diff --git a/src/suinput/dataset.py b/src/suinput/dataset.py index 5bf37a4..8efea57 100644 --- a/src/suinput/dataset.py +++ b/src/suinput/dataset.py @@ -438,9 +438,9 @@ class PinyinInputDataset(IterableDataset): return_tensors="pt", ) - prob = random.random() pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12 - if prob < 0.1: + prob = random.random() + if prob < 0.3: py = "" else: py = processed_pinyin diff --git a/src/trainer/eval_dataset/sample_0.pkl b/src/trainer/eval_dataset/sample_0.pkl index 4069112..566815f 100644 Binary files a/src/trainer/eval_dataset/sample_0.pkl and b/src/trainer/eval_dataset/sample_0.pkl differ diff --git a/src/trainer/eval_dataset/sample_1.pkl b/src/trainer/eval_dataset/sample_1.pkl index 469cd31..ef913cd 100644 Binary files a/src/trainer/eval_dataset/sample_1.pkl and b/src/trainer/eval_dataset/sample_1.pkl differ diff --git a/src/trainer/eval_dataset/sample_2.pkl b/src/trainer/eval_dataset/sample_2.pkl index 640bac7..324b7b2 100644 Binary files a/src/trainer/eval_dataset/sample_2.pkl and b/src/trainer/eval_dataset/sample_2.pkl differ diff --git a/src/trainer/eval_dataset/sample_3.pkl b/src/trainer/eval_dataset/sample_3.pkl index 3cc5b31..1a7f1b4 100644 Binary files a/src/trainer/eval_dataset/sample_3.pkl and b/src/trainer/eval_dataset/sample_3.pkl differ diff --git a/src/trainer/eval_dataset/sample_4.pkl b/src/trainer/eval_dataset/sample_4.pkl index 785ab66..9476367 100644 Binary files a/src/trainer/eval_dataset/sample_4.pkl and b/src/trainer/eval_dataset/sample_4.pkl differ diff --git a/src/trainer/model.py b/src/trainer/model.py index 47d7a7b..5754f20 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -112,12 +112,11 @@ class MoEModel(nn.Module): # 2. 4 层标准 Transformer Encoder(从 config 读取参数) encoder_layer = nn.TransformerEncoderLayer( d_model=self.hidden_size, - nhead=self.bert_config.num_attention_heads, + nhead=8, dim_feedforward=self.bert_config.intermediate_size, dropout=self.bert_config.hidden_dropout_prob, activation="gelu", batch_first=True, - norm_first=True, # Pre-LN,与预训练一致 ) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) @@ -128,7 +127,7 @@ class MoEModel(nn.Module): # self.linear = nn.Linear(self.hidden_size, self.hidden_size) - # 3. 专家层:8个领域专家 + 1个共享专家 + # 3. 专家层:n个领域专家 + 1个共享专家 total_experts = num_domain_experts + num_shared_experts self.experts = nn.ModuleList() @@ -156,17 +155,11 @@ class MoEModel(nn.Module): self.output_multiplier * self.hidden_size, ), nn.ReLU(inplace=True), - nn.Linear( - self.output_multiplier * self.hidden_size, - self.output_multiplier * self.hidden_size, - ), - nn.ReLU(inplace=True), nn.Linear( self.output_multiplier * self.hidden_size, self.output_multiplier * self.hidden_size * 2, ), nn.ReLU(inplace=True), - nn.Dropout(0.2), nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes), ) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)