feat: 调整拼音输入数据集处理逻辑及模型结构参数

This commit is contained in:
songsenand 2026-02-22 15:19:59 +08:00
parent 350cab20c5
commit 398155721d
7 changed files with 4 additions and 11 deletions

View File

@ -438,9 +438,9 @@ class PinyinInputDataset(IterableDataset):
return_tensors="pt",
)
prob = random.random()
pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12
if prob < 0.1:
prob = random.random()
if prob < 0.3:
py = ""
else:
py = processed_pinyin

View File

@ -112,12 +112,11 @@ class MoEModel(nn.Module):
# 2. 4 层标准 Transformer Encoder从 config 读取参数)
encoder_layer = nn.TransformerEncoderLayer(
d_model=self.hidden_size,
nhead=self.bert_config.num_attention_heads,
nhead=8,
dim_feedforward=self.bert_config.intermediate_size,
dropout=self.bert_config.hidden_dropout_prob,
activation="gelu",
batch_first=True,
norm_first=True, # Pre-LN与预训练一致
)
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
@ -128,7 +127,7 @@ class MoEModel(nn.Module):
# self.linear = nn.Linear(self.hidden_size, self.hidden_size)
# 3. 专家层:8个领域专家 + 1个共享专家
# 3. 专家层:n个领域专家 + 1个共享专家
total_experts = num_domain_experts + num_shared_experts
self.experts = nn.ModuleList()
@ -156,17 +155,11 @@ class MoEModel(nn.Module):
self.output_multiplier * self.hidden_size,
),
nn.ReLU(inplace=True),
nn.Linear(
self.output_multiplier * self.hidden_size,
self.output_multiplier * self.hidden_size,
),
nn.ReLU(inplace=True),
nn.Linear(
self.output_multiplier * self.hidden_size,
self.output_multiplier * self.hidden_size * 2,
),
nn.ReLU(inplace=True),
nn.Dropout(0.2),
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
)
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)