feat: 调整拼音输入数据集处理逻辑及模型结构参数

This commit is contained in:
songsenand 2026-02-22 15:19:59 +08:00
parent 350cab20c5
commit 398155721d
7 changed files with 4 additions and 11 deletions

View File

@ -438,9 +438,9 @@ class PinyinInputDataset(IterableDataset):
return_tensors="pt", return_tensors="pt",
) )
prob = random.random()
pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12 pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12
if prob < 0.1: prob = random.random()
if prob < 0.3:
py = "" py = ""
else: else:
py = processed_pinyin py = processed_pinyin

View File

@ -112,12 +112,11 @@ class MoEModel(nn.Module):
# 2. 4 层标准 Transformer Encoder从 config 读取参数) # 2. 4 层标准 Transformer Encoder从 config 读取参数)
encoder_layer = nn.TransformerEncoderLayer( encoder_layer = nn.TransformerEncoderLayer(
d_model=self.hidden_size, d_model=self.hidden_size,
nhead=self.bert_config.num_attention_heads, nhead=8,
dim_feedforward=self.bert_config.intermediate_size, dim_feedforward=self.bert_config.intermediate_size,
dropout=self.bert_config.hidden_dropout_prob, dropout=self.bert_config.hidden_dropout_prob,
activation="gelu", activation="gelu",
batch_first=True, batch_first=True,
norm_first=True, # Pre-LN与预训练一致
) )
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
@ -128,7 +127,7 @@ class MoEModel(nn.Module):
# self.linear = nn.Linear(self.hidden_size, self.hidden_size) # self.linear = nn.Linear(self.hidden_size, self.hidden_size)
# 3. 专家层:8个领域专家 + 1个共享专家 # 3. 专家层:n个领域专家 + 1个共享专家
total_experts = num_domain_experts + num_shared_experts total_experts = num_domain_experts + num_shared_experts
self.experts = nn.ModuleList() self.experts = nn.ModuleList()
@ -156,17 +155,11 @@ class MoEModel(nn.Module):
self.output_multiplier * self.hidden_size, self.output_multiplier * self.hidden_size,
), ),
nn.ReLU(inplace=True), nn.ReLU(inplace=True),
nn.Linear(
self.output_multiplier * self.hidden_size,
self.output_multiplier * self.hidden_size,
),
nn.ReLU(inplace=True),
nn.Linear( nn.Linear(
self.output_multiplier * self.hidden_size, self.output_multiplier * self.hidden_size,
self.output_multiplier * self.hidden_size * 2, self.output_multiplier * self.hidden_size * 2,
), ),
nn.ReLU(inplace=True), nn.ReLU(inplace=True),
nn.Dropout(0.2),
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes), nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
) )
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)