feat: 调整拼音输入数据集处理逻辑及模型结构参数
This commit is contained in:
parent
350cab20c5
commit
398155721d
|
|
@ -438,9 +438,9 @@ class PinyinInputDataset(IterableDataset):
|
|||
return_tensors="pt",
|
||||
)
|
||||
|
||||
prob = random.random()
|
||||
pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12
|
||||
if prob < 0.1:
|
||||
prob = random.random()
|
||||
if prob < 0.3:
|
||||
py = ""
|
||||
else:
|
||||
py = processed_pinyin
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -112,12 +112,11 @@ class MoEModel(nn.Module):
|
|||
# 2. 4 层标准 Transformer Encoder(从 config 读取参数)
|
||||
encoder_layer = nn.TransformerEncoderLayer(
|
||||
d_model=self.hidden_size,
|
||||
nhead=self.bert_config.num_attention_heads,
|
||||
nhead=8,
|
||||
dim_feedforward=self.bert_config.intermediate_size,
|
||||
dropout=self.bert_config.hidden_dropout_prob,
|
||||
activation="gelu",
|
||||
batch_first=True,
|
||||
norm_first=True, # Pre-LN,与预训练一致
|
||||
)
|
||||
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
|
||||
|
||||
|
|
@ -128,7 +127,7 @@ class MoEModel(nn.Module):
|
|||
|
||||
# self.linear = nn.Linear(self.hidden_size, self.hidden_size)
|
||||
|
||||
# 3. 专家层:8个领域专家 + 1个共享专家
|
||||
# 3. 专家层:n个领域专家 + 1个共享专家
|
||||
total_experts = num_domain_experts + num_shared_experts
|
||||
self.experts = nn.ModuleList()
|
||||
|
||||
|
|
@ -156,17 +155,11 @@ class MoEModel(nn.Module):
|
|||
self.output_multiplier * self.hidden_size,
|
||||
),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(
|
||||
self.output_multiplier * self.hidden_size,
|
||||
self.output_multiplier * self.hidden_size,
|
||||
),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(
|
||||
self.output_multiplier * self.hidden_size,
|
||||
self.output_multiplier * self.hidden_size * 2,
|
||||
),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Dropout(0.2),
|
||||
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
|
||||
)
|
||||
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
||||
|
|
|
|||
Loading…
Reference in New Issue