feat: 调整拼音输入数据集处理逻辑及模型结构参数
This commit is contained in:
parent
350cab20c5
commit
398155721d
|
|
@ -438,9 +438,9 @@ class PinyinInputDataset(IterableDataset):
|
||||||
return_tensors="pt",
|
return_tensors="pt",
|
||||||
)
|
)
|
||||||
|
|
||||||
prob = random.random()
|
|
||||||
pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12
|
pg = self.pg_groups[processed_pinyin[0]] if processed_pinyin else 12
|
||||||
if prob < 0.1:
|
prob = random.random()
|
||||||
|
if prob < 0.3:
|
||||||
py = ""
|
py = ""
|
||||||
else:
|
else:
|
||||||
py = processed_pinyin
|
py = processed_pinyin
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -112,12 +112,11 @@ class MoEModel(nn.Module):
|
||||||
# 2. 4 层标准 Transformer Encoder(从 config 读取参数)
|
# 2. 4 层标准 Transformer Encoder(从 config 读取参数)
|
||||||
encoder_layer = nn.TransformerEncoderLayer(
|
encoder_layer = nn.TransformerEncoderLayer(
|
||||||
d_model=self.hidden_size,
|
d_model=self.hidden_size,
|
||||||
nhead=self.bert_config.num_attention_heads,
|
nhead=8,
|
||||||
dim_feedforward=self.bert_config.intermediate_size,
|
dim_feedforward=self.bert_config.intermediate_size,
|
||||||
dropout=self.bert_config.hidden_dropout_prob,
|
dropout=self.bert_config.hidden_dropout_prob,
|
||||||
activation="gelu",
|
activation="gelu",
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
norm_first=True, # Pre-LN,与预训练一致
|
|
||||||
)
|
)
|
||||||
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
|
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
|
||||||
|
|
||||||
|
|
@ -128,7 +127,7 @@ class MoEModel(nn.Module):
|
||||||
|
|
||||||
# self.linear = nn.Linear(self.hidden_size, self.hidden_size)
|
# self.linear = nn.Linear(self.hidden_size, self.hidden_size)
|
||||||
|
|
||||||
# 3. 专家层:8个领域专家 + 1个共享专家
|
# 3. 专家层:n个领域专家 + 1个共享专家
|
||||||
total_experts = num_domain_experts + num_shared_experts
|
total_experts = num_domain_experts + num_shared_experts
|
||||||
self.experts = nn.ModuleList()
|
self.experts = nn.ModuleList()
|
||||||
|
|
||||||
|
|
@ -156,17 +155,11 @@ class MoEModel(nn.Module):
|
||||||
self.output_multiplier * self.hidden_size,
|
self.output_multiplier * self.hidden_size,
|
||||||
),
|
),
|
||||||
nn.ReLU(inplace=True),
|
nn.ReLU(inplace=True),
|
||||||
nn.Linear(
|
|
||||||
self.output_multiplier * self.hidden_size,
|
|
||||||
self.output_multiplier * self.hidden_size,
|
|
||||||
),
|
|
||||||
nn.ReLU(inplace=True),
|
|
||||||
nn.Linear(
|
nn.Linear(
|
||||||
self.output_multiplier * self.hidden_size,
|
self.output_multiplier * self.hidden_size,
|
||||||
self.output_multiplier * self.hidden_size * 2,
|
self.output_multiplier * self.hidden_size * 2,
|
||||||
),
|
),
|
||||||
nn.ReLU(inplace=True),
|
nn.ReLU(inplace=True),
|
||||||
nn.Dropout(0.2),
|
|
||||||
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
|
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
|
||||||
)
|
)
|
||||||
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue