修复分类头输出维度,使用 d_model 替代 hidden_size
This commit is contained in:
parent
6923870171
commit
d82c80f3a9
|
|
@ -125,16 +125,16 @@ class MoEModel(nn.Module):
|
|||
input_dim=self.hidden_size,
|
||||
d_model=d_model,
|
||||
num_resblocks=num_resblocks,
|
||||
output_multiplier=2, # 输出维度 = 2 * hidden_size
|
||||
output_multiplier=2, # 输出维度 = 2 * d_model
|
||||
dropout_prob=dropout_prob,
|
||||
)
|
||||
self.experts.append(expert)
|
||||
|
||||
# 4. 分类头
|
||||
self.classifier = nn.Sequential(
|
||||
nn.LayerNorm(2 * self.hidden_size), # 专家输出维度
|
||||
nn.LayerNorm(2 * d_model), # 专家输出维度
|
||||
nn.Dropout(0.1),
|
||||
nn.Linear(2 * self.hidden_size, num_classes),
|
||||
nn.Linear(2 * d_model, num_classes),
|
||||
)
|
||||
|
||||
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
||||
|
|
|
|||
Loading…
Reference in New Issue