修复分类头输出维度,使用 d_model 替代 hidden_size

This commit is contained in:
songsenand 2026-02-13 14:19:57 +08:00
parent 6923870171
commit d82c80f3a9
1 changed files with 3 additions and 3 deletions

View File

@ -125,16 +125,16 @@ class MoEModel(nn.Module):
input_dim=self.hidden_size,
d_model=d_model,
num_resblocks=num_resblocks,
output_multiplier=2, # 输出维度 = 2 * hidden_size
output_multiplier=2, # 输出维度 = 2 * d_model
dropout_prob=dropout_prob,
)
self.experts.append(expert)
# 4. 分类头
self.classifier = nn.Sequential(
nn.LayerNorm(2 * self.hidden_size), # 专家输出维度
nn.LayerNorm(2 * d_model), # 专家输出维度
nn.Dropout(0.1),
nn.Linear(2 * self.hidden_size, num_classes),
nn.Linear(2 * d_model, num_classes),
)
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)