diff --git a/src/trainer/model.py b/src/trainer/model.py index cb3e8cf..5ce92e3 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -125,16 +125,16 @@ class MoEModel(nn.Module): input_dim=self.hidden_size, d_model=d_model, num_resblocks=num_resblocks, - output_multiplier=2, # 输出维度 = 2 * hidden_size + output_multiplier=2, # 输出维度 = 2 * d_model dropout_prob=dropout_prob, ) self.experts.append(expert) # 4. 分类头 self.classifier = nn.Sequential( - nn.LayerNorm(2 * self.hidden_size), # 专家输出维度 + nn.LayerNorm(2 * d_model), # 专家输出维度 nn.Dropout(0.1), - nn.Linear(2 * self.hidden_size, num_classes), + nn.Linear(2 * d_model, num_classes), ) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)