From d82c80f3a98e78626c8d8d33a637bc596b987f00 Mon Sep 17 00:00:00 2001 From: songsenand Date: Fri, 13 Feb 2026 14:19:57 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=88=86=E7=B1=BB=E5=A4=B4?= =?UTF-8?q?=E8=BE=93=E5=87=BA=E7=BB=B4=E5=BA=A6=EF=BC=8C=E4=BD=BF=E7=94=A8?= =?UTF-8?q?=20d=5Fmodel=20=E6=9B=BF=E4=BB=A3=20hidden=5Fsize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/trainer/model.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/trainer/model.py b/src/trainer/model.py index cb3e8cf..5ce92e3 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -125,16 +125,16 @@ class MoEModel(nn.Module): input_dim=self.hidden_size, d_model=d_model, num_resblocks=num_resblocks, - output_multiplier=2, # 输出维度 = 2 * hidden_size + output_multiplier=2, # 输出维度 = 2 * d_model dropout_prob=dropout_prob, ) self.experts.append(expert) # 4. 分类头 self.classifier = nn.Sequential( - nn.LayerNorm(2 * self.hidden_size), # 专家输出维度 + nn.LayerNorm(2 * d_model), # 专家输出维度 nn.Dropout(0.1), - nn.Linear(2 * self.hidden_size, num_classes), + nn.Linear(2 * d_model, num_classes), ) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)