diff --git a/src/trainer/model.py b/src/trainer/model.py index cc97f25..54ed892 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -89,8 +89,8 @@ class Expert(nn.Module): dropout_prob=0.1, ): """ - input_dim : BERT 输出的 hidden_size(如 312/768) - d_model : 专家内部维度(固定 1024) + input_dim : 输入维度 + d_model : 专家内部维度 output_multiplier : 输出维度 = input_dim * output_multiplier dropout_prob : 残差块内 Dropout """ @@ -154,15 +154,16 @@ class MoEModel(nn.Module): batch_first=True, ) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) - self.pooler = nn.AdaptiveAvgPool1d(1) self.res_blocks = nn.ModuleList([ResidualBlock(self.hidden_size) for _ in range(4)]) + self.pooler = nn.AdaptiveAvgPool1d(2) + self.total_experts = 20 self.experts = nn.ModuleList() for i in range(self.total_experts): expert = Expert( - input_dim=self.hidden_size, + input_dim=self.hidden_size * 2, d_model=self.experts_dim[i], num_resblocks=num_resblocks, output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size @@ -171,24 +172,24 @@ class MoEModel(nn.Module): self.experts.append(expert) self.expert_bias = nn.Embedding( - self.total_experts, self.output_multiplier * self.hidden_size + self.total_experts, self.output_multiplier * self.hidden_size * 2 ) # 4. 分类头 self.classifier = nn.Sequential( - nn.LayerNorm(self.output_multiplier * self.hidden_size), + nn.LayerNorm(self.output_multiplier * self.hidden_size * 2), nn.Linear( - self.output_multiplier * self.hidden_size, - self.output_multiplier * self.hidden_size, - ), - nn.ReLU(inplace=True), - nn.Linear( - self.output_multiplier * self.hidden_size, + self.output_multiplier * self.hidden_size * 2, self.output_multiplier * self.hidden_size * 2, ), nn.ReLU(inplace=True), - nn.Dropout(0.2), - nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes), + nn.Linear( + self.output_multiplier * self.hidden_size * 2, + self.output_multiplier * self.hidden_size * 4, + ), + nn.ReLU(inplace=True), + nn.Dropout(0.1), + nn.Linear(self.output_multiplier * self.hidden_size * 4, num_classes), ) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理) @@ -213,11 +214,12 @@ class MoEModel(nn.Module): embeddings, src_key_padding_mask=padding_mask ) # [B, S, H] - # ----- 3. 池化量 ----- - pooled = self.pooler(encoded.transpose(1, 2)).squeeze(-1) - for block in self.res_blocks: - pooled = block(pooled) + pooled = block(encoded) + + # ----- 3. 池化量 ----- + pooled = self.pooler(encoded.transpose(1, 2)) # [B, H, 2] + pooled = pooled.flatten(1) # [B, H*2] # ----- 4. 专家路由(硬路由)----- if torch.jit.is_tracing(): @@ -466,7 +468,7 @@ class MoEModel(nn.Module): train_dataloader, eval_dataloader=None, monitor: Optional[TrainingMonitor] = None, - criterion=nn.CrossEntropyLoss(), + criterion=None, optimizer=None, num_epochs=1, stop_batch=1e6, @@ -475,7 +477,7 @@ class MoEModel(nn.Module): clip_grad_norm=1.0, mixed_precision=False, loss_weight=None, - lr=1e-4, + lr=6e-5, lr_schedule=None, # 新增:可选的自定义学习率调度函数 ): """ @@ -512,17 +514,19 @@ class MoEModel(nn.Module): if self.device is None: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.to(self.device) - if loss_weight: - loss_weight = 1 / torch.sqrt(torch.sqrt(torch.tensor(loss_weight))) - self.loss_weight = loss_weight.to(self.device) - criterion.weight = self.loss_weight # 切换到训练模式 super().train() # 默认优化器 if optimizer is None: - optimizer = optim.AdamW(self.parameters(), lr=lr) # 初始学习率 1e-4 + optimizer = optim.AdamW(self.parameters(), lr=lr) + + if criterion is None: + if loss_weight is not None: + criterion = nn.CrossEntropyLoss(weight=loss_weight) + else: + criterion = nn.CrossEntropyLoss() # 混合精度缩放器 scaler = amp.GradScaler(enabled=mixed_precision) @@ -560,7 +564,7 @@ class MoEModel(nn.Module): scaler.scale(loss).backward() # 梯度累积 - if (batch_idx + 1) % grad_accum_steps == 0: + if (batch_idx) % grad_accum_steps == 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(self.parameters(), clip_grad_norm) scaler.step(optimizer)