调整模型结构及参数以优化性能

This commit is contained in:
songsenand 2026-02-20 23:21:27 +08:00
parent ae414bae6b
commit 558d7f9fc9
1 changed files with 30 additions and 26 deletions

View File

@ -89,8 +89,8 @@ class Expert(nn.Module):
dropout_prob=0.1, dropout_prob=0.1,
): ):
""" """
input_dim : BERT 输出的 hidden_size 312/768 input_dim : 输入维度
d_model : 专家内部维度固定 1024 d_model : 专家内部维度
output_multiplier : 输出维度 = input_dim * output_multiplier output_multiplier : 输出维度 = input_dim * output_multiplier
dropout_prob : 残差块内 Dropout dropout_prob : 残差块内 Dropout
""" """
@ -154,15 +154,16 @@ class MoEModel(nn.Module):
batch_first=True, batch_first=True,
) )
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
self.pooler = nn.AdaptiveAvgPool1d(1)
self.res_blocks = nn.ModuleList([ResidualBlock(self.hidden_size) for _ in range(4)]) self.res_blocks = nn.ModuleList([ResidualBlock(self.hidden_size) for _ in range(4)])
self.pooler = nn.AdaptiveAvgPool1d(2)
self.total_experts = 20 self.total_experts = 20
self.experts = nn.ModuleList() self.experts = nn.ModuleList()
for i in range(self.total_experts): for i in range(self.total_experts):
expert = Expert( expert = Expert(
input_dim=self.hidden_size, input_dim=self.hidden_size * 2,
d_model=self.experts_dim[i], d_model=self.experts_dim[i],
num_resblocks=num_resblocks, num_resblocks=num_resblocks,
output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size
@ -171,24 +172,24 @@ class MoEModel(nn.Module):
self.experts.append(expert) self.experts.append(expert)
self.expert_bias = nn.Embedding( self.expert_bias = nn.Embedding(
self.total_experts, self.output_multiplier * self.hidden_size self.total_experts, self.output_multiplier * self.hidden_size * 2
) )
# 4. 分类头 # 4. 分类头
self.classifier = nn.Sequential( self.classifier = nn.Sequential(
nn.LayerNorm(self.output_multiplier * self.hidden_size), nn.LayerNorm(self.output_multiplier * self.hidden_size * 2),
nn.Linear( nn.Linear(
self.output_multiplier * self.hidden_size, self.output_multiplier * self.hidden_size * 2,
self.output_multiplier * self.hidden_size,
),
nn.ReLU(inplace=True),
nn.Linear(
self.output_multiplier * self.hidden_size,
self.output_multiplier * self.hidden_size * 2, self.output_multiplier * self.hidden_size * 2,
), ),
nn.ReLU(inplace=True), nn.ReLU(inplace=True),
nn.Dropout(0.2), nn.Linear(
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes), self.output_multiplier * self.hidden_size * 2,
self.output_multiplier * self.hidden_size * 4,
),
nn.ReLU(inplace=True),
nn.Dropout(0.1),
nn.Linear(self.output_multiplier * self.hidden_size * 4, num_classes),
) )
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理) # 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
@ -213,11 +214,12 @@ class MoEModel(nn.Module):
embeddings, src_key_padding_mask=padding_mask embeddings, src_key_padding_mask=padding_mask
) # [B, S, H] ) # [B, S, H]
# ----- 3. 池化量 -----
pooled = self.pooler(encoded.transpose(1, 2)).squeeze(-1)
for block in self.res_blocks: for block in self.res_blocks:
pooled = block(pooled) pooled = block(encoded)
# ----- 3. 池化量 -----
pooled = self.pooler(encoded.transpose(1, 2)) # [B, H, 2]
pooled = pooled.flatten(1) # [B, H*2]
# ----- 4. 专家路由(硬路由)----- # ----- 4. 专家路由(硬路由)-----
if torch.jit.is_tracing(): if torch.jit.is_tracing():
@ -466,7 +468,7 @@ class MoEModel(nn.Module):
train_dataloader, train_dataloader,
eval_dataloader=None, eval_dataloader=None,
monitor: Optional[TrainingMonitor] = None, monitor: Optional[TrainingMonitor] = None,
criterion=nn.CrossEntropyLoss(), criterion=None,
optimizer=None, optimizer=None,
num_epochs=1, num_epochs=1,
stop_batch=1e6, stop_batch=1e6,
@ -475,7 +477,7 @@ class MoEModel(nn.Module):
clip_grad_norm=1.0, clip_grad_norm=1.0,
mixed_precision=False, mixed_precision=False,
loss_weight=None, loss_weight=None,
lr=1e-4, lr=6e-5,
lr_schedule=None, # 新增:可选的自定义学习率调度函数 lr_schedule=None, # 新增:可选的自定义学习率调度函数
): ):
""" """
@ -512,17 +514,19 @@ class MoEModel(nn.Module):
if self.device is None: if self.device is None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(self.device) self.to(self.device)
if loss_weight:
loss_weight = 1 / torch.sqrt(torch.sqrt(torch.tensor(loss_weight)))
self.loss_weight = loss_weight.to(self.device)
criterion.weight = self.loss_weight
# 切换到训练模式 # 切换到训练模式
super().train() super().train()
# 默认优化器 # 默认优化器
if optimizer is None: if optimizer is None:
optimizer = optim.AdamW(self.parameters(), lr=lr) # 初始学习率 1e-4 optimizer = optim.AdamW(self.parameters(), lr=lr)
if criterion is None:
if loss_weight is not None:
criterion = nn.CrossEntropyLoss(weight=loss_weight)
else:
criterion = nn.CrossEntropyLoss()
# 混合精度缩放器 # 混合精度缩放器
scaler = amp.GradScaler(enabled=mixed_precision) scaler = amp.GradScaler(enabled=mixed_precision)
@ -560,7 +564,7 @@ class MoEModel(nn.Module):
scaler.scale(loss).backward() scaler.scale(loss).backward()
# 梯度累积 # 梯度累积
if (batch_idx + 1) % grad_accum_steps == 0: if (batch_idx) % grad_accum_steps == 0:
scaler.unscale_(optimizer) scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(self.parameters(), clip_grad_norm) torch.nn.utils.clip_grad_norm_(self.parameters(), clip_grad_norm)
scaler.step(optimizer) scaler.step(optimizer)