调整模型结构及参数以优化性能
This commit is contained in:
parent
ae414bae6b
commit
558d7f9fc9
|
|
@ -89,8 +89,8 @@ class Expert(nn.Module):
|
||||||
dropout_prob=0.1,
|
dropout_prob=0.1,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
input_dim : BERT 输出的 hidden_size(如 312/768)
|
input_dim : 输入维度
|
||||||
d_model : 专家内部维度(固定 1024)
|
d_model : 专家内部维度
|
||||||
output_multiplier : 输出维度 = input_dim * output_multiplier
|
output_multiplier : 输出维度 = input_dim * output_multiplier
|
||||||
dropout_prob : 残差块内 Dropout
|
dropout_prob : 残差块内 Dropout
|
||||||
"""
|
"""
|
||||||
|
|
@ -154,15 +154,16 @@ class MoEModel(nn.Module):
|
||||||
batch_first=True,
|
batch_first=True,
|
||||||
)
|
)
|
||||||
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
|
self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4)
|
||||||
self.pooler = nn.AdaptiveAvgPool1d(1)
|
|
||||||
self.res_blocks = nn.ModuleList([ResidualBlock(self.hidden_size) for _ in range(4)])
|
self.res_blocks = nn.ModuleList([ResidualBlock(self.hidden_size) for _ in range(4)])
|
||||||
|
|
||||||
|
self.pooler = nn.AdaptiveAvgPool1d(2)
|
||||||
|
|
||||||
self.total_experts = 20
|
self.total_experts = 20
|
||||||
self.experts = nn.ModuleList()
|
self.experts = nn.ModuleList()
|
||||||
|
|
||||||
for i in range(self.total_experts):
|
for i in range(self.total_experts):
|
||||||
expert = Expert(
|
expert = Expert(
|
||||||
input_dim=self.hidden_size,
|
input_dim=self.hidden_size * 2,
|
||||||
d_model=self.experts_dim[i],
|
d_model=self.experts_dim[i],
|
||||||
num_resblocks=num_resblocks,
|
num_resblocks=num_resblocks,
|
||||||
output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size
|
output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size
|
||||||
|
|
@ -171,24 +172,24 @@ class MoEModel(nn.Module):
|
||||||
self.experts.append(expert)
|
self.experts.append(expert)
|
||||||
|
|
||||||
self.expert_bias = nn.Embedding(
|
self.expert_bias = nn.Embedding(
|
||||||
self.total_experts, self.output_multiplier * self.hidden_size
|
self.total_experts, self.output_multiplier * self.hidden_size * 2
|
||||||
)
|
)
|
||||||
|
|
||||||
# 4. 分类头
|
# 4. 分类头
|
||||||
self.classifier = nn.Sequential(
|
self.classifier = nn.Sequential(
|
||||||
nn.LayerNorm(self.output_multiplier * self.hidden_size),
|
nn.LayerNorm(self.output_multiplier * self.hidden_size * 2),
|
||||||
nn.Linear(
|
nn.Linear(
|
||||||
self.output_multiplier * self.hidden_size,
|
self.output_multiplier * self.hidden_size * 2,
|
||||||
self.output_multiplier * self.hidden_size,
|
|
||||||
),
|
|
||||||
nn.ReLU(inplace=True),
|
|
||||||
nn.Linear(
|
|
||||||
self.output_multiplier * self.hidden_size,
|
|
||||||
self.output_multiplier * self.hidden_size * 2,
|
self.output_multiplier * self.hidden_size * 2,
|
||||||
),
|
),
|
||||||
nn.ReLU(inplace=True),
|
nn.ReLU(inplace=True),
|
||||||
nn.Dropout(0.2),
|
nn.Linear(
|
||||||
nn.Linear(self.output_multiplier * self.hidden_size * 2, num_classes),
|
self.output_multiplier * self.hidden_size * 2,
|
||||||
|
self.output_multiplier * self.hidden_size * 4,
|
||||||
|
),
|
||||||
|
nn.ReLU(inplace=True),
|
||||||
|
nn.Dropout(0.1),
|
||||||
|
nn.Linear(self.output_multiplier * self.hidden_size * 4, num_classes),
|
||||||
)
|
)
|
||||||
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
# 可选:为领域专家和共享专家设置不同权重衰减(通过优化器实现,此处不处理)
|
||||||
|
|
||||||
|
|
@ -213,11 +214,12 @@ class MoEModel(nn.Module):
|
||||||
embeddings, src_key_padding_mask=padding_mask
|
embeddings, src_key_padding_mask=padding_mask
|
||||||
) # [B, S, H]
|
) # [B, S, H]
|
||||||
|
|
||||||
# ----- 3. 池化量 -----
|
|
||||||
pooled = self.pooler(encoded.transpose(1, 2)).squeeze(-1)
|
|
||||||
|
|
||||||
for block in self.res_blocks:
|
for block in self.res_blocks:
|
||||||
pooled = block(pooled)
|
pooled = block(encoded)
|
||||||
|
|
||||||
|
# ----- 3. 池化量 -----
|
||||||
|
pooled = self.pooler(encoded.transpose(1, 2)) # [B, H, 2]
|
||||||
|
pooled = pooled.flatten(1) # [B, H*2]
|
||||||
|
|
||||||
# ----- 4. 专家路由(硬路由)-----
|
# ----- 4. 专家路由(硬路由)-----
|
||||||
if torch.jit.is_tracing():
|
if torch.jit.is_tracing():
|
||||||
|
|
@ -466,7 +468,7 @@ class MoEModel(nn.Module):
|
||||||
train_dataloader,
|
train_dataloader,
|
||||||
eval_dataloader=None,
|
eval_dataloader=None,
|
||||||
monitor: Optional[TrainingMonitor] = None,
|
monitor: Optional[TrainingMonitor] = None,
|
||||||
criterion=nn.CrossEntropyLoss(),
|
criterion=None,
|
||||||
optimizer=None,
|
optimizer=None,
|
||||||
num_epochs=1,
|
num_epochs=1,
|
||||||
stop_batch=1e6,
|
stop_batch=1e6,
|
||||||
|
|
@ -475,7 +477,7 @@ class MoEModel(nn.Module):
|
||||||
clip_grad_norm=1.0,
|
clip_grad_norm=1.0,
|
||||||
mixed_precision=False,
|
mixed_precision=False,
|
||||||
loss_weight=None,
|
loss_weight=None,
|
||||||
lr=1e-4,
|
lr=6e-5,
|
||||||
lr_schedule=None, # 新增:可选的自定义学习率调度函数
|
lr_schedule=None, # 新增:可选的自定义学习率调度函数
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
|
|
@ -512,17 +514,19 @@ class MoEModel(nn.Module):
|
||||||
if self.device is None:
|
if self.device is None:
|
||||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||||
self.to(self.device)
|
self.to(self.device)
|
||||||
if loss_weight:
|
|
||||||
loss_weight = 1 / torch.sqrt(torch.sqrt(torch.tensor(loss_weight)))
|
|
||||||
self.loss_weight = loss_weight.to(self.device)
|
|
||||||
criterion.weight = self.loss_weight
|
|
||||||
|
|
||||||
# 切换到训练模式
|
# 切换到训练模式
|
||||||
super().train()
|
super().train()
|
||||||
|
|
||||||
# 默认优化器
|
# 默认优化器
|
||||||
if optimizer is None:
|
if optimizer is None:
|
||||||
optimizer = optim.AdamW(self.parameters(), lr=lr) # 初始学习率 1e-4
|
optimizer = optim.AdamW(self.parameters(), lr=lr)
|
||||||
|
|
||||||
|
if criterion is None:
|
||||||
|
if loss_weight is not None:
|
||||||
|
criterion = nn.CrossEntropyLoss(weight=loss_weight)
|
||||||
|
else:
|
||||||
|
criterion = nn.CrossEntropyLoss()
|
||||||
|
|
||||||
# 混合精度缩放器
|
# 混合精度缩放器
|
||||||
scaler = amp.GradScaler(enabled=mixed_precision)
|
scaler = amp.GradScaler(enabled=mixed_precision)
|
||||||
|
|
@ -560,7 +564,7 @@ class MoEModel(nn.Module):
|
||||||
scaler.scale(loss).backward()
|
scaler.scale(loss).backward()
|
||||||
|
|
||||||
# 梯度累积
|
# 梯度累积
|
||||||
if (batch_idx + 1) % grad_accum_steps == 0:
|
if (batch_idx) % grad_accum_steps == 0:
|
||||||
scaler.unscale_(optimizer)
|
scaler.unscale_(optimizer)
|
||||||
torch.nn.utils.clip_grad_norm_(self.parameters(), clip_grad_norm)
|
torch.nn.utils.clip_grad_norm_(self.parameters(), clip_grad_norm)
|
||||||
scaler.step(optimizer)
|
scaler.step(optimizer)
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue