添加损失权重支持并重构部分模块结构
This commit is contained in:
parent
515f261824
commit
94b44e6f71
|
|
@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset):
|
|||
|
||||
# Tokenize
|
||||
hint = self.tokenizer(
|
||||
sampled_context + processed_pinyin,
|
||||
sampled_context,
|
||||
processed_pinyin,
|
||||
max_length=self.max_len,
|
||||
padding="max_length",
|
||||
truncation=True,
|
||||
|
|
|
|||
|
|
@ -313,6 +313,11 @@ class QueryEngine:
|
|||
|
||||
return self._char_pinyin_map.get((char, pinyin), 0)
|
||||
|
||||
def get_all_weights(self):
|
||||
"""获取所有字符-拼音对出现的次数 - O(n)时间复杂度"""
|
||||
items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0])
|
||||
return [info.count for _, info in items_sorted]
|
||||
|
||||
def get_char_info_by_char_pinyin(
|
||||
self, char: str, pinyin: str
|
||||
) -> Optional[CharInfo]:
|
||||
|
|
|
|||
|
|
@ -465,10 +465,12 @@ class MoEModel(nn.Module):
|
|||
criterion=nn.CrossEntropyLoss(),
|
||||
optimizer=None,
|
||||
num_epochs=1,
|
||||
stop_batch=1e6,
|
||||
eval_frequency=500,
|
||||
grad_accum_steps=1,
|
||||
clip_grad_norm=1.0,
|
||||
mixed_precision=False,
|
||||
loss_weight=None,
|
||||
lr=1e-4,
|
||||
lr_schedule=None, # 新增:可选的自定义学习率调度函数
|
||||
):
|
||||
|
|
@ -506,6 +508,10 @@ class MoEModel(nn.Module):
|
|||
if self.device is None:
|
||||
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
self.to(self.device)
|
||||
if loss_weight:
|
||||
loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight))
|
||||
self.loss_weight = loss_weight.to(self.device)
|
||||
criterion.weight = self.loss_weight
|
||||
|
||||
# 切换到训练模式
|
||||
super().train()
|
||||
|
|
|
|||
|
|
@ -13,113 +13,14 @@ from modelscope import AutoModel
|
|||
from tqdm import tqdm
|
||||
|
||||
from .monitor import TrainingMonitor
|
||||
from .model import (
|
||||
EXPORT_HIDE_DIM,
|
||||
eval_dataloader,
|
||||
ResidualBlock,
|
||||
Expert
|
||||
)
|
||||
|
||||
|
||||
def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")):
|
||||
return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")]
|
||||
|
||||
|
||||
def round_to_power_of_two(x):
|
||||
if x < 1:
|
||||
return 0
|
||||
n = x.bit_length()
|
||||
n = min(max(7, n), 9)
|
||||
lower = 1 << (n) # 小于等于x的最大2的幂次
|
||||
upper = lower << 1 # 大于x的最小2的幂次
|
||||
if x - lower < upper - x:
|
||||
return lower
|
||||
else:
|
||||
return upper
|
||||
|
||||
|
||||
EXPORT_HIDE_DIM = {
|
||||
0: 1024,
|
||||
1: 1024,
|
||||
2: 1024,
|
||||
3: 512,
|
||||
4: 512,
|
||||
5: 512,
|
||||
6: 512,
|
||||
7: 512,
|
||||
8: 512,
|
||||
9: 512,
|
||||
10: 512,
|
||||
11: 512,
|
||||
12: 512,
|
||||
13: 512,
|
||||
14: 512,
|
||||
15: 512,
|
||||
16: 512,
|
||||
17: 512,
|
||||
18: 512,
|
||||
19: 256,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------- 残差块 ----------------------------
|
||||
class ResidualBlock(nn.Module):
|
||||
def __init__(self, dim, dropout_prob=0.0):
|
||||
super().__init__()
|
||||
self.linear1 = nn.Linear(dim, dim)
|
||||
self.ln1 = nn.LayerNorm(dim)
|
||||
self.linear2 = nn.Linear(dim, dim)
|
||||
self.ln2 = nn.LayerNorm(dim)
|
||||
self.relu = nn.ReLU()
|
||||
self.dropout = nn.Dropout(dropout_prob)
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
x = self.relu(self.linear1(x))
|
||||
x = self.ln1(x)
|
||||
x = self.linear2(x)
|
||||
x = self.ln2(x)
|
||||
x = self.dropout(x)
|
||||
x = x + residual
|
||||
return self.relu(x)
|
||||
|
||||
|
||||
# ---------------------------- 专家网络 ----------------------------
|
||||
class Expert(nn.Module):
|
||||
def __init__(
|
||||
self,
|
||||
input_dim,
|
||||
d_model=1024,
|
||||
num_resblocks=4,
|
||||
output_multiplier=2,
|
||||
dropout_prob=0.0,
|
||||
):
|
||||
"""
|
||||
input_dim : BERT 输出的 hidden_size(如 312/768)
|
||||
d_model : 专家内部维度(固定 1024)
|
||||
output_multiplier : 输出维度 = input_dim * output_multiplier
|
||||
dropout_prob : 残差块内 Dropout
|
||||
"""
|
||||
super().__init__()
|
||||
self.input_dim = input_dim
|
||||
self.d_model = d_model
|
||||
self.output_dim = input_dim * output_multiplier
|
||||
|
||||
# 输入映射:input_dim -> d_model
|
||||
self.linear_in = nn.Linear(input_dim, d_model)
|
||||
|
||||
# 残差堆叠
|
||||
self.res_blocks = nn.ModuleList(
|
||||
[ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)]
|
||||
)
|
||||
|
||||
# 输出映射:d_model -> output_dim
|
||||
self.output = nn.Sequential(
|
||||
nn.Linear(d_model, d_model),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(d_model, self.output_dim),
|
||||
)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.linear_in(x)
|
||||
for block in self.res_blocks:
|
||||
x = block(x)
|
||||
return self.output(x)
|
||||
|
||||
|
||||
# ---------------------------- 主模型(MoE + 硬路由)------------------------
|
||||
class MoEModel(nn.Module):
|
||||
|
|
|
|||
Loading…
Reference in New Issue