添加损失权重支持并重构部分模块结构

This commit is contained in:
songsenand 2026-02-15 01:06:52 +08:00
parent 515f261824
commit 94b44e6f71
4 changed files with 19 additions and 106 deletions

View File

@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset):
# Tokenize
hint = self.tokenizer(
sampled_context + processed_pinyin,
sampled_context,
processed_pinyin,
max_length=self.max_len,
padding="max_length",
truncation=True,

View File

@ -313,6 +313,11 @@ class QueryEngine:
return self._char_pinyin_map.get((char, pinyin), 0)
def get_all_weights(self):
"""获取所有字符-拼音对出现的次数 - O(n)时间复杂度"""
items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0])
return [info.count for _, info in items_sorted]
def get_char_info_by_char_pinyin(
self, char: str, pinyin: str
) -> Optional[CharInfo]:

View File

@ -465,10 +465,12 @@ class MoEModel(nn.Module):
criterion=nn.CrossEntropyLoss(),
optimizer=None,
num_epochs=1,
stop_batch=1e6,
eval_frequency=500,
grad_accum_steps=1,
clip_grad_norm=1.0,
mixed_precision=False,
loss_weight=None,
lr=1e-4,
lr_schedule=None, # 新增:可选的自定义学习率调度函数
):
@ -506,6 +508,10 @@ class MoEModel(nn.Module):
if self.device is None:
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.to(self.device)
if loss_weight:
loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight))
self.loss_weight = loss_weight.to(self.device)
criterion.weight = self.loss_weight
# 切换到训练模式
super().train()

View File

@ -13,112 +13,13 @@ from modelscope import AutoModel
from tqdm import tqdm
from .monitor import TrainingMonitor
def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")):
return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")]
def round_to_power_of_two(x):
if x < 1:
return 0
n = x.bit_length()
n = min(max(7, n), 9)
lower = 1 << (n) # 小于等于x的最大2的幂次
upper = lower << 1 # 大于x的最小2的幂次
if x - lower < upper - x:
return lower
else:
return upper
EXPORT_HIDE_DIM = {
0: 1024,
1: 1024,
2: 1024,
3: 512,
4: 512,
5: 512,
6: 512,
7: 512,
8: 512,
9: 512,
10: 512,
11: 512,
12: 512,
13: 512,
14: 512,
15: 512,
16: 512,
17: 512,
18: 512,
19: 256,
}
# ---------------------------- 残差块 ----------------------------
class ResidualBlock(nn.Module):
def __init__(self, dim, dropout_prob=0.0):
super().__init__()
self.linear1 = nn.Linear(dim, dim)
self.ln1 = nn.LayerNorm(dim)
self.linear2 = nn.Linear(dim, dim)
self.ln2 = nn.LayerNorm(dim)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(dropout_prob)
def forward(self, x):
residual = x
x = self.relu(self.linear1(x))
x = self.ln1(x)
x = self.linear2(x)
x = self.ln2(x)
x = self.dropout(x)
x = x + residual
return self.relu(x)
# ---------------------------- 专家网络 ----------------------------
class Expert(nn.Module):
def __init__(
self,
input_dim,
d_model=1024,
num_resblocks=4,
output_multiplier=2,
dropout_prob=0.0,
):
"""
input_dim : BERT 输出的 hidden_size 312/768
d_model : 专家内部维度固定 1024
output_multiplier : 输出维度 = input_dim * output_multiplier
dropout_prob : 残差块内 Dropout
"""
super().__init__()
self.input_dim = input_dim
self.d_model = d_model
self.output_dim = input_dim * output_multiplier
# 输入映射input_dim -> d_model
self.linear_in = nn.Linear(input_dim, d_model)
# 残差堆叠
self.res_blocks = nn.ModuleList(
[ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)]
from .model import (
EXPORT_HIDE_DIM,
eval_dataloader,
ResidualBlock,
Expert
)
# 输出映射d_model -> output_dim
self.output = nn.Sequential(
nn.Linear(d_model, d_model),
nn.ReLU(inplace=True),
nn.Linear(d_model, self.output_dim),
)
def forward(self, x):
x = self.linear_in(x)
for block in self.res_blocks:
x = block(x)
return self.output(x)
# ---------------------------- 主模型MoE + 硬路由)------------------------