From 94b44e6f711f85b3d9fb20fa8b107263dde48774 Mon Sep 17 00:00:00 2001 From: songsenand Date: Sun, 15 Feb 2026 01:06:52 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=8D=9F=E5=A4=B1=E6=9D=83?= =?UTF-8?q?=E9=87=8D=E6=94=AF=E6=8C=81=E5=B9=B6=E9=87=8D=E6=9E=84=E9=83=A8?= =?UTF-8?q?=E5=88=86=E6=A8=A1=E5=9D=97=E7=BB=93=E6=9E=84?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/suinput/dataset.py | 3 +- src/suinput/query.py | 5 ++ src/trainer/model.py | 6 ++ src/trainer/model_with_neck.py | 111 ++------------------------------- 4 files changed, 19 insertions(+), 106 deletions(-) diff --git a/src/suinput/dataset.py b/src/suinput/dataset.py index 3699cfa..b7223d3 100644 --- a/src/suinput/dataset.py +++ b/src/suinput/dataset.py @@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset): # Tokenize hint = self.tokenizer( - sampled_context + processed_pinyin, + sampled_context, + processed_pinyin, max_length=self.max_len, padding="max_length", truncation=True, diff --git a/src/suinput/query.py b/src/suinput/query.py index 7f4fba2..a3c139f 100644 --- a/src/suinput/query.py +++ b/src/suinput/query.py @@ -313,6 +313,11 @@ class QueryEngine: return self._char_pinyin_map.get((char, pinyin), 0) + def get_all_weights(self): + """获取所有字符-拼音对出现的次数 - O(n)时间复杂度""" + items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0]) + return [info.count for _, info in items_sorted] + def get_char_info_by_char_pinyin( self, char: str, pinyin: str ) -> Optional[CharInfo]: diff --git a/src/trainer/model.py b/src/trainer/model.py index cbe8fe8..d7d8f32 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -465,10 +465,12 @@ class MoEModel(nn.Module): criterion=nn.CrossEntropyLoss(), optimizer=None, num_epochs=1, + stop_batch=1e6, eval_frequency=500, grad_accum_steps=1, clip_grad_norm=1.0, mixed_precision=False, + loss_weight=None, lr=1e-4, lr_schedule=None, # 新增:可选的自定义学习率调度函数 ): @@ -506,6 +508,10 @@ class MoEModel(nn.Module): if self.device is None: self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") self.to(self.device) + if loss_weight: + loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight)) + self.loss_weight = loss_weight.to(self.device) + criterion.weight = self.loss_weight # 切换到训练模式 super().train() diff --git a/src/trainer/model_with_neck.py b/src/trainer/model_with_neck.py index 2f36f39..9fd087d 100644 --- a/src/trainer/model_with_neck.py +++ b/src/trainer/model_with_neck.py @@ -13,113 +13,14 @@ from modelscope import AutoModel from tqdm import tqdm from .monitor import TrainingMonitor +from .model import ( + EXPORT_HIDE_DIM, + eval_dataloader, + ResidualBlock, + Expert +) -def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")): - return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")] - - -def round_to_power_of_two(x): - if x < 1: - return 0 - n = x.bit_length() - n = min(max(7, n), 9) - lower = 1 << (n) # 小于等于x的最大2的幂次 - upper = lower << 1 # 大于x的最小2的幂次 - if x - lower < upper - x: - return lower - else: - return upper - - -EXPORT_HIDE_DIM = { - 0: 1024, - 1: 1024, - 2: 1024, - 3: 512, - 4: 512, - 5: 512, - 6: 512, - 7: 512, - 8: 512, - 9: 512, - 10: 512, - 11: 512, - 12: 512, - 13: 512, - 14: 512, - 15: 512, - 16: 512, - 17: 512, - 18: 512, - 19: 256, -} - - -# ---------------------------- 残差块 ---------------------------- -class ResidualBlock(nn.Module): - def __init__(self, dim, dropout_prob=0.0): - super().__init__() - self.linear1 = nn.Linear(dim, dim) - self.ln1 = nn.LayerNorm(dim) - self.linear2 = nn.Linear(dim, dim) - self.ln2 = nn.LayerNorm(dim) - self.relu = nn.ReLU() - self.dropout = nn.Dropout(dropout_prob) - - def forward(self, x): - residual = x - x = self.relu(self.linear1(x)) - x = self.ln1(x) - x = self.linear2(x) - x = self.ln2(x) - x = self.dropout(x) - x = x + residual - return self.relu(x) - - -# ---------------------------- 专家网络 ---------------------------- -class Expert(nn.Module): - def __init__( - self, - input_dim, - d_model=1024, - num_resblocks=4, - output_multiplier=2, - dropout_prob=0.0, - ): - """ - input_dim : BERT 输出的 hidden_size(如 312/768) - d_model : 专家内部维度(固定 1024) - output_multiplier : 输出维度 = input_dim * output_multiplier - dropout_prob : 残差块内 Dropout - """ - super().__init__() - self.input_dim = input_dim - self.d_model = d_model - self.output_dim = input_dim * output_multiplier - - # 输入映射:input_dim -> d_model - self.linear_in = nn.Linear(input_dim, d_model) - - # 残差堆叠 - self.res_blocks = nn.ModuleList( - [ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)] - ) - - # 输出映射:d_model -> output_dim - self.output = nn.Sequential( - nn.Linear(d_model, d_model), - nn.ReLU(inplace=True), - nn.Linear(d_model, self.output_dim), - ) - - def forward(self, x): - x = self.linear_in(x) - for block in self.res_blocks: - x = block(x) - return self.output(x) - # ---------------------------- 主模型(MoE + 硬路由)------------------------ class MoEModel(nn.Module):