添加损失权重支持并重构部分模块结构

2026-02-15 01:06:52 +08:00 · 2026-02-15 01:06:52 +08:00 · 94b44e6f71
parent 515f261824
commit 94b44e6f71
4 changed files with 19 additions and 106 deletions
--- a/src/suinput/dataset.py
+++ b/src/suinput/dataset.py
@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset):

            # Tokenize
            hint = self.tokenizer(
-                sampled_context + processed_pinyin,
+                sampled_context,
+                processed_pinyin,
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
--- a/src/suinput/query.py
+++ b/src/suinput/query.py
@ -313,6 +313,11 @@ class QueryEngine:

        return self._char_pinyin_map.get((char, pinyin), 0)

+    def get_all_weights(self):
+        """获取所有字符-拼音对出现的次数 - O(n)时间复杂度"""
+        items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0])
+        return [info.count for _, info in items_sorted]
+
    def get_char_info_by_char_pinyin(
        self, char: str, pinyin: str
    ) -> Optional[CharInfo]:
--- a/src/trainer/model.py
+++ b/src/trainer/model.py
@ -465,10 +465,12 @@ class MoEModel(nn.Module):
        criterion=nn.CrossEntropyLoss(),
        optimizer=None,
        num_epochs=1,
+        stop_batch=1e6,
        eval_frequency=500,
        grad_accum_steps=1,
        clip_grad_norm=1.0,
        mixed_precision=False,
+        loss_weight=None,
        lr=1e-4,
        lr_schedule=None,  # 新增：可选的自定义学习率调度函数
    ):
@ -506,6 +508,10 @@ class MoEModel(nn.Module):
        if self.device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.to(self.device)
+        if loss_weight:
+            loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight))
+            self.loss_weight = loss_weight.to(self.device)
+            criterion.weight = self.loss_weight

        # 切换到训练模式
        super().train()
--- a/src/trainer/model_with_neck.py
+++ b/src/trainer/model_with_neck.py
@ -13,112 +13,13 @@ from modelscope import AutoModel
 from tqdm import tqdm

 from .monitor import TrainingMonitor
-
-
-def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")):
-    return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")]
-
-
-def round_to_power_of_two(x):
-    if x < 1:
-        return 0
-    n = x.bit_length()
-    n = min(max(7, n), 9)
-    lower = 1 << (n)  # 小于等于x的最大2的幂次
-    upper = lower << 1  # 大于x的最小2的幂次
-    if x - lower < upper - x:
-        return lower
-    else:
-        return upper
-
-
-EXPORT_HIDE_DIM = {
-    0: 1024,
-    1: 1024,
-    2: 1024,
-    3: 512,
-    4: 512,
-    5: 512,
-    6: 512,
-    7: 512,
-    8: 512,
-    9: 512,
-    10: 512,
-    11: 512,
-    12: 512,
-    13: 512,
-    14: 512,
-    15: 512,
-    16: 512,
-    17: 512,
-    18: 512,
-    19: 256,
-}
-
-
-# ---------------------------- 残差块 ----------------------------
-class ResidualBlock(nn.Module):
-    def __init__(self, dim, dropout_prob=0.0):
-        super().__init__()
-        self.linear1 = nn.Linear(dim, dim)
-        self.ln1 = nn.LayerNorm(dim)
-        self.linear2 = nn.Linear(dim, dim)
-        self.ln2 = nn.LayerNorm(dim)
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(dropout_prob)
-
-    def forward(self, x):
-        residual = x
-        x = self.relu(self.linear1(x))
-        x = self.ln1(x)
-        x = self.linear2(x)
-        x = self.ln2(x)
-        x = self.dropout(x)
-        x = x + residual
-        return self.relu(x)
-
-
-# ---------------------------- 专家网络 ----------------------------
-class Expert(nn.Module):
-    def __init__(
-        self,
-        input_dim,
-        d_model=1024,
-        num_resblocks=4,
-        output_multiplier=2,
-        dropout_prob=0.0,
-    ):
-        """
-        input_dim : BERT 输出的 hidden_size（如 312/768）
-        d_model   : 专家内部维度（固定 1024）
-        output_multiplier : 输出维度 = input_dim * output_multiplier
-        dropout_prob      : 残差块内 Dropout
-        """
-        super().__init__()
-        self.input_dim = input_dim
-        self.d_model = d_model
-        self.output_dim = input_dim * output_multiplier
-
-        # 输入映射：input_dim -> d_model
-        self.linear_in = nn.Linear(input_dim, d_model)
-
-        # 残差堆叠
-        self.res_blocks = nn.ModuleList(
-            [ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)]
+from .model import (
+    EXPORT_HIDE_DIM,
+    eval_dataloader,
+    ResidualBlock,
+    Expert
 )

-        # 输出映射：d_model -> output_dim
-        self.output = nn.Sequential(
-            nn.Linear(d_model, d_model),
-            nn.ReLU(inplace=True),
-            nn.Linear(d_model, self.output_dim),
-        )
-
-    def forward(self, x):
-        x = self.linear_in(x)
-        for block in self.res_blocks:
-            x = block(x)
-        return self.output(x)


 # ---------------------------- 主模型（MoE + 硬路由）------------------------