From 94b44e6f711f85b3d9fb20fa8b107263dde48774 Mon Sep 17 00:00:00 2001
From: songsenand <songsenand@163.com>
Date: Sun, 15 Feb 2026 01:06:52 +0800
Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=E6=8D=9F=E5=A4=B1=E6=9D=83?=
 =?UTF-8?q?=E9=87=8D=E6=94=AF=E6=8C=81=E5=B9=B6=E9=87=8D=E6=9E=84=E9=83=A8?=
 =?UTF-8?q?=E5=88=86=E6=A8=A1=E5=9D=97=E7=BB=93=E6=9E=84?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/suinput/dataset.py         |   3 +-
 src/suinput/query.py           |   5 ++
 src/trainer/model.py           |   6 ++
 src/trainer/model_with_neck.py | 111 ++-------------------------------
 4 files changed, 19 insertions(+), 106 deletions(-)

diff --git a/src/suinput/dataset.py b/src/suinput/dataset.py
index 3699cfa..b7223d3 100644
--- a/src/suinput/dataset.py
+++ b/src/suinput/dataset.py
@@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset):
 
             # Tokenize
             hint = self.tokenizer(
-                sampled_context + processed_pinyin,
+                sampled_context,
+                processed_pinyin,
                 max_length=self.max_len,
                 padding="max_length",
                 truncation=True,
diff --git a/src/suinput/query.py b/src/suinput/query.py
index 7f4fba2..a3c139f 100644
--- a/src/suinput/query.py
+++ b/src/suinput/query.py
@@ -313,6 +313,11 @@ class QueryEngine:
 
         return self._char_pinyin_map.get((char, pinyin), 0)
 
+    def get_all_weights(self):
+        """获取所有字符-拼音对出现的次数 - O(n)时间复杂度"""
+        items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0])
+        return [info.count for _, info in items_sorted]
+
     def get_char_info_by_char_pinyin(
         self, char: str, pinyin: str
     ) -> Optional[CharInfo]:
diff --git a/src/trainer/model.py b/src/trainer/model.py
index cbe8fe8..d7d8f32 100644
--- a/src/trainer/model.py
+++ b/src/trainer/model.py
@@ -465,10 +465,12 @@ class MoEModel(nn.Module):
         criterion=nn.CrossEntropyLoss(),
         optimizer=None,
         num_epochs=1,
+        stop_batch=1e6,
         eval_frequency=500,
         grad_accum_steps=1,
         clip_grad_norm=1.0,
         mixed_precision=False,
+        loss_weight=None,
         lr=1e-4,
         lr_schedule=None,  # 新增：可选的自定义学习率调度函数
     ):
@@ -506,6 +508,10 @@ class MoEModel(nn.Module):
         if self.device is None:
             self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
             self.to(self.device)
+        if loss_weight:
+            loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight))
+            self.loss_weight = loss_weight.to(self.device)
+            criterion.weight = self.loss_weight
 
         # 切换到训练模式
         super().train()
diff --git a/src/trainer/model_with_neck.py b/src/trainer/model_with_neck.py
index 2f36f39..9fd087d 100644
--- a/src/trainer/model_with_neck.py
+++ b/src/trainer/model_with_neck.py
@@ -13,113 +13,14 @@ from modelscope import AutoModel
 from tqdm import tqdm
 
 from .monitor import TrainingMonitor
+from .model import (
+    EXPORT_HIDE_DIM,
+    eval_dataloader,
+    ResidualBlock,
+    Expert
+)
 
 
-def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")):
-    return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")]
-
-
-def round_to_power_of_two(x):
-    if x < 1:
-        return 0
-    n = x.bit_length()
-    n = min(max(7, n), 9)
-    lower = 1 << (n)  # 小于等于x的最大2的幂次
-    upper = lower << 1  # 大于x的最小2的幂次
-    if x - lower < upper - x:
-        return lower
-    else:
-        return upper
-
-
-EXPORT_HIDE_DIM = {
-    0: 1024,
-    1: 1024,
-    2: 1024,
-    3: 512,
-    4: 512,
-    5: 512,
-    6: 512,
-    7: 512,
-    8: 512,
-    9: 512,
-    10: 512,
-    11: 512,
-    12: 512,
-    13: 512,
-    14: 512,
-    15: 512,
-    16: 512,
-    17: 512,
-    18: 512,
-    19: 256,
-}
-
-
-# ---------------------------- 残差块 ----------------------------
-class ResidualBlock(nn.Module):
-    def __init__(self, dim, dropout_prob=0.0):
-        super().__init__()
-        self.linear1 = nn.Linear(dim, dim)
-        self.ln1 = nn.LayerNorm(dim)
-        self.linear2 = nn.Linear(dim, dim)
-        self.ln2 = nn.LayerNorm(dim)
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(dropout_prob)
-
-    def forward(self, x):
-        residual = x
-        x = self.relu(self.linear1(x))
-        x = self.ln1(x)
-        x = self.linear2(x)
-        x = self.ln2(x)
-        x = self.dropout(x)
-        x = x + residual
-        return self.relu(x)
-
-
-# ---------------------------- 专家网络 ----------------------------
-class Expert(nn.Module):
-    def __init__(
-        self,
-        input_dim,
-        d_model=1024,
-        num_resblocks=4,
-        output_multiplier=2,
-        dropout_prob=0.0,
-    ):
-        """
-        input_dim : BERT 输出的 hidden_size（如 312/768）
-        d_model   : 专家内部维度（固定 1024）
-        output_multiplier : 输出维度 = input_dim * output_multiplier
-        dropout_prob      : 残差块内 Dropout
-        """
-        super().__init__()
-        self.input_dim = input_dim
-        self.d_model = d_model
-        self.output_dim = input_dim * output_multiplier
-
-        # 输入映射：input_dim -> d_model
-        self.linear_in = nn.Linear(input_dim, d_model)
-
-        # 残差堆叠
-        self.res_blocks = nn.ModuleList(
-            [ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)]
-        )
-
-        # 输出映射：d_model -> output_dim
-        self.output = nn.Sequential(
-            nn.Linear(d_model, d_model),
-            nn.ReLU(inplace=True),
-            nn.Linear(d_model, self.output_dim),
-        )
-
-    def forward(self, x):
-        x = self.linear_in(x)
-        for block in self.res_blocks:
-            x = block(x)
-        return self.output(x)
-
 
 # ---------------------------- 主模型（MoE + 硬路由）------------------------
 class MoEModel(nn.Module):