添加损失权重支持并重构部分模块结构

2026-02-15 01:06:52 +08:00 · 2026-02-15 01:06:52 +08:00 · 94b44e6f71
parent 515f261824
commit 94b44e6f71
4 changed files with 19 additions and 106 deletions
--- a/src/suinput/dataset.py
+++ b/src/suinput/dataset.py
@ -424,7 +424,8 @@ class PinyinInputDataset(IterableDataset):
            # Tokenize
            hint = self.tokenizer(
-                sampled_context + processed_pinyin,
+                sampled_context,
                processed_pinyin,
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
--- a/src/suinput/query.py
+++ b/src/suinput/query.py
@ -313,6 +313,11 @@ class QueryEngine:
        return self._char_pinyin_map.get((char, pinyin), 0)
    def get_all_weights(self):
        """获取所有字符-拼音对出现的次数 - O(n)时间复杂度"""
        items_sorted = sorted(self._id_to_info.items(), key=lambda x: x[0])
        return [info.count for _, info in items_sorted]
    def get_char_info_by_char_pinyin(
        self, char: str, pinyin: str
    ) -> Optional[CharInfo]:
--- a/src/trainer/model.py
+++ b/src/trainer/model.py
@ -465,10 +465,12 @@ class MoEModel(nn.Module):
        criterion=nn.CrossEntropyLoss(),
        optimizer=None,
        num_epochs=1,
        stop_batch=1e6,
        eval_frequency=500,
        grad_accum_steps=1,
        clip_grad_norm=1.0,
        mixed_precision=False,
        loss_weight=None,
        lr=1e-4,
        lr_schedule=None,  # 新增：可选的自定义学习率调度函数
    ):
@ -506,6 +508,10 @@ class MoEModel(nn.Module):
        if self.device is None:
            self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
            self.to(self.device)
        if loss_weight:
            loss_weight = 1 / torch.sqrt(torch.tensor(loss_weight))
            self.loss_weight = loss_weight.to(self.device)
            criterion.weight = self.loss_weight
        # 切换到训练模式
        super().train()
--- a/src/trainer/model_with_neck.py
+++ b/src/trainer/model_with_neck.py
@ -13,112 +13,13 @@ from modelscope import AutoModel
 from tqdm import tqdm
 from .monitor import TrainingMonitor
-
+from .model import (
-
+    EXPORT_HIDE_DIM,
-def eval_dataloader(path: Union[str, Path] = (files(__package__) / "eval_dataset")):
+    eval_dataloader,
-    return [pickle.load(file.open("rb")) for file in Path(path).glob("*.pkl")]
+    ResidualBlock,
-
+    Expert
 def round_to_power_of_two(x):
    if x < 1:
        return 0
    n = x.bit_length()
    n = min(max(7, n), 9)
    lower = 1 << (n)  # 小于等于x的最大2的幂次
    upper = lower << 1  # 大于x的最小2的幂次
    if x - lower < upper - x:
        return lower
    else:
        return upper
 EXPORT_HIDE_DIM = {
    0: 1024,
    1: 1024,
    2: 1024,
    3: 512,
    4: 512,
    5: 512,
    6: 512,
    7: 512,
    8: 512,
    9: 512,
    10: 512,
    11: 512,
    12: 512,
    13: 512,
    14: 512,
    15: 512,
    16: 512,
    17: 512,
    18: 512,
    19: 256,
 }
 # ---------------------------- 残差块 ----------------------------
 class ResidualBlock(nn.Module):
    def __init__(self, dim, dropout_prob=0.0):
        super().__init__()
        self.linear1 = nn.Linear(dim, dim)
        self.ln1 = nn.LayerNorm(dim)
        self.linear2 = nn.Linear(dim, dim)
        self.ln2 = nn.LayerNorm(dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_prob)
    def forward(self, x):
        residual = x
        x = self.relu(self.linear1(x))
        x = self.ln1(x)
        x = self.linear2(x)
        x = self.ln2(x)
        x = self.dropout(x)
        x = x + residual
        return self.relu(x)
 # ---------------------------- 专家网络 ----------------------------
 class Expert(nn.Module):
    def __init__(
        self,
        input_dim,
        d_model=1024,
        num_resblocks=4,
        output_multiplier=2,
        dropout_prob=0.0,
    ):
        """
        input_dim : BERT 输出的 hidden_size（如 312/768）
        d_model   : 专家内部维度（固定 1024）
        output_multiplier : 输出维度 = input_dim * output_multiplier
        dropout_prob      : 残差块内 Dropout
        """
        super().__init__()
        self.input_dim = input_dim
        self.d_model = d_model
        self.output_dim = input_dim * output_multiplier
        # 输入映射：input_dim -> d_model
        self.linear_in = nn.Linear(input_dim, d_model)
        # 残差堆叠
        self.res_blocks = nn.ModuleList(
            [ResidualBlock(d_model, dropout_prob) for _ in range(num_resblocks)]
 )
        # 输出映射：d_model -> output_dim
        self.output = nn.Sequential(
            nn.Linear(d_model, d_model),
            nn.ReLU(inplace=True),
            nn.Linear(d_model, self.output_dim),
        )
    def forward(self, x):
        x = self.linear_in(x)
        for block in self.res_blocks:
            x = block(x)
        return self.output(x)
 # ---------------------------- 主模型（MoE + 硬路由）------------------------