From 5857c90be70128ace26f79a46968ab5c95c90df8 Mon Sep 17 00:00:00 2001 From: songsenand Date: Sun, 22 Feb 2026 12:16:22 +0800 Subject: [PATCH] =?UTF-8?q?=E9=87=8D=E6=9E=84=E4=BB=A3=E7=A0=81=E7=BB=93?= =?UTF-8?q?=E6=9E=84=E5=B9=B6=E4=BC=98=E5=8C=96=E6=B3=A8=E9=87=8A=E6=A0=BC?= =?UTF-8?q?=E5=BC=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/trainer/model.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/trainer/model.py b/src/trainer/model.py index 1fe3ac6..9d0371f 100644 --- a/src/trainer/model.py +++ b/src/trainer/model.py @@ -10,7 +10,7 @@ import torch.nn.functional as F import torch.optim as optim from loguru import logger from modelscope import AutoModel, AutoTokenizer -from tqdm import tqdm +from tqdm.autonotebook import tqdm from .monitor import TrainingMonitor from suinput.dataset import PG @@ -121,14 +121,13 @@ class MoEModel(nn.Module): ) self.encoder = nn.TransformerEncoder(encoder_layer, num_layers=4) - self.shared_resblocks = nn.ModuleList( - [ResidualBlock(self.hidden_size, 0.1) for _ in range(6)] - ) + # self.shared_resblocks = nn.ModuleList( + # [ResidualBlock(self.hidden_size, 0.1) for _ in range(4)] + # ) self.pooler = nn.AdaptiveAvgPool1d(1) # self.linear = nn.Linear(self.hidden_size, self.hidden_size) - # 3. 专家层:8个领域专家 + 1个共享专家 total_experts = num_domain_experts + num_shared_experts self.experts = nn.ModuleList() @@ -140,12 +139,11 @@ class MoEModel(nn.Module): input_dim=self.hidden_size, d_model=d_model, num_resblocks=num_resblocks, - output_multiplier=self.output_multiplier, # 输出维度 = 2 * hidden_size + output_multiplier=self.output_multiplier, dropout_prob=dropout_prob, ) self.experts.append(expert) - self.expert_bias = nn.Embedding( total_experts, self.output_multiplier * self.hidden_size ) @@ -195,8 +193,8 @@ class MoEModel(nn.Module): ) # [B, S, H] # ----- 3. 池化量 ----- - for block in self.shared_resblocks: - encoded = block(encoded) + # for block in self.shared_resblocks: + # encoded = block(encoded) pooled = self.pooler(encoded.transpose(1, 2)).squeeze(-1) # pooled = self.pooler(encoded.transpose(1, 2)) # [B, H, 2] # pooled = pooled.flatten(1) # [B, H*2] @@ -321,11 +319,11 @@ class MoEModel(nn.Module): return_tensors="pt", ) sample = {} - sample['hint'] = { + sample["hint"] = { "input_ids": hint["input_ids"], "attention_mask": hint["attention_mask"], } - sample['pg'] = torch.tensor([PG[py[0]]]) + sample["pg"] = torch.tensor([PG[py[0]]]) return sample def predict(self, text, py, tokenizer=None): @@ -500,7 +498,7 @@ class MoEModel(nn.Module): f"step: {global_step}, loss: {avg_loss:.4f}, acc: {acc:.4f}, eval_loss: {eval_loss:.4f}" ) batch_loss_sum = 0.0 - if processed_batches >= stop_batch: + if processed_batches + 1 >= stop_batch: break global_step += 1 @@ -534,3 +532,5 @@ class MoEModel(nn.Module): for name, param in self.named_parameters(): if name in freeze_layers: param.requires_grad = False + +