feat: 添加模型扩容两阶段训练功能,支持冻结层训练与全量微调切换

This commit is contained in:
songsenand 2026-04-07 14:46:50 +08:00
parent d14fd09f41
commit c9a96651cd
4 changed files with 2632 additions and 1738 deletions

109
README.md
View File

@ -790,7 +790,114 @@ train-model evaluate \
- 在评估数据集上计算准确率、困惑度等指标 - 在评估数据集上计算准确率、困惑度等指标
- 生成详细的性能报告 - 生成详细的性能报告
### 6.8 导出模型(开发中) ### 6.8 模型扩容两阶段训练
当需要增加模型容量(如增加专家数量、修改层结构等)时,可以使用 `expand-and-train` 命令进行两阶段训练:先冻结匹配层训练新增参数,然后全量微调。
#### 训练策略
1. **冻结阶段**:只训练形状不匹配的新增参数(如新增的专家、扩容的层等)
2. **全量微调阶段**:当验证损失连续 `--frozen-patience` 次不下降时,自动解冻所有层进行全量训练
#### 基础用法
```bash
train-model expand-and-train \
--train-data-path "path/to/train/dataset" \
--eval-data-path "path/to/eval/dataset" \
--base-model-path "./pretrained/model.pt" \
--new-model-spec "model:InputMethodEngine" \
--num-experts 40 \
--frozen-lr 2e-3 \
--full-lr 5e-5 \
--frozen-patience 8
```
#### 完整参数示例
```bash
train-model expand-and-train \
--train-data-path "path/to/train/dataset" \
--eval-data-path "path/to/eval/dataset" \
--output-dir "./expansion_output" \
--base-model-path "./pretrained/model.pt" \
--new-model-spec "custom_model:ExpandedModel" \
--vocab-size 10019 \
--dim 512 \
--num-experts 40 \
--frozen-patience 10 \
--frozen-lr 1e-3 \
--full-lr 1e-4 \
--frozen-scheduler cosine \
--full-scheduler cosine \
--batch-size 128 \
--num-epochs 20 \
--compile
```
#### 参数详解
**模型扩容参数**
- `--base-model-path`: 预训练基础模型检查点路径(必需)
- `--new-model-spec`: 新模型规格,格式:`模块名:类名`,如 `model:InputMethodEngine`(必需)
- 支持任意路径的模块导入,模块文件需包含自定义的模型类
- 自定义模型类必须是 `InputMethodEngine` 的子类
- 示例:`my_model:MyExpandedModel` 对应 `my_model.py` 中的 `MyExpandedModel`
**两阶段训练参数**
- `--frozen-patience`: 冻结阶段验证损失连续不下降的评估次数触发切换到全量微调默认10
- `--frozen-lr`: 冻结阶段学习率默认1e-3
- `--full-lr`: 全量微调阶段学习率默认1e-4
- `--frozen-scheduler`: 冻结阶段学习率调度器,可选 `cosine``plateau`(默认:`cosine`
- `--full-scheduler`: 全量微调阶段学习率调度器,可选 `cosine``plateau`(默认:`cosine`
**其他参数**
- 支持所有 `train` 子命令的通用参数(数据参数、模型参数、训练参数等)
- 继承现有的训练基础设施混合精度训练、TensorBoard日志、checkpoint保存等
#### 使用场景
1. **增加专家数量**20→40
- 冻结效果:~70% 参数可冻结(已有专家权重、注意力层等)
- 新增参数新专家网络、gate层
2. **增加top_k值**2→3
- 冻结效果100% 参数可冻结(仅逻辑变化)
- 新增参数:无
3. **修改专家内部结构**如增加resblocks
- 冻结效果:~50% 参数可冻结linear_in/output可冻结
- 新增参数新增的resblocks层
4. **增加Transformer层数**4→5
- 冻结效果:~80% 参数可冻结前4层可冻结
- 新增参数新增的第5层
#### 自定义模型类示例
```python
# my_model.py
from model.model import InputMethodEngine
class MyExpandedModel(InputMethodEngine):
def __init__(self, num_experts=40, **kwargs):
# 调用父类构造函数覆盖num_experts参数
super().__init__(num_experts=num_experts, **kwargs)
# 可以在这里添加额外的层或修改现有层
# 使用命令
# train-model expand-and-train --new-model-spec "my_model:MyExpandedModel" ...
```
#### 注意事项
1. **模型类要求**:自定义模型类必须是 `InputMethodEngine` 的子类
2. **冻结条件**:只有权重形状完全匹配的层才会被冻结
3. **性能保持**MoE层保持"计算所有专家+Top-K选择"方案,确保 `torch.compile` 下的最佳性能
4. **阶段切换**基于评估频率而非epoch建议适当调高 `--eval-frequency`
5. **模块导入**支持任意路径的模块通过Python标准导入机制加载
### 6.9 导出模型(开发中)
当前导出功能尚在开发中: 当前导出功能尚在开发中:

8
big_expert.py Normal file
View File

@ -0,0 +1,8 @@
from model.model import InputMethodEngine
class BigExpert(InputMethodEngine):
def __init__(self, *args, **kw):
super().__init__(*args, **kw)
self.moe = MoELayer(dim=dim, num_experts=40, top_k=3)

View File

@ -45,10 +45,12 @@ class Trainer:
- CrossEntropyLoss损失函数支持weight和label_smoothing - CrossEntropyLoss损失函数支持weight和label_smoothing
- Rich终端美化输出 - Rich终端美化输出
""" """
training_status_data: List[Dict[str, Any]]
def __init__( def __init__(
self, self,
model: InputMethodEngine, model: nn.Module,
train_dataloader: DataLoader, train_dataloader: DataLoader,
eval_dataloader: DataLoader, eval_dataloader: DataLoader,
total_steps: int, total_steps: int,
@ -461,10 +463,10 @@ class Trainer:
if existing_indices: if existing_indices:
# 替换现有记录 # 替换现有记录
for idx in existing_indices: for idx in existing_indices:
self.training_status_data[idx] = status_record self.training_status_data[idx] = status_record # type: ignore
else: else:
# 添加到内存缓存 # 添加到内存缓存
self.training_status_data.append(status_record) self.training_status_data.append(status_record) # type: ignore
# 限制内存中的数据量只保留最近1000条记录 # 限制内存中的数据量只保留最近1000条记录
if len(self.training_status_data) > 1000: if len(self.training_status_data) > 1000:
@ -686,6 +688,444 @@ class Trainer:
self.writer.close() self.writer.close()
def load_expanded_model(
base_model_path: str,
new_model_spec: str,
device: torch.device,
**model_kwargs,
) -> nn.Module:
"""
加载预训练基础模型并创建扩容后的新模型冻结匹配的层
Args:
base_model_path: 预训练基础模型检查点路径
new_model_spec: 新模型规格格式 "module:ClassName" "new_model:NewModel"
device: 设备
**model_kwargs: 传递给新模型构造函数的参数
Returns:
扩容后的新模型匹配的层已冻结
"""
import importlib
import sys
# 解析新模型规格
if ":" not in new_model_spec:
raise ValueError(f"Invalid model spec format: {new_model_spec}. Expected format: 'module:ClassName'")
module_name, class_name = new_model_spec.split(":", 1)
# 导入模块(支持任意路径)
module = None
try:
# 尝试直接导入
module = importlib.import_module(module_name)
except ImportError:
# 如果失败,尝试将其视为文件路径
try:
# 将模块名转换为可能的文件路径
module_path = module_name.replace(".", "/") + ".py"
import importlib.util
spec = importlib.util.spec_from_file_location(module_name, module_path)
if spec is None or spec.loader is None:
raise ImportError(f"Cannot find module or loader: {module_name}")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) # type: ignore
except Exception as e:
# 尝试在当前目录下查找
import os
if os.path.exists(module_name + ".py"):
spec = importlib.util.spec_from_file_location(module_name, module_name + ".py")
if spec is None or spec.loader is None:
raise ImportError(f"Cannot load module from file: {module_name}.py")
module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(module) # type: ignore
else:
raise ImportError(f"Failed to import module '{module_name}': {e}")
if module is None:
raise ImportError(f"Module '{module_name}' could not be imported")
# 获取模型类
model_class = getattr(module, class_name)
# 检查模型类是否是 InputMethodEngine 的子类
from .model import InputMethodEngine
if not issubclass(model_class, InputMethodEngine):
raise TypeError(
f"Model class {class_name} must be a subclass of InputMethodEngine. "
f"Got {model_class.__name__} instead."
)
# 创建新模型
new_model = model_class(**model_kwargs)
new_model.to(device)
# 加载预训练权重
checkpoint = torch.load(base_model_path, map_location=device)
if "model_state_dict" in checkpoint:
pretrained_state_dict = checkpoint["model_state_dict"]
else:
pretrained_state_dict = checkpoint
# 获取新模型的状态字典
new_state_dict = new_model.state_dict()
# 冻结匹配的层
frozen_layers = []
for key in new_state_dict.keys():
if key in pretrained_state_dict:
if new_state_dict[key].shape == pretrained_state_dict[key].shape:
new_state_dict[key] = pretrained_state_dict[key].to(device)
frozen_layers.append(key)
# 加载更新后的状态字典
new_model.load_state_dict(new_state_dict)
# 设置参数 requires_grad
for name, param in new_model.named_parameters():
if name in frozen_layers:
param.requires_grad = False
logger.info(f"Loaded expanded model with {len(frozen_layers)} frozen layers")
logger.info(f"Frozen layers: {frozen_layers[:10]}{'...' if len(frozen_layers) > 10 else ''}")
return new_model
class TwoStageTrainer(Trainer):
"""
两阶段训练器先冻结匹配层训练然后全量微调
"""
def __init__(
self,
model: nn.Module,
train_dataloader: DataLoader,
eval_dataloader: DataLoader,
total_steps: int,
output_dir: str = "./output",
num_epochs: int = 10,
learning_rate: float = 1e-4,
min_learning_rate: float = 1e-6,
weight_decay: float = 0.1,
warmup_ratio: float = 0.1,
label_smoothing: float = 0.15,
loss_weight: Optional[torch.Tensor] = None,
grad_accum_steps: int = 1,
clip_grad_norm: float = 1.0,
eval_frequency: int = 500,
save_frequency: int = 10000,
mixed_precision: bool = True,
device: Optional[torch.device] = None,
status_file: str = "training_status.json",
use_tensorboard: bool = True,
# 两阶段训练特有参数
frozen_patience: int = 10,
frozen_lr: Optional[float] = None,
full_lr: Optional[float] = None,
frozen_scheduler: str = "cosine",
full_scheduler: str = "cosine",
):
"""
初始化两阶段训练器
Args:
frozen_patience: 冻结阶段验证损失连续不下降的epoch数触发切换到全量微调
frozen_lr: 冻结阶段学习率如果为None则使用learning_rate
full_lr: 全量微调阶段学习率如果为None则使用learning_rate
frozen_scheduler: 冻结阶段学习率调度器类型"cosine""plateau"
full_scheduler: 全量微调阶段学习率调度器类型"cosine""plateau"
"""
super().__init__(
model=model,
train_dataloader=train_dataloader,
eval_dataloader=eval_dataloader,
total_steps=total_steps,
output_dir=output_dir,
num_epochs=num_epochs,
learning_rate=learning_rate,
min_learning_rate=min_learning_rate,
weight_decay=weight_decay,
warmup_ratio=warmup_ratio,
label_smoothing=label_smoothing,
loss_weight=loss_weight,
grad_accum_steps=grad_accum_steps,
clip_grad_norm=clip_grad_norm,
eval_frequency=eval_frequency,
save_frequency=save_frequency,
mixed_precision=mixed_precision,
device=device,
status_file=status_file,
use_tensorboard=use_tensorboard,
)
# 两阶段训练参数
self.frozen_patience = frozen_patience
self.frozen_lr = frozen_lr if frozen_lr is not None else learning_rate
self.full_lr = full_lr if full_lr is not None else learning_rate
self.frozen_scheduler = frozen_scheduler
self.full_scheduler = full_scheduler
# 训练状态
self.current_stage = "frozen" # "frozen" 或 "full"
self.frozen_best_loss = float("inf")
self.frozen_patience_counter = 0
logger.info(f"TwoStageTrainer initialized with frozen_patience={frozen_patience}")
logger.info(f"Stage: {self.current_stage}, Frozen LR: {self.frozen_lr:.2e}, Full LR: {self.full_lr:.2e}")
# 覆盖父类的学习率调度器为冻结阶段调度器
self.lr_scheduler = self._create_stage_lr_scheduler("frozen")
def _create_stage_lr_scheduler(self, stage: str) -> Callable[[int], float]:
"""创建阶段特定的学习率调度函数"""
if stage == "frozen":
base_lr = self.frozen_lr
scheduler_type = self.frozen_scheduler
else:
base_lr = self.full_lr
scheduler_type = self.full_scheduler
# 捕获局部变量以避免闭包中的self引用问题
warmup_steps = self.warmup_steps
total_steps = self.total_steps
min_learning_rate = self.min_learning_rate
def lr_scheduler(step: int) -> float:
if step < warmup_steps:
# 线性预热
return base_lr * (step / warmup_steps)
else:
if scheduler_type == "cosine":
# 余弦退火
progress = (step - warmup_steps) / (
total_steps - warmup_steps
)
cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
decayed_lr = (
min_learning_rate
+ (base_lr - min_learning_rate) * cosine_decay
)
return decayed_lr
elif scheduler_type == "plateau":
# 保持恒定学习率plateau调度需要在训练循环中实现
return base_lr
else:
raise ValueError(f"Unknown scheduler type: {scheduler_type}")
return lr_scheduler
def _switch_to_full_stage(self):
"""切换到全量微调阶段"""
if self.current_stage == "full":
return
logger.info("Switching to full fine-tuning stage")
self.current_stage = "full"
# 解冻所有参数
for param in self.model.parameters():
param.requires_grad = True
# 更新学习率调度器
self.learning_rate = self.full_lr
self.lr_scheduler = self._create_stage_lr_scheduler("full")
# 重置优化器
self.optimizer = optim.AdamW(
self.model.parameters(),
lr=self.full_lr,
weight_decay=self.weight_decay,
betas=(0.9, 0.999),
eps=1e-8,
)
# 重置训练状态
self.frozen_best_loss = float("inf")
self.frozen_patience_counter = 0
logger.info(f"All layers unfrozen, using full LR: {self.full_lr:.2e}")
def _update_stage_after_eval(self, eval_loss: float):
"""根据评估结果更新训练阶段"""
if self.current_stage == "frozen":
# 检查是否应该切换到全量微调
if eval_loss < self.frozen_best_loss:
self.frozen_best_loss = eval_loss
self.frozen_patience_counter = 0
logger.info(f"Frozen stage new best loss: {eval_loss:.4f}")
else:
self.frozen_patience_counter += 1
logger.info(f"Frozen stage patience counter: {self.frozen_patience_counter}/{self.frozen_patience}")
# 如果达到耐心值,切换到全量微调
if self.frozen_patience_counter >= self.frozen_patience:
self._switch_to_full_stage()
def train(
self, resume_from: Optional[str] = None, reset_training_state: bool = False
):
"""
两阶段训练循环
Args:
resume_from: 从哪个检查点恢复训练可选
reset_training_state: 是否重置训练状态只加载模型权重从头开始训练
"""
# 如果提供了检查点,则恢复训练
if resume_from is not None:
self.load_checkpoint(resume_from, reset_training_state=reset_training_state)
# 打印训练信息
self._print_training_info()
# 初始化训练状态
global_step = self.current_step
accumulated_loss = 0.0
accumulated_accuracy = 0.0
accumulation_counter = 0
# 创建进度条
with self._create_progress_bar() as progress:
epoch_task = progress.add_task(
f"[cyan]Epoch {self.current_epoch + 1}/{self.num_epochs} (Stage: {self.current_stage})",
total=self.total_steps,
)
# 训练循环
for epoch in range(self.current_epoch, self.num_epochs):
self.current_epoch = epoch
for batch_idx, batch in enumerate(self.train_dataloader):
# 更新学习率
current_lr = self._update_learning_rate()
# 训练步骤
loss, metrics = self.train_step(batch)
# 累积指标
accumulated_loss += loss
accumulated_accuracy += metrics.get("accuracy", 0.0)
accumulation_counter += 1
# 梯度累积每grad_accum_steps步更新一次参数
if (global_step + 1) % self.grad_accum_steps == 0:
# 梯度裁剪
self.scaler.unscale_(self.optimizer)
torch.nn.utils.clip_grad_norm_(
self.model.parameters(), self.clip_grad_norm
)
# 更新参数
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
# 更新进度条
progress.update(
epoch_task,
advance=1,
description=f"[cyan]Epoch {epoch + 1}/{self.num_epochs} (Stage: {self.current_stage}) | "
f"Step {global_step}/{self.total_steps} | "
f"Loss: {loss:.4f} | "
f"LR: {current_lr:.2e}",
)
# 定期评估和记录
if (global_step + 1) % self.eval_frequency == 0:
# 计算平均指标
avg_loss = accumulated_loss / accumulation_counter
avg_accuracy = accumulated_accuracy / accumulation_counter
# 评估模型
eval_metrics = self.evaluate()
# 准备日志指标
log_metrics = {
"train/loss": avg_loss,
"train/accuracy": avg_accuracy,
"train/learning_rate": current_lr,
"train/stage": 0.0 if self.current_stage == "frozen" else 1.0,
}
if eval_metrics:
log_metrics.update(
{
"eval/loss": eval_metrics["eval_loss"],
"eval/accuracy": eval_metrics["eval_accuracy"],
}
)
# 更新最佳模型(全局)
if eval_metrics["eval_loss"] < self.best_eval_loss:
self.best_eval_loss = eval_metrics["eval_loss"]
# 只保存best_model不创建额外的checkpoint文件
self.save_checkpoint("best_model.pt", is_best=True)
# 更新训练阶段
self._update_stage_after_eval(eval_metrics["eval_loss"])
# 记录到TensorBoard
self._log_to_tensorboard(log_metrics, global_step)
# 打印日志
log_text = (
f"[Epoch {epoch + 1}/{self.num_epochs}] "
f"[Stage: {self.current_stage}] "
f"[Step {global_step}/{self.total_steps}] "
f"Train Loss: {avg_loss:.4f} | "
f"Train Acc: {avg_accuracy:.4f} | "
f"LR: {current_lr:.2e}"
)
if eval_metrics:
log_text += (
f" | Eval Loss: {eval_metrics['eval_loss']:.4f} | "
f"Eval Acc: {eval_metrics['eval_accuracy']:.4f}"
)
progress.console.log(log_text)
# 重置累积指标
accumulated_loss = 0.0
accumulated_accuracy = 0.0
accumulation_counter = 0
# 定期保存检查点(覆盖之前的定期检查点)
if (global_step + 1) % self.save_frequency == 0:
self.save_checkpoint("latest_checkpoint.pt", is_periodic=True)
# 更新步数
global_step += 1
self.current_step = global_step
# 检查是否达到总步数
if global_step >= self.total_steps:
progress.update(epoch_task, completed=self.total_steps)
break
# 重置进度条
progress.reset(epoch_task)
# 每个epoch结束后保存检查点
self.save_checkpoint(f"epoch_{epoch + 1}.pt")
# 检查是否达到总步数
if global_step >= self.total_steps:
break
# 训练完成
logger.info("Two-stage training completed!")
# 保存最终模型
self.save_checkpoint("final_model.pt")
# 关闭TensorBoard写入器
if self.writer is not None:
self.writer.close()
def worker_init_fn(worker_id: int) -> None: def worker_init_fn(worker_id: int) -> None:
""" """
初始化每个DataLoader worker的随机种子确保可复现性 初始化每个DataLoader worker的随机种子确保可复现性
@ -1082,5 +1522,326 @@ def export(
console.print("[yellow]导出功能待实现[/yellow]") console.print("[yellow]导出功能待实现[/yellow]")
@app.command()
def expand_and_train(
# 数据参数
train_data_path: str = typer.Option(
..., "--train-data-path", "-t", help="训练数据集路径"
),
eval_data_path: str = typer.Option(
..., "--eval-data-path", "-e", help="评估数据集路径"
),
output_dir: str = typer.Option("./output", "--output-dir", "-o", help="输出目录"),
# 模型参数
base_model_path: str = typer.Option(
..., "--base-model-path", help="预训练基础模型检查点路径"
),
new_model_spec: str = typer.Option(
..., "--new-model-spec", "-m", help="新模型规格,格式:模块名:类名,如 'model:InputMethodEngine'。支持任意路径,自定义模型类必须是 InputMethodEngine 的子类"
),
vocab_size: int = typer.Option(10019, "--vocab-size", help="词汇表大小"),
pinyin_vocab_size: int = typer.Option(
30, "--pinyin-vocab-size", help="拼音词汇表大小"
),
max_iter_length: int = typer.Option(
1024 * 1024 * 128, "--max_iter_length", help="数据集大小"
),
dim: int = typer.Option(512, "--dim", help="模型维度"),
num_slots: int = typer.Option(8, "--num-slots", help="历史槽位数量"),
n_layers: int = typer.Option(4, "--n-layers", help="Transformer层数"),
n_heads: int = typer.Option(4, "--n-heads", help="注意力头数"),
num_experts: int = typer.Option(20, "--num-experts", help="MoE专家数量"),
max_seq_len: int = typer.Option(128, "--max-seq-len", help="最大序列长度"),
use_pinyin: bool = typer.Option(False, "--use-pinyin", help="是否使用拼音特征"),
# 两阶段训练参数
frozen_patience: int = typer.Option(
10, "--frozen-patience", help="冻结阶段验证损失连续不下降的epoch数触发切换到全量微调"
),
frozen_lr: float = typer.Option(
1e-3, "--frozen-lr", help="冻结阶段学习率"
),
full_lr: float = typer.Option(
1e-4, "--full-lr", help="全量微调阶段学习率"
),
frozen_scheduler: str = typer.Option(
"cosine", "--frozen-scheduler", help="冻结阶段学习率调度器类型cosine或plateau"
),
full_scheduler: str = typer.Option(
"cosine", "--full-scheduler", help="全量微调阶段学习率调度器类型cosine或plateau"
),
# 训练参数
batch_size: int = typer.Option(128, "--batch-size", "-b", help="批次大小"),
num_epochs: int = typer.Option(10, "--num-epochs", help="训练轮数"),
min_learning_rate: float = typer.Option(
1e-9, "--min-learning-rate", help="最小学习率"
),
weight_decay: float = typer.Option(0.1, "--weight-decay", help="权重衰减"),
warmup_ratio: float = typer.Option(0.1, "--warmup-ratio", help="热身步数比例"),
label_smoothing: float = typer.Option(
0.15, "--label-smoothing", help="标签平滑参数"
),
grad_accum_steps: int = typer.Option(1, "--grad-accum-steps", help="梯度累积步数"),
clip_grad_norm: float = typer.Option(1.0, "--clip-grad-norm", help="梯度裁剪范数"),
eval_frequency: int = typer.Option(500, "--eval-frequency", help="评估频率"),
save_frequency: int = typer.Option(1000, "--save-frequency", help="保存频率"),
# 其他参数
mixed_precision: bool = typer.Option(
True, "--mixed-precision/--no-mixed-precision", help="是否使用混合精度训练"
),
num_workers: int = typer.Option(
2, "--num-workers", help="数据加载worker数量流式数据集建议为2"
),
use_tensorboard: bool = typer.Option(
True, "--tensorboard/--no-tensorboard", help="是否使用TensorBoard"
),
resume_from: Optional[str] = typer.Option(
None, "--resume-from", help="从检查点恢复训练"
),
reset_training_state: bool = typer.Option(
False, "--reset-training-state", help="重置训练状态,只加载模型权重从头开始训练"
),
seed: int = typer.Option(42, "--seed", help="随机种子"),
compile: bool = typer.Option(
False,
"--compile/--no-compile",
help="是否开启 torch.compile 优化(需 PyTorch 2.0+",
),
):
"""
模型扩容两阶段训练先冻结匹配层训练然后全量微调
"""
torch.multiprocessing.set_sharing_strategy("file_system")
# 启用 TensorFloat32 加速矩阵乘法 (解决 UserWarning 并提升性能)
if torch.cuda.is_available():
torch.set_float32_matmul_precision("high")
# 设置随机种子
torch.manual_seed(seed)
if torch.cuda.is_available():
torch.cuda.manual_seed_all(seed)
console = Console()
# 打印配置信息
console.print(
Panel.fit("[bold cyan]模型扩容两阶段训练配置[/bold cyan]", border_style="cyan")
)
config_table = Table(show_header=True, header_style="bold magenta")
config_table.add_column("Category", style="cyan")
config_table.add_column("Parameter", style="green")
config_table.add_column("Value", style="yellow")
# 添加配置信息
config_table.add_row("数据", "训练数据路径", train_data_path)
config_table.add_row("数据", "评估数据路径", eval_data_path)
config_table.add_row("数据", "输出目录", output_dir)
config_table.add_row("数据", "批次大小", str(batch_size))
config_table.add_row("数据", "Worker数量", str(num_workers))
config_table.add_row("模型", "基础模型路径", base_model_path)
config_table.add_row("模型", "新模型规格", new_model_spec)
config_table.add_row("模型", "词汇表大小", str(vocab_size))
config_table.add_row("模型", "拼音词汇表", str(pinyin_vocab_size))
config_table.add_row("模型", "模型维度", str(dim))
config_table.add_row("模型", "槽位数量", str(num_slots))
config_table.add_row("模型", "Transformer层数", str(n_layers))
config_table.add_row("模型", "注意力头数", str(n_heads))
config_table.add_row("模型", "MoE专家数", str(num_experts))
config_table.add_row("模型", "使用拼音", str(use_pinyin))
config_table.add_row("模型", "编译优化", str(compile))
config_table.add_row("两阶段训练", "冻结阶段耐心值", str(frozen_patience))
config_table.add_row("两阶段训练", "冻结阶段学习率", f"{frozen_lr:.2e}")
config_table.add_row("两阶段训练", "全量阶段学习率", f"{full_lr:.2e}")
config_table.add_row("两阶段训练", "冻结阶段调度器", frozen_scheduler)
config_table.add_row("两阶段训练", "全量阶段调度器", full_scheduler)
config_table.add_row("训练", "训练轮数", str(num_epochs))
config_table.add_row("训练", "最小学习率", f"{min_learning_rate:.2e}")
config_table.add_row("训练", "权重衰减", str(weight_decay))
config_table.add_row("训练", "热身比例", str(warmup_ratio))
config_table.add_row("训练", "标签平滑", str(label_smoothing))
config_table.add_row("训练", "梯度累积", str(grad_accum_steps))
config_table.add_row("训练", "梯度裁剪", str(clip_grad_norm))
config_table.add_row("训练", "混合精度", str(mixed_precision))
console.print(config_table)
# 创建输出目录
output_path = Path(output_dir)
output_path.mkdir(parents=True, exist_ok=True)
# 保存配置
config = {
"train_data_path": train_data_path,
"eval_data_path": eval_data_path,
"output_dir": output_dir,
"base_model_path": base_model_path,
"new_model_spec": new_model_spec,
"vocab_size": vocab_size,
"pinyin_vocab_size": pinyin_vocab_size,
"dim": dim,
"num_slots": num_slots,
"n_layers": n_layers,
"n_heads": n_heads,
"num_experts": num_experts,
"max_seq_len": max_seq_len,
"use_pinyin": use_pinyin,
"frozen_patience": frozen_patience,
"frozen_lr": frozen_lr,
"full_lr": full_lr,
"frozen_scheduler": frozen_scheduler,
"full_scheduler": full_scheduler,
"batch_size": batch_size,
"num_workers": num_workers,
"num_epochs": num_epochs,
"min_learning_rate": min_learning_rate,
"weight_decay": weight_decay,
"warmup_ratio": warmup_ratio,
"label_smoothing": label_smoothing,
"grad_accum_steps": grad_accum_steps,
"clip_grad_norm": clip_grad_norm,
"eval_frequency": eval_frequency,
"save_frequency": save_frequency,
"mixed_precision": mixed_precision,
"use_tensorboard": use_tensorboard,
"seed": seed,
"max_iter_length": max_iter_length,
"compile": compile,
}
config_file = output_path / "expansion_training_config.json"
with open(config_file, "w", encoding="utf-8") as f:
json.dump(config, f, indent=2, ensure_ascii=False)
logger.info(f"Configuration saved to {config_file}")
# 创建数据加载器
console.print("[bold cyan]正在创建数据加载器...[/bold cyan]")
# 训练数据集
train_dataset = PinyinInputDataset(
data_path=train_data_path,
max_workers=-1, # 自动选择worker数量
max_iter_length=max_iter_length,
max_seq_length=max_seq_len,
text_field="text",
py_style_weight=(9, 2, 1),
shuffle_buffer_size=5000,
length_weights={1: 10, 2: 50, 3: 50, 4: 40, 5: 15, 6: 10, 7: 5, 8: 2},
)
# 训练数据加载器
train_dataloader = create_dataloader(
dataset=train_dataset,
batch_size=batch_size,
num_workers=num_workers,
pin_memory=torch.cuda.is_available(),
max_iter_length=max_iter_length,
)
# 评估数据集
eval_dataset = PinyinInputDataset(
data_path=eval_data_path,
max_workers=-1,
max_iter_length=batch_size * 64, # 评估集较小
max_seq_length=max_seq_len,
text_field="text",
py_style_weight=(9, 2, 1),
shuffle_buffer_size=50000,
length_weights={1: 10, 2: 50, 3: 50, 4: 40, 5: 15, 6: 10, 7: 5, 8: 2},
)
eval_dataloader = create_dataloader(
dataset=eval_dataset,
batch_size=batch_size,
num_workers=1, # 评估使用较少的worker
pin_memory=torch.cuda.is_available(),
max_iter_length=batch_size * 64,
)
# 创建扩容模型
console.print("[bold cyan]正在创建扩容模型...[/bold cyan]")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_kwargs = {
"vocab_size": vocab_size,
"pinyin_vocab_size": pinyin_vocab_size,
"dim": dim,
"num_slots": num_slots,
"n_layers": n_layers,
"n_heads": n_heads,
"num_experts": num_experts,
"max_seq_len": max_seq_len,
"compile": compile,
}
model = load_expanded_model(
base_model_path=base_model_path,
new_model_spec=new_model_spec,
device=device,
**model_kwargs,
)
console.print(
f"[green]✓ 扩容模型创建完成,参数量: {sum(p.numel() for p in model.parameters()):,}[/green]"
)
# 统计冻结参数比例
total_params = sum(p.numel() for p in model.parameters())
frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
console.print(
f"[green]✓ 冻结参数: {frozen_params:,}/{total_params:,} ({frozen_params/total_params*100:.1f}%)[/green]"
)
# 创建两阶段训练器
console.print("[bold cyan]正在创建两阶段训练器...[/bold cyan]")
trainer = TwoStageTrainer(
model=model,
train_dataloader=train_dataloader,
eval_dataloader=eval_dataloader,
total_steps=int(max_iter_length * num_epochs / batch_size),
output_dir=output_dir,
num_epochs=num_epochs,
learning_rate=frozen_lr, # 初始学习率会被阶段特定LR覆盖
min_learning_rate=min_learning_rate,
weight_decay=weight_decay,
warmup_ratio=warmup_ratio,
label_smoothing=label_smoothing,
grad_accum_steps=grad_accum_steps,
clip_grad_norm=clip_grad_norm,
eval_frequency=eval_frequency,
save_frequency=save_frequency,
mixed_precision=mixed_precision,
use_tensorboard=use_tensorboard,
status_file="training_status.json",
# 两阶段训练特有参数
frozen_patience=frozen_patience,
frozen_lr=frozen_lr,
full_lr=full_lr,
frozen_scheduler=frozen_scheduler,
full_scheduler=full_scheduler,
)
console.print("[green]✓ 两阶段训练器创建完成[/green]")
# 开始训练
console.print("\n[bold cyan]开始两阶段训练...[/bold cyan]")
console.print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
try:
trainer.train(
resume_from=resume_from, reset_training_state=reset_training_state
)
except KeyboardInterrupt:
console.print("[bold green]训练被终止[/bold green]")
trainer.save_checkpoint("interrupted_model.pt")
console.print("[bold green]✓ 两阶段训练完成![/bold green]")
console.print(f"结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
console.print(f"模型和日志保存在: {output_dir}")
if __name__ == "__main__": if __name__ == "__main__":
app() app()

3486
uv.lock

File diff suppressed because it is too large Load Diff