feat: 添加模型扩容两阶段训练功能,支持冻结层训练与全量微调切换
This commit is contained in:
parent
d14fd09f41
commit
c9a96651cd
109
README.md
109
README.md
|
|
@ -790,7 +790,114 @@ train-model evaluate \
|
|||
- 在评估数据集上计算准确率、困惑度等指标
|
||||
- 生成详细的性能报告
|
||||
|
||||
### 6.8 导出模型(开发中)
|
||||
### 6.8 模型扩容两阶段训练
|
||||
|
||||
当需要增加模型容量(如增加专家数量、修改层结构等)时,可以使用 `expand-and-train` 命令进行两阶段训练:先冻结匹配层训练新增参数,然后全量微调。
|
||||
|
||||
#### 训练策略
|
||||
|
||||
1. **冻结阶段**:只训练形状不匹配的新增参数(如新增的专家、扩容的层等)
|
||||
2. **全量微调阶段**:当验证损失连续 `--frozen-patience` 次不下降时,自动解冻所有层进行全量训练
|
||||
|
||||
#### 基础用法
|
||||
|
||||
```bash
|
||||
train-model expand-and-train \
|
||||
--train-data-path "path/to/train/dataset" \
|
||||
--eval-data-path "path/to/eval/dataset" \
|
||||
--base-model-path "./pretrained/model.pt" \
|
||||
--new-model-spec "model:InputMethodEngine" \
|
||||
--num-experts 40 \
|
||||
--frozen-lr 2e-3 \
|
||||
--full-lr 5e-5 \
|
||||
--frozen-patience 8
|
||||
```
|
||||
|
||||
#### 完整参数示例
|
||||
|
||||
```bash
|
||||
train-model expand-and-train \
|
||||
--train-data-path "path/to/train/dataset" \
|
||||
--eval-data-path "path/to/eval/dataset" \
|
||||
--output-dir "./expansion_output" \
|
||||
--base-model-path "./pretrained/model.pt" \
|
||||
--new-model-spec "custom_model:ExpandedModel" \
|
||||
--vocab-size 10019 \
|
||||
--dim 512 \
|
||||
--num-experts 40 \
|
||||
--frozen-patience 10 \
|
||||
--frozen-lr 1e-3 \
|
||||
--full-lr 1e-4 \
|
||||
--frozen-scheduler cosine \
|
||||
--full-scheduler cosine \
|
||||
--batch-size 128 \
|
||||
--num-epochs 20 \
|
||||
--compile
|
||||
```
|
||||
|
||||
#### 参数详解
|
||||
|
||||
**模型扩容参数**
|
||||
- `--base-model-path`: 预训练基础模型检查点路径(必需)
|
||||
- `--new-model-spec`: 新模型规格,格式:`模块名:类名`,如 `model:InputMethodEngine`(必需)
|
||||
- 支持任意路径的模块导入,模块文件需包含自定义的模型类
|
||||
- 自定义模型类必须是 `InputMethodEngine` 的子类
|
||||
- 示例:`my_model:MyExpandedModel` 对应 `my_model.py` 中的 `MyExpandedModel` 类
|
||||
|
||||
**两阶段训练参数**
|
||||
- `--frozen-patience`: 冻结阶段验证损失连续不下降的评估次数,触发切换到全量微调(默认:10)
|
||||
- `--frozen-lr`: 冻结阶段学习率(默认:1e-3)
|
||||
- `--full-lr`: 全量微调阶段学习率(默认:1e-4)
|
||||
- `--frozen-scheduler`: 冻结阶段学习率调度器,可选 `cosine` 或 `plateau`(默认:`cosine`)
|
||||
- `--full-scheduler`: 全量微调阶段学习率调度器,可选 `cosine` 或 `plateau`(默认:`cosine`)
|
||||
|
||||
**其他参数**
|
||||
- 支持所有 `train` 子命令的通用参数(数据参数、模型参数、训练参数等)
|
||||
- 继承现有的训练基础设施:混合精度训练、TensorBoard日志、checkpoint保存等
|
||||
|
||||
#### 使用场景
|
||||
|
||||
1. **增加专家数量**(20→40)
|
||||
- 冻结效果:~70% 参数可冻结(已有专家权重、注意力层等)
|
||||
- 新增参数:新专家网络、gate层
|
||||
|
||||
2. **增加top_k值**(2→3)
|
||||
- 冻结效果:100% 参数可冻结(仅逻辑变化)
|
||||
- 新增参数:无
|
||||
|
||||
3. **修改专家内部结构**(如增加resblocks)
|
||||
- 冻结效果:~50% 参数可冻结(linear_in/output可冻结)
|
||||
- 新增参数:新增的resblocks层
|
||||
|
||||
4. **增加Transformer层数**(4→5)
|
||||
- 冻结效果:~80% 参数可冻结(前4层可冻结)
|
||||
- 新增参数:新增的第5层
|
||||
|
||||
#### 自定义模型类示例
|
||||
|
||||
```python
|
||||
# my_model.py
|
||||
from model.model import InputMethodEngine
|
||||
|
||||
class MyExpandedModel(InputMethodEngine):
|
||||
def __init__(self, num_experts=40, **kwargs):
|
||||
# 调用父类构造函数,覆盖num_experts参数
|
||||
super().__init__(num_experts=num_experts, **kwargs)
|
||||
# 可以在这里添加额外的层或修改现有层
|
||||
|
||||
# 使用命令
|
||||
# train-model expand-and-train --new-model-spec "my_model:MyExpandedModel" ...
|
||||
```
|
||||
|
||||
#### 注意事项
|
||||
|
||||
1. **模型类要求**:自定义模型类必须是 `InputMethodEngine` 的子类
|
||||
2. **冻结条件**:只有权重形状完全匹配的层才会被冻结
|
||||
3. **性能保持**:MoE层保持"计算所有专家+Top-K选择"方案,确保 `torch.compile` 下的最佳性能
|
||||
4. **阶段切换**:基于评估频率而非epoch,建议适当调高 `--eval-frequency`
|
||||
5. **模块导入**:支持任意路径的模块,通过Python标准导入机制加载
|
||||
|
||||
### 6.9 导出模型(开发中)
|
||||
|
||||
当前导出功能尚在开发中:
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,8 @@
|
|||
from model.model import InputMethodEngine
|
||||
|
||||
|
||||
class BigExpert(InputMethodEngine):
|
||||
def __init__(self, *args, **kw):
|
||||
super().__init__(*args, **kw)
|
||||
|
||||
self.moe = MoELayer(dim=dim, num_experts=40, top_k=3)
|
||||
|
|
@ -45,10 +45,12 @@ class Trainer:
|
|||
- CrossEntropyLoss损失函数(支持weight和label_smoothing)
|
||||
- Rich终端美化输出
|
||||
"""
|
||||
|
||||
training_status_data: List[Dict[str, Any]]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: InputMethodEngine,
|
||||
model: nn.Module,
|
||||
train_dataloader: DataLoader,
|
||||
eval_dataloader: DataLoader,
|
||||
total_steps: int,
|
||||
|
|
@ -461,10 +463,10 @@ class Trainer:
|
|||
if existing_indices:
|
||||
# 替换现有记录
|
||||
for idx in existing_indices:
|
||||
self.training_status_data[idx] = status_record
|
||||
self.training_status_data[idx] = status_record # type: ignore
|
||||
else:
|
||||
# 添加到内存缓存
|
||||
self.training_status_data.append(status_record)
|
||||
self.training_status_data.append(status_record) # type: ignore
|
||||
|
||||
# 限制内存中的数据量,只保留最近1000条记录
|
||||
if len(self.training_status_data) > 1000:
|
||||
|
|
@ -686,6 +688,444 @@ class Trainer:
|
|||
self.writer.close()
|
||||
|
||||
|
||||
def load_expanded_model(
|
||||
base_model_path: str,
|
||||
new_model_spec: str,
|
||||
device: torch.device,
|
||||
**model_kwargs,
|
||||
) -> nn.Module:
|
||||
"""
|
||||
加载预训练基础模型并创建扩容后的新模型,冻结匹配的层。
|
||||
|
||||
Args:
|
||||
base_model_path: 预训练基础模型检查点路径
|
||||
new_model_spec: 新模型规格,格式 "module:ClassName",如 "new_model:NewModel"
|
||||
device: 设备
|
||||
**model_kwargs: 传递给新模型构造函数的参数
|
||||
|
||||
Returns:
|
||||
扩容后的新模型,匹配的层已冻结
|
||||
"""
|
||||
import importlib
|
||||
import sys
|
||||
|
||||
# 解析新模型规格
|
||||
if ":" not in new_model_spec:
|
||||
raise ValueError(f"Invalid model spec format: {new_model_spec}. Expected format: 'module:ClassName'")
|
||||
|
||||
module_name, class_name = new_model_spec.split(":", 1)
|
||||
|
||||
# 导入模块(支持任意路径)
|
||||
module = None
|
||||
try:
|
||||
# 尝试直接导入
|
||||
module = importlib.import_module(module_name)
|
||||
except ImportError:
|
||||
# 如果失败,尝试将其视为文件路径
|
||||
try:
|
||||
# 将模块名转换为可能的文件路径
|
||||
module_path = module_name.replace(".", "/") + ".py"
|
||||
import importlib.util
|
||||
spec = importlib.util.spec_from_file_location(module_name, module_path)
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Cannot find module or loader: {module_name}")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module) # type: ignore
|
||||
except Exception as e:
|
||||
# 尝试在当前目录下查找
|
||||
import os
|
||||
if os.path.exists(module_name + ".py"):
|
||||
spec = importlib.util.spec_from_file_location(module_name, module_name + ".py")
|
||||
if spec is None or spec.loader is None:
|
||||
raise ImportError(f"Cannot load module from file: {module_name}.py")
|
||||
module = importlib.util.module_from_spec(spec)
|
||||
spec.loader.exec_module(module) # type: ignore
|
||||
else:
|
||||
raise ImportError(f"Failed to import module '{module_name}': {e}")
|
||||
|
||||
if module is None:
|
||||
raise ImportError(f"Module '{module_name}' could not be imported")
|
||||
|
||||
# 获取模型类
|
||||
model_class = getattr(module, class_name)
|
||||
|
||||
# 检查模型类是否是 InputMethodEngine 的子类
|
||||
from .model import InputMethodEngine
|
||||
if not issubclass(model_class, InputMethodEngine):
|
||||
raise TypeError(
|
||||
f"Model class {class_name} must be a subclass of InputMethodEngine. "
|
||||
f"Got {model_class.__name__} instead."
|
||||
)
|
||||
|
||||
# 创建新模型
|
||||
new_model = model_class(**model_kwargs)
|
||||
new_model.to(device)
|
||||
|
||||
# 加载预训练权重
|
||||
checkpoint = torch.load(base_model_path, map_location=device)
|
||||
if "model_state_dict" in checkpoint:
|
||||
pretrained_state_dict = checkpoint["model_state_dict"]
|
||||
else:
|
||||
pretrained_state_dict = checkpoint
|
||||
|
||||
# 获取新模型的状态字典
|
||||
new_state_dict = new_model.state_dict()
|
||||
|
||||
# 冻结匹配的层
|
||||
frozen_layers = []
|
||||
for key in new_state_dict.keys():
|
||||
if key in pretrained_state_dict:
|
||||
if new_state_dict[key].shape == pretrained_state_dict[key].shape:
|
||||
new_state_dict[key] = pretrained_state_dict[key].to(device)
|
||||
frozen_layers.append(key)
|
||||
|
||||
# 加载更新后的状态字典
|
||||
new_model.load_state_dict(new_state_dict)
|
||||
|
||||
# 设置参数 requires_grad
|
||||
for name, param in new_model.named_parameters():
|
||||
if name in frozen_layers:
|
||||
param.requires_grad = False
|
||||
|
||||
logger.info(f"Loaded expanded model with {len(frozen_layers)} frozen layers")
|
||||
logger.info(f"Frozen layers: {frozen_layers[:10]}{'...' if len(frozen_layers) > 10 else ''}")
|
||||
|
||||
return new_model
|
||||
|
||||
|
||||
class TwoStageTrainer(Trainer):
|
||||
"""
|
||||
两阶段训练器:先冻结匹配层训练,然后全量微调。
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: nn.Module,
|
||||
train_dataloader: DataLoader,
|
||||
eval_dataloader: DataLoader,
|
||||
total_steps: int,
|
||||
output_dir: str = "./output",
|
||||
num_epochs: int = 10,
|
||||
learning_rate: float = 1e-4,
|
||||
min_learning_rate: float = 1e-6,
|
||||
weight_decay: float = 0.1,
|
||||
warmup_ratio: float = 0.1,
|
||||
label_smoothing: float = 0.15,
|
||||
loss_weight: Optional[torch.Tensor] = None,
|
||||
grad_accum_steps: int = 1,
|
||||
clip_grad_norm: float = 1.0,
|
||||
eval_frequency: int = 500,
|
||||
save_frequency: int = 10000,
|
||||
mixed_precision: bool = True,
|
||||
device: Optional[torch.device] = None,
|
||||
status_file: str = "training_status.json",
|
||||
use_tensorboard: bool = True,
|
||||
# 两阶段训练特有参数
|
||||
frozen_patience: int = 10,
|
||||
frozen_lr: Optional[float] = None,
|
||||
full_lr: Optional[float] = None,
|
||||
frozen_scheduler: str = "cosine",
|
||||
full_scheduler: str = "cosine",
|
||||
):
|
||||
"""
|
||||
初始化两阶段训练器
|
||||
|
||||
Args:
|
||||
frozen_patience: 冻结阶段验证损失连续不下降的epoch数,触发切换到全量微调
|
||||
frozen_lr: 冻结阶段学习率,如果为None则使用learning_rate
|
||||
full_lr: 全量微调阶段学习率,如果为None则使用learning_rate
|
||||
frozen_scheduler: 冻结阶段学习率调度器类型,"cosine"或"plateau"
|
||||
full_scheduler: 全量微调阶段学习率调度器类型,"cosine"或"plateau"
|
||||
"""
|
||||
super().__init__(
|
||||
model=model,
|
||||
train_dataloader=train_dataloader,
|
||||
eval_dataloader=eval_dataloader,
|
||||
total_steps=total_steps,
|
||||
output_dir=output_dir,
|
||||
num_epochs=num_epochs,
|
||||
learning_rate=learning_rate,
|
||||
min_learning_rate=min_learning_rate,
|
||||
weight_decay=weight_decay,
|
||||
warmup_ratio=warmup_ratio,
|
||||
label_smoothing=label_smoothing,
|
||||
loss_weight=loss_weight,
|
||||
grad_accum_steps=grad_accum_steps,
|
||||
clip_grad_norm=clip_grad_norm,
|
||||
eval_frequency=eval_frequency,
|
||||
save_frequency=save_frequency,
|
||||
mixed_precision=mixed_precision,
|
||||
device=device,
|
||||
status_file=status_file,
|
||||
use_tensorboard=use_tensorboard,
|
||||
)
|
||||
|
||||
# 两阶段训练参数
|
||||
self.frozen_patience = frozen_patience
|
||||
self.frozen_lr = frozen_lr if frozen_lr is not None else learning_rate
|
||||
self.full_lr = full_lr if full_lr is not None else learning_rate
|
||||
self.frozen_scheduler = frozen_scheduler
|
||||
self.full_scheduler = full_scheduler
|
||||
|
||||
# 训练状态
|
||||
self.current_stage = "frozen" # "frozen" 或 "full"
|
||||
self.frozen_best_loss = float("inf")
|
||||
self.frozen_patience_counter = 0
|
||||
|
||||
logger.info(f"TwoStageTrainer initialized with frozen_patience={frozen_patience}")
|
||||
logger.info(f"Stage: {self.current_stage}, Frozen LR: {self.frozen_lr:.2e}, Full LR: {self.full_lr:.2e}")
|
||||
|
||||
# 覆盖父类的学习率调度器为冻结阶段调度器
|
||||
self.lr_scheduler = self._create_stage_lr_scheduler("frozen")
|
||||
|
||||
def _create_stage_lr_scheduler(self, stage: str) -> Callable[[int], float]:
|
||||
"""创建阶段特定的学习率调度函数"""
|
||||
if stage == "frozen":
|
||||
base_lr = self.frozen_lr
|
||||
scheduler_type = self.frozen_scheduler
|
||||
else:
|
||||
base_lr = self.full_lr
|
||||
scheduler_type = self.full_scheduler
|
||||
|
||||
# 捕获局部变量以避免闭包中的self引用问题
|
||||
warmup_steps = self.warmup_steps
|
||||
total_steps = self.total_steps
|
||||
min_learning_rate = self.min_learning_rate
|
||||
|
||||
def lr_scheduler(step: int) -> float:
|
||||
if step < warmup_steps:
|
||||
# 线性预热
|
||||
return base_lr * (step / warmup_steps)
|
||||
else:
|
||||
if scheduler_type == "cosine":
|
||||
# 余弦退火
|
||||
progress = (step - warmup_steps) / (
|
||||
total_steps - warmup_steps
|
||||
)
|
||||
cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
|
||||
decayed_lr = (
|
||||
min_learning_rate
|
||||
+ (base_lr - min_learning_rate) * cosine_decay
|
||||
)
|
||||
return decayed_lr
|
||||
elif scheduler_type == "plateau":
|
||||
# 保持恒定学习率(plateau调度需要在训练循环中实现)
|
||||
return base_lr
|
||||
else:
|
||||
raise ValueError(f"Unknown scheduler type: {scheduler_type}")
|
||||
|
||||
return lr_scheduler
|
||||
|
||||
def _switch_to_full_stage(self):
|
||||
"""切换到全量微调阶段"""
|
||||
if self.current_stage == "full":
|
||||
return
|
||||
|
||||
logger.info("Switching to full fine-tuning stage")
|
||||
self.current_stage = "full"
|
||||
|
||||
# 解冻所有参数
|
||||
for param in self.model.parameters():
|
||||
param.requires_grad = True
|
||||
|
||||
# 更新学习率调度器
|
||||
self.learning_rate = self.full_lr
|
||||
self.lr_scheduler = self._create_stage_lr_scheduler("full")
|
||||
|
||||
# 重置优化器
|
||||
self.optimizer = optim.AdamW(
|
||||
self.model.parameters(),
|
||||
lr=self.full_lr,
|
||||
weight_decay=self.weight_decay,
|
||||
betas=(0.9, 0.999),
|
||||
eps=1e-8,
|
||||
)
|
||||
|
||||
# 重置训练状态
|
||||
self.frozen_best_loss = float("inf")
|
||||
self.frozen_patience_counter = 0
|
||||
|
||||
logger.info(f"All layers unfrozen, using full LR: {self.full_lr:.2e}")
|
||||
|
||||
def _update_stage_after_eval(self, eval_loss: float):
|
||||
"""根据评估结果更新训练阶段"""
|
||||
if self.current_stage == "frozen":
|
||||
# 检查是否应该切换到全量微调
|
||||
if eval_loss < self.frozen_best_loss:
|
||||
self.frozen_best_loss = eval_loss
|
||||
self.frozen_patience_counter = 0
|
||||
logger.info(f"Frozen stage new best loss: {eval_loss:.4f}")
|
||||
else:
|
||||
self.frozen_patience_counter += 1
|
||||
logger.info(f"Frozen stage patience counter: {self.frozen_patience_counter}/{self.frozen_patience}")
|
||||
|
||||
# 如果达到耐心值,切换到全量微调
|
||||
if self.frozen_patience_counter >= self.frozen_patience:
|
||||
self._switch_to_full_stage()
|
||||
|
||||
def train(
|
||||
self, resume_from: Optional[str] = None, reset_training_state: bool = False
|
||||
):
|
||||
"""
|
||||
两阶段训练循环
|
||||
|
||||
Args:
|
||||
resume_from: 从哪个检查点恢复训练(可选)
|
||||
reset_training_state: 是否重置训练状态(只加载模型权重,从头开始训练)
|
||||
"""
|
||||
# 如果提供了检查点,则恢复训练
|
||||
if resume_from is not None:
|
||||
self.load_checkpoint(resume_from, reset_training_state=reset_training_state)
|
||||
|
||||
# 打印训练信息
|
||||
self._print_training_info()
|
||||
|
||||
# 初始化训练状态
|
||||
global_step = self.current_step
|
||||
accumulated_loss = 0.0
|
||||
accumulated_accuracy = 0.0
|
||||
accumulation_counter = 0
|
||||
|
||||
# 创建进度条
|
||||
with self._create_progress_bar() as progress:
|
||||
epoch_task = progress.add_task(
|
||||
f"[cyan]Epoch {self.current_epoch + 1}/{self.num_epochs} (Stage: {self.current_stage})",
|
||||
total=self.total_steps,
|
||||
)
|
||||
|
||||
# 训练循环
|
||||
for epoch in range(self.current_epoch, self.num_epochs):
|
||||
self.current_epoch = epoch
|
||||
|
||||
for batch_idx, batch in enumerate(self.train_dataloader):
|
||||
# 更新学习率
|
||||
current_lr = self._update_learning_rate()
|
||||
|
||||
# 训练步骤
|
||||
loss, metrics = self.train_step(batch)
|
||||
|
||||
# 累积指标
|
||||
accumulated_loss += loss
|
||||
accumulated_accuracy += metrics.get("accuracy", 0.0)
|
||||
accumulation_counter += 1
|
||||
|
||||
# 梯度累积:每grad_accum_steps步更新一次参数
|
||||
if (global_step + 1) % self.grad_accum_steps == 0:
|
||||
# 梯度裁剪
|
||||
self.scaler.unscale_(self.optimizer)
|
||||
torch.nn.utils.clip_grad_norm_(
|
||||
self.model.parameters(), self.clip_grad_norm
|
||||
)
|
||||
|
||||
# 更新参数
|
||||
self.scaler.step(self.optimizer)
|
||||
self.scaler.update()
|
||||
self.optimizer.zero_grad()
|
||||
|
||||
# 更新进度条
|
||||
progress.update(
|
||||
epoch_task,
|
||||
advance=1,
|
||||
description=f"[cyan]Epoch {epoch + 1}/{self.num_epochs} (Stage: {self.current_stage}) | "
|
||||
f"Step {global_step}/{self.total_steps} | "
|
||||
f"Loss: {loss:.4f} | "
|
||||
f"LR: {current_lr:.2e}",
|
||||
)
|
||||
|
||||
# 定期评估和记录
|
||||
if (global_step + 1) % self.eval_frequency == 0:
|
||||
# 计算平均指标
|
||||
avg_loss = accumulated_loss / accumulation_counter
|
||||
avg_accuracy = accumulated_accuracy / accumulation_counter
|
||||
|
||||
# 评估模型
|
||||
eval_metrics = self.evaluate()
|
||||
|
||||
# 准备日志指标
|
||||
log_metrics = {
|
||||
"train/loss": avg_loss,
|
||||
"train/accuracy": avg_accuracy,
|
||||
"train/learning_rate": current_lr,
|
||||
"train/stage": 0.0 if self.current_stage == "frozen" else 1.0,
|
||||
}
|
||||
|
||||
if eval_metrics:
|
||||
log_metrics.update(
|
||||
{
|
||||
"eval/loss": eval_metrics["eval_loss"],
|
||||
"eval/accuracy": eval_metrics["eval_accuracy"],
|
||||
}
|
||||
)
|
||||
|
||||
# 更新最佳模型(全局)
|
||||
if eval_metrics["eval_loss"] < self.best_eval_loss:
|
||||
self.best_eval_loss = eval_metrics["eval_loss"]
|
||||
# 只保存best_model,不创建额外的checkpoint文件
|
||||
self.save_checkpoint("best_model.pt", is_best=True)
|
||||
|
||||
# 更新训练阶段
|
||||
self._update_stage_after_eval(eval_metrics["eval_loss"])
|
||||
|
||||
# 记录到TensorBoard
|
||||
self._log_to_tensorboard(log_metrics, global_step)
|
||||
|
||||
# 打印日志
|
||||
log_text = (
|
||||
f"[Epoch {epoch + 1}/{self.num_epochs}] "
|
||||
f"[Stage: {self.current_stage}] "
|
||||
f"[Step {global_step}/{self.total_steps}] "
|
||||
f"Train Loss: {avg_loss:.4f} | "
|
||||
f"Train Acc: {avg_accuracy:.4f} | "
|
||||
f"LR: {current_lr:.2e}"
|
||||
)
|
||||
|
||||
if eval_metrics:
|
||||
log_text += (
|
||||
f" | Eval Loss: {eval_metrics['eval_loss']:.4f} | "
|
||||
f"Eval Acc: {eval_metrics['eval_accuracy']:.4f}"
|
||||
)
|
||||
|
||||
progress.console.log(log_text)
|
||||
|
||||
# 重置累积指标
|
||||
accumulated_loss = 0.0
|
||||
accumulated_accuracy = 0.0
|
||||
accumulation_counter = 0
|
||||
|
||||
# 定期保存检查点(覆盖之前的定期检查点)
|
||||
if (global_step + 1) % self.save_frequency == 0:
|
||||
self.save_checkpoint("latest_checkpoint.pt", is_periodic=True)
|
||||
|
||||
# 更新步数
|
||||
global_step += 1
|
||||
self.current_step = global_step
|
||||
|
||||
# 检查是否达到总步数
|
||||
if global_step >= self.total_steps:
|
||||
progress.update(epoch_task, completed=self.total_steps)
|
||||
break
|
||||
|
||||
# 重置进度条
|
||||
progress.reset(epoch_task)
|
||||
|
||||
# 每个epoch结束后保存检查点
|
||||
self.save_checkpoint(f"epoch_{epoch + 1}.pt")
|
||||
|
||||
# 检查是否达到总步数
|
||||
if global_step >= self.total_steps:
|
||||
break
|
||||
|
||||
# 训练完成
|
||||
logger.info("Two-stage training completed!")
|
||||
|
||||
# 保存最终模型
|
||||
self.save_checkpoint("final_model.pt")
|
||||
|
||||
# 关闭TensorBoard写入器
|
||||
if self.writer is not None:
|
||||
self.writer.close()
|
||||
|
||||
|
||||
def worker_init_fn(worker_id: int) -> None:
|
||||
"""
|
||||
初始化每个DataLoader worker的随机种子,确保可复现性
|
||||
|
|
@ -1082,5 +1522,326 @@ def export(
|
|||
console.print("[yellow]导出功能待实现[/yellow]")
|
||||
|
||||
|
||||
@app.command()
|
||||
def expand_and_train(
|
||||
# 数据参数
|
||||
train_data_path: str = typer.Option(
|
||||
..., "--train-data-path", "-t", help="训练数据集路径"
|
||||
),
|
||||
eval_data_path: str = typer.Option(
|
||||
..., "--eval-data-path", "-e", help="评估数据集路径"
|
||||
),
|
||||
output_dir: str = typer.Option("./output", "--output-dir", "-o", help="输出目录"),
|
||||
# 模型参数
|
||||
base_model_path: str = typer.Option(
|
||||
..., "--base-model-path", help="预训练基础模型检查点路径"
|
||||
),
|
||||
new_model_spec: str = typer.Option(
|
||||
..., "--new-model-spec", "-m", help="新模型规格,格式:模块名:类名,如 'model:InputMethodEngine'。支持任意路径,自定义模型类必须是 InputMethodEngine 的子类"
|
||||
),
|
||||
vocab_size: int = typer.Option(10019, "--vocab-size", help="词汇表大小"),
|
||||
pinyin_vocab_size: int = typer.Option(
|
||||
30, "--pinyin-vocab-size", help="拼音词汇表大小"
|
||||
),
|
||||
max_iter_length: int = typer.Option(
|
||||
1024 * 1024 * 128, "--max_iter_length", help="数据集大小"
|
||||
),
|
||||
dim: int = typer.Option(512, "--dim", help="模型维度"),
|
||||
num_slots: int = typer.Option(8, "--num-slots", help="历史槽位数量"),
|
||||
n_layers: int = typer.Option(4, "--n-layers", help="Transformer层数"),
|
||||
n_heads: int = typer.Option(4, "--n-heads", help="注意力头数"),
|
||||
num_experts: int = typer.Option(20, "--num-experts", help="MoE专家数量"),
|
||||
max_seq_len: int = typer.Option(128, "--max-seq-len", help="最大序列长度"),
|
||||
use_pinyin: bool = typer.Option(False, "--use-pinyin", help="是否使用拼音特征"),
|
||||
# 两阶段训练参数
|
||||
frozen_patience: int = typer.Option(
|
||||
10, "--frozen-patience", help="冻结阶段验证损失连续不下降的epoch数,触发切换到全量微调"
|
||||
),
|
||||
frozen_lr: float = typer.Option(
|
||||
1e-3, "--frozen-lr", help="冻结阶段学习率"
|
||||
),
|
||||
full_lr: float = typer.Option(
|
||||
1e-4, "--full-lr", help="全量微调阶段学习率"
|
||||
),
|
||||
frozen_scheduler: str = typer.Option(
|
||||
"cosine", "--frozen-scheduler", help="冻结阶段学习率调度器类型:cosine或plateau"
|
||||
),
|
||||
full_scheduler: str = typer.Option(
|
||||
"cosine", "--full-scheduler", help="全量微调阶段学习率调度器类型:cosine或plateau"
|
||||
),
|
||||
# 训练参数
|
||||
batch_size: int = typer.Option(128, "--batch-size", "-b", help="批次大小"),
|
||||
num_epochs: int = typer.Option(10, "--num-epochs", help="训练轮数"),
|
||||
min_learning_rate: float = typer.Option(
|
||||
1e-9, "--min-learning-rate", help="最小学习率"
|
||||
),
|
||||
weight_decay: float = typer.Option(0.1, "--weight-decay", help="权重衰减"),
|
||||
warmup_ratio: float = typer.Option(0.1, "--warmup-ratio", help="热身步数比例"),
|
||||
label_smoothing: float = typer.Option(
|
||||
0.15, "--label-smoothing", help="标签平滑参数"
|
||||
),
|
||||
grad_accum_steps: int = typer.Option(1, "--grad-accum-steps", help="梯度累积步数"),
|
||||
clip_grad_norm: float = typer.Option(1.0, "--clip-grad-norm", help="梯度裁剪范数"),
|
||||
eval_frequency: int = typer.Option(500, "--eval-frequency", help="评估频率"),
|
||||
save_frequency: int = typer.Option(1000, "--save-frequency", help="保存频率"),
|
||||
# 其他参数
|
||||
mixed_precision: bool = typer.Option(
|
||||
True, "--mixed-precision/--no-mixed-precision", help="是否使用混合精度训练"
|
||||
),
|
||||
num_workers: int = typer.Option(
|
||||
2, "--num-workers", help="数据加载worker数量(流式数据集建议为2)"
|
||||
),
|
||||
use_tensorboard: bool = typer.Option(
|
||||
True, "--tensorboard/--no-tensorboard", help="是否使用TensorBoard"
|
||||
),
|
||||
resume_from: Optional[str] = typer.Option(
|
||||
None, "--resume-from", help="从检查点恢复训练"
|
||||
),
|
||||
reset_training_state: bool = typer.Option(
|
||||
False, "--reset-training-state", help="重置训练状态,只加载模型权重从头开始训练"
|
||||
),
|
||||
seed: int = typer.Option(42, "--seed", help="随机种子"),
|
||||
compile: bool = typer.Option(
|
||||
False,
|
||||
"--compile/--no-compile",
|
||||
help="是否开启 torch.compile 优化(需 PyTorch 2.0+)",
|
||||
),
|
||||
):
|
||||
"""
|
||||
模型扩容两阶段训练:先冻结匹配层训练,然后全量微调
|
||||
"""
|
||||
torch.multiprocessing.set_sharing_strategy("file_system")
|
||||
|
||||
# 启用 TensorFloat32 加速矩阵乘法 (解决 UserWarning 并提升性能)
|
||||
if torch.cuda.is_available():
|
||||
torch.set_float32_matmul_precision("high")
|
||||
|
||||
# 设置随机种子
|
||||
torch.manual_seed(seed)
|
||||
if torch.cuda.is_available():
|
||||
torch.cuda.manual_seed_all(seed)
|
||||
|
||||
console = Console()
|
||||
|
||||
# 打印配置信息
|
||||
console.print(
|
||||
Panel.fit("[bold cyan]模型扩容两阶段训练配置[/bold cyan]", border_style="cyan")
|
||||
)
|
||||
|
||||
config_table = Table(show_header=True, header_style="bold magenta")
|
||||
config_table.add_column("Category", style="cyan")
|
||||
config_table.add_column("Parameter", style="green")
|
||||
config_table.add_column("Value", style="yellow")
|
||||
|
||||
# 添加配置信息
|
||||
config_table.add_row("数据", "训练数据路径", train_data_path)
|
||||
config_table.add_row("数据", "评估数据路径", eval_data_path)
|
||||
config_table.add_row("数据", "输出目录", output_dir)
|
||||
config_table.add_row("数据", "批次大小", str(batch_size))
|
||||
config_table.add_row("数据", "Worker数量", str(num_workers))
|
||||
|
||||
config_table.add_row("模型", "基础模型路径", base_model_path)
|
||||
config_table.add_row("模型", "新模型规格", new_model_spec)
|
||||
config_table.add_row("模型", "词汇表大小", str(vocab_size))
|
||||
config_table.add_row("模型", "拼音词汇表", str(pinyin_vocab_size))
|
||||
config_table.add_row("模型", "模型维度", str(dim))
|
||||
config_table.add_row("模型", "槽位数量", str(num_slots))
|
||||
config_table.add_row("模型", "Transformer层数", str(n_layers))
|
||||
config_table.add_row("模型", "注意力头数", str(n_heads))
|
||||
config_table.add_row("模型", "MoE专家数", str(num_experts))
|
||||
config_table.add_row("模型", "使用拼音", str(use_pinyin))
|
||||
config_table.add_row("模型", "编译优化", str(compile))
|
||||
|
||||
config_table.add_row("两阶段训练", "冻结阶段耐心值", str(frozen_patience))
|
||||
config_table.add_row("两阶段训练", "冻结阶段学习率", f"{frozen_lr:.2e}")
|
||||
config_table.add_row("两阶段训练", "全量阶段学习率", f"{full_lr:.2e}")
|
||||
config_table.add_row("两阶段训练", "冻结阶段调度器", frozen_scheduler)
|
||||
config_table.add_row("两阶段训练", "全量阶段调度器", full_scheduler)
|
||||
|
||||
config_table.add_row("训练", "训练轮数", str(num_epochs))
|
||||
config_table.add_row("训练", "最小学习率", f"{min_learning_rate:.2e}")
|
||||
config_table.add_row("训练", "权重衰减", str(weight_decay))
|
||||
config_table.add_row("训练", "热身比例", str(warmup_ratio))
|
||||
config_table.add_row("训练", "标签平滑", str(label_smoothing))
|
||||
config_table.add_row("训练", "梯度累积", str(grad_accum_steps))
|
||||
config_table.add_row("训练", "梯度裁剪", str(clip_grad_norm))
|
||||
config_table.add_row("训练", "混合精度", str(mixed_precision))
|
||||
|
||||
console.print(config_table)
|
||||
|
||||
# 创建输出目录
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# 保存配置
|
||||
config = {
|
||||
"train_data_path": train_data_path,
|
||||
"eval_data_path": eval_data_path,
|
||||
"output_dir": output_dir,
|
||||
"base_model_path": base_model_path,
|
||||
"new_model_spec": new_model_spec,
|
||||
"vocab_size": vocab_size,
|
||||
"pinyin_vocab_size": pinyin_vocab_size,
|
||||
"dim": dim,
|
||||
"num_slots": num_slots,
|
||||
"n_layers": n_layers,
|
||||
"n_heads": n_heads,
|
||||
"num_experts": num_experts,
|
||||
"max_seq_len": max_seq_len,
|
||||
"use_pinyin": use_pinyin,
|
||||
"frozen_patience": frozen_patience,
|
||||
"frozen_lr": frozen_lr,
|
||||
"full_lr": full_lr,
|
||||
"frozen_scheduler": frozen_scheduler,
|
||||
"full_scheduler": full_scheduler,
|
||||
"batch_size": batch_size,
|
||||
"num_workers": num_workers,
|
||||
"num_epochs": num_epochs,
|
||||
"min_learning_rate": min_learning_rate,
|
||||
"weight_decay": weight_decay,
|
||||
"warmup_ratio": warmup_ratio,
|
||||
"label_smoothing": label_smoothing,
|
||||
"grad_accum_steps": grad_accum_steps,
|
||||
"clip_grad_norm": clip_grad_norm,
|
||||
"eval_frequency": eval_frequency,
|
||||
"save_frequency": save_frequency,
|
||||
"mixed_precision": mixed_precision,
|
||||
"use_tensorboard": use_tensorboard,
|
||||
"seed": seed,
|
||||
"max_iter_length": max_iter_length,
|
||||
"compile": compile,
|
||||
}
|
||||
|
||||
config_file = output_path / "expansion_training_config.json"
|
||||
with open(config_file, "w", encoding="utf-8") as f:
|
||||
json.dump(config, f, indent=2, ensure_ascii=False)
|
||||
|
||||
logger.info(f"Configuration saved to {config_file}")
|
||||
|
||||
# 创建数据加载器
|
||||
console.print("[bold cyan]正在创建数据加载器...[/bold cyan]")
|
||||
|
||||
# 训练数据集
|
||||
train_dataset = PinyinInputDataset(
|
||||
data_path=train_data_path,
|
||||
max_workers=-1, # 自动选择worker数量
|
||||
max_iter_length=max_iter_length,
|
||||
max_seq_length=max_seq_len,
|
||||
text_field="text",
|
||||
py_style_weight=(9, 2, 1),
|
||||
shuffle_buffer_size=5000,
|
||||
length_weights={1: 10, 2: 50, 3: 50, 4: 40, 5: 15, 6: 10, 7: 5, 8: 2},
|
||||
)
|
||||
|
||||
# 训练数据加载器
|
||||
train_dataloader = create_dataloader(
|
||||
dataset=train_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=num_workers,
|
||||
pin_memory=torch.cuda.is_available(),
|
||||
max_iter_length=max_iter_length,
|
||||
)
|
||||
|
||||
# 评估数据集
|
||||
eval_dataset = PinyinInputDataset(
|
||||
data_path=eval_data_path,
|
||||
max_workers=-1,
|
||||
max_iter_length=batch_size * 64, # 评估集较小
|
||||
max_seq_length=max_seq_len,
|
||||
text_field="text",
|
||||
py_style_weight=(9, 2, 1),
|
||||
shuffle_buffer_size=50000,
|
||||
length_weights={1: 10, 2: 50, 3: 50, 4: 40, 5: 15, 6: 10, 7: 5, 8: 2},
|
||||
)
|
||||
|
||||
eval_dataloader = create_dataloader(
|
||||
dataset=eval_dataset,
|
||||
batch_size=batch_size,
|
||||
num_workers=1, # 评估使用较少的worker
|
||||
pin_memory=torch.cuda.is_available(),
|
||||
max_iter_length=batch_size * 64,
|
||||
)
|
||||
|
||||
# 创建扩容模型
|
||||
console.print("[bold cyan]正在创建扩容模型...[/bold cyan]")
|
||||
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
||||
|
||||
model_kwargs = {
|
||||
"vocab_size": vocab_size,
|
||||
"pinyin_vocab_size": pinyin_vocab_size,
|
||||
"dim": dim,
|
||||
"num_slots": num_slots,
|
||||
"n_layers": n_layers,
|
||||
"n_heads": n_heads,
|
||||
"num_experts": num_experts,
|
||||
"max_seq_len": max_seq_len,
|
||||
"compile": compile,
|
||||
}
|
||||
|
||||
model = load_expanded_model(
|
||||
base_model_path=base_model_path,
|
||||
new_model_spec=new_model_spec,
|
||||
device=device,
|
||||
**model_kwargs,
|
||||
)
|
||||
|
||||
console.print(
|
||||
f"[green]✓ 扩容模型创建完成,参数量: {sum(p.numel() for p in model.parameters()):,}[/green]"
|
||||
)
|
||||
|
||||
# 统计冻结参数比例
|
||||
total_params = sum(p.numel() for p in model.parameters())
|
||||
frozen_params = sum(p.numel() for p in model.parameters() if not p.requires_grad)
|
||||
console.print(
|
||||
f"[green]✓ 冻结参数: {frozen_params:,}/{total_params:,} ({frozen_params/total_params*100:.1f}%)[/green]"
|
||||
)
|
||||
|
||||
# 创建两阶段训练器
|
||||
console.print("[bold cyan]正在创建两阶段训练器...[/bold cyan]")
|
||||
trainer = TwoStageTrainer(
|
||||
model=model,
|
||||
train_dataloader=train_dataloader,
|
||||
eval_dataloader=eval_dataloader,
|
||||
total_steps=int(max_iter_length * num_epochs / batch_size),
|
||||
output_dir=output_dir,
|
||||
num_epochs=num_epochs,
|
||||
learning_rate=frozen_lr, # 初始学习率(会被阶段特定LR覆盖)
|
||||
min_learning_rate=min_learning_rate,
|
||||
weight_decay=weight_decay,
|
||||
warmup_ratio=warmup_ratio,
|
||||
label_smoothing=label_smoothing,
|
||||
grad_accum_steps=grad_accum_steps,
|
||||
clip_grad_norm=clip_grad_norm,
|
||||
eval_frequency=eval_frequency,
|
||||
save_frequency=save_frequency,
|
||||
mixed_precision=mixed_precision,
|
||||
use_tensorboard=use_tensorboard,
|
||||
status_file="training_status.json",
|
||||
# 两阶段训练特有参数
|
||||
frozen_patience=frozen_patience,
|
||||
frozen_lr=frozen_lr,
|
||||
full_lr=full_lr,
|
||||
frozen_scheduler=frozen_scheduler,
|
||||
full_scheduler=full_scheduler,
|
||||
)
|
||||
|
||||
console.print("[green]✓ 两阶段训练器创建完成[/green]")
|
||||
|
||||
# 开始训练
|
||||
console.print("\n[bold cyan]开始两阶段训练...[/bold cyan]")
|
||||
console.print(f"开始时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
try:
|
||||
trainer.train(
|
||||
resume_from=resume_from, reset_training_state=reset_training_state
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
console.print("[bold green]训练被终止[/bold green]")
|
||||
trainer.save_checkpoint("interrupted_model.pt")
|
||||
|
||||
console.print("[bold green]✓ 两阶段训练完成![/bold green]")
|
||||
console.print(f"结束时间: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
||||
console.print(f"模型和日志保存在: {output_dir}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
||||
|
|
|
|||
Loading…
Reference in New Issue