From 59bb29e4fd56fd2cb1300cfa7422837384434507 Mon Sep 17 00:00:00 2001 From: songsenand Date: Sun, 5 Apr 2026 22:06:36 +0800 Subject: [PATCH] =?UTF-8?q?feat(benchmark):=20=E6=B7=BB=E5=8A=A0=E6=80=A7?= =?UTF-8?q?=E8=83=BD=E5=9F=BA=E5=87=86=E6=B5=8B=E8=AF=95=E8=84=9A=E6=9C=AC?= =?UTF-8?q?=E7=94=A8=E4=BA=8E=E8=AF=8A=E6=96=AD=E6=A8=A1=E5=9E=8B=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E7=93=B6=E9=A2=88?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- benchmark.py | 662 +++++++++++++++++++++++++++++++++++++++++++ src/model/model.py | 5 + src/model/trainer.py | 8 + 3 files changed, 675 insertions(+) create mode 100644 benchmark.py diff --git a/benchmark.py b/benchmark.py new file mode 100644 index 0000000..017c659 --- /dev/null +++ b/benchmark.py @@ -0,0 +1,662 @@ +#!/usr/bin/env python3 +""" +性能基准测试脚本 +用于诊断输入法模型训练的性能瓶颈 +""" + +import json +import subprocess +import sys +import time +from pathlib import Path +from typing import Any, Dict, List, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader, Dataset + +# 添加src目录到路径 +sys.path.insert(0, str(Path(__file__).parent)) + +from src.model.model import InputMethodEngine + + +class MockDataset(Dataset): + """模拟数据集用于性能测试""" + + def __init__(self, num_samples=1000, vocab_size=10019, seq_len=128, num_slots=8): + self.num_samples = num_samples + self.vocab_size = vocab_size + self.seq_len = seq_len + self.num_slots = num_slots + + def __len__(self): + return self.num_samples + + def __getitem__(self, idx): + # 生成模拟数据,包含训练所需的所有字段 + return { + "input_ids": torch.randint(0, self.vocab_size, (self.seq_len,)), + "token_type_ids": torch.randint(0, 2, (self.seq_len,)), + "attention_mask": torch.ones(self.seq_len, dtype=torch.long), + "history_slot_ids": torch.randint(0, self.vocab_size, (self.num_slots,)), + "pinyin_ids": torch.randint(0, 30, (24,)), # pinyin_ids长度固定为24 + "label": torch.randint(0, self.vocab_size, (1,)), + } + + +def collate_fn(batch: List[Dict[str, Any]]) -> Dict[str, Any]: + """与trainer.py中相同的collate_fn""" + # 处理tensor字段 + input_ids = torch.stack([item["input_ids"] for item in batch]) + token_type_ids = torch.stack([item["token_type_ids"] for item in batch]) + attention_mask = torch.stack([item["attention_mask"] for item in batch]) + labels = torch.stack([item["label"] for item in batch]) + history_slot_ids = torch.stack([item["history_slot_ids"] for item in batch]) + pinyin_ids = torch.stack([item["pinyin_ids"] for item in batch]) + + # 字符串字段(模拟) + prefixes = [f"prefix_{i}" for i in range(len(batch))] + suffixes = [f"suffix_{i}" for i in range(len(batch))] + pinyins = [f"pinyin_{i}" for i in range(len(batch))] + + return { + "input_ids": input_ids, + "token_type_ids": token_type_ids, + "attention_mask": attention_mask, + "labels": labels, + "history_slot_ids": history_slot_ids, + "pinyin_ids": pinyin_ids, + "prefix": prefixes, + "suffix": suffixes, + "pinyin": pinyins, + } + + +def get_gpu_utilization() -> float: + """获取GPU利用率""" + try: + result = subprocess.run( + [ + "nvidia-smi", + "--query-gpu=utilization.gpu", + "--format=csv,noheader,nounits", + ], + capture_output=True, + text=True, + timeout=2, + ) + if result.returncode == 0: + return float(result.stdout.strip()) + except: + pass + return 0.0 + + +def benchmark_model_only(batch_sizes=[32, 64, 128, 256, 512]): + """仅测试模型前向+反向传播性能(不包含数据加载)""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + print(f"\n{'=' * 70}") + print("模型计算性能测试(不包含数据加载)") + print(f"使用设备: {device}") + print(f"PyTorch版本: {torch.__version__}") + print(f"CUDA可用: {torch.cuda.is_available()}") + + if torch.cuda.is_available(): + print(f"GPU: {torch.cuda.get_device_name(0)}") + print( + f"GPU内存: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB" + ) + + # 关键修复:使用与ContextEncoder预训练模型匹配的维度 + # 根据components.py,ContextEncoder使用预训练模型,其embedding维度是固定的 + # 从错误信息看,预训练模型维度是512,所以这里必须使用dim=512 + vocab_size = 10019 + pinyin_vocab_size = 30 + dim = 512 # 必须与预训练模型维度匹配! + num_slots = 8 + n_layers = 4 + n_heads = 4 + num_experts = 20 + max_seq_len = 128 + + results = [] + + for batch_size in batch_sizes: + print(f"\n{'=' * 60}") + print(f"测试批量大小: {batch_size}") + + try: + # 创建模型(使用真实训练配置) + model = InputMethodEngine( + vocab_size=vocab_size, + pinyin_vocab_size=pinyin_vocab_size, + dim=dim, + num_slots=num_slots, + n_layers=n_layers, + n_heads=n_heads, + num_experts=num_experts, + max_seq_len=max_seq_len, + ).to(device) + + # 创建优化器和损失函数 + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + criterion = nn.CrossEntropyLoss() + scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) + + # 生成模拟batch(直接在GPU上) + batch = { + "input_ids": torch.randint( + 0, vocab_size, (batch_size, max_seq_len), device=device + ), + "token_type_ids": torch.randint( + 0, 2, (batch_size, max_seq_len), device=device + ), + "attention_mask": torch.ones( + batch_size, max_seq_len, dtype=torch.long, device=device + ), + "history_slot_ids": torch.randint( + 0, vocab_size, (batch_size, num_slots), device=device + ), + "pinyin_ids": torch.randint( + 0, pinyin_vocab_size, (batch_size, 24), device=device + ), + "labels": torch.randint(0, vocab_size, (batch_size, 1), device=device), + } + + # 预热(运行几次以避免初始化开销) + print(" 预热...") + for _ in range(3): + with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): + logits = model( + input_ids=batch["input_ids"], + token_type_ids=batch["token_type_ids"], + attention_mask=batch["attention_mask"], + pinyin_ids=batch["pinyin_ids"], + history_slot_ids=batch["history_slot_ids"], + ) + loss = criterion(logits, batch["labels"].squeeze(-1)) + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + + # 测速 + if torch.cuda.is_available(): + torch.cuda.synchronize() + start_time = time.time() + steps = 20 # 运行20步以获得稳定平均值 + + print(f" 运行{steps}步测速...") + for step in range(steps): + with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): + logits = model( + input_ids=batch["input_ids"], + token_type_ids=batch["token_type_ids"], + attention_mask=batch["attention_mask"], + pinyin_ids=batch["pinyin_ids"], + history_slot_ids=batch["history_slot_ids"], + ) + loss = criterion(logits, batch["labels"].squeeze(-1)) + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + + if torch.cuda.is_available(): + torch.cuda.synchronize() + elapsed = time.time() - start_time + + # 计算指标 + throughput = steps * batch_size / elapsed + step_time = elapsed / steps * 1000 # 毫秒 + memory_used = ( + torch.cuda.max_memory_allocated() / 1024**2 + if torch.cuda.is_available() + else 0 + ) # MB + + print(f" 结果:") + print(f" 吞吐量: {throughput:.2f} samples/sec") + print(f" 每个step: {step_time:.2f}ms") + print(f" GPU内存峰值: {memory_used:.2f} MB") + + if torch.cuda.is_available(): + gpu_util = get_gpu_utilization() + if gpu_util > 0: + print(f" GPU利用率: {gpu_util:.1f}%") + + results.append( + { + "batch_size": batch_size, + "throughput": throughput, + "step_time": step_time, + "memory": memory_used, + "success": True, + } + ) + + # 清理 + del model, optimizer, batch + if torch.cuda.is_available(): + torch.cuda.empty_cache() + + # 检查是否内存不足 + if ( + torch.cuda.is_available() + and memory_used + > torch.cuda.get_device_properties(0).total_memory * 0.8 / 1024**2 + ): + print( + f" ⚠️ 警告: batch_size={batch_size} 使用了超过80% GPU内存,更大的batch_size可能导致OOM" + ) + break + + except Exception as e: + print(f" 错误: {e}") + results.append( + { + "batch_size": batch_size, + "error": str(e), + "success": False, + } + ) + if "CUDA out of memory" in str(e): + print(f" ⚠️ CUDA内存不足,停止测试更大的batch_size") + break + + return results + + +def benchmark_data_loading(num_workers_list=[0, 2, 4, 8, 12], batch_size=128): + """测试数据加载性能""" + print(f"\n{'=' * 70}") + print("测试数据加载性能") + print(f"批量大小: {batch_size}") + + results = [] + + for num_workers in num_workers_list: + print(f"\n测试num_workers: {num_workers}") + + try: + # 创建数据集和数据加载器 + dataset = MockDataset(num_samples=2000) + dataloader = DataLoader( + dataset, + batch_size=batch_size, + num_workers=num_workers, + pin_memory=torch.cuda.is_available(), + collate_fn=collate_fn, + prefetch_factor=2 if num_workers > 0 else None, + ) + + # 预热 + print(" 预热...") + for _ in range(2): + _ = next(iter(dataloader)) + + # 测速 + print(f" 测速中...") + start_time = time.time() + batches = 0 + total_samples = 0 + + for batch in dataloader: + batches += 1 + total_samples += batch["input_ids"].size(0) + + if batches >= 20: # 测试20个batch + break + + elapsed = time.time() - start_time + if batches == 0: + print(f" 警告: 无法加载任何batch") + continue + + throughput = total_samples / elapsed + + print(f" 结果:") + print(f" 数据加载吞吐量: {throughput:.2f} samples/sec") + print(f" 平均每batch加载时间: {elapsed / batches * 1000:.2f}ms") + + results.append( + { + "num_workers": num_workers, + "throughput": throughput, + "avg_batch_time": elapsed / batches * 1000, + "success": True, + } + ) + + except Exception as e: + print(f" 错误: {e}") + results.append( + { + "num_workers": num_workers, + "error": str(e), + "success": False, + } + ) + + return results + + +def profile_training_step(batch_size=128, num_steps=15): + """详细分析训练步骤中各部分耗时""" + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print(f"\n{'=' * 70}") + print("详细训练步骤分析") + print(f"批量大小: {batch_size}, 步数: {num_steps}") + + # 使用真实训练配置 + vocab_size = 10019 + pinyin_vocab_size = 30 + dim = 512 + num_slots = 8 + n_layers = 4 + n_heads = 4 + num_experts = 20 + max_seq_len = 128 + + timings = { + "data_to_gpu": [], + "forward": [], + "loss_calc": [], + "backward": [], + "optimizer_step": [], + "total_step": [], + } + + try: + # 创建模型和优化器 + model = InputMethodEngine( + vocab_size=vocab_size, + pinyin_vocab_size=pinyin_vocab_size, + dim=dim, + num_slots=num_slots, + n_layers=n_layers, + n_heads=n_heads, + num_experts=num_experts, + max_seq_len=max_seq_len, + ).to(device) + + optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4) + criterion = nn.CrossEntropyLoss() + scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available()) + + # 创建数据加载器 + dataset = MockDataset(num_samples=1000) + dataloader = DataLoader( + dataset, + batch_size=batch_size, + num_workers=4, + pin_memory=torch.cuda.is_available(), + collate_fn=collate_fn, + ) + + model.train() + + print(f"开始分析...") + for i, batch in enumerate(dataloader): + if i >= num_steps: + break + + step_start = time.time() + + # 1. 数据移动到GPU + data_start = time.time() + batch_gpu = { + k: v.to(device, non_blocking=True) + for k, v in batch.items() + if isinstance(v, torch.Tensor) + } + data_time = time.time() - data_start + timings["data_to_gpu"].append(data_time) + + # 2. 前向传播 + forward_start = time.time() + with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): + logits = model( + input_ids=batch_gpu["input_ids"], + token_type_ids=batch_gpu["token_type_ids"], + attention_mask=batch_gpu["attention_mask"], + pinyin_ids=batch_gpu["pinyin_ids"], + history_slot_ids=batch_gpu["history_slot_ids"], + ) + forward_time = time.time() - forward_start + timings["forward"].append(forward_time) + + # 3. 损失计算 + loss_start = time.time() + with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()): + loss = criterion(logits, batch_gpu["labels"].squeeze(-1)) + loss_time = time.time() - loss_start + timings["loss_calc"].append(loss_time) + + # 4. 反向传播 + backward_start = time.time() + scaler.scale(loss).backward() + backward_time = time.time() - backward_start + timings["backward"].append(backward_time) + + # 5. 优化器步骤 + optimizer_start = time.time() + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad() + optimizer_time = time.time() - optimizer_start + timings["optimizer_step"].append(optimizer_time) + + # 总时间 + total_time = time.time() - step_start + timings["total_step"].append(total_time) + + if i % 5 == 0: + print( + f" Step {i}: 总时间={total_time * 1000:.1f}ms, " + f"前向={forward_time * 1000:.1f}ms, " + f"反向={backward_time * 1000:.1f}ms" + ) + + # 打印统计信息 + print(f"\n{'=' * 60}") + print("训练步骤耗时分析(平均值):") + for key, values in timings.items(): + if values: + avg_ms = sum(values) / len(values) * 1000 + min_ms = min(values) * 1000 + max_ms = max(values) * 1000 + if timings["total_step"]: + total_avg = ( + sum(timings["total_step"]) / len(timings["total_step"]) * 1000 + ) + percentage = (avg_ms / total_avg) * 100 + else: + percentage = 0 + print( + f" {key:15s}: {avg_ms:6.1f}ms ({percentage:5.1f}%) [min={min_ms:.1f}ms, max={max_ms:.1f}ms]" + ) + + # 计算瓶颈 + if timings["total_step"]: + total_avg_time = sum(timings["total_step"]) / len(timings["total_step"]) + data_percent = ( + sum(timings["data_to_gpu"]) / sum(timings["total_step"]) * 100 + ) + compute_percent = ( + ( + sum(timings["forward"]) + + sum(timings["backward"]) + + sum(timings["loss_calc"]) + + sum(timings["optimizer_step"]) + ) + / sum(timings["total_step"]) + * 100 + ) + + print(f"\n瓶颈分析:") + print(f" 数据加载/传输: {data_percent:.1f}%") + print(f" 计算(前向+反向): {compute_percent:.1f}%") + + if data_percent > 30: + print(f" ⚠️ 数据加载是瓶颈!建议增加num_workers或使用pin_memory") + if compute_percent > 70: + print(f" ⚠️ 计算是瓶颈!建议优化模型或使用混合精度") + + except Exception as e: + print(f"分析过程中出错: {e}") + import traceback + + traceback.print_exc() + + return timings + + +def check_gpu_status(): + """检查GPU状态""" + print(f"\n{'=' * 70}") + print("GPU状态检查") + + if not torch.cuda.is_available(): + print("CUDA不可用!将在CPU上运行") + print("注意:CPU训练性能会远低于GPU") + return + + print(f"可用GPU数量: {torch.cuda.device_count()}") + + for i in range(torch.cuda.device_count()): + props = torch.cuda.get_device_properties(i) + print(f"\nGPU {i}: {props.name}") + print(f" 计算能力: {props.major}.{props.minor}") + print(f" 总内存: {props.total_memory / 1024**3:.2f} GB") + print(f" SM数量: {props.multi_processor_count}") + + # 当前内存使用 + print(f"\n当前内存使用:") + print(f" 已分配: {torch.cuda.memory_allocated() / 1024**2:.2f} MB") + print(f" 已缓存: {torch.cuda.memory_reserved() / 1024**2:.2f} MB") + + # 检查PyTorch配置 + print(f"\nPyTorch配置:") + print(f" cuDNN基准模式: {torch.backends.cudnn.benchmark}") + print(f" cuDNN确定性: {torch.backends.cudnn.deterministic}") + print(f" TF32启用: {torch.backends.cuda.matmul.allow_tf32}") + + # 建议优化 + print(f"\n建议优化:") + if not torch.backends.cudnn.benchmark: + print(f" ⚙️ 建议设置: torch.backends.cudnn.benchmark = True") + if not torch.backends.cuda.matmul.allow_tf32: + print(f" ⚙️ 建议设置: torch.backends.cuda.matmul.allow_tf32 = True") + + +def main(): + """主函数:运行所有性能测试""" + print("输入法模型性能基准测试") + print("=" * 70) + + # 设置优化标志(在测试前设置) + if torch.cuda.is_available(): + torch.backends.cudnn.benchmark = True + torch.backends.cuda.matmul.allow_tf32 = True + print("已启用cuDNN基准模式和TF32加速") + + # 1. 检查GPU状态 + check_gpu_status() + + # 2. 测试数据加载性能 + print(f"\n{'=' * 70}") + print("阶段1: 数据加载性能测试") + data_results = benchmark_data_loading( + num_workers_list=[0, 2, 4, 8, 12], batch_size=128 + ) + + # 3. 测试模型计算性能 + print(f"\n{'=' * 70}") + print("阶段2: 模型计算性能测试") + # 从较小的batch_size开始测试 + model_results = benchmark_model_only(batch_sizes=[32, 64, 128, 256]) + + # 4. 详细分析训练步骤 + print(f"\n{'=' * 70}") + print("阶段3: 详细训练步骤分析") + step_timings = profile_training_step(batch_size=128, num_steps=15) + + # 5. 给出优化建议 + print(f"\n{'=' * 70}") + print("优化建议汇总") + print("=" * 70) + + # 基于数据加载结果 + if data_results: + successful_results = [r for r in data_results if r.get("success", False)] + if successful_results: + best_workers = max(successful_results, key=lambda x: x["throughput"]) + print(f"1. 数据加载优化:") + print(f" 推荐num_workers: {best_workers['num_workers']}") + print(f" 最佳吞吐量: {best_workers['throughput']:.2f} samples/sec") + + # 基于模型结果 + if model_results: + successful_results = [r for r in model_results if r.get("success", False)] + if successful_results: + best_batch = max(successful_results, key=lambda x: x["throughput"]) + print(f"\n2. 批量大小优化:") + print(f" 推荐batch_size: {best_batch['batch_size']}") + print(f" 最佳吞吐量: {best_batch['throughput']:.2f} samples/sec") + print(f" 内存使用: {best_batch['memory']:.2f} MB") + + # 内存安全建议 + if best_batch["memory"] > 0: + if torch.cuda.is_available(): + total_mem = ( + torch.cuda.get_device_properties(0).total_memory / 1024**2 + ) + usage_percent = (best_batch["memory"] / total_mem) * 100 + if usage_percent < 70: + print(f" ✅ 内存使用安全 ({usage_percent:.1f}% of total)") + else: + print( + f" ⚠️ 内存使用较高 ({usage_percent:.1f}% of total),考虑减小batch_size" + ) + + # 通用建议 + print(f"\n3. 通用优化建议:") + if torch.cuda.is_available(): + print(f" ✅ 使用pin_memory=True (已启用)") + print(f" ✅ 使用混合精度训练 (已启用)") + print(f" ✅ 设置torch.backends.cudnn.benchmark = True (已启用)") + print(f" ✅ 使用非阻塞数据传输 (non_blocking=True)") + else: + print(f" ⚠️ 未检测到GPU,建议使用GPU以获得更好性能") + + # 特定建议 + print(f"\n4. 针对你的配置的具体建议:") + print(f" - 默认配置: batch_size=128, num_workers=2, dim=512") + print(f" - 如果GPU内存充足,可以尝试增加batch_size") + print(f" - 如果CPU核心多,可以增加num_workers") + + print(f"\n{'=' * 70}") + print("基准测试完成!") + print(f"可以运行完整训练来验证优化效果:") + print(f" python -m src.model.trainer train --help") + + # 保存结果到文件 + try: + results = { + "data_loading": data_results, + "model_compute": model_results, + "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"), + } + with open("benchmark_results.json", "w") as f: + json.dump(results, f, indent=2, default=str) + print(f"\n结果已保存到: benchmark_results.json") + except: + pass + + +if __name__ == "__main__": + main() diff --git a/src/model/model.py b/src/model/model.py index 84a7c9a..ba9d492 100644 --- a/src/model/model.py +++ b/src/model/model.py @@ -40,6 +40,7 @@ class InputMethodEngine(nn.Module): n_heads: int = 4, # 注意力头数 num_experts: int = 20, # MoE 专家数量 max_seq_len: int = 128, # 最大上下文长度 + compile: bool = False, # 是否开启 torch.compile 优化 ): super().__init__() self.dim = dim @@ -76,6 +77,10 @@ class InputMethodEngine(nn.Module): # 5. 分类头 self.classifier = nn.Linear(dim, vocab_size) + # 开启 torch.compile 优化 (如果请求) + if compile: + self.forward = torch.compile(self.forward) + def forward( self, input_ids: torch.Tensor, diff --git a/src/model/trainer.py b/src/model/trainer.py index b952ef4..f2a8582 100644 --- a/src/model/trainer.py +++ b/src/model/trainer.py @@ -867,6 +867,11 @@ def train( False, "--reset-training-state", help="重置训练状态,只加载模型权重从头开始训练" ), seed: int = typer.Option(42, "--seed", help="随机种子"), + compile: bool = typer.Option( + False, + "--compile/--no-compile", + help="是否开启 torch.compile 优化(需 PyTorch 2.0+)", + ), ): """ 训练输入法模型 @@ -905,6 +910,7 @@ def train( config_table.add_row("模型", "注意力头数", str(n_heads)) config_table.add_row("模型", "MoE专家数", str(num_experts)) config_table.add_row("模型", "使用拼音", str(use_pinyin)) + config_table.add_row("模型", "编译优化", str(compile)) config_table.add_row("训练", "训练轮数", str(num_epochs)) config_table.add_row("训练", "学习率", f"{learning_rate:.2e}") @@ -952,6 +958,7 @@ def train( "use_tensorboard": use_tensorboard, "seed": seed, "max_iter_length": max_iter_length, + "compile": compile, } config_file = output_path / "training_config.json" @@ -1016,6 +1023,7 @@ def train( n_heads=n_heads, num_experts=num_experts, max_seq_len=max_seq_len, + compile=compile, ) console.print(