feat: 添加拼音输入法模拟数据集及相关功能实现

2026-02-09 00:43:38 +08:00 · 2026-02-09 00:43:38 +08:00 · f2c260de72
parent 5ea0b0b31c
commit f2c260de72
6 changed files with 833 additions and 7 deletions
--- a/.gitignore
+++ b/.gitignore
@ -213,4 +213,5 @@ cython_debug/
 uv.lock
 *.log
-marimo/
+marimo/
 __marimo__/
--- a/pyproject.toml
+++ b/pyproject.toml
@ -11,6 +11,7 @@ dependencies = [
    "msgpack>=1.1.2",
    "pypinyin>=0.55.0",
    "rich>=14.3.1",
    "transformers>=5.1.0",
    "typer>=0.21.1",
 ]
--- a/src/suinput/counter.py
+++ b/src/suinput/counter.py
@ -16,6 +16,7 @@ from loguru import logger
 from tqdm import trange
 from .char_info import PinyinCharPairsCounter, CharInfo
 from .query import QueryEngine
@ -48,6 +49,12 @@ class PinyinCharStatistics:
            # 启动工作进程
            self._start_workers()
    # 实现一个加载历史快照，并且以历史快照的数据进行初始化的函数
    def load_history_snapshot(self, file_path: str):
        """加载历史快照，并且以历史快照的数据进行初始化"""
        self
    def _start_workers(self):
        """启动工作进程"""
--- a/src/suinput/dataset.py
+++ b/src/suinput/dataset.py
@ -0,0 +1,624 @@
 import torch
 from torch.utils.data import IterableDataset, DataLoader
 from datasets import load_dataset
 from pypinyin import lazy_pinyin
 import random
 from modelscope import AutoTokenizer
 from typing import Tuple, List, Dict, Any
 import re
 import numpy as np
 from loguru import logger
 class PinyinInputDataset(IterableDataset):
    """
    拼音输入法模拟数据集
    特性:
    1. 流式读取数据集，内存友好
    2. 实时拼音转换和多音字处理
    3. 前文上下文多种采样方式
    4. 拼音截断模拟不完整输入
    5. 内置削峰填谷算法平衡数据分布
    6. 缓冲区打乱支持多进程
    """
    def __init__(
        self,
        data_dir: str,
        query_engine,
        tokenizer_name: str = "iic/nlp_structbert_backbone_tiny_std",
        max_len: int = 88,
        text_field: str = "text",
        batch_query_size: int = 1000,
        # 打乱参数
        shuffle: bool = True,
        shuffle_buffer_size: int = 100,
        # 削峰填谷参数
        max_freq: int = 434748359,  # "的"的频率
        min_freq: int = 109,  # "蓚"的频率
        drop_start_freq: int = 30000000,  # 开始丢弃的阈值
        repeat_end_freq: int = 10000,  # 开始重复的阈值
        max_drop_prob: float = 0.8,  # 最大丢弃概率
        max_repeat_expect: float = 50.0,  # 最大重复期望
    ):
        """
        初始化数据集
        Args:
            data_dir: 数据集目录
            query_engine: QueryEngine实例
            tokenizer_name: tokenizer名称
            max_len: 最大序列长度
            text_field: 文本字段名
            batch_query_size: 批量查询大小
            shuffle: 是否打乱数据
            shuffle_buffer_size: 打乱缓冲区大小
            max_freq: 最大字符频率
            min_freq: 最小字符频率
            drop_start_freq: 开始削峰的频率阈值
            repeat_end_freq: 开始填谷的频率阈值
            max_drop_prob: 最高频率字符的丢弃概率
            max_repeat_expect: 最低频率字符的重复期望
        """
        self.query_engine = query_engine
        self.max_len = max_len
        self.text_field = text_field
        self.batch_query_size = batch_query_size
        # 打乱相关参数
        self.shuffle = shuffle
        self.shuffle_buffer_size = shuffle_buffer_size
        self.shuffle_buffer = []
        # 削峰填谷参数
        self.max_freq = max_freq
        self.min_freq = min_freq
        self.drop_start_freq = drop_start_freq
        self.repeat_end_freq = repeat_end_freq
        self.max_drop_prob = max_drop_prob
        self.max_repeat_expect = max_repeat_expect
        # 加载tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
        # 获取总字频用于后续计算
        stats = query_engine.get_statistics()
        self.total_chars = stats.get("valid_input_character_count", 0)
        # 汉字正则表达式
        self.chinese_pattern = re.compile(r"[\u4e00-\u9fff]")
        # 缓存字典
        self.char_info_cache = {}
        # 加载数据集
        self.dataset = load_dataset(data_dir, split="train", streaming=True)
    def is_chinese_char(self, char: str) -> bool:
        """判断是否为中文字符"""
        return bool(self.chinese_pattern.match(char))
    def get_next_chinese_chars(
        self, text: str, start_idx: int, max_count: int = 3
    ) -> List[Tuple[str, str]]:
        """
        获取后续的中文字符及其拼音
        Args:
            text: 完整文本
            start_idx: 起始位置
            max_count: 最大字符数
        Returns:
            列表，每个元素为(字符, 拼音)
        """
        result = []
        count = 0
        for i in range(start_idx + 1, len(text)):
            if count >= max_count:
                break
            char = text[i]
            if self.is_chinese_char(char):
                # 获取拼音（注意：这里需要确保拼音列表长度与text一致）
                try:
                    # 重新计算整个text的拼音可能效率低，但确保准确
                    # 实际实现中可以考虑缓存或优化
                    pinyin_list = lazy_pinyin(text, errors=lambda x: [c for c in x])
                    if i < len(pinyin_list):
                        result.append((char, pinyin_list[i]))
                        count += 1
                except Exception:
                    break
            else:
                # 非汉字，继续查找
                continue
        return result
    def sample_context(self, context: str) -> str:
        """
        三种方式采样前文上下文
        Args:
            context: 原始前文（最多100字符）
        Returns:
            采样后的54个字符
        """
        if not context:
            return ""
        # 确保有足够长度
        context_len = len(context)
        # 随机选择采样方式 (各1/3概率)
        choice = random.random()
        if choice < 0.333:
            # 方式1: 靠近汉字的54个字符
            return context[-54:] if context_len >= 54 else context
        elif choice < 0.667:
            # 方式2: 随机位置取46个连续字符
            if context_len <= 46:
                return context
            start = random.randint(0, context_len - 46)
            return context[start : start + 46]
        else:
            # 方式3: 12+6×7组合
            if context_len < 12:
                return context
            # 最后12个字符
            last_12 = context[-12:]
            # 从剩下的前88个字符中随机取6段，每段7个字符
            remaining = context[:-12] if context_len > 12 else ""
            remaining_len = len(remaining)
            if remaining_len < 7:
                # 如果不够7个字符，直接返回最后12个字符
                return last_12
            segments = []
            for _ in range(6):
                if remaining_len < 7:
                    break
                start = random.randint(0, remaining_len - 7)
                segment = remaining[start : start + 7]
                segments.append(segment)
            # 拼接
            combined = "".join(segments)
            result = combined + last_12
            # 确保总长度为54（可能不足）
            if len(result) < 54:
                # 如果不够，从前面补一些字符
                needed = 54 - len(result)
                if context_len >= needed:
                    result = context[:needed] + result
            # 截断到54字符
            return result[:54]
    def truncate_pinyin(self, pinyin: str) -> str:
        """
        截断拼音
        Args:
            pinyin: 原始拼音
        Returns:
            截断后的拼音，可能为空字符串
        """
        if not pinyin:
            return ""
        # 随机决定截断方式
        rand_val = random.random()
        if rand_val < 0.1:
            # 10%概率截断为空
            return ""
        elif rand_val < 0.6:
            # 50%概率不截断
            return pinyin
        else:
            # 40%概率随机截断
            # 均匀分配剩余概率给各种截断长度
            max_len = len(pinyin)
            if max_len <= 1:
                return pinyin
            # 随机选择截断长度 (1 到 max_len-1)
            trunc_len = random.randint(1, max_len - 1)
            return pinyin[:trunc_len]
    def process_pinyin_sequence(self, pinyin_list: List[str]) -> str:
        """
        处理拼音序列，逐个截断并拼接
        Args:
            pinyin_list: 拼音列表，长度1-4
        Returns:
            拼接后的拼音字符串
        """
        result_parts = []
        for pinyin in pinyin_list:
            truncated = self.truncate_pinyin(pinyin)
            if not truncated:
                # 如果某个拼音截断为空，则停止
                break
            result_parts.append(truncated)
        if not result_parts:
            return ""
        result = "".join(result_parts)
        # 限制最大长度
        if len(result) > 18:
            result = result[:18]
        return result
    def adjust_frequency(self, freq: int) -> int:
        """
        削峰填谷 - 根据频率调整采样
        Args:
            freq: 当前字符频率
        Returns:
            调整后的采样次数，0表示丢弃
        """
        # 1. 削峰处理（高频字，>= 3000W开始丢弃）
        if freq >= self.drop_start_freq:
            # 线性丢弃概率：3000W时丢弃概率为0，434748359时丢弃概率为0.8
            # 使用线性插值计算丢弃概率
            if self.max_freq == self.drop_start_freq:
                drop_prob = 0.0  # 防止除零
            else:
                drop_prob = (
                    self.max_drop_prob
                    * (freq - self.drop_start_freq)
                    / (self.max_freq - self.drop_start_freq)
                )
            # 根据丢弃概率决定是否保留
            if random.random() < drop_prob:
                return 0  # 丢弃该样本
            else:
                return 1  # 保留，但不重复
        # 2. 填谷处理（低频字，<= 1W开始重复）
        elif freq <= self.repeat_end_freq:
            # 线性重复期望：1W时重复期望为0，109时重复期望为50
            # 使用线性插值计算期望重复次数
            if freq <= self.min_freq:
                repeat_expect = self.max_repeat_expect  # 最低频字重复期望为50
            else:
                if self.repeat_end_freq == self.min_freq:
                    repeat_expect = 0  # 防止除零
                else:
                    # 线性插值公式
                    repeat_expect = (
                        self.max_repeat_expect
                        * (self.repeat_end_freq - freq)
                        / (self.repeat_end_freq - self.min_freq)
                    )
            # 期望重复次数转换为实际重复次数
            # 使用泊松分布实现期望重复，确保有随机性
            repeat_count = np.random.poisson(repeat_expect)
            # 确保至少返回1次
            return max(1, repeat_count)
        # 3. 中等频率字（1W < freq < 3000W）
        else:
            return 1  # 保持原样
    def batch_get_char_info(
        self, char_pinyin_pairs: List[Tuple[str, str]]
    ) -> Dict[Tuple[str, str], Any]:
        """
        批量获取字符信息
        Args:
            char_pinyin_pairs: [(字符, 拼音), ...]
        Returns:
            字典，key为(字符, 拼音)，value为(id, 频率)或None
        """
        results = {}
        # 先检查缓存
        uncached_pairs = []
        for pair in char_pinyin_pairs:
            if pair in self.char_info_cache:
                results[pair] = self.char_info_cache[pair]
            else:
                uncached_pairs.append(pair)
        # 批量查询未缓存的
        if uncached_pairs:
            # 使用query_engine批量查询
            char_infos = self.query_engine.batch_get_char_pinyin_info(uncached_pairs)
            for pair, char_info in char_infos.items():
                if char_info:
                    info = {
                        "id": char_info.id,
                        "freq": char_info.count,
                        "char": char_info.char,
                        "pinyin": char_info.pinyin,
                    }
                else:
                    info = None
                results[pair] = info
                self.char_info_cache[pair] = info
        return results
    def _process_batch(self, char_pinyin_batch, char_positions, text):
        """处理批量字符"""
        # 批量查询字符信息
        char_info_map = self.batch_get_char_info(char_pinyin_batch)
        batch_samples = []
        for pos_info in char_positions:
            char = pos_info["char"]
            pinyin = pos_info["pinyin"]
            next_pinyins = pos_info["next_pinyins"]
            context = pos_info["context"]
            # 获取字符信息
            char_info = char_info_map.get((char, pinyin))
            if not char_info:
                continue
            # 削峰填谷调整
            adjust_factor = self.adjust_frequency(char_info["freq"])
            if adjust_factor <= 0:
                continue
            # 前文采样
            sampled_context = self.sample_context(context)
            # 拼音处理
            processed_pinyin = self.process_pinyin_sequence(next_pinyins)
            if not processed_pinyin:
                continue
            # Tokenize
            hint = self.tokenizer(
                sampled_context,
                processed_pinyin,
                max_length=self.max_len,
                padding="max_length",
                truncation=True,
                return_tensors="pt",
            )
            # 生成样本
            sample = {
                "hint": hint,
                "txt": sampled_context,
                "py": processed_pinyin,
                "char_id": torch.tensor([char_info["id"]]),
                "char": char,
                "freq": char_info["freq"],
            }
            # 根据调整因子重复样本
            for _ in range(adjust_factor):
                batch_samples.append(sample)
        return batch_samples
    def _shuffle_and_yield(self, batch_samples):
        """打乱并yield样本"""
        if not self.shuffle:
            for sample in batch_samples:
                yield sample
            return
        # 添加到打乱缓冲区
        self.shuffle_buffer.extend(batch_samples)
        # 如果缓冲区达到指定大小，打乱并输出
        if len(self.shuffle_buffer) >= self.shuffle_buffer_size:
            random.shuffle(self.shuffle_buffer)
            for sample in self.shuffle_buffer:
                yield sample
            self.shuffle_buffer = []
    def __iter__(self):
        """
        迭代器实现，支持多进程
        返回:
            生成器，每次返回一个样本
        """
        # 获取worker信息，为每个worker设置不同的随机种子
        worker_info = torch.utils.data.get_worker_info()
        if worker_info is not None:
            worker_id = worker_info.id
            # 使用base_seed + worker_id确保每个worker有不同但确定的随机序列
            base_seed = torch.initial_seed() if hasattr(torch, "initial_seed") else 42
            seed = base_seed + worker_id
            random.seed(seed % (2**32))
            np.random.seed(seed % (2**32))
        # 重置打乱缓冲区
        self.shuffle_buffer = []
        for item in self.dataset:
            text = item.get(self.text_field, "")
            if not text:
                continue
            # 转换为拼音列表
            pinyin_list = lazy_pinyin(text, errors=lambda x: [c for c in x])
            # 批量收集需要查询的字符信息
            char_pinyin_batch = []
            char_positions = []  # 保存字符位置和上下文信息
            # 遍历文本中的每个字符
            for i, (char, py) in enumerate(zip(text, pinyin_list)):
                if not self.is_chinese_char(char):
                    continue
                # 获取后续最多3个中文字符的拼音
                next_chars = self.get_next_chinese_chars(text, i, max_count=3)
                next_pinyins = [py] + [p for _, p in next_chars]
                # 获取前文上下文（最多100字符）
                context = text[max(0, i - 100) : i]
                # 收集信息用于批量查询
                char_pinyin_batch.append((char, py))
                char_positions.append(
                    {
                        "index": i,
                        "char": char,
                        "pinyin": py,
                        "next_pinyins": next_pinyins,
                        "context": context,
                        "next_chars": next_chars,
                    }
                )
                # 达到批量大小时处理
                if len(char_pinyin_batch) >= self.batch_query_size:
                    batch_samples = self._process_batch(
                        char_pinyin_batch, char_positions, text
                    )
                    yield from self._shuffle_and_yield(batch_samples)
                    char_pinyin_batch = []
                    char_positions = []
            # 处理剩余的字符
            if char_pinyin_batch:
                batch_samples = self._process_batch(
                    char_pinyin_batch, char_positions, text
                )
                yield from self._shuffle_and_yield(batch_samples)
        # 清空缓冲区（处理完所有数据后）
        if self.shuffle_buffer:
            random.shuffle(self.shuffle_buffer)
            for sample in self.shuffle_buffer:
                yield sample
            self.shuffle_buffer = []
    def __len__(self):
        """
        由于是流式数据集，无法预先知道长度
        返回:
            返回一个估计值或-1
        """
        return -1
 # 辅助函数，用于DataLoader
 def worker_init_fn(worker_id):
    """DataLoader worker初始化函数"""
    # 设置每个worker的随机种子
    seed = torch.initial_seed() + worker_id
    random.seed(seed % (2**32))
    np.random.seed(seed % (2**32))
    torch.manual_seed(seed % (2**32))
 def custom_collate(batch):
    """自定义批处理函数"""
    if not batch:
        return {}
    # 处理hint字段
    hints = [item["hint"] for item in batch]
    # 合并所有张量字段
    result = {
        "hint": {
            "input_ids": torch.cat([h["input_ids"] for h in hints]),
            "attention_mask": torch.cat([h["attention_mask"] for h in hints]),
        },
        "char_id": torch.cat([item["char_id"] for item in batch]),
        "char": [item["char"] for item in batch],
        "txt": [item["txt"] for item in batch],
        "py": [item["py"] for item in batch],
    }
    # 如果存在token_type_ids则添加
    if "token_type_ids" in hints[0]:
        result["hint"]["token_type_ids"] = torch.cat(
            [h["token_type_ids"] for h in hints]
        )
    # 如果存在freq则添加
    if "freq" in batch[0]:
        result["freq"] = torch.tensor([item["freq"] for item in batch])
    return result
 # 使用示例
 if __name__ == "__main__":
    from query import QueryEngine
    from tqdm import tqdm
    # 初始化查询引擎
    query_engine = QueryEngine()
    query_engine.load("./pinyin_char_statistics.json")
    # 创建数据集
    dataset = PinyinInputDataset(
        data_dir="/home/songsenand/Data/corpus/CCI-Data/",
        query_engine=query_engine,
        tokenizer_name="iic/nlp_structbert_backbone_tiny_std",
        max_len=88,
        batch_query_size=500,
        shuffle=True,
        shuffle_buffer_size=1000,
    )
    logger.info("数据集初始化")
    dataloader = DataLoader(
        dataset,
        batch_size=256,
        num_workers=12,
        worker_init_fn=worker_init_fn,
        # pin_memory=True,
        collate_fn=custom_collate,
        prefetch_factor=32,
        persistent_workers=True,
        shuffle=False,  # 数据集内部已实现打乱
        timeout=60,
    )
    # 测试数据集
    try:
        iterator = iter(dataset)
        logger.info("测试数据集")
        for i, _ in tqdm(enumerate(dataloader), total=200):
            if i >= 200:
                break
            """
            print(f"Sample {i+1}:")
            print(f"  Char: {sample['char']}, Id: {sample['char_id'].item()}, Freq: {sample.get('freq', 'N/A')}")
            print(f"  Pinyin: {sample['py']}")
            print(f"  Context length: {len(sample['txt'])}")
            print(f"  Hint shape: {sample['hint']['input_ids'].shape}")
            print()
            """
    except StopIteration:
        print("数据集为空")
--- a/src/suinput/query.py
+++ b/src/suinput/query.py
@ -7,7 +7,7 @@ from typing import Dict, List, Optional, Tuple, Any
 import time
 import os
-from .char_info import CharInfo, PinyinCharPairsCounter
+from char_info import CharInfo, PinyinCharPairsCounter
 class QueryEngine:
@ -30,6 +30,7 @@ class QueryEngine:
        self._id_to_info: Dict[int, CharInfo] = {}          # ID -> CharInfo
        self._char_to_ids: Dict[str, List[int]] = {}        # 字符 -> ID列表
        self._pinyin_to_ids: Dict[str, List[int]] = {}      # 拼音 -> ID列表
        self._char_pinyin_to_ids: Dict[Tuple[str, str], int] = {}
        # 辅助索引 - 快速获取详细信息
        self._char_freq: Dict[str, int] = {}                # 字符总频率
@ -62,7 +63,7 @@ class QueryEngine:
        start_time = time.time()
        # 读取并解析文件
-        self._counter_data = self._parse_file(file_path)
+        self._counter_data = self.parse_file(file_path)
        # 构建索引
        self._build_indices()
@ -72,7 +73,7 @@ class QueryEngine:
        return self._counter_data.metadata
-    def _parse_file(self, file_path: str) -> PinyinCharPairsCounter:
+    def parse_file(self, file_path: str) -> PinyinCharPairsCounter:
        """解析文件，支持多种格式"""
        with open(file_path, 'rb') as f:
            data = f.read()
@ -130,6 +131,7 @@ class QueryEngine:
        self._id_to_info.clear()
        self._char_to_ids.clear()
        self._pinyin_to_ids.clear()
        self._char_pinyin_to_ids.clear()
        self._char_freq.clear()
        self._pinyin_freq.clear()
        self._char_pinyin_map.clear()
@ -161,6 +163,7 @@ class QueryEngine:
            # 字符-拼音映射
            self._char_pinyin_map[(char, pinyin)] = char_info.count
            self._char_pinyin_to_ids[(char, pinyin)] = char_info_id
        self._total_pairs = len(self._id_to_info)
        self._index_time = time.time() - start_time
@ -295,7 +298,45 @@ class QueryEngine:
            raise RuntimeError("数据未加载，请先调用load()方法")
        return self._char_pinyin_map.get((char, pinyin), 0)
-    
+
    def get_char_info_by_char_pinyin(self, char: str, pinyin: str) -> Optional[CharInfo]:
        """获取特定字符-拼音对对应的ID和频率 - O(1)时间复杂度
        Args:
            char: 汉字字符
            pinyin: 拼音字符串
        Returns:
            ID和频率
        """
        if not self._loaded:
            raise RuntimeError("数据未加载，请先调用load()方法")
        char_info_id = self._char_pinyin_to_ids.get((char, pinyin), None)
        return self.query_by_id(char_info_id)
    def batch_get_char_pinyin_info(self, pairs: List[Tuple[str, str]]) -> Dict[Tuple[str, str], CharInfo]:
        """批量获取汉字-拼音信息
        Args:
            pairs: 汉字-拼音列表
        Returns:
            字典，key为汉字-拼音对，value为CharInfo对象（不存在则为None）
        """
        if not self._loaded:
            raise RuntimeError("数据未加载，请先调用load()方法")
        result = {}
        for pair in pairs:
            char_info_id = self._char_pinyin_to_ids.get(pair)
            if char_info_id is not None:
                result[pair] = self._id_to_info.get(char_info_id)
            else:
                result[pair] = None
        return result
    def batch_query_by_ids(self, ids: List[int]) -> Dict[int, Optional[CharInfo]]:
        """
        批量ID查询 - O(n)时间复杂度
@ -311,10 +352,10 @@ class QueryEngine:
        results = {}
        for id_value in ids:
-            results[id_value] = self._id_to_info.get(id_value)
+            results[id_value] = self._id_to_info.get(id_value, None)
        return results
-    
+
    def batch_query_by_chars(self, chars: List[str], limit_per_char: int = 0) -> Dict[str, List[Tuple[int, str, int]]]:
        """
        批量字符查询
@ -408,6 +449,7 @@ class QueryEngine:
        self._char_freq.clear()
        self._pinyin_freq.clear()
        self._char_pinyin_map.clear()
        self._char_pinyin_to_ids.clear()
        self._loaded = False
        self._total_pairs = 0
        self._load_time = 0.0
--- a/test/test_query.py
+++ b/test/test_query.py
@ -0,0 +1,151 @@
 # test_query_engine.py
 import pytest
 import tempfile
 import os
 import json
 from suinput.query import QueryEngine
 from suinput.char_info import CharInfo, PinyinCharPairsCounter
 # 将测试数据保存为 JSON 文件
@pytest.fixture
 def json_file_path():
    yield "pinyin_char_statistics.json"
 # 测试 QueryEngine 的基本功能
 class TestQueryEngine:
    def test_load_from_json(self, json_file_path):
        """测试从 JSON 文件加载数据"""
        engine = QueryEngine()
        metadata = engine.load(json_file_path)
        assert engine.is_loaded() is True
        assert metadata["format"] == "json"
        assert metadata["pair_count"] == 20646
    def test_query_by_id(self, json_file_path):
        """测试通过 ID 查询字符信息"""
        engine = QueryEngine()
        engine.load(json_file_path)
        result = engine.query_by_id(8)
        assert result is not None
        assert result.char == "中"
        assert result.pinyin == "zhong"
        assert result.count == 73927282
        result = engine.query_by_id(100000)  # 不存在的 ID
        assert result is None
    def test_query_by_char(self, json_file_path):
        """测试通过字符查询拼音信息"""
        engine = QueryEngine()
        engine.load(json_file_path)
        results = engine.query_by_char("长")
        assert len(results) == 2
        assert results[0] == (159, "zhang", 15424264)
        assert results[1] == (414, "chang", 6663465)
        results_limited = engine.query_by_char("长", limit=1)
        assert len(results_limited) == 1
        assert results_limited[0] == (159, "zhang", 15424264)
        results_empty = engine.query_by_char("X")  # 不存在的字符
        assert results_empty == []
    def test_query_by_pinyin(self, json_file_path):
        """测试通过拼音查询字符信息"""
        engine = QueryEngine()
        engine.load(json_file_path)
        results = engine.query_by_pinyin("zhong")
        assert len(results) == 57
        assert results[0] == (8, "中", 73927282)
        results_empty = engine.query_by_pinyin("xxx")  # 不存在的拼音
        assert results_empty == []
    def test_get_char_frequency(self, json_file_path):
        """测试获取字符总频率"""
        engine = QueryEngine()
        engine.load(json_file_path)
        freq = engine.get_char_frequency("中")
        assert freq == 73927282
        freq_zero = engine.get_char_frequency("X")  # 不存在的字符
        assert freq_zero == 0
    def test_get_pinyin_frequency(self, json_file_path):
        """测试获取拼音总频率"""
        engine = QueryEngine()
        engine.load(json_file_path)
        freq = engine.get_pinyin_frequency("zhong")
        assert freq == 136246123
        freq_zero = engine.get_pinyin_frequency("xxx")  # 不存在的拼音
        assert freq_zero == 0
    def test_get_char_pinyin_count(self, json_file_path):
        """测试获取字符-拼音对的出现次数"""
        engine = QueryEngine()
        engine.load(json_file_path)
        count = engine.get_char_pinyin_count("中", "zhong")
        assert count == 73927282
        count_zero = engine.get_char_pinyin_count("中", "xxx")  # 不存在的拼音
        assert count_zero == 0
    def test_batch_query_by_ids(self, json_file_path):
        """测试批量 ID 查询"""
        engine = QueryEngine()
        engine.load(json_file_path)
        results = engine.batch_query_by_ids([8, 9, 10000000])
        assert len(results) == 3
        assert results[9].char == "为"
    def test_search_chars_by_prefix(self, json_file_path):
        """测试根据字符前缀搜索"""
        engine = QueryEngine()
        engine.load(json_file_path)
        results = engine.search_chars_by_prefix("中")
        assert len(results) == 1
        assert results[0] == ("中", 73927282)
        results_empty = engine.search_chars_by_prefix("X")  # 不存在的前缀
        assert results_empty == []
    def test_get_statistics(self, json_file_path):
        """测试获取统计信息"""
        engine = QueryEngine()
        engine.load(json_file_path)
        stats = engine.get_statistics()
        assert stats["status"] == "loaded"
        assert stats["total_pairs"] == 20646
        assert stats["total_characters"] == 18240
        assert stats["top_chars"][0] == ("的", 439524694)
    def test_clear(self, json_file_path):
        """测试清除数据"""
        engine = QueryEngine()
        engine.load(json_file_path)
        assert engine.is_loaded() is True
        engine.clear()
        assert engine.is_loaded() is False
        assert engine.get_statistics()["status"] == "not_loaded"
    def test_batch_get_char_pinyin_info(self, json_file_path):
        engine = QueryEngine()
        engine.load(json_file_path)
        assert engine.is_loaded() is True
        pairs = engine.batch_get_char_pinyin_info([("我", "wo"), ("你", "ni"), ("他", "ta")])
        assert pairs[("我", "wo")] == engine.get_char_info_by_char_pinyin("我", "wo")
        assert pairs[("你", "ni")] == engine.get_char_info_by_char_pinyin("你", "ni")
        assert pairs[("他", "ta")] == engine.get_char_info_by_char_pinyin("他", "ta")