重构代码结构并优化拼音字符查询逻辑

重构：将 suinput 模块文件移至 tmp_utils 目录
2026-02-11 12:38:48 +08:00 · 2026-02-11 10:13:06 +08:00
7 changed files with 186 additions and 149 deletions
--- a/src/suinput/data/pinyin_char_statistics.json
+++ b/src/suinput/data/pinyin_char_statistics.json
--- a/src/suinput/data/pinyin_group.json
+++ b/src/suinput/data/pinyin_group.json
--- a/src/suinput/dataset.py
+++ b/src/suinput/dataset.py
@ -1,8 +1,9 @@
 import random
 from typing import Any, Dict, List, Tuple
 import os
 import json
 import os
 import random
 from importlib.resources import files
 from pathlib import Path
 from typing import Any, Dict, List, Tuple
 import numpy as np
 import torch
@ -12,9 +13,9 @@ from modelscope import AutoTokenizer
 from pypinyin import lazy_pinyin
 from torch.utils.data import DataLoader, IterableDataset
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 class PinyinInputDataset(IterableDataset):
    """
    拼音输入法模拟数据集
@ -46,7 +47,7 @@ class PinyinInputDataset(IterableDataset):
        repeat_end_freq: int = 10000,  # 开始重复的阈值
        max_drop_prob: float = 0.8,  # 最大丢弃概率
        max_repeat_expect: float = 50.0,  # 最大重复期望
-        py_group_json_file: Path = Path(__file__).parent.parent.parent / "py_group.json",
+        py_group_json_file: Optional[Dict[str, int]] = None,
    ):
        """
        初始化数据集
@ -97,9 +98,32 @@ class PinyinInputDataset(IterableDataset):
        # 加载数据集
        self.dataset = load_dataset(data_dir, split="train", streaming=True)
-        with open(py_group_json_file, "r") as f:
+        # 加载拼音分组
-            self.py_groups = json.load(f)
+        self.pg_groups = {
-
+            "y": 0,
            "k": 0,
            "e": 0,
            "l": 1,
            "w": 1,
            "f": 1,
            "q": 2,
            "a": 2,
            "s": 2,
            "x": 3,
            "b": 3,
            "r": 3,
            "o": 4,
            "m": 4,
            "z": 4,
            "g": 5,
            "n": 5,
            "c": 5,
            "t": 6,
            "p": 6,
            "d": 6,
            "j": 7,
            "h": 7,
        }
    def get_next_chinese_chars(
        self,
@ -391,7 +415,7 @@ class PinyinInputDataset(IterableDataset):
            if not char_info:
                continue
-            logger.info(f'获取字符信息: {char_info}')
+            logger.info(f"获取字符信息: {char_info}")
            # 削峰填谷调整
            adjust_factor = self.adjust_frequency(char_info["freq"])
            if adjust_factor <= 0:
@ -402,8 +426,6 @@ class PinyinInputDataset(IterableDataset):
            # 拼音处理
            processed_pinyin = self.process_pinyin_sequence(next_pinyins)
            if not processed_pinyin:
                continue
            # Tokenize
            hint = self.tokenizer(
@ -423,6 +445,9 @@ class PinyinInputDataset(IterableDataset):
                "char_id": torch.tensor([char_info["id"]]),
                "char": char,
                "freq": char_info["freq"],
                "pg": torch.tensor(
                    self.pg_groups[processed_pinyin[0]] if processed_pinyin else 8
                ),
            }
            # 根据调整因子重复样本
@ -470,7 +495,8 @@ class PinyinInputDataset(IterableDataset):
            pinyin_list = lazy_pinyin(text, errors=lambda x: [c for c in x])
            # 批量收集需要查询的字符信息
            chinese_positions = [
-                i for i, char in enumerate(text) 
+                i
                for i, char in enumerate(text)
                if self.query_engine.is_chinese_char(char)
            ]
@ -518,7 +544,6 @@ class PinyinInputDataset(IterableDataset):
                )
                yield from self._shuffle_and_yield(batch_samples)
    def __len__(self):
        """
        由于是流式数据集，无法预先知道长度
@ -591,7 +616,7 @@ if __name__ == "__main__":
    # 初始化查询引擎
    query_engine = QueryEngine()
-    query_engine.load("./pinyin_char_pairs_info.json")
+    query_engine.load()
    # 创建数据集
    dataset = PinyinInputDataset(
@ -645,4 +670,3 @@ if __name__ == "__main__":
            """
    except StopIteration:
        print("数据集为空")
--- a/src/suinput/query.py
+++ b/src/suinput/query.py
@ -1,11 +1,14 @@
 # file name: query_engine.py
 import json
 import pickle
 import msgpack
 import gzip
-from typing import Dict, List, Optional, Tuple, Any
+import json
 import time
 import os
 import pickle
 import time
 from importlib.resources import files
 from pathlib import Path
 from typing import Any, Dict, List, Optional, Tuple, Union
 import msgpack
 from .char_info import CharInfo, PinyinCharPairsCounter
@ -45,12 +48,17 @@ class QueryEngine:
        self.min_count = min_count
-    def load(self, file_path: str) -> Dict[str, Any]:
+    def load(
        self,
        file_path: Union[str, Path] = (
            files(__package__) / "data" / "pinyin_char_statistics.json"
        ),
    ) -> Dict[str, Any]:
        """
        加载统计结果文件
        Args:
-            file_path: 文件路径，支持msgpack/pickle/json格式，自动检测压缩
+            file_path: 文件路径，文件支持msgpack/pickle/json格式，自动检测压缩
        Returns:
            元数据字典
@ -75,9 +83,9 @@ class QueryEngine:
        return self._counter_data.metadata
-    def parse_file(self, file_path: str) -> PinyinCharPairsCounter:
+    def parse_file(self, file_path: Union[str, Path]) -> PinyinCharPairsCounter:
        """解析文件，支持多种格式"""
-        with open(file_path, 'rb') as f:
+        with open(file_path, "rb") as f:
            data = f.read()
        # 尝试解压
@ -88,9 +96,9 @@ class QueryEngine:
        # 尝试不同格式
        for parser_name, parser in [
-            ('msgpack', self._parse_msgpack),
+            ("msgpack", self._parse_msgpack),
-            ('pickle', self._parse_pickle),
+            ("pickle", self._parse_pickle),
-            ('json', self._parse_json)
+            ("json", self._parse_json),
        ]:
            try:
                return parser(data)
@ -110,7 +118,7 @@ class QueryEngine:
    def _parse_json(self, data: bytes) -> PinyinCharPairsCounter:
        """解析json格式"""
-        data_str = data.decode('utf-8')
+        data_str = data.decode("utf-8")
        data_dict = json.loads(data_str)
        return self._dict_to_counter(data_dict)
@ -118,10 +126,10 @@ class QueryEngine:
        """字典转PinyinCharPairsCounter"""
        # 转换CharInfo字典
        pairs_dict = {}
-        if 'pairs' in data_dict and data_dict['pairs']:
+        if "pairs" in data_dict and data_dict["pairs"]:
-            for id_str, info_dict in data_dict['pairs'].items():
+            for id_str, info_dict in data_dict["pairs"].items():
                pairs_dict[int(id_str)] = CharInfo(**info_dict)
-            data_dict['pairs'] = pairs_dict
+            data_dict["pairs"] = pairs_dict
        return PinyinCharPairsCounter(**data_dict)
@ -222,7 +230,9 @@ class QueryEngine:
        return results
-    def query_by_pinyin(self, pinyin: str, limit: int = 0) -> List[Tuple[int, str, int]]:
+    def query_by_pinyin(
        self, pinyin: str, limit: int = 0
    ) -> List[Tuple[int, str, int]]:
        """
        通过拼音查询字符信息 - O(1) + O(k)时间复杂度
@ -303,7 +313,9 @@ class QueryEngine:
        return self._char_pinyin_map.get((char, pinyin), 0)
-    def get_char_info_by_char_pinyin(self, char: str, pinyin: str) -> Optional[CharInfo]:
+    def get_char_info_by_char_pinyin(
        self, char: str, pinyin: str
    ) -> Optional[CharInfo]:
        """获取特定字符-拼音对对应的ID和频率 - O(1)时间复杂度
        Args:
@ -319,7 +331,9 @@ class QueryEngine:
        char_info_id = self._char_pinyin_to_ids.get((char, pinyin), None)
        return self.query_by_id(char_info_id)
-    def batch_get_char_pinyin_info(self, pairs: List[Tuple[str, str]]) -> Dict[Tuple[str, str], CharInfo]:
+    def batch_get_char_pinyin_info(
        self, pairs: List[Tuple[str, str]]
    ) -> Dict[Tuple[str, str], CharInfo]:
        """批量获取汉字-拼音信息
        Args:
@ -340,7 +354,6 @@ class QueryEngine:
                result[pair] = None
        return result
    def batch_query_by_ids(self, ids: List[int]) -> Dict[int, Optional[CharInfo]]:
        """
        批量ID查询 - O(n)时间复杂度
@ -360,7 +373,9 @@ class QueryEngine:
        return results
-    def batch_query_by_chars(self, chars: List[str], limit_per_char: int = 0) -> Dict[str, List[Tuple[int, str, int]]]:
+    def batch_query_by_chars(
        self, chars: List[str], limit_per_char: int = 0
    ) -> Dict[str, List[Tuple[int, str, int]]]:
        """
        批量字符查询
@ -380,7 +395,9 @@ class QueryEngine:
        return results
-    def search_chars_by_prefix(self, prefix: str, limit: int = 20) -> List[Tuple[str, int]]:
+    def search_chars_by_prefix(
        self, prefix: str, limit: int = 20
    ) -> List[Tuple[str, int]]:
        """
        根据字符前缀搜索 - O(n)时间复杂度，n为字符总数
@ -409,7 +426,7 @@ class QueryEngine:
        判断是否是汉字
        """
        if not self.is_loaded():
-            raise ValueError('请先调用 load() 方法加载数据')
+            raise ValueError("请先调用 load() 方法加载数据")
        return char in self._char_to_ids
    def get_statistics(self) -> Dict[str, Any]:
@ -422,16 +439,12 @@ class QueryEngine:
        if not self._loaded:
            return {"status": "not_loaded"}
-        top_chars = sorted(
+        top_chars = sorted(self._char_freq.items(), key=lambda x: x[1], reverse=True)[
-            self._char_freq.items(),
+            :10
-            key=lambda x: x[1],
+        ]
            reverse=True
        )[:10]
        top_pinyins = sorted(
-            self._pinyin_freq.items(),
+            self._pinyin_freq.items(), key=lambda x: x[1], reverse=True
            key=lambda x: x[1],
            reverse=True
        )[:10]
        return {
@ -445,7 +458,7 @@ class QueryEngine:
            "index_time_seconds": self._index_time,
            "top_chars": top_chars,
            "top_pinyins": top_pinyins,
-            "metadata": self._counter_data.metadata
+            "metadata": self._counter_data.metadata,
        }
    def is_loaded(self) -> bool:
--- a/src/tmp_utils/main.py
+++ b/src/tmp_utils/main.py
--- a/src/tmp_utils/counter.py
+++ b/src/tmp_utils/counter.py
--- a/src/tmp_utils/group.py
+++ b/src/tmp_utils/group.py
Author	SHA1	Message	Date
songsenand	5b1c6fcb2b	重构代码结构并优化拼音字符查询逻辑	2026-02-11 12:38:48 +08:00
songsenand	1cbb0b07c4	重构：将 suinput 模块文件移至 tmp_utils 目录	2026-02-11 10:13:06 +08:00