chore: 删除旧的拼音字符统计文件

2026-04-02 00:44:16 +08:00 · 2026-04-02 00:44:16 +08:00 · 14c835dd72
parent 9c8574cf11
commit 14c835dd72
7 changed files with 23068 additions and 144246 deletions
--- a/src/model/assets/pinyin_char_statistics.json.bak
+++ b/src/model/assets/pinyin_char_statistics.json.bak
--- a/src/model/assets/tokenizer/config.json
+++ b/src/model/assets/tokenizer/config.json
@ -0,0 +1,30 @@
 {
  "add_cross_attention": false,
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": null,
  "classifier_dropout": null,
  "directionality": "bidi",
  "eos_token_id": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 512,
  "initializer_range": 0.02,
  "intermediate_size": 2048,
  "is_decoder": false,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 8,
  "num_hidden_layers": 6,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_token_transform",
  "tie_word_embeddings": true,
  "transformers_version": "5.1.0",
  "type_vocab_size": 4,
  "use_cache": true,
  "vocab_size": 21128
 }
--- a/src/model/assets/tokenizer/tokenizer.json
+++ b/src/model/assets/tokenizer/tokenizer.json
--- a/src/model/assets/tokenizer/tokenizer_config.json
+++ b/src/model/assets/tokenizer/tokenizer_config.json
@ -0,0 +1,14 @@
 {
  "backend": "tokenizers",
  "cls_token": "[CLS]",
  "do_lower_case": false,
  "is_local": true,
  "mask_token": "[MASK]",
  "model_max_length": 1000000000000000019884624838656,
  "pad_token": "[PAD]",
  "sep_token": "[SEP]",
  "strip_accents": null,
  "tokenize_chinese_chars": true,
  "tokenizer_class": "BertTokenizer",
  "unk_token": "[UNK]"
 }
--- a/src/model/dataset.py
+++ b/src/model/dataset.py
@ -1,6 +1,7 @@
 import os
 import random
 from typing import Any, Dict, List, Optional, Tuple
 from importlib.resources import files
 from pathlib import Path
 import numpy as np
 import torch
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
 from pypinyin.contrib.tone_convert import to_initials
 from torch.utils.data import DataLoader, IterableDataset
-# 加载分词器和模型
+from .query import QueryEngine
 # model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
 # tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
 class PinyinInputDataset(IterableDataset):
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
        self,
        data_path: str,
        max_workes: int = -1,
        tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
        max_length=128,
        text_field: str = "text",
        py_style_weight=(9, 2, 1),
        shuffle_buffer_size: int = 5000,
    ):
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
        self.data_path = data_path
        self.max_length = max_length
        self.text_field = text_field
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
        self.max_workers = max_workes
        self.py_style_weight = py_style_weight
-    @staticmethod
+        self.query_engine = QueryEngine()
-    def smart_multi_segment_encode(texts, tokenizer, max_length=128):
+        self.query_engine.load()
        self.shuffle_buffer_size = shuffle_buffer_size
        self.buffer = []
    def smart_multi_segment_encode(self, texts):
        """
        智能多段落编码
@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
        encoded_segments = []
        for text in texts:
            # 注意：不添加特殊标记，我们后面统一处理
-            encoded = tokenizer.encode(text, add_special_tokens=False)
+            encoded = self.tokenizer.encode(text, add_special_tokens=False)
            encoded_segments.append(encoded)
        # 第二步：构建完整序列
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
        token_type_ids = []
        # 添加[CLS]
-        tokens.append(tokenizer.cls_token_id)
+        tokens.append(self.tokenizer.cls_token_id)
        token_type_ids.append(0)  # CLS通常为0
        # 添加各个段落
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):
            # 添加[SEP]（最后一个段落可以不加）
            if seg_idx < len(encoded_segments) - 1:
-                tokens.append(tokenizer.sep_token_id)
+                tokens.append(self.tokenizer.sep_token_id)
                token_type_ids.append(current_type)
            else:
                # 最后一个段落加[SEP]
-                tokens.append(tokenizer.sep_token_id)
+                tokens.append(self.tokenizer.sep_token_id)
                token_type_ids.append(current_type)
        # 第三步：截断和填充
-        if len(tokens) > max_length:
+        if len(tokens) > self.max_length:
-            tokens = tokens[:max_length]
+            tokens = tokens[:self.max_length]
-            token_type_ids = token_type_ids[:max_length]
+            token_type_ids = token_type_ids[:self.max_length]
        else:
            # 填充
-            padding_length = max_length - len(tokens)
+            padding_length = self.max_length - len(tokens)
-            tokens = tokens + [tokenizer.pad_token_id] * padding_length
+            tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
            token_type_ids = token_type_ids + [0] * padding_length  # 填充部分用0
        # 第四步：创建attention mask
        attention_mask = [
-            1 if token != tokenizer.pad_token_id else 0 for token in tokens
+            1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
        ]
        return {
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
    def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
        mask_pinyin = []
        for i in range(len(text)):
-            if text[i] == pinyin_list[i]:
+            if self.query_engine.is_chinese_char(text[i]):
                return i - 1, mask_pinyin
            else:
                py = random.choice(
                    (pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
                    weights=self.py_style_weight,
                )
                if py == "":
                    py = pinyin_list[i][0]
                mask_pinyin.append(py)
        return len(text) - 1, mask_pinyin
@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
                pinyin_list = self.generate_pinyin(text)
                for i in range(len(text)):
                    # 如果text[i]不再字符库中，则跳过
-                    # 当i小于44时候，则将part1取text[0:i]
+                    # 当i小于48时候，则将part1取text[0:i]
-                    # 当i大于44时候，则将part1取text[i-44:i]
+                    # 当i大于48时候，则将part1取text[i-48:i]
-                    if text[i] == pinyin_list[i]:
+                    if self.query_engine.is_chinese_char(text[i]):
                        continue
-                    if i < 44:
+                    if i < 48:
                        part1 = text[0:i]
                    else:
-                        part1 = text[i - 44 : i]
+                        part1 = text[i - 48 : i]
                    # 首先取随机值pinyin_len（1-8），pinyin_len取值呈高斯分布，最大概率取3
                    # 获取text[i + pinyin_len]字符，如果无法获取所指向的后，如果pinyin_len
                    # part2的长度为x，取pinyin_list[i:i+pinyin_len]，为part2
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
                    )
                    py_end = min(i + pinyin_len, len(text))
                    part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
                    # part3为文本，大概率（0.85）为空，不为空则是i+pinyin_len所指向的字符
-                # encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length)
+                    # part3为文本，大概率（0.70）为空
                    # 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
                    # x为1-16中的任意整数，取值平均分布
                    part3 = ""
                    if random.random() > 0.7:
                        part3 = text[
                            i + pinyin_len : i
                            + pinyin_len
                            + np.random.choice(range(1, 17))
                        ]
                    # part4为文本，0.50的概率为空
                    # 不为空则为1-5个连续字符串
                    # 连续字符串的取值方法为：随机从字符库中取一个字符，以及该字符后x个字符
                    # x为2-6中的任意整数，取值平均分布
                    # 使用|将part4中的字符串连接起来
                    part4 = ""
                    if random.random() > 0.5:
                        # 生成1-5个连续字符串
                        num_strings = random.randint(1, 5)
                        string_list = []
                        for _ in range(num_strings):
                            # 随机选择起始位置
                            start_pos = random.randint(0, len(text) - 1)
                            # 随机选择x的值(2-6)
                            x = random.randint(2, 6)
                            # 获取连续字符串
                            end_pos = min(start_pos + x + 1, len(text))
                            string_list.append(text[start_pos:end_pos])
                        # 用|连接所有字符串
                        part4 = "|".join(string_list)
                    labels = [
                        self.query_engine.get_char_info_by_char_pinyin(
                            c, p
                        ).id
                        for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
                    ]
                encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
                encoded["label"] = labels
                batch_samples.append(encoded)
            if len(batch_samples) >= self.shuffle_buffer_size:
                indices = np.random.permutation(len(batch_samples))
                self.buffer.extend([batch_samples[i] for i in indices])
                batch_samples = []
            yield from self.buffer
--- a/src/model/model.py
+++ b/src/model/model.py
@ -10,7 +10,7 @@ import torch.nn.functional as F
 from loguru import logger
 from modelscope import AutoTokenizer
-from tqdm.autonotebook import tqdm
+from tqdm.notebook import tqdm
 from .components import AttentionPooling, Expert  # , ResidualBlock  # 假设已实现
--- a/uv.lock
+++ b/uv.lock