chore: 删除旧的拼音字符统计文件

2026-04-02 00:44:16 +08:00 · 2026-04-02 00:44:16 +08:00 · 14c835dd72
parent 9c8574cf11
commit 14c835dd72
7 changed files with 23068 additions and 144246 deletions
--- a/src/model/assets/pinyin_char_statistics.json.bak
+++ b/src/model/assets/pinyin_char_statistics.json.bak
--- a/src/model/assets/tokenizer/config.json
+++ b/src/model/assets/tokenizer/config.json
@ -0,0 +1,30 @@
+{
+  "add_cross_attention": false,
+  "attention_probs_dropout_prob": 0.1,
+  "bos_token_id": null,
+  "classifier_dropout": null,
+  "directionality": "bidi",
+  "eos_token_id": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "is_decoder": false,
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 6,
+  "pad_token_id": 0,
+  "pooler_fc_size": 768,
+  "pooler_num_attention_heads": 12,
+  "pooler_num_fc_layers": 3,
+  "pooler_size_per_head": 128,
+  "pooler_type": "first_token_transform",
+  "tie_word_embeddings": true,
+  "transformers_version": "5.1.0",
+  "type_vocab_size": 4,
+  "use_cache": true,
+  "vocab_size": 21128
+}
--- a/src/model/assets/tokenizer/tokenizer.json
+++ b/src/model/assets/tokenizer/tokenizer.json
--- a/src/model/assets/tokenizer/tokenizer_config.json
+++ b/src/model/assets/tokenizer/tokenizer_config.json
@ -0,0 +1,14 @@
+{
+  "backend": "tokenizers",
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "is_local": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 1000000000000000019884624838656,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}
--- a/src/model/dataset.py
+++ b/src/model/dataset.py
@ -1,6 +1,7 @@
-import os
 import random
 from typing import Any, Dict, List, Optional, Tuple
+from importlib.resources import files
+from pathlib import Path

 import numpy as np
 import torch
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
 from pypinyin.contrib.tone_convert import to_initials
 from torch.utils.data import DataLoader, IterableDataset

-# 加载分词器和模型
-# model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
-
-# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
+from .query import QueryEngine


 class PinyinInputDataset(IterableDataset):
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
        self,
        data_path: str,
        max_workes: int = -1,
-        tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
        max_length=128,
        text_field: str = "text",
        py_style_weight=(9, 2, 1),
+        shuffle_buffer_size: int = 5000,
    ):
-        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+        self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
        self.data_path = data_path
        self.max_length = max_length
        self.text_field = text_field
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
        self.max_workers = max_workes
        self.py_style_weight = py_style_weight

-    @staticmethod
-    def smart_multi_segment_encode(texts, tokenizer, max_length=128):
+        self.query_engine = QueryEngine()
+        self.query_engine.load()
+        self.shuffle_buffer_size = shuffle_buffer_size
+        self.buffer = []
+
+        
+
+    def smart_multi_segment_encode(self, texts):
        """
        智能多段落编码

@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
        encoded_segments = []
        for text in texts:
            # 注意：不添加特殊标记，我们后面统一处理
-            encoded = tokenizer.encode(text, add_special_tokens=False)
+            encoded = self.tokenizer.encode(text, add_special_tokens=False)
            encoded_segments.append(encoded)

        # 第二步：构建完整序列
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
        token_type_ids = []

        # 添加[CLS]
-        tokens.append(tokenizer.cls_token_id)
+        tokens.append(self.tokenizer.cls_token_id)
        token_type_ids.append(0)  # CLS通常为0

        # 添加各个段落
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):

            # 添加[SEP]（最后一个段落可以不加）
            if seg_idx < len(encoded_segments) - 1:
-                tokens.append(tokenizer.sep_token_id)
+                tokens.append(self.tokenizer.sep_token_id)
                token_type_ids.append(current_type)
            else:
                # 最后一个段落加[SEP]
-                tokens.append(tokenizer.sep_token_id)
+                tokens.append(self.tokenizer.sep_token_id)
                token_type_ids.append(current_type)

        # 第三步：截断和填充
-        if len(tokens) > max_length:
-            tokens = tokens[:max_length]
-            token_type_ids = token_type_ids[:max_length]
+        if len(tokens) > self.max_length:
+            tokens = tokens[:self.max_length]
+            token_type_ids = token_type_ids[:self.max_length]
        else:
            # 填充
-            padding_length = max_length - len(tokens)
-            tokens = tokens + [tokenizer.pad_token_id] * padding_length
+            padding_length = self.max_length - len(tokens)
+            tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
            token_type_ids = token_type_ids + [0] * padding_length  # 填充部分用0

        # 第四步：创建attention mask
        attention_mask = [
-            1 if token != tokenizer.pad_token_id else 0 for token in tokens
+            1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
        ]

        return {
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
    def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
        mask_pinyin = []
        for i in range(len(text)):
-            if text[i] == pinyin_list[i]:
+            if self.query_engine.is_chinese_char(text[i]):
                return i - 1, mask_pinyin
            else:
                py = random.choice(
                    (pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
                    weights=self.py_style_weight,
                )
+                if py == "":
+                    py = pinyin_list[i][0]
                mask_pinyin.append(py)
        return len(text) - 1, mask_pinyin

@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
                pinyin_list = self.generate_pinyin(text)
                for i in range(len(text)):
                    # 如果text[i]不再字符库中，则跳过
-                    # 当i小于44时候，则将part1取text[0:i]
-                    # 当i大于44时候，则将part1取text[i-44:i]
-                    if text[i] == pinyin_list[i]:
+                    # 当i小于48时候，则将part1取text[0:i]
+                    # 当i大于48时候，则将part1取text[i-48:i]
+                    if self.query_engine.is_chinese_char(text[i]):
                        continue
-                    if i < 44:
+                    if i < 48:
                        part1 = text[0:i]
                    else:
-                        part1 = text[i - 44 : i]
+                        part1 = text[i - 48 : i]
                    # 首先取随机值pinyin_len（1-8），pinyin_len取值呈高斯分布，最大概率取3
                    # 获取text[i + pinyin_len]字符，如果无法获取所指向的后，如果pinyin_len
                    # part2的长度为x，取pinyin_list[i:i+pinyin_len]，为part2
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
                    )
                    py_end = min(i + pinyin_len, len(text))
                    part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
-                    # part3为文本，大概率（0.85）为空，不为空则是i+pinyin_len所指向的字符

-                # encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length)
+                    # part3为文本，大概率（0.70）为空
+                    # 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
+                    # x为1-16中的任意整数，取值平均分布
+                    part3 = ""
+                    if random.random() > 0.7:
+                        part3 = text[
+                            i + pinyin_len : i
+                            + pinyin_len
+                            + np.random.choice(range(1, 17))
+                        ]
+
+                    # part4为文本，0.50的概率为空
+                    # 不为空则为1-5个连续字符串
+                    # 连续字符串的取值方法为：随机从字符库中取一个字符，以及该字符后x个字符
+                    # x为2-6中的任意整数，取值平均分布
+                    # 使用|将part4中的字符串连接起来
+                    part4 = ""
+                    if random.random() > 0.5:
+                        # 生成1-5个连续字符串
+                        num_strings = random.randint(1, 5)
+                        string_list = []
+                        for _ in range(num_strings):
+                            # 随机选择起始位置
+                            start_pos = random.randint(0, len(text) - 1)
+                            # 随机选择x的值(2-6)
+                            x = random.randint(2, 6)
+                            # 获取连续字符串
+                            end_pos = min(start_pos + x + 1, len(text))
+                            string_list.append(text[start_pos:end_pos])
+                        # 用|连接所有字符串
+                        part4 = "|".join(string_list)
+
+                    labels = [
+                        self.query_engine.get_char_info_by_char_pinyin(
+                            c, p
+                        ).id
+                        for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
+                    ]
+
+                encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
+                encoded["label"] = labels
+                batch_samples.append(encoded)
+            if len(batch_samples) >= self.shuffle_buffer_size:
+                indices = np.random.permutation(len(batch_samples))
+                self.buffer.extend([batch_samples[i] for i in indices])
+                batch_samples = []
+            yield from self.buffer
+
+
--- a/src/model/model.py
+++ b/src/model/model.py
@ -10,7 +10,7 @@ import torch.nn.functional as F

 from loguru import logger
 from modelscope import AutoTokenizer
-from tqdm.autonotebook import tqdm
+from tqdm.notebook import tqdm


 from .components import AttentionPooling, Expert  # , ResidualBlock  # 假设已实现
--- a/uv.lock
+++ b/uv.lock