chore: 删除旧的拼音字符统计文件

This commit is contained in:
songsenand 2026-04-02 00:44:16 +08:00
parent 9c8574cf11
commit 14c835dd72
7 changed files with 23068 additions and 144246 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
{
"add_cross_attention": false,
"attention_probs_dropout_prob": 0.1,
"bos_token_id": null,
"classifier_dropout": null,
"directionality": "bidi",
"eos_token_id": null,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 2048,
"is_decoder": false,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 8,
"num_hidden_layers": 6,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"tie_word_embeddings": true,
"transformers_version": "5.1.0",
"type_vocab_size": 4,
"use_cache": true,
"vocab_size": 21128
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,14 @@
{
"backend": "tokenizers",
"cls_token": "[CLS]",
"do_lower_case": false,
"is_local": true,
"mask_token": "[MASK]",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}

View File

@ -1,6 +1,7 @@
import os
import random
from typing import Any, Dict, List, Optional, Tuple
from importlib.resources import files
from pathlib import Path
import numpy as np
import torch
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
from pypinyin.contrib.tone_convert import to_initials
from torch.utils.data import DataLoader, IterableDataset
# 加载分词器和模型
# model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
from .query import QueryEngine
class PinyinInputDataset(IterableDataset):
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
self,
data_path: str,
max_workes: int = -1,
tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
max_length=128,
text_field: str = "text",
py_style_weight=(9, 2, 1),
shuffle_buffer_size: int = 5000,
):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
self.data_path = data_path
self.max_length = max_length
self.text_field = text_field
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
self.max_workers = max_workes
self.py_style_weight = py_style_weight
@staticmethod
def smart_multi_segment_encode(texts, tokenizer, max_length=128):
self.query_engine = QueryEngine()
self.query_engine.load()
self.shuffle_buffer_size = shuffle_buffer_size
self.buffer = []
def smart_multi_segment_encode(self, texts):
"""
智能多段落编码
@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
encoded_segments = []
for text in texts:
# 注意:不添加特殊标记,我们后面统一处理
encoded = tokenizer.encode(text, add_special_tokens=False)
encoded = self.tokenizer.encode(text, add_special_tokens=False)
encoded_segments.append(encoded)
# 第二步:构建完整序列
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
token_type_ids = []
# 添加[CLS]
tokens.append(tokenizer.cls_token_id)
tokens.append(self.tokenizer.cls_token_id)
token_type_ids.append(0) # CLS通常为0
# 添加各个段落
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):
# 添加[SEP](最后一个段落可以不加)
if seg_idx < len(encoded_segments) - 1:
tokens.append(tokenizer.sep_token_id)
tokens.append(self.tokenizer.sep_token_id)
token_type_ids.append(current_type)
else:
# 最后一个段落加[SEP]
tokens.append(tokenizer.sep_token_id)
tokens.append(self.tokenizer.sep_token_id)
token_type_ids.append(current_type)
# 第三步:截断和填充
if len(tokens) > max_length:
tokens = tokens[:max_length]
token_type_ids = token_type_ids[:max_length]
if len(tokens) > self.max_length:
tokens = tokens[:self.max_length]
token_type_ids = token_type_ids[:self.max_length]
else:
# 填充
padding_length = max_length - len(tokens)
tokens = tokens + [tokenizer.pad_token_id] * padding_length
padding_length = self.max_length - len(tokens)
tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0
# 第四步创建attention mask
attention_mask = [
1 if token != tokenizer.pad_token_id else 0 for token in tokens
1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
]
return {
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
mask_pinyin = []
for i in range(len(text)):
if text[i] == pinyin_list[i]:
if self.query_engine.is_chinese_char(text[i]):
return i - 1, mask_pinyin
else:
py = random.choice(
(pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
weights=self.py_style_weight,
)
if py == "":
py = pinyin_list[i][0]
mask_pinyin.append(py)
return len(text) - 1, mask_pinyin
@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
pinyin_list = self.generate_pinyin(text)
for i in range(len(text)):
# 如果text[i]不再字符库中,则跳过
# 当i小于44时候则将part1取text[0:i]
# 当i大于44时候则将part1取text[i-44:i]
if text[i] == pinyin_list[i]:
# 当i小于48时候则将part1取text[0:i]
# 当i大于48时候则将part1取text[i-48:i]
if self.query_engine.is_chinese_char(text[i]):
continue
if i < 44:
if i < 48:
part1 = text[0:i]
else:
part1 = text[i - 44 : i]
part1 = text[i - 48 : i]
# 首先取随机值pinyin_len1-8pinyin_len取值呈高斯分布最大概率取3
# 获取text[i + pinyin_len]字符如果无法获取所指向的后如果pinyin_len
# part2的长度为x取pinyin_list[i:i+pinyin_len]为part2
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
)
py_end = min(i + pinyin_len, len(text))
part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
# part3为文本大概率0.85为空不为空则是i+pinyin_len所指向的字符
# encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length)
# part3为文本大概率0.70)为空
# 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
# x为1-16中的任意整数取值平均分布
part3 = ""
if random.random() > 0.7:
part3 = text[
i + pinyin_len : i
+ pinyin_len
+ np.random.choice(range(1, 17))
]
# part4为文本0.50的概率为空
# 不为空则为1-5个连续字符串
# 连续字符串的取值方法为随机从字符库中取一个字符以及该字符后x个字符
# x为2-6中的任意整数取值平均分布
# 使用|将part4中的字符串连接起来
part4 = ""
if random.random() > 0.5:
# 生成1-5个连续字符串
num_strings = random.randint(1, 5)
string_list = []
for _ in range(num_strings):
# 随机选择起始位置
start_pos = random.randint(0, len(text) - 1)
# 随机选择x的值(2-6)
x = random.randint(2, 6)
# 获取连续字符串
end_pos = min(start_pos + x + 1, len(text))
string_list.append(text[start_pos:end_pos])
# 用|连接所有字符串
part4 = "|".join(string_list)
labels = [
self.query_engine.get_char_info_by_char_pinyin(
c, p
).id
for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
]
encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
encoded["label"] = labels
batch_samples.append(encoded)
if len(batch_samples) >= self.shuffle_buffer_size:
indices = np.random.permutation(len(batch_samples))
self.buffer.extend([batch_samples[i] for i in indices])
batch_samples = []
yield from self.buffer

View File

@ -10,7 +10,7 @@ import torch.nn.functional as F
from loguru import logger
from modelscope import AutoTokenizer
from tqdm.autonotebook import tqdm
from tqdm.notebook import tqdm
from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现

3337
uv.lock

File diff suppressed because it is too large Load Diff