chore: 删除旧的拼音字符统计文件

This commit is contained in:
songsenand 2026-04-02 00:44:16 +08:00
parent 9c8574cf11
commit 14c835dd72
7 changed files with 23068 additions and 144246 deletions

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,30 @@
{
"add_cross_attention": false,
"attention_probs_dropout_prob": 0.1,
"bos_token_id": null,
"classifier_dropout": null,
"directionality": "bidi",
"eos_token_id": null,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 512,
"initializer_range": 0.02,
"intermediate_size": 2048,
"is_decoder": false,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 8,
"num_hidden_layers": 6,
"pad_token_id": 0,
"pooler_fc_size": 768,
"pooler_num_attention_heads": 12,
"pooler_num_fc_layers": 3,
"pooler_size_per_head": 128,
"pooler_type": "first_token_transform",
"tie_word_embeddings": true,
"transformers_version": "5.1.0",
"type_vocab_size": 4,
"use_cache": true,
"vocab_size": 21128
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,14 @@
{
"backend": "tokenizers",
"cls_token": "[CLS]",
"do_lower_case": false,
"is_local": true,
"mask_token": "[MASK]",
"model_max_length": 1000000000000000019884624838656,
"pad_token": "[PAD]",
"sep_token": "[SEP]",
"strip_accents": null,
"tokenize_chinese_chars": true,
"tokenizer_class": "BertTokenizer",
"unk_token": "[UNK]"
}

View File

@ -1,6 +1,7 @@
import os
import random import random
from typing import Any, Dict, List, Optional, Tuple from typing import Any, Dict, List, Optional, Tuple
from importlib.resources import files
from pathlib import Path
import numpy as np import numpy as np
import torch import torch
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
from pypinyin.contrib.tone_convert import to_initials from pypinyin.contrib.tone_convert import to_initials
from torch.utils.data import DataLoader, IterableDataset from torch.utils.data import DataLoader, IterableDataset
# 加载分词器和模型 from .query import QueryEngine
# model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
class PinyinInputDataset(IterableDataset): class PinyinInputDataset(IterableDataset):
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
self, self,
data_path: str, data_path: str,
max_workes: int = -1, max_workes: int = -1,
tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
max_length=128, max_length=128,
text_field: str = "text", text_field: str = "text",
py_style_weight=(9, 2, 1), py_style_weight=(9, 2, 1),
shuffle_buffer_size: int = 5000,
): ):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name) self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
self.data_path = data_path self.data_path = data_path
self.max_length = max_length self.max_length = max_length
self.text_field = text_field self.text_field = text_field
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
self.max_workers = max_workes self.max_workers = max_workes
self.py_style_weight = py_style_weight self.py_style_weight = py_style_weight
@staticmethod self.query_engine = QueryEngine()
def smart_multi_segment_encode(texts, tokenizer, max_length=128): self.query_engine.load()
self.shuffle_buffer_size = shuffle_buffer_size
self.buffer = []
def smart_multi_segment_encode(self, texts):
""" """
智能多段落编码 智能多段落编码
@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
encoded_segments = [] encoded_segments = []
for text in texts: for text in texts:
# 注意:不添加特殊标记,我们后面统一处理 # 注意:不添加特殊标记,我们后面统一处理
encoded = tokenizer.encode(text, add_special_tokens=False) encoded = self.tokenizer.encode(text, add_special_tokens=False)
encoded_segments.append(encoded) encoded_segments.append(encoded)
# 第二步:构建完整序列 # 第二步:构建完整序列
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
token_type_ids = [] token_type_ids = []
# 添加[CLS] # 添加[CLS]
tokens.append(tokenizer.cls_token_id) tokens.append(self.tokenizer.cls_token_id)
token_type_ids.append(0) # CLS通常为0 token_type_ids.append(0) # CLS通常为0
# 添加各个段落 # 添加各个段落
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):
# 添加[SEP](最后一个段落可以不加) # 添加[SEP](最后一个段落可以不加)
if seg_idx < len(encoded_segments) - 1: if seg_idx < len(encoded_segments) - 1:
tokens.append(tokenizer.sep_token_id) tokens.append(self.tokenizer.sep_token_id)
token_type_ids.append(current_type) token_type_ids.append(current_type)
else: else:
# 最后一个段落加[SEP] # 最后一个段落加[SEP]
tokens.append(tokenizer.sep_token_id) tokens.append(self.tokenizer.sep_token_id)
token_type_ids.append(current_type) token_type_ids.append(current_type)
# 第三步:截断和填充 # 第三步:截断和填充
if len(tokens) > max_length: if len(tokens) > self.max_length:
tokens = tokens[:max_length] tokens = tokens[:self.max_length]
token_type_ids = token_type_ids[:max_length] token_type_ids = token_type_ids[:self.max_length]
else: else:
# 填充 # 填充
padding_length = max_length - len(tokens) padding_length = self.max_length - len(tokens)
tokens = tokens + [tokenizer.pad_token_id] * padding_length tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0 token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0
# 第四步创建attention mask # 第四步创建attention mask
attention_mask = [ attention_mask = [
1 if token != tokenizer.pad_token_id else 0 for token in tokens 1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
] ]
return { return {
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]): def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
mask_pinyin = [] mask_pinyin = []
for i in range(len(text)): for i in range(len(text)):
if text[i] == pinyin_list[i]: if self.query_engine.is_chinese_char(text[i]):
return i - 1, mask_pinyin return i - 1, mask_pinyin
else: else:
py = random.choice( py = random.choice(
(pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]), (pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
weights=self.py_style_weight, weights=self.py_style_weight,
) )
if py == "":
py = pinyin_list[i][0]
mask_pinyin.append(py) mask_pinyin.append(py)
return len(text) - 1, mask_pinyin return len(text) - 1, mask_pinyin
@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
pinyin_list = self.generate_pinyin(text) pinyin_list = self.generate_pinyin(text)
for i in range(len(text)): for i in range(len(text)):
# 如果text[i]不再字符库中,则跳过 # 如果text[i]不再字符库中,则跳过
# 当i小于44时候则将part1取text[0:i] # 当i小于48时候则将part1取text[0:i]
# 当i大于44时候则将part1取text[i-44:i] # 当i大于48时候则将part1取text[i-48:i]
if text[i] == pinyin_list[i]: if self.query_engine.is_chinese_char(text[i]):
continue continue
if i < 44: if i < 48:
part1 = text[0:i] part1 = text[0:i]
else: else:
part1 = text[i - 44 : i] part1 = text[i - 48 : i]
# 首先取随机值pinyin_len1-8pinyin_len取值呈高斯分布最大概率取3 # 首先取随机值pinyin_len1-8pinyin_len取值呈高斯分布最大概率取3
# 获取text[i + pinyin_len]字符如果无法获取所指向的后如果pinyin_len # 获取text[i + pinyin_len]字符如果无法获取所指向的后如果pinyin_len
# part2的长度为x取pinyin_list[i:i+pinyin_len]为part2 # part2的长度为x取pinyin_list[i:i+pinyin_len]为part2
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
) )
py_end = min(i + pinyin_len, len(text)) py_end = min(i + pinyin_len, len(text))
part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end]) part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
# part3为文本大概率0.85为空不为空则是i+pinyin_len所指向的字符
# encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length) # part3为文本大概率0.70)为空
# 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
# x为1-16中的任意整数取值平均分布
part3 = ""
if random.random() > 0.7:
part3 = text[
i + pinyin_len : i
+ pinyin_len
+ np.random.choice(range(1, 17))
]
# part4为文本0.50的概率为空
# 不为空则为1-5个连续字符串
# 连续字符串的取值方法为随机从字符库中取一个字符以及该字符后x个字符
# x为2-6中的任意整数取值平均分布
# 使用|将part4中的字符串连接起来
part4 = ""
if random.random() > 0.5:
# 生成1-5个连续字符串
num_strings = random.randint(1, 5)
string_list = []
for _ in range(num_strings):
# 随机选择起始位置
start_pos = random.randint(0, len(text) - 1)
# 随机选择x的值(2-6)
x = random.randint(2, 6)
# 获取连续字符串
end_pos = min(start_pos + x + 1, len(text))
string_list.append(text[start_pos:end_pos])
# 用|连接所有字符串
part4 = "|".join(string_list)
labels = [
self.query_engine.get_char_info_by_char_pinyin(
c, p
).id
for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
]
encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
encoded["label"] = labels
batch_samples.append(encoded)
if len(batch_samples) >= self.shuffle_buffer_size:
indices = np.random.permutation(len(batch_samples))
self.buffer.extend([batch_samples[i] for i in indices])
batch_samples = []
yield from self.buffer

View File

@ -10,7 +10,7 @@ import torch.nn.functional as F
from loguru import logger from loguru import logger
from modelscope import AutoTokenizer from modelscope import AutoTokenizer
from tqdm.autonotebook import tqdm from tqdm.notebook import tqdm
from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现 from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现

3337
uv.lock

File diff suppressed because it is too large Load Diff