chore: 删除旧的拼音字符统计文件
This commit is contained in:
parent
9c8574cf11
commit
14c835dd72
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,30 @@
|
|||
{
|
||||
"add_cross_attention": false,
|
||||
"attention_probs_dropout_prob": 0.1,
|
||||
"bos_token_id": null,
|
||||
"classifier_dropout": null,
|
||||
"directionality": "bidi",
|
||||
"eos_token_id": null,
|
||||
"hidden_act": "gelu",
|
||||
"hidden_dropout_prob": 0.1,
|
||||
"hidden_size": 512,
|
||||
"initializer_range": 0.02,
|
||||
"intermediate_size": 2048,
|
||||
"is_decoder": false,
|
||||
"layer_norm_eps": 1e-12,
|
||||
"max_position_embeddings": 512,
|
||||
"model_type": "bert",
|
||||
"num_attention_heads": 8,
|
||||
"num_hidden_layers": 6,
|
||||
"pad_token_id": 0,
|
||||
"pooler_fc_size": 768,
|
||||
"pooler_num_attention_heads": 12,
|
||||
"pooler_num_fc_layers": 3,
|
||||
"pooler_size_per_head": 128,
|
||||
"pooler_type": "first_token_transform",
|
||||
"tie_word_embeddings": true,
|
||||
"transformers_version": "5.1.0",
|
||||
"type_vocab_size": 4,
|
||||
"use_cache": true,
|
||||
"vocab_size": 21128
|
||||
}
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,14 @@
|
|||
{
|
||||
"backend": "tokenizers",
|
||||
"cls_token": "[CLS]",
|
||||
"do_lower_case": false,
|
||||
"is_local": true,
|
||||
"mask_token": "[MASK]",
|
||||
"model_max_length": 1000000000000000019884624838656,
|
||||
"pad_token": "[PAD]",
|
||||
"sep_token": "[SEP]",
|
||||
"strip_accents": null,
|
||||
"tokenize_chinese_chars": true,
|
||||
"tokenizer_class": "BertTokenizer",
|
||||
"unk_token": "[UNK]"
|
||||
}
|
||||
|
|
@ -1,6 +1,7 @@
|
|||
import os
|
||||
import random
|
||||
from typing import Any, Dict, List, Optional, Tuple
|
||||
from importlib.resources import files
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
|
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
|
|||
from pypinyin.contrib.tone_convert import to_initials
|
||||
from torch.utils.data import DataLoader, IterableDataset
|
||||
|
||||
# 加载分词器和模型
|
||||
# model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
|
||||
|
||||
# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
|
||||
from .query import QueryEngine
|
||||
|
||||
|
||||
class PinyinInputDataset(IterableDataset):
|
||||
|
|
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
|
|||
self,
|
||||
data_path: str,
|
||||
max_workes: int = -1,
|
||||
tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
|
||||
max_length=128,
|
||||
text_field: str = "text",
|
||||
py_style_weight=(9, 2, 1),
|
||||
shuffle_buffer_size: int = 5000,
|
||||
):
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
||||
self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
|
||||
self.data_path = data_path
|
||||
self.max_length = max_length
|
||||
self.text_field = text_field
|
||||
|
|
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
|
|||
self.max_workers = max_workes
|
||||
self.py_style_weight = py_style_weight
|
||||
|
||||
@staticmethod
|
||||
def smart_multi_segment_encode(texts, tokenizer, max_length=128):
|
||||
self.query_engine = QueryEngine()
|
||||
self.query_engine.load()
|
||||
self.shuffle_buffer_size = shuffle_buffer_size
|
||||
self.buffer = []
|
||||
|
||||
|
||||
|
||||
def smart_multi_segment_encode(self, texts):
|
||||
"""
|
||||
智能多段落编码
|
||||
|
||||
|
|
@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
|
|||
encoded_segments = []
|
||||
for text in texts:
|
||||
# 注意:不添加特殊标记,我们后面统一处理
|
||||
encoded = tokenizer.encode(text, add_special_tokens=False)
|
||||
encoded = self.tokenizer.encode(text, add_special_tokens=False)
|
||||
encoded_segments.append(encoded)
|
||||
|
||||
# 第二步:构建完整序列
|
||||
|
|
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
|
|||
token_type_ids = []
|
||||
|
||||
# 添加[CLS]
|
||||
tokens.append(tokenizer.cls_token_id)
|
||||
tokens.append(self.tokenizer.cls_token_id)
|
||||
token_type_ids.append(0) # CLS通常为0
|
||||
|
||||
# 添加各个段落
|
||||
|
|
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):
|
|||
|
||||
# 添加[SEP](最后一个段落可以不加)
|
||||
if seg_idx < len(encoded_segments) - 1:
|
||||
tokens.append(tokenizer.sep_token_id)
|
||||
tokens.append(self.tokenizer.sep_token_id)
|
||||
token_type_ids.append(current_type)
|
||||
else:
|
||||
# 最后一个段落加[SEP]
|
||||
tokens.append(tokenizer.sep_token_id)
|
||||
tokens.append(self.tokenizer.sep_token_id)
|
||||
token_type_ids.append(current_type)
|
||||
|
||||
# 第三步:截断和填充
|
||||
if len(tokens) > max_length:
|
||||
tokens = tokens[:max_length]
|
||||
token_type_ids = token_type_ids[:max_length]
|
||||
if len(tokens) > self.max_length:
|
||||
tokens = tokens[:self.max_length]
|
||||
token_type_ids = token_type_ids[:self.max_length]
|
||||
else:
|
||||
# 填充
|
||||
padding_length = max_length - len(tokens)
|
||||
tokens = tokens + [tokenizer.pad_token_id] * padding_length
|
||||
padding_length = self.max_length - len(tokens)
|
||||
tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
|
||||
token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0
|
||||
|
||||
# 第四步:创建attention mask
|
||||
attention_mask = [
|
||||
1 if token != tokenizer.pad_token_id else 0 for token in tokens
|
||||
1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
|
||||
]
|
||||
|
||||
return {
|
||||
|
|
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
|
|||
def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
|
||||
mask_pinyin = []
|
||||
for i in range(len(text)):
|
||||
if text[i] == pinyin_list[i]:
|
||||
if self.query_engine.is_chinese_char(text[i]):
|
||||
return i - 1, mask_pinyin
|
||||
else:
|
||||
py = random.choice(
|
||||
(pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
|
||||
weights=self.py_style_weight,
|
||||
)
|
||||
if py == "":
|
||||
py = pinyin_list[i][0]
|
||||
mask_pinyin.append(py)
|
||||
return len(text) - 1, mask_pinyin
|
||||
|
||||
|
|
@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
|
|||
pinyin_list = self.generate_pinyin(text)
|
||||
for i in range(len(text)):
|
||||
# 如果text[i]不再字符库中,则跳过
|
||||
# 当i小于44时候,则将part1取text[0:i]
|
||||
# 当i大于44时候,则将part1取text[i-44:i]
|
||||
if text[i] == pinyin_list[i]:
|
||||
# 当i小于48时候,则将part1取text[0:i]
|
||||
# 当i大于48时候,则将part1取text[i-48:i]
|
||||
if self.query_engine.is_chinese_char(text[i]):
|
||||
continue
|
||||
if i < 44:
|
||||
if i < 48:
|
||||
part1 = text[0:i]
|
||||
else:
|
||||
part1 = text[i - 44 : i]
|
||||
part1 = text[i - 48 : i]
|
||||
# 首先取随机值pinyin_len(1-8),pinyin_len取值呈高斯分布,最大概率取3
|
||||
# 获取text[i + pinyin_len]字符,如果无法获取所指向的后,如果pinyin_len
|
||||
# part2的长度为x,取pinyin_list[i:i+pinyin_len],为part2
|
||||
|
|
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
|
|||
)
|
||||
py_end = min(i + pinyin_len, len(text))
|
||||
part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
|
||||
# part3为文本,大概率(0.85)为空,不为空则是i+pinyin_len所指向的字符
|
||||
|
||||
# encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length)
|
||||
# part3为文本,大概率(0.70)为空
|
||||
# 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
|
||||
# x为1-16中的任意整数,取值平均分布
|
||||
part3 = ""
|
||||
if random.random() > 0.7:
|
||||
part3 = text[
|
||||
i + pinyin_len : i
|
||||
+ pinyin_len
|
||||
+ np.random.choice(range(1, 17))
|
||||
]
|
||||
|
||||
# part4为文本,0.50的概率为空
|
||||
# 不为空则为1-5个连续字符串
|
||||
# 连续字符串的取值方法为:随机从字符库中取一个字符,以及该字符后x个字符
|
||||
# x为2-6中的任意整数,取值平均分布
|
||||
# 使用|将part4中的字符串连接起来
|
||||
part4 = ""
|
||||
if random.random() > 0.5:
|
||||
# 生成1-5个连续字符串
|
||||
num_strings = random.randint(1, 5)
|
||||
string_list = []
|
||||
for _ in range(num_strings):
|
||||
# 随机选择起始位置
|
||||
start_pos = random.randint(0, len(text) - 1)
|
||||
# 随机选择x的值(2-6)
|
||||
x = random.randint(2, 6)
|
||||
# 获取连续字符串
|
||||
end_pos = min(start_pos + x + 1, len(text))
|
||||
string_list.append(text[start_pos:end_pos])
|
||||
# 用|连接所有字符串
|
||||
part4 = "|".join(string_list)
|
||||
|
||||
labels = [
|
||||
self.query_engine.get_char_info_by_char_pinyin(
|
||||
c, p
|
||||
).id
|
||||
for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
|
||||
]
|
||||
|
||||
encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
|
||||
encoded["label"] = labels
|
||||
batch_samples.append(encoded)
|
||||
if len(batch_samples) >= self.shuffle_buffer_size:
|
||||
indices = np.random.permutation(len(batch_samples))
|
||||
self.buffer.extend([batch_samples[i] for i in indices])
|
||||
batch_samples = []
|
||||
yield from self.buffer
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -10,7 +10,7 @@ import torch.nn.functional as F
|
|||
|
||||
from loguru import logger
|
||||
from modelscope import AutoTokenizer
|
||||
from tqdm.autonotebook import tqdm
|
||||
from tqdm.notebook import tqdm
|
||||
|
||||
|
||||
from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现
|
||||
|
|
|
|||
Loading…
Reference in New Issue