chore: 删除旧的拼音字符统计文件
This commit is contained in:
parent
9c8574cf11
commit
14c835dd72
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,30 @@
|
||||||
|
{
|
||||||
|
"add_cross_attention": false,
|
||||||
|
"attention_probs_dropout_prob": 0.1,
|
||||||
|
"bos_token_id": null,
|
||||||
|
"classifier_dropout": null,
|
||||||
|
"directionality": "bidi",
|
||||||
|
"eos_token_id": null,
|
||||||
|
"hidden_act": "gelu",
|
||||||
|
"hidden_dropout_prob": 0.1,
|
||||||
|
"hidden_size": 512,
|
||||||
|
"initializer_range": 0.02,
|
||||||
|
"intermediate_size": 2048,
|
||||||
|
"is_decoder": false,
|
||||||
|
"layer_norm_eps": 1e-12,
|
||||||
|
"max_position_embeddings": 512,
|
||||||
|
"model_type": "bert",
|
||||||
|
"num_attention_heads": 8,
|
||||||
|
"num_hidden_layers": 6,
|
||||||
|
"pad_token_id": 0,
|
||||||
|
"pooler_fc_size": 768,
|
||||||
|
"pooler_num_attention_heads": 12,
|
||||||
|
"pooler_num_fc_layers": 3,
|
||||||
|
"pooler_size_per_head": 128,
|
||||||
|
"pooler_type": "first_token_transform",
|
||||||
|
"tie_word_embeddings": true,
|
||||||
|
"transformers_version": "5.1.0",
|
||||||
|
"type_vocab_size": 4,
|
||||||
|
"use_cache": true,
|
||||||
|
"vocab_size": 21128
|
||||||
|
}
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,14 @@
|
||||||
|
{
|
||||||
|
"backend": "tokenizers",
|
||||||
|
"cls_token": "[CLS]",
|
||||||
|
"do_lower_case": false,
|
||||||
|
"is_local": true,
|
||||||
|
"mask_token": "[MASK]",
|
||||||
|
"model_max_length": 1000000000000000019884624838656,
|
||||||
|
"pad_token": "[PAD]",
|
||||||
|
"sep_token": "[SEP]",
|
||||||
|
"strip_accents": null,
|
||||||
|
"tokenize_chinese_chars": true,
|
||||||
|
"tokenizer_class": "BertTokenizer",
|
||||||
|
"unk_token": "[UNK]"
|
||||||
|
}
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
import os
|
|
||||||
import random
|
import random
|
||||||
from typing import Any, Dict, List, Optional, Tuple
|
from typing import Any, Dict, List, Optional, Tuple
|
||||||
|
from importlib.resources import files
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
|
|
@ -11,10 +12,7 @@ from pypinyin import Style, lazy_pinyin
|
||||||
from pypinyin.contrib.tone_convert import to_initials
|
from pypinyin.contrib.tone_convert import to_initials
|
||||||
from torch.utils.data import DataLoader, IterableDataset
|
from torch.utils.data import DataLoader, IterableDataset
|
||||||
|
|
||||||
# 加载分词器和模型
|
from .query import QueryEngine
|
||||||
# model = AutoModel.from_pretrained('iic/nlp_structbert_backbone_lite_std')
|
|
||||||
|
|
||||||
# tokenizer = AutoTokenizer.from_pretrained('iic/nlp_structbert_backbone_lite_std')
|
|
||||||
|
|
||||||
|
|
||||||
class PinyinInputDataset(IterableDataset):
|
class PinyinInputDataset(IterableDataset):
|
||||||
|
|
@ -22,12 +20,12 @@ class PinyinInputDataset(IterableDataset):
|
||||||
self,
|
self,
|
||||||
data_path: str,
|
data_path: str,
|
||||||
max_workes: int = -1,
|
max_workes: int = -1,
|
||||||
tokenizer_name: str = "iic/nlp_structbert_backbone_lite_std",
|
|
||||||
max_length=128,
|
max_length=128,
|
||||||
text_field: str = "text",
|
text_field: str = "text",
|
||||||
py_style_weight=(9, 2, 1),
|
py_style_weight=(9, 2, 1),
|
||||||
|
shuffle_buffer_size: int = 5000,
|
||||||
):
|
):
|
||||||
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
|
self.tokenizer = AutoTokenizer.from_pretrained(Path(files(__package__) / "assets" / "tokenizer"))
|
||||||
self.data_path = data_path
|
self.data_path = data_path
|
||||||
self.max_length = max_length
|
self.max_length = max_length
|
||||||
self.text_field = text_field
|
self.text_field = text_field
|
||||||
|
|
@ -35,8 +33,14 @@ class PinyinInputDataset(IterableDataset):
|
||||||
self.max_workers = max_workes
|
self.max_workers = max_workes
|
||||||
self.py_style_weight = py_style_weight
|
self.py_style_weight = py_style_weight
|
||||||
|
|
||||||
@staticmethod
|
self.query_engine = QueryEngine()
|
||||||
def smart_multi_segment_encode(texts, tokenizer, max_length=128):
|
self.query_engine.load()
|
||||||
|
self.shuffle_buffer_size = shuffle_buffer_size
|
||||||
|
self.buffer = []
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def smart_multi_segment_encode(self, texts):
|
||||||
"""
|
"""
|
||||||
智能多段落编码
|
智能多段落编码
|
||||||
|
|
||||||
|
|
@ -46,7 +50,7 @@ class PinyinInputDataset(IterableDataset):
|
||||||
encoded_segments = []
|
encoded_segments = []
|
||||||
for text in texts:
|
for text in texts:
|
||||||
# 注意:不添加特殊标记,我们后面统一处理
|
# 注意:不添加特殊标记,我们后面统一处理
|
||||||
encoded = tokenizer.encode(text, add_special_tokens=False)
|
encoded = self.tokenizer.encode(text, add_special_tokens=False)
|
||||||
encoded_segments.append(encoded)
|
encoded_segments.append(encoded)
|
||||||
|
|
||||||
# 第二步:构建完整序列
|
# 第二步:构建完整序列
|
||||||
|
|
@ -54,7 +58,7 @@ class PinyinInputDataset(IterableDataset):
|
||||||
token_type_ids = []
|
token_type_ids = []
|
||||||
|
|
||||||
# 添加[CLS]
|
# 添加[CLS]
|
||||||
tokens.append(tokenizer.cls_token_id)
|
tokens.append(self.tokenizer.cls_token_id)
|
||||||
token_type_ids.append(0) # CLS通常为0
|
token_type_ids.append(0) # CLS通常为0
|
||||||
|
|
||||||
# 添加各个段落
|
# 添加各个段落
|
||||||
|
|
@ -68,26 +72,26 @@ class PinyinInputDataset(IterableDataset):
|
||||||
|
|
||||||
# 添加[SEP](最后一个段落可以不加)
|
# 添加[SEP](最后一个段落可以不加)
|
||||||
if seg_idx < len(encoded_segments) - 1:
|
if seg_idx < len(encoded_segments) - 1:
|
||||||
tokens.append(tokenizer.sep_token_id)
|
tokens.append(self.tokenizer.sep_token_id)
|
||||||
token_type_ids.append(current_type)
|
token_type_ids.append(current_type)
|
||||||
else:
|
else:
|
||||||
# 最后一个段落加[SEP]
|
# 最后一个段落加[SEP]
|
||||||
tokens.append(tokenizer.sep_token_id)
|
tokens.append(self.tokenizer.sep_token_id)
|
||||||
token_type_ids.append(current_type)
|
token_type_ids.append(current_type)
|
||||||
|
|
||||||
# 第三步:截断和填充
|
# 第三步:截断和填充
|
||||||
if len(tokens) > max_length:
|
if len(tokens) > self.max_length:
|
||||||
tokens = tokens[:max_length]
|
tokens = tokens[:self.max_length]
|
||||||
token_type_ids = token_type_ids[:max_length]
|
token_type_ids = token_type_ids[:self.max_length]
|
||||||
else:
|
else:
|
||||||
# 填充
|
# 填充
|
||||||
padding_length = max_length - len(tokens)
|
padding_length = self.max_length - len(tokens)
|
||||||
tokens = tokens + [tokenizer.pad_token_id] * padding_length
|
tokens = tokens + [self.tokenizer.pad_token_id] * padding_length
|
||||||
token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0
|
token_type_ids = token_type_ids + [0] * padding_length # 填充部分用0
|
||||||
|
|
||||||
# 第四步:创建attention mask
|
# 第四步:创建attention mask
|
||||||
attention_mask = [
|
attention_mask = [
|
||||||
1 if token != tokenizer.pad_token_id else 0 for token in tokens
|
1 if token != self.tokenizer.pad_token_id else 0 for token in tokens
|
||||||
]
|
]
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|
@ -104,13 +108,15 @@ class PinyinInputDataset(IterableDataset):
|
||||||
def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
|
def get_mask_pinyin(self, text: str, pinyin_list: List[str]) -> (int, List[str]):
|
||||||
mask_pinyin = []
|
mask_pinyin = []
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
if text[i] == pinyin_list[i]:
|
if self.query_engine.is_chinese_char(text[i]):
|
||||||
return i - 1, mask_pinyin
|
return i - 1, mask_pinyin
|
||||||
else:
|
else:
|
||||||
py = random.choice(
|
py = random.choice(
|
||||||
(pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
|
(pinyin_list[i], to_initials(pinyin_list[i]), pinyin_list[i][0]),
|
||||||
weights=self.py_style_weight,
|
weights=self.py_style_weight,
|
||||||
)
|
)
|
||||||
|
if py == "":
|
||||||
|
py = pinyin_list[i][0]
|
||||||
mask_pinyin.append(py)
|
mask_pinyin.append(py)
|
||||||
return len(text) - 1, mask_pinyin
|
return len(text) - 1, mask_pinyin
|
||||||
|
|
||||||
|
|
@ -135,14 +141,14 @@ class PinyinInputDataset(IterableDataset):
|
||||||
pinyin_list = self.generate_pinyin(text)
|
pinyin_list = self.generate_pinyin(text)
|
||||||
for i in range(len(text)):
|
for i in range(len(text)):
|
||||||
# 如果text[i]不再字符库中,则跳过
|
# 如果text[i]不再字符库中,则跳过
|
||||||
# 当i小于44时候,则将part1取text[0:i]
|
# 当i小于48时候,则将part1取text[0:i]
|
||||||
# 当i大于44时候,则将part1取text[i-44:i]
|
# 当i大于48时候,则将part1取text[i-48:i]
|
||||||
if text[i] == pinyin_list[i]:
|
if self.query_engine.is_chinese_char(text[i]):
|
||||||
continue
|
continue
|
||||||
if i < 44:
|
if i < 48:
|
||||||
part1 = text[0:i]
|
part1 = text[0:i]
|
||||||
else:
|
else:
|
||||||
part1 = text[i - 44 : i]
|
part1 = text[i - 48 : i]
|
||||||
# 首先取随机值pinyin_len(1-8),pinyin_len取值呈高斯分布,最大概率取3
|
# 首先取随机值pinyin_len(1-8),pinyin_len取值呈高斯分布,最大概率取3
|
||||||
# 获取text[i + pinyin_len]字符,如果无法获取所指向的后,如果pinyin_len
|
# 获取text[i + pinyin_len]字符,如果无法获取所指向的后,如果pinyin_len
|
||||||
# part2的长度为x,取pinyin_list[i:i+pinyin_len],为part2
|
# part2的长度为x,取pinyin_list[i:i+pinyin_len],为part2
|
||||||
|
|
@ -152,6 +158,53 @@ class PinyinInputDataset(IterableDataset):
|
||||||
)
|
)
|
||||||
py_end = min(i + pinyin_len, len(text))
|
py_end = min(i + pinyin_len, len(text))
|
||||||
part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
|
part2 = self.get_mask_pinyin(text[i:py_end], pinyin_list[i:py_end])
|
||||||
# part3为文本,大概率(0.85)为空,不为空则是i+pinyin_len所指向的字符
|
|
||||||
|
|
||||||
# encoded = self.smart_multi_segment_encode([pinyin_text], self.tokenizer, self.max_length)
|
# part3为文本,大概率(0.70)为空
|
||||||
|
# 不为空则是i+pinyin_len所指向的字符以及所指向字符后x个字符
|
||||||
|
# x为1-16中的任意整数,取值平均分布
|
||||||
|
part3 = ""
|
||||||
|
if random.random() > 0.7:
|
||||||
|
part3 = text[
|
||||||
|
i + pinyin_len : i
|
||||||
|
+ pinyin_len
|
||||||
|
+ np.random.choice(range(1, 17))
|
||||||
|
]
|
||||||
|
|
||||||
|
# part4为文本,0.50的概率为空
|
||||||
|
# 不为空则为1-5个连续字符串
|
||||||
|
# 连续字符串的取值方法为:随机从字符库中取一个字符,以及该字符后x个字符
|
||||||
|
# x为2-6中的任意整数,取值平均分布
|
||||||
|
# 使用|将part4中的字符串连接起来
|
||||||
|
part4 = ""
|
||||||
|
if random.random() > 0.5:
|
||||||
|
# 生成1-5个连续字符串
|
||||||
|
num_strings = random.randint(1, 5)
|
||||||
|
string_list = []
|
||||||
|
for _ in range(num_strings):
|
||||||
|
# 随机选择起始位置
|
||||||
|
start_pos = random.randint(0, len(text) - 1)
|
||||||
|
# 随机选择x的值(2-6)
|
||||||
|
x = random.randint(2, 6)
|
||||||
|
# 获取连续字符串
|
||||||
|
end_pos = min(start_pos + x + 1, len(text))
|
||||||
|
string_list.append(text[start_pos:end_pos])
|
||||||
|
# 用|连接所有字符串
|
||||||
|
part4 = "|".join(string_list)
|
||||||
|
|
||||||
|
labels = [
|
||||||
|
self.query_engine.get_char_info_by_char_pinyin(
|
||||||
|
c, p
|
||||||
|
).id
|
||||||
|
for c, p in zip(text[i:py_end], pinyin_list[i:py_end])
|
||||||
|
]
|
||||||
|
|
||||||
|
encoded = self.smart_multi_segment_encode([part1, part2, part3, part4])
|
||||||
|
encoded["label"] = labels
|
||||||
|
batch_samples.append(encoded)
|
||||||
|
if len(batch_samples) >= self.shuffle_buffer_size:
|
||||||
|
indices = np.random.permutation(len(batch_samples))
|
||||||
|
self.buffer.extend([batch_samples[i] for i in indices])
|
||||||
|
batch_samples = []
|
||||||
|
yield from self.buffer
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -10,7 +10,7 @@ import torch.nn.functional as F
|
||||||
|
|
||||||
from loguru import logger
|
from loguru import logger
|
||||||
from modelscope import AutoTokenizer
|
from modelscope import AutoTokenizer
|
||||||
from tqdm.autonotebook import tqdm
|
from tqdm.notebook import tqdm
|
||||||
|
|
||||||
|
|
||||||
from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现
|
from .components import AttentionPooling, Expert # , ResidualBlock # 假设已实现
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue