fix(dataset): 添加6%概率返回None以增强数据多样性

This commit is contained in:
songsenand 2026-02-26 14:30:35 +08:00
parent dfcce1f1ed
commit 1178f87713
7 changed files with 4 additions and 1 deletions

View File

@ -276,6 +276,9 @@ class PinyinInputDataset(IterableDataset):
trunc_len = random.randint(1, max_len - 1)
return pinyin[:trunc_len]
"""
rand_val = random.random()
if rand_val <= 0.06:
return None
return pinyin
def process_pinyin_sequence(self, pinyin_list: List[str]) -> str:

View File

@ -16,7 +16,7 @@ if __name__ == "__main__":
# 创建数据集
dataset = PinyinInputDataset(
data_dir="/root/autodl-tmp/data",
data_dir="/home/songsenand/DataSet/data",
query_engine=query_engine,
tokenizer_name="iic/nlp_structbert_backbone_lite_std",
max_len=88,