fix(dataset): 添加6%概率返回None以增强数据多样性
This commit is contained in:
parent
dfcce1f1ed
commit
1178f87713
|
|
@ -276,6 +276,9 @@ class PinyinInputDataset(IterableDataset):
|
||||||
trunc_len = random.randint(1, max_len - 1)
|
trunc_len = random.randint(1, max_len - 1)
|
||||||
return pinyin[:trunc_len]
|
return pinyin[:trunc_len]
|
||||||
"""
|
"""
|
||||||
|
rand_val = random.random()
|
||||||
|
if rand_val <= 0.06:
|
||||||
|
return None
|
||||||
return pinyin
|
return pinyin
|
||||||
|
|
||||||
def process_pinyin_sequence(self, pinyin_list: List[str]) -> str:
|
def process_pinyin_sequence(self, pinyin_list: List[str]) -> str:
|
||||||
|
|
|
||||||
|
|
@ -16,7 +16,7 @@ if __name__ == "__main__":
|
||||||
|
|
||||||
# 创建数据集
|
# 创建数据集
|
||||||
dataset = PinyinInputDataset(
|
dataset = PinyinInputDataset(
|
||||||
data_dir="/root/autodl-tmp/data",
|
data_dir="/home/songsenand/DataSet/data",
|
||||||
query_engine=query_engine,
|
query_engine=query_engine,
|
||||||
tokenizer_name="iic/nlp_structbert_backbone_lite_std",
|
tokenizer_name="iic/nlp_structbert_backbone_lite_std",
|
||||||
max_len=88,
|
max_len=88,
|
||||||
|
|
|
||||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Loading…
Reference in New Issue