diff --git a/src/suinput/dataset.py b/src/suinput/dataset.py index bb51a1d..1e6abc2 100644 --- a/src/suinput/dataset.py +++ b/src/suinput/dataset.py @@ -276,6 +276,9 @@ class PinyinInputDataset(IterableDataset): trunc_len = random.randint(1, max_len - 1) return pinyin[:trunc_len] """ + rand_val = random.random() + if rand_val <= 0.06: + return None return pinyin def process_pinyin_sequence(self, pinyin_list: List[str]) -> str: diff --git a/src/tmp_utils/gen_eval_dataset.py b/src/tmp_utils/gen_eval_dataset.py index 834b370..7ad4700 100644 --- a/src/tmp_utils/gen_eval_dataset.py +++ b/src/tmp_utils/gen_eval_dataset.py @@ -16,7 +16,7 @@ if __name__ == "__main__": # 创建数据集 dataset = PinyinInputDataset( - data_dir="/root/autodl-tmp/data", + data_dir="/home/songsenand/DataSet/data", query_engine=query_engine, tokenizer_name="iic/nlp_structbert_backbone_lite_std", max_len=88, diff --git a/src/trainer/eval_dataset/sample_0.pkl b/src/trainer/eval_dataset/sample_0.pkl index 379539a..f4c5086 100644 Binary files a/src/trainer/eval_dataset/sample_0.pkl and b/src/trainer/eval_dataset/sample_0.pkl differ diff --git a/src/trainer/eval_dataset/sample_1.pkl b/src/trainer/eval_dataset/sample_1.pkl index 47b037e..49280a0 100644 Binary files a/src/trainer/eval_dataset/sample_1.pkl and b/src/trainer/eval_dataset/sample_1.pkl differ diff --git a/src/trainer/eval_dataset/sample_2.pkl b/src/trainer/eval_dataset/sample_2.pkl index bedd1bf..b5c7eba 100644 Binary files a/src/trainer/eval_dataset/sample_2.pkl and b/src/trainer/eval_dataset/sample_2.pkl differ diff --git a/src/trainer/eval_dataset/sample_3.pkl b/src/trainer/eval_dataset/sample_3.pkl index a9b1df3..b46b65b 100644 Binary files a/src/trainer/eval_dataset/sample_3.pkl and b/src/trainer/eval_dataset/sample_3.pkl differ diff --git a/src/trainer/eval_dataset/sample_4.pkl b/src/trainer/eval_dataset/sample_4.pkl index b999ccb..f52a653 100644 Binary files a/src/trainer/eval_dataset/sample_4.pkl and b/src/trainer/eval_dataset/sample_4.pkl differ