diff --git a/src/suinput/dataset.py b/src/suinput/dataset.py index b7223d3..b0eb3ec 100644 --- a/src/suinput/dataset.py +++ b/src/suinput/dataset.py @@ -184,15 +184,15 @@ class PinyinInputDataset(IterableDataset): # 随机选择采样方式 (各1/3概率) choice = random.random() - if choice < 0.333: + if choice < 0.85: # 方式1: 靠近汉字的54个字符 return context[-54:] if context_len >= 54 else context - elif choice < 0.667: + elif choice < 0.95: # 方式2: 随机位置取46个连续字符 if context_len <= 46: return context start = random.randint(0, context_len - 46) - return context[start : start + 46] + return context[start : start + 46] + context[-8:] else: # 方式3: 12+6×7组合 if context_len < 12: diff --git a/src/tmp_utils/gen_eval_dataset.py b/src/tmp_utils/gen_eval_dataset.py index 4e8ef56..912399a 100644 --- a/src/tmp_utils/gen_eval_dataset.py +++ b/src/tmp_utils/gen_eval_dataset.py @@ -31,7 +31,7 @@ if __name__ == "__main__": batch_size=1024, num_workers=1, worker_init_fn=worker_init_fn, - pin_memory=True if torch.cuda.is_available() else False, + # pin_memory=True if torch.cuda.is_available() else False, collate_fn=custom_collate_with_txt, prefetch_factor=8, persistent_workers=True, diff --git a/src/trainer/eval_dataset/sample_0.pkl b/src/trainer/eval_dataset/sample_0.pkl index 3ebc5cf..d766b4d 100644 Binary files a/src/trainer/eval_dataset/sample_0.pkl and b/src/trainer/eval_dataset/sample_0.pkl differ diff --git a/src/trainer/eval_dataset/sample_1.pkl b/src/trainer/eval_dataset/sample_1.pkl index b8149f8..d417f58 100644 Binary files a/src/trainer/eval_dataset/sample_1.pkl and b/src/trainer/eval_dataset/sample_1.pkl differ diff --git a/src/trainer/eval_dataset/sample_2.pkl b/src/trainer/eval_dataset/sample_2.pkl index 1ccb18a..eab9fe9 100644 Binary files a/src/trainer/eval_dataset/sample_2.pkl and b/src/trainer/eval_dataset/sample_2.pkl differ diff --git a/src/trainer/eval_dataset/sample_3.pkl b/src/trainer/eval_dataset/sample_3.pkl index f0a2796..21b6e65 100644 Binary files a/src/trainer/eval_dataset/sample_3.pkl and b/src/trainer/eval_dataset/sample_3.pkl differ diff --git a/src/trainer/eval_dataset/sample_4.pkl b/src/trainer/eval_dataset/sample_4.pkl index 05f6211..592e7c6 100644 Binary files a/src/trainer/eval_dataset/sample_4.pkl and b/src/trainer/eval_dataset/sample_4.pkl differ