From 6ad003133cb60ee4e31fc752f75a7d4b47a41c2d Mon Sep 17 00:00:00 2001 From: songsenand Date: Mon, 6 Apr 2026 06:05:12 +0800 Subject: [PATCH] =?UTF-8?q?fix(dataset):=20=E6=B7=BB=E5=8A=A0=E5=BC=82?= =?UTF-8?q?=E5=B8=B8=E6=8D=95=E8=8E=B7=E9=98=B2=E6=AD=A2=E6=A0=87=E7=AD=BE?= =?UTF-8?q?=E7=94=9F=E6=88=90=E5=A4=B1=E8=B4=A5?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/model/dataset.py | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/src/model/dataset.py b/src/model/dataset.py index db17f61..48cfb69 100644 --- a/src/model/dataset.py +++ b/src/model/dataset.py @@ -7,6 +7,7 @@ from typing import Dict, List, Tuple import numpy as np import torch from datasets import load_dataset +from loguru import logger from modelscope import AutoTokenizer from pypinyin import lazy_pinyin from pypinyin.contrib.tone_convert import to_initials @@ -304,13 +305,19 @@ class PinyinInputDataset(IterableDataset): string_list.append(text[start_pos:end_pos]) # 用|连接所有字符串 part4 = "|".join(string_list) - labels = [ - self.query_engine.get_char_info_by_char_pinyin(c, p).id - for c, p in zip( - text[i : i + pinyin_len], - pinyin_list[i : i + pinyin_len], + try: + labels = [ + self.query_engine.get_char_info_by_char_pinyin(c, p).id + for c, p in zip( + text[i : i + pinyin_len], + pinyin_list[i : i + pinyin_len], + ) + ] + except AttributeError as e: + logger.error( + f"e: {e}, (text, pinyin): {text[i : i + pinyin_len]} - {pinyin_list[i : i + pinyin_len]}" ) - ] + continue if random.random() <= 0.1: labels.append(0)