fix(dataset): 添加异常捕获防止标签生成失败

This commit is contained in:
songsenand 2026-04-06 06:05:12 +08:00
parent 493bfdec1a
commit 6ad003133c
1 changed files with 13 additions and 6 deletions

View File

@ -7,6 +7,7 @@ from typing import Dict, List, Tuple
import numpy as np import numpy as np
import torch import torch
from datasets import load_dataset from datasets import load_dataset
from loguru import logger
from modelscope import AutoTokenizer from modelscope import AutoTokenizer
from pypinyin import lazy_pinyin from pypinyin import lazy_pinyin
from pypinyin.contrib.tone_convert import to_initials from pypinyin.contrib.tone_convert import to_initials
@ -304,6 +305,7 @@ class PinyinInputDataset(IterableDataset):
string_list.append(text[start_pos:end_pos]) string_list.append(text[start_pos:end_pos])
# 用|连接所有字符串 # 用|连接所有字符串
part4 = "|".join(string_list) part4 = "|".join(string_list)
try:
labels = [ labels = [
self.query_engine.get_char_info_by_char_pinyin(c, p).id self.query_engine.get_char_info_by_char_pinyin(c, p).id
for c, p in zip( for c, p in zip(
@ -311,6 +313,11 @@ class PinyinInputDataset(IterableDataset):
pinyin_list[i : i + pinyin_len], pinyin_list[i : i + pinyin_len],
) )
] ]
except AttributeError as e:
logger.error(
f"e: {e}, (text, pinyin): {text[i : i + pinyin_len]} - {pinyin_list[i : i + pinyin_len]}"
)
continue
if random.random() <= 0.1: if random.random() <= 0.1:
labels.append(0) labels.append(0)