fix(dataset): 添加异常捕获防止标签生成失败

This commit is contained in:
songsenand 2026-04-06 06:05:12 +08:00
parent 493bfdec1a
commit 6ad003133c
1 changed files with 13 additions and 6 deletions

View File

@ -7,6 +7,7 @@ from typing import Dict, List, Tuple
import numpy as np import numpy as np
import torch import torch
from datasets import load_dataset from datasets import load_dataset
from loguru import logger
from modelscope import AutoTokenizer from modelscope import AutoTokenizer
from pypinyin import lazy_pinyin from pypinyin import lazy_pinyin
from pypinyin.contrib.tone_convert import to_initials from pypinyin.contrib.tone_convert import to_initials
@ -304,13 +305,19 @@ class PinyinInputDataset(IterableDataset):
string_list.append(text[start_pos:end_pos]) string_list.append(text[start_pos:end_pos])
# 用|连接所有字符串 # 用|连接所有字符串
part4 = "|".join(string_list) part4 = "|".join(string_list)
labels = [ try:
self.query_engine.get_char_info_by_char_pinyin(c, p).id labels = [
for c, p in zip( self.query_engine.get_char_info_by_char_pinyin(c, p).id
text[i : i + pinyin_len], for c, p in zip(
pinyin_list[i : i + pinyin_len], text[i : i + pinyin_len],
pinyin_list[i : i + pinyin_len],
)
]
except AttributeError as e:
logger.error(
f"e: {e}, (text, pinyin): {text[i : i + pinyin_len]} - {pinyin_list[i : i + pinyin_len]}"
) )
] continue
if random.random() <= 0.1: if random.random() <= 0.1:
labels.append(0) labels.append(0)