fix(dataset): 添加异常捕获防止标签生成失败
This commit is contained in:
parent
493bfdec1a
commit
6ad003133c
|
|
@ -7,6 +7,7 @@ from typing import Dict, List, Tuple
|
||||||
import numpy as np
|
import numpy as np
|
||||||
import torch
|
import torch
|
||||||
from datasets import load_dataset
|
from datasets import load_dataset
|
||||||
|
from loguru import logger
|
||||||
from modelscope import AutoTokenizer
|
from modelscope import AutoTokenizer
|
||||||
from pypinyin import lazy_pinyin
|
from pypinyin import lazy_pinyin
|
||||||
from pypinyin.contrib.tone_convert import to_initials
|
from pypinyin.contrib.tone_convert import to_initials
|
||||||
|
|
@ -304,13 +305,19 @@ class PinyinInputDataset(IterableDataset):
|
||||||
string_list.append(text[start_pos:end_pos])
|
string_list.append(text[start_pos:end_pos])
|
||||||
# 用|连接所有字符串
|
# 用|连接所有字符串
|
||||||
part4 = "|".join(string_list)
|
part4 = "|".join(string_list)
|
||||||
labels = [
|
try:
|
||||||
self.query_engine.get_char_info_by_char_pinyin(c, p).id
|
labels = [
|
||||||
for c, p in zip(
|
self.query_engine.get_char_info_by_char_pinyin(c, p).id
|
||||||
text[i : i + pinyin_len],
|
for c, p in zip(
|
||||||
pinyin_list[i : i + pinyin_len],
|
text[i : i + pinyin_len],
|
||||||
|
pinyin_list[i : i + pinyin_len],
|
||||||
|
)
|
||||||
|
]
|
||||||
|
except AttributeError as e:
|
||||||
|
logger.error(
|
||||||
|
f"e: {e}, (text, pinyin): {text[i : i + pinyin_len]} - {pinyin_list[i : i + pinyin_len]}"
|
||||||
)
|
)
|
||||||
]
|
continue
|
||||||
if random.random() <= 0.1:
|
if random.random() <= 0.1:
|
||||||
labels.append(0)
|
labels.append(0)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue