SUIME/src/vocabs.rs

74 lines
2.3 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use anyhow::{Context, Result};
use serde::Deserialize;
use std::collections::HashMap;
use std::fs::File;
use std::io::BufReader;
use std::path::Path;
/// 单个字符-拼音对的信息
#[derive(Debug, Deserialize, Clone)]
pub struct CharInfo {
pub id: u32,
#[serde(rename = "char")]
pub character: String,
pub pinyin: String,
pub count: u64,
}
/// JSON 根结构(仅包含需要的字段)
#[derive(Debug, Deserialize)]
struct RawStatistics {
pairs: HashMap<String, CharInfo>, // 键为字符串形式的 ID
// 忽略其他元数据字段
}
/// 字典查询引擎,提供 O(1) 的 ID 到信息的映射
pub struct Dictionary {
id_to_charinfo: HashMap<u32, CharInfo>,
}
impl Dictionary {
/// 从 JSON 文件加载字典
pub fn from_json_file<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path).context("无法打开字典文件")?;
let reader = BufReader::new(file);
let raw: RawStatistics = serde_json::from_reader(reader)
.context("无法解析 JSON 字典")?;
let mut id_to_charinfo = HashMap::with_capacity(raw.pairs.len());
for (id_str, info) in raw.pairs {
let id = id_str
.parse::<u32>()
.with_context(|| format!("无效的 ID 字符串: {}", id_str))?;
// 可选:验证 id 与 info.id 一致,此处忽略不一致的情况(信任输入数据)
id_to_charinfo.insert(info.id, info);
}
Ok(Dictionary { id_to_charinfo })
}
/// 通过 ID 获取汉字(用于填充 Candidate.text
pub fn get_char_by_id(&self, id: u32) -> Option<&str> {
self.id_to_charinfo.get(&id).map(|info| info.character.as_str())
}
/// 通过 ID 获取拼音
pub fn get_pinyin_by_id(&self, id: u32) -> Option<&str> {
self.id_to_charinfo.get(&id).map(|info| info.pinyin.as_str())
}
/// 通过 ID 获取出现次数
pub fn get_count_by_id(&self, id: u32) -> Option<u64> {
self.id_to_charinfo.get(&id).map(|info| info.count)
}
/// 获取完整的 CharInfo 引用
pub fn get_char_info(&self, id: u32) -> Option<&CharInfo> {
self.id_to_charinfo.get(&id)
}
/// 返回字典中存储的条目数量
pub fn len(&self) -> usize {
self.id_to_charinfo.len()
}
}