Merge branch 'main' of https://gitea.winkinshly.site/songsenand/SUIME

feat(tokenizer): 添加自定义分词器配置文件
2026-03-01 10:49:56 +08:00 · 2026-03-01 10:48:52 +08:00
4 changed files with 21465 additions and 1 deletions
--- a/Cargo.toml
+++ b/Cargo.toml
@ -4,3 +4,8 @@ version = "0.1.0"
 edition = "2024"
 [dependencies]
 anyhow = "1.0.102"
 lazy_static = "1.5.0"
 ndarray = "0.17.2"
 tempfile = "3.26.0"
 tokenizers = "0.22.2"
--- a/assets/tokenizer.json
+++ b/assets/tokenizer.json
--- a/src/main.rs
+++ b/src/main.rs
@ -1,3 +1,26 @@
 mod tokenizers;
 use std::path::PathBuf;
 fn main() {
-    println!("Hello, world!");
+
    let mut tokenizer_json_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
    tokenizer_json_path.push("assets");
    tokenizer_json_path.push("tokenizer.json");
    // 示例：使用 HFTokenizer
    match tokenizers::HFTokenizer::new(tokenizer_json_path) {
        Ok(mut tokenizer) => {
            match tokenizer.gen_predict_sample("hello world", "ni hao") {
                Ok(model_input) => {
                    println!("Model input generated successfully");
                    println!("Input IDs: {:?}", model_input.input_ids);
                    println!("PG value: {}", model_input.pg[[0]]);
                }
                Err(e) => eprintln!("Error generating model input: {}", e),
            }
        }
        Err(e) => eprintln!("Error loading tokenizer: {}", e),
    }
 }
--- a/src/tokenizers.rs
+++ b/src/tokenizers.rs
@ -0,0 +1,158 @@
 // tokenizer.rs
 use anyhow::Result;
 use ndarray::{Array1, Array2};
 use std::collections::HashMap;
 use std::path::Path;
 use tokenizers::{EncodeInput, Encoding, Tokenizer};
 // 拼音组映射表 (PG map) - 使用 const array 初始化 HashMap
 const PG_MAP: &[(&str, i64)] = &[
    ("r", 0),
    ("l", 0),
    ("p", 1),
    ("d", 1),
    ("h", 2),
    ("f", 2),
    ("g", 3),
    ("m", 3),
    ("z", 4),
    ("o", 4),
    ("t", 5),
    ("q", 5),
    ("b", 6),
    ("w", 6),
    ("j", 7),
    ("e", 7),
    ("k", 8),
    ("c", 8),
    ("s", 9),
    ("a", 9),
    ("n", 10),
    ("x", 10),
    ("y", 11),
 ];
 // 使用 lazy_static 初始化 HashMap
 lazy_static::lazy_static! {
    static ref PG: HashMap<String, i64> = {
        let mut m = HashMap::new();
        for &(k, v) in PG_MAP.iter() {
            m.insert(k.to_string(), v);
        }
        m
    };
 }
 // 模型输入结构体，用于组织 hint 和 pg 输入
 // 现在使用 ndarray::Array 类型
 #[derive(Debug)]
 pub struct ModelInput {
    pub input_ids: Array2<i64>,      // (batch_size, sequence_length)
    pub attention_mask: Array2<i64>, // (batch_size, sequence_length)
    pub token_type_ids: Array2<i64>, // (batch_size, sequence_length)
    pub pg: Array1<i64>, // 使用 Array1 代表一个 1D 向量
 }
 /// 封装了 tokenizers crate 的 Rust Tokenizer，并提供与 ONNX Runtime 模型兼容的接口
 pub struct HFTokenizer {
    /// 内部使用的 tokenizers crate 的 Tokenizer 实例
    tokenizer: Tokenizer,
 }
 impl HFTokenizer {
    /// 创建一个新的 HFTokenizer 实例
    /// `tokenizer_path_or_name`: 指向预训练 tokenizer 配置文件的路径 (例如 tokenizer.json)
    pub fn new<P: AsRef<Path>>(tokenizer_path_or_name: P) -> Result<Self> {
        // 加载 tokenizer 配置文件 (通常名为 tokenizer.json)
        let tokenizer = Tokenizer::from_file(tokenizer_path_or_name.as_ref()).map_err(|e| {
            anyhow::anyhow!(
                "Failed to load tokenizer from {:?}: {}",
                tokenizer_path_or_name.as_ref(),
                e
            )
        })?;
        Ok(HFTokenizer { tokenizer })
    }
    /// 使用 `tokenizers` crate 的 EncodeInput 构造函数来处理文本对
    /// `text` - 第一句输入 (通常是文本)
    /// `py` - 第二句输入 (通常是拼音)
    fn encode_pair(&mut self, text: &str, py: &str) -> Result<Encoding> {
        let input = EncodeInput::Dual(text.into(), py.into());
        // encode 方法会应用 pre_tokenizer, normalizer, post_processor, truncation, padding
        let encoding = self
            .tokenizer
            .encode(input, true) // true for add_special_tokens
            .map_err(|e| anyhow::anyhow!("Tokenization error: {}", e))?;
        Ok(encoding)
    }
    /// 生成用于预测的样本数据
    ///
    /// # Arguments
    ///
    /// * `text` - 输入的文本字符串
    /// * `py` - 输入的拼音字符串
    ///
    /// # Returns
    ///
    /// * `Result<ModelInput>` - 包含模型所需输入的结构体，包含 hint (input_ids, attention_mask, token_type_ids) 和 pg array
    pub fn gen_predict_sample(&mut self, text: &str, py: &str) -> Result<ModelInput> {
        let encoding = self.encode_pair(text, py)?;
        // 获取分词结果
        let input_ids: Vec<i32> = encoding.get_ids().iter().map(|&id| id as i32).collect();
        let attention_mask: Vec<i32> = encoding
            .get_attention_mask()
            .iter()
            .map(|&mask| mask as i32)
            .collect();
        // token_type_ids 是由 post_processor (如 BertProcessing) 自动生成的
        let token_type_ids: Vec<i32> = encoding
            .get_type_ids()
            .iter()
            .map(|&ty| ty as i32)
            .collect();
        // 获取序列长度
        let seq_len = input_ids.len();
        // 转换为 ndarray::Array，并重塑为 (batch_size=1, sequence_length)
        // ONNX Runtime 通常期望明确的批次维度
        let input_ids_nd = Array1::from_vec(input_ids)
            .into_shape_with_order((1, seq_len))
            .unwrap();
        let attention_mask_nd = Array1::from_vec(attention_mask)
            .into_shape_with_order((1, seq_len))
            .unwrap();
        let token_type_ids_nd = Array1::from_vec(token_type_ids)
            .into_shape_with_order((1, seq_len))
            .unwrap();
        // --- PG Logic ---
        let py_first_char = py
            .chars()
            .next()
            .map(|c| c.to_string())
            .unwrap_or_else(|| "unknown".to_string());
        let pg_val = *PG.get(&py_first_char).unwrap_or(&12); // Default to 12 if key not found
        let pg_nd = Array1::from_elem(1, pg_val); // 直接创建一维数组
        Ok(ModelInput {
            input_ids: input_ids_nd.mapv(|x| x as i64), // Convert i32 to i64 for ONNX
            attention_mask: attention_mask_nd.mapv(|x| x as i64),
            token_type_ids: token_type_ids_nd.mapv(|x| x as i64),
            pg: pg_nd.mapv(|x| x as i64),
        })
    }
    /*
    /// 预测函数 (此函数通常在模型结构体内实现，此处仅为演示如何调用 gen_predict_sample)
    /// 这里只是展示如何准备输入，实际的 forward pass 需要在模型的 impl 中完成
    pub fn prepare_for_prediction(&mut self, text: &str, py: &str) -> Result<ModelInput> {
        self.gen_predict_sample(text, py)
    }*/
 }
Author	SHA1	Message	Date
songsenand	c143d793ec	Merge branch 'main' of https://gitea.winkinshly.site/songsenand/SUIME	2026-03-01 10:49:56 +08:00
songsenand	085d90b5d3	feat(tokenizer): 添加自定义分词器配置文件	2026-03-01 10:48:52 +08:00