15 个不稳定版本

0.8.0 2024年2月27日
0.7.3 2023年1月19日
0.6.3 2023年1月12日
0.6.2 2022年7月16日

#195 in 文本处理

Download history 6/week @ 2024-03-09 39/week @ 2024-03-30 1/week @ 2024-04-06 57/week @ 2024-04-13

每月下载量 199

MIT 许可证

135KB
4K SLoC

ultra-nlp

安装

cargo add ultra-nlp

使用方法

ngrams

let text = "你好世界";

let result = ngrams(text, 2);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["你好", "好世", "世界"]
);

extract_consecutive_chinese_chars

let text = "foo中文bar字符baz";

let result = extract_consecutive_chinese_chars(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["中文", "字符"]
);

extract_consecutive_letters

let text = "foo中文,bar,字符baz";

let result = extract_consecutive_letters(text);

assert_eq!(
    result
        .into_iter()
        .collect::<Vec<&str>>(),
    vec!["foo中文", "bar", "字符baz"]
);

cedarwood(slow, low memory usage)

忽略不匹配的内容

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::Ignore
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec!["南京", "南京市", "市长", "长江", "大桥"]
);

将不匹配的内容作为字符保留

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsChars
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

将不匹配的内容作为单词保留

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
    segment_fully,
    ForwardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(
    text,
    &dict,
    BehaviorForUnmatched::KeepAsWords
);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

daachorse(fast, high memory usage)

忽略不匹配的内容

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::Ignore);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
      "南京", "南京市", "市长", "长江", "大桥",
    ]
);

将不匹配的内容作为字符保留

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsChars);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
    ]
);

将不匹配的内容作为单词保留

use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
    segment_fully,
    StandardDictionary,
};

let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
    vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();

let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsWords);

assert_eq!(
    result
        .iter()
        .map(|x| x.range().extract(text))
        .collect::<Vec<_>>(),
    vec![
        " ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
    ]
);

依赖

~2.4–3.5MB
~60K SLoC