15 个不稳定版本
0.8.0 | 2024年2月27日 |
---|---|
0.7.3 | 2023年1月19日 |
0.6.3 | 2023年1月12日 |
0.6.2 | 2022年7月16日 |
#195 in 文本处理
每月下载量 199
135KB
4K SLoC
ultra-nlp
安装
cargo add ultra-nlp
使用方法
ngrams
let text = "你好世界";
let result = ngrams(text, 2);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["你好", "好世", "世界"]
);
extract_consecutive_chinese_chars
let text = "foo中文bar字符baz";
let result = extract_consecutive_chinese_chars(text);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["中文", "字符"]
);
extract_consecutive_letters
let text = "foo中文,bar,字符baz";
let result = extract_consecutive_letters(text);
assert_eq!(
result
.into_iter()
.collect::<Vec<&str>>(),
vec!["foo中文", "bar", "字符baz"]
);
cedarwood(slow, low memory usage)
忽略不匹配的内容
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
segment_fully,
ForwardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(
text,
&dict,
BehaviorForUnmatched::Ignore
);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec!["南京", "南京市", "市长", "长江", "大桥"]
);
将不匹配的内容作为字符保留
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
segment_fully,
ForwardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(
text,
&dict,
BehaviorForUnmatched::KeepAsChars
);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec![
" ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
]
);
将不匹配的内容作为单词保留
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::cedarwood::{
segment_fully,
ForwardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = ForwardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(
text,
&dict,
BehaviorForUnmatched::KeepAsWords
);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec![
" ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
]
);
daachorse(fast, high memory usage)
忽略不匹配的内容
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
segment_fully,
StandardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(text, &dict, BehaviorForUnmatched::Ignore);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec![
"南京", "南京市", "市长", "长江", "大桥",
]
);
将不匹配的内容作为字符保留
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
segment_fully,
StandardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsChars);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec![
" ", "南京", "南京市", "市长", "长江", "大桥", ",", " ", "h", "e", "l", "l", "o", " ", "w", "o", "r", "l", "d", " ",
]
);
将不匹配的内容作为单词保留
use ultra_nlp::BehaviorForUnmatched,
use ultra_nlp::daachorse::{
segment_fully,
StandardDictionary,
};
let text = " 南京市长江大桥, hello world ";
let dict = StandardDictionary::new(
vec!["南京", "南京市", "市长", "长江", "大桥", "你好世界"]
).unwrap();
let result = segment_fully(text, &dict, BehaviorForUnmatched::KeepAsWords);
assert_eq!(
result
.iter()
.map(|x| x.range().extract(text))
.collect::<Vec<_>>(),
vec![
" ", "南京", "南京市", "市长", "长江", "大桥", ", hello world ",
]
);
依赖
~2.4–3.5MB
~60K SLoC