11 个稳定版本
3.1.6 | 2022 年 12 月 26 日 |
---|---|
3.1.5 | 2021 年 10 月 18 日 |
3.1.4 | 2021 年 8 月 26 日 |
3.1.3 | 2021 年 6 月 7 日 |
1.0.0 | 2021 年 5 月 7 日 |
#1019 在 解析器实现 中
105KB
1K SLoC
parsercher
解析和搜索标签文档。(例如 HTML,XML)
parsercher 解析如 HTML 和 XML 等标签文档。
- 从标签文档创建 Dom 结构树。
- 从 Dom 结构树中搜索标签和文本。
- 从 Dom 结构树中搜索子树。
使用方法
将以下内容添加到您的 Cargo.toml
[dependencies]
parsercher = "3.1.6"
许可协议
示例
从 HTML 中获取文本的示例。
从 HTML 创建 Dom 结构树并获取具有 class
属性值为 target
的 li
标签的文本。
use parsercher;
use parsercher::dom::Tag;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>sample html</title>
</head>
<body>
<ol>
<li class="target">first</li>
<li>second</li>
<li class="target">therd</li>
</ol>
</body>
</html>
"#;
if let Ok(root_dom) = parsercher::parse(&html) {
let mut needle = Tag::new("li");
needle.set_attr("class", "target");
if let Some(texts) = parsercher::search_text_from_tag_children(&root_dom, &needle) {
assert_eq!(texts.len(), 2);
assert_eq!(texts[0], "first".to_string());
assert_eq!(texts[1], "therd".to_string());
}
}
从 Dom 结构树中搜索子树的示例。
找到一个具有 ul
标签且其 class
属性值为 targetList
的子树,下面有两个 li
标签。此外,li
标签的 class
属性值必须分别为 key1
和 key2
。
查找内容
<ul class="targetList">
<li class="key1"></li>
<li class="key2"></li>
</ul>
use parsercher;
let doc = r#"
<body>
<ul id="list1" class="targetList">
<li class="key1">1-1</li>
<li class="key2">
<span>1-2</span>
</li>
</ul>
<ul id="list2">
<li class="key1">2-1</li>
<li>2-2</li>
</ul>
<div>
<div>
<ul class="targetList">
<ul id="list3" class="targetList">
<li class="key1">3-1</li>
<li class="item">3-2</li>
<li class="key2">3-3</li>
</ul>
</ul>
</div>
</div>
<ul id="list4">
<li class="key1">4-1</li>
<li class="key2">4-2</li>
</ul>
</body>
"#;
let root_dom = parsercher::parse(&doc).unwrap();
let needle = r#"
<ul class="targetList">
<li class="key1"></li>
<li class="key2"></li>
</ul>
"#;
let result = root_dom.search(&needle).unwrap().unwrap();
for dom in result.iter() {
parsercher::print_dom_tree(&dom);
}
输出结果
<ul class="targetList" id="list1">
<li class="key1">
TEXT: "1-1"
<li class="key2">
<span>
TEXT: "1-2"
<ul class="targetList" id="list3">
<li class="key1">
TEXT: "3-1"
<li class="item">
TEXT: "3-2"
<li class="key2">
TEXT: "3-3"
更复杂的 Dom 结构树示例
use parsercher;
let html = r#"
<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>sample html</title>
</head>
<body>
<h1>Hello, world!</h1>
<div id="content"></div>
<ol>
<li>first</li>
<li>second</li>
<li>therd</li>
</ol>
<!-- All script code becomes one text -->
<script>
let content = document.getElementById('content');
content.textContent = 'content';
</script>
</body>
</html>
"#;
if let Ok(dom) = parsercher::parse(&html) {
println!("{:#?}", dom);
}
输出结果
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "root",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "!DOCTYPE",
attr: Some(
{
"html": "",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "html",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "head",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "meta",
attr: Some(
{
"charset": "UTF-8",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "title",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "sample html",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "body",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "h1",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "Hello, world!",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "div",
attr: Some(
{
"id": "content",
},
),
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "ol",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "first",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "second",
},
),
comment: None,
children: None,
},
],
),
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "li",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "therd",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
Dom {
dom_type: Comment,
tag: None,
text: None,
comment: Some(
Comment {
comment: " All script code becomes one text ",
},
),
children: None,
},
Dom {
dom_type: Tag,
tag: Some(
Tag {
name: "script",
attr: None,
terminated: false,
terminator: false,
},
),
text: None,
comment: None,
children: Some(
[
Dom {
dom_type: Text,
tag: None,
text: Some(
Text {
text: "\n let content = document.getElementById(\'content\');\n content.textContent = \'content\';\n",
},
),
comment: None,
children: None,
},
],
),
},
],
),
},
],
),
},
],
),
}