1 个不稳定版本
0.1.0 | 2018年11月28日 |
---|
37 in #deserializer
15KB
431 行
一个魔法 HTML 解析器。
目录
派生目标
结构体
基本用法
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
#[html(selector = "#test")]
struct SingleUser {
#[html(selector = "p:nth-child(1)", attr = "inner")]
name: String,
#[html(selector = "p:nth-child(2)", attr = "inner")]
age: u8,
#[html(selector = "p:nth-child(3)", attr = "inner")]
like_lemon: bool,
}
let user = SingleUser::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
</body>
</html>"#).unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert!(user.like_lemon);
属性
html
目标
derive target
或 field
规范
#[html(选择器= "...",attr= "...",默认= ...)]
selector
、attr
、default
或 html
本身可能是多余的。
这是有效的
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct SingleString {
value: String,
}
选择器
目标
derive target
或 field
文字类型
字符串
规范
选择器必须是一个有效的 CSS 选择器,无效的选择器将导致编译时恐慌
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
#[html(selector = "<>")]
struct SingleUser {}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::*;
#[derive(FromHtml)]
struct SingleUser {
#[html(selector = "<>", attr = "inner")]
name: String,
}
如果选择多个元素且字段类型不是 Vec
,则选择第一个
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
#[html(selector = "a")]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
let link = Link::from_html(r#"
<a href="https://github.com">Github</a>
<a href="https://google.com">Google</a>
"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
默认行为
根元素的 HTML
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
attr
目标
字段
文字类型
字符串
规范
inner
指的是innerHtml
- 任何其他
attr
指的是html 元素属性
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
默认行为
整个元素的 HTML(不是 innerHtml
!)
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
source: String,
}
let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
assert_eq!(r#"<a href="https://github.com">Github</a>"#, &link.source);
默认
目标
字段
文字类型
任何 literal 类型
规范
- 与
field
相同的类型
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct DefaultUser {
// invoke String::from_html
#[html(selector = "#non-exist", default = "Hexilee")]
name: String,
// invoke u8::from<u8>
#[html(default = 20)]
age: u8,
#[html(default = true)]
like_lemon: bool,
}
let user = DefaultUser::from_html("<p></p>").unwrap();
assert_eq!("Hexilee", &user.name);
assert_eq!(20, user.age);
assert_eq!(-1000, user.assets);
assert!(user.like_lemon);
字符串
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
#[derive(FromHtml)]
struct Website {
#[html(default = "10")]
age: u8,
#[html(default = "<a href='https://github.com'>Github</a>")]
link: Link,
}
let website = Website::from_html("<p></p>").unwrap();
let link = website.link;
assert_eq!(10u8, website.age);
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
默认行为
当选择无内容时返回 Err(unhtml::failure::Error)
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
// no default
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
let link = Link::from_html(r#"<a>Github</a>"#).unwrap();
字段类型
实现 FromHtml 的任何类型,不使用泛型
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
// no default
#[html(attr = "href")]
href: &str,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Website {
// no default
#[html(attr = "href")]
hrefs: std::collections::LinkedList<String>,
}
// panic
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Website {
// no default
#[html(attr = "href")]
hrefs: [String],
}
Vec
应使用
use unhtml::VecFromHtml
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml, VecFromHtml};
#[derive(FromHtml)]
struct TestUser {
#[html(selector = "p:nth-child(1)", attr = "inner")]
name: String,
#[html(selector = "p:nth-child(2)", attr = "inner")]
age: u8,
#[html(selector = "p:nth-child(3)", attr = "inner")]
like_lemon: bool,
}
#[derive(FromHtml)]
#[html(selector = "#test")]
struct TestUsers {
#[html(selector = "div")]
users: Vec<TestUser>,
}
let users = TestUsers::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
<div>
<p>BigBrother</p>
<p>21</p>
<p>false</p>
</div>
</div>
</body>
</html>"#).unwrap();
let hexilee = &users.users[0];
let big_brother = &users.users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);
如 crate unhtml
的文档所示,如果您想直接使用 Vec<TestUser>
,则可以这样做:
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, VecFromHtml};
#[derive(FromHtml)]
struct TestUser {
#[html(selector = "p:nth-child(1)", attr = "inner")]
name: String,
#[html(selector = "p:nth-child(2)", attr = "inner")]
age: u8,
#[html(selector = "p:nth-child(3)", attr = "inner")]
like_lemon: bool,
}
let users = Vec::<TestUser>::from_html("#test > div", r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
<div>
<p>BigBrother</p>
<p>21</p>
<p>false</p>
</div>
</div>
</body>
</html>"#).unwrap();
let hexilee = &users[0];
let big_brother = &users[1];
assert_eq!("Hexilee", &hexilee.name);
assert_eq!(20, hexilee.age);
assert!(hexilee.like_lemon);
assert_eq!("BigBrother", &big_brother.name);
assert_eq!(21, big_brother.age);
assert!(!big_brother.like_lemon);
源 HTML
带有顶级选择器
所有源 HTML 都将被解析为 fragment
。顶级元素是 html
,并且没有 DOCTYPE
、head
或 body
。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
</body>
</html>
将被解析为
<html lang="en">
<meta charset="UTF-8">
<title>Title</title>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
</html>
和
<p>Hexilee</p>
将被解析为
<html>
<p>Hexilee</p>
</html>
// panic
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Document {
// no default
#[html(selector = "head")]
head: String,
#[html(selector = "body")]
body: String,
}
let dicument = Document::from_html(r#"<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
</body>
</html>"#).unwrap();
没有顶级选择器
当派生结构体没有 top selector
时,所有源 HTML 都将被解析为 纯 fragment
。没有 DOCTYPE
、html
、head
或 body
。
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>Title</title>
</head>
<body>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
</body>
</html>
将被解析为
<meta charset="UTF-8">
<title>Title</title>
<div id="test">
<div>
<p>Hexilee</p>
<p>20</p>
<p>true</p>
</div>
</div>
和
<p>Hexilee</p>
将被解析为
<p>Hexilee</p>
#[macro_use]
extern crate unhtml_derive;
extern crate unhtml;
use unhtml::{self, FromHtml};
#[derive(FromHtml)]
struct Link {
#[html(attr = "href")]
href: String,
#[html(attr = "inner")]
value: String,
}
let link = Link::from_html(r#"<a href="https://github.com">Github</a>"#).unwrap();
assert_eq!("https://github.com", &link.href);
assert_eq!("Github", &link.value);
依赖项
~5MB
~97K SLoC