8 个版本

0.2.1	2023 年 7 月 13 日
0.2.0	2021 年 4 月 16 日
0.1.5	2020 年 11 月 6 日
0.1.4	2018 年 3 月 6 日

#22 in 解析工具

在 2 个 crate 中使用（通过 caminos-lib）

MIT/Apache 许可证

4MB
43K SLoC

包含 (ELF 可执行文件/库，3MB) examples/calculator_by_hand

gramatica

此 crate 提供了一个二进制文件，用于将语法编译成 Rust 代码，并实现了一个 Earley 解析算法的库，用于解析指定的语法。

使用方法

此 crate 是 gramatica。要使用它，您应先安装它以获取 gramatica_compiler 二进制文件，并将 gramatica 添加到您的项目 Cargo.toml 文件中的依赖项中。

[dependencies]
gramatica = "0.2.1"

然后，如果您已创建了一个语法文件 example.rsg，请执行 gramatica_compiler example.rsg > example.rs。之后，您可以像使用源 Rust 文件一样使用生成的文件 example.rs。

示例：计算器

经典的例子是实现一个计算器。

//This is a just Rust header that it is copied literally
extern crate gramatica;
use std::cmp::Ordering;
use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo};

//Here the proper grammar begins.
//These lines are processed by gramatica_compiler to generate the Token enum and the parsing tables.
//We begin by terminal tokens (symbols that are not in the left of any rule but have a literal representation).
//For this example all terminals are regular expressions. The first argument of re_terminal! is the type entry, as used in a enum.
re_terminal!(Num(f64),"[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?");
re_terminal!(Plus,"\\+");
re_terminal!(Minus,"-");
re_terminal!(Star,"\\*");
re_terminal!(Slash,"/");
re_terminal!(Caret,"\\^");
re_terminal!(LPar,"\\(");
re_terminal!(RPar,"\\)");
re_terminal!(NewLine,"\\n");
re_terminal!(_,"\\s+");//Otherwise skip spaces

//Now is the turn of nonterminal tokens. The first one is the default start symbol.
//These have rules written as match clauses, with the pattern being the reduction of the nonterminal token and the expression being the value the token takes when reducing.
//In this case the type of the symbol is empty and so is the expression
nonterminal Input
{
	() => (),
	(Input,Line) => (),
}

//Although the value type of Line is empty we may have code executed on the reduction
nonterminal Line
{
	(NewLine) => (),
	(Expression(value), NewLine) =>
	{
		println!("{}",value);
	},
}

//Finally a token with value type. Each rule creates the value in a different way.
//Most rules are annotated to avoid ambiguities
nonterminal Expression(f64)
{
	(Num(value)) => value,
	#[priority(addition)]
	#[associativity(left)]
	(Expression(l),Plus,Expression(r)) => l+r,
	#[priority(addition)]
	#[associativity(left)]
	(Expression(l),Minus,Expression(r)) => l-r,
	#[priority(multiplication)]
	#[associativity(left)]
	(Expression(l),Star,Expression(r)) => l*r,
	#[priority(multiplication)]
	#[associativity(left)]
	(Expression(l),Slash,Expression(r)) => l/r,
	#[priority(addition)]
	#[associativity(left)]
	(Minus,Expression(value)) => -value,
	#[priority(exponentiation)]
	#[associativity(right)]
	(Expression(l),Caret,Expression(r)) => l.powf(r),
	(LPar,Expression(value),RPar) => value,
}

//The ordering macro-like sets the order of application of the previously annotated rules
ordering!(exponentiation,multiplication,addition);

//Finally an example of using the grammar to parse some lines from stdin.
//We could do this or something similar in a different file if we desired to.
use std::io::BufRead;
fn main()
{
	let stdin=std::io::stdin();
	for rline in stdin.lock().lines()
	{
		let line=rline.unwrap()+"\n";
		println!("line={}",line);
		match Parser::<Token,ParsingTables>::parse(&line,None)
		{
			Err(x) => println!("error parsing: {:?}",x),
			Ok(x) => println!("parsed correctly: {:?}",x),
		};
	}
}

高级词法分析器

为了定义无法用正则表达式表示的终端令牌，您可以使用以下方法。它必须包含一个返回包含匹配字符数和令牌值的选项的 _match 函数。

terminal LitChar(char)
{
	fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,char)>
	{
		let mut characters=source.chars();
		if (characters.next())==(Some('\''))
		{
			let mut c=characters.next().unwrap();
			let mut size=3;
			if c=='\\'
			{
				c=(characters.next().unwrap());
				size=4;
			}
			if characters.next().unwrap()=='\''
			{
				Some((size,c))
			}
			else
			{
				None
			}
		}
		else
		{
			None
		}
	}
}

从版本 0.1.1 开始，还有一个 keyword_terminal! 宏。

keyword_terminal!(Const,"const");

解析值为匹配子句

每个规则都写成匹配子句，其结束表达式是非终端令牌解析后的值。例如，解析语句列表：

nonterminal Stmts(Vec<StmtKind>)
{
	(Stmt(ref stmt)) => vec![stmt.clone()],
	(Stmts(ref stmts),Stmt(ref stmt)) =>
	{
		let mut new=(stmts.clone());
		new.push(stmt.clone());
		new
	},
}

只有当它们是最终语法树的一部分时，缩减才会执行。

通过注解优先级

为了避免歧义，您有两个选择：确保语法中不包含它们，或者通过引入注解来优先处理规则。在计算器的示例中，我们看到了两种类型：

#[priority(p_name)] 用于声明具有优先级 p_name 的规则。稍后应该有一个类似 ordering!(p_0,p_1,p_2,...) 的宏，用于指示 p_0 应在 p_1 之前缩减。
#[associativity(left/right)] 来决定如何处理嵌套相同规则的情况。

示例：解析 JSON extern crate gramatica; use std::cmp::Ordering; use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo}; //See https://json.js.cn/ use std::rc::Rc; //We define an auxiliar type to store JSON values #[derive(Clone,Debug,PartialEq)] enum JsonValue { Literal(String), Number(f64), Object(Vec<(String,JsonValue)>), Array(Vec<JsonValue>), True, False, Null, } // ---- Start of the grammar ---- keyword_terminal!(True,"true"); keyword_terminal!(False,"false"); keyword_terminal!(Null,"null"); re_terminal!(Number(f64),"[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?"); terminal LitStr(String) { //This function has limited escaping capabilities fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,String)> { let mut ret=None; let mut characters=source.chars(); if (characters.next())!=(Some('"')) { } else { let mut size=1; let mut r=String::from("\""); while true { match characters.next() { None => break, Some('"') => { ret=(Some((size+1,r+&"\""))); break; }, Some('\\') => { match characters.next() { None => break, //Some(c) => r+='\\'+c, Some(c) => { r.push('\\'); r.push(c); } }; size+=2; }, Some(c) => { //r+=&String::from(c); r.push(c); size+=1; }, }; } } ret } } re_terminal!(LBrace,"\\{"); re_terminal!(RBrace,"\\}"); re_terminal!(LBracket,"\\["); re_terminal!(RBracket,"\\]"); re_terminal!(Comma,","); re_terminal!(Colon,":"); re_terminal!(_,"\\s+|\n");//Otherwise skip spaces nonterminal Object(JsonValue) { (LBrace,RBrace) => JsonValue::Object(vec![]), (LBrace,Members(ref list),RBrace) => JsonValue::Object(list.clone()), } nonterminal Members(Vec<(String,JsonValue)>) { (Pair(ref s,ref value)) => vec![(s.clone(),value.clone())], //(Pair,Comma,Members) => (), (Members(ref list),Comma,Pair(ref s,ref value)) => { let mut new=(list.clone()); new.push((s.clone(),value.clone())); new }, } nonterminal Pair(String,JsonValue) { (LitStr(ref s),Colon,Value(ref value)) => (s.clone(),value.clone()), } nonterminal Array(Vec<JsonValue>) { (LBracket,RBracket) => vec![], (LBracket,Elements(ref list),RBracket) => list.clone(), } nonterminal Elements(Vec<JsonValue>) { (Value(ref value)) => vec![value.clone()], //(Value,Comma,Elements) => (), (Elements(ref list),Comma,Value(ref value)) => { let mut new=(list.clone()); new.push(value.clone()); new }, } nonterminal Value(JsonValue) { (LitStr(ref s)) => JsonValue::Literal(s.clone()), (Number(v)) => JsonValue::Number(v), (Object(ref value)) => value.clone(), (Array(ref list)) => JsonValue::Array(list.clone()), (True) => JsonValue::True, (False) => JsonValue::False, (Null) => JsonValue::Null, } // ---- End of the grammar ---- use std::io::{BufRead,Read}; //As example, we parse stdin for a JSON object fn main() { let stdin=std::io::stdin(); let mut buf=String::new(); stdin.lock().read_to_string(&mut buf); match Parser::<Token,ParsingTables>::parse(&buf,None) { Err(x) => println!("error parsing: {:?}",x), Ok(x) => println!("parsed correctly: {:?}",x), }; } 示例：解析基本 XML //A very basic xml grammar extern crate gramatica; use std::cmp::Ordering; use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo}; // see https://www.w3.org/People/Bos/meta-bnf // also http://cs.lmu.edu/~ray/notes/xmlgrammar/ use std::rc::Rc; //We define an auxiliar type to store XML elements #[derive(Clone,Debug,PartialEq)] struct XMLElement { name: String, attrs: Vec<(String,String)>, contents: Vec<XMLContent>, } #[derive(Clone,Debug,PartialEq)] enum XMLContent { Element(XMLElement), Data(String), } // ---- Start of the grammar ---- re_terminal!(Space(String),"(\\s|\n)+"); re_terminal!(Ident(String),"[a-zA-Z\\x80-\\xff_][a-zA-Z0-9\\x80-\\xff_]*"); terminal LitStr(String) { fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,String)> { let mut ret=None; let mut characters=source.chars(); if (characters.next())!=(Some('"')) { } else { let mut size=1; let mut r=String::from("\""); while true { match characters.next() { None => break, Some('"') => { ret=(Some((size+1,r+&"\""))); break; }, Some('\\') => { match characters.next() { None => break, //Some(c) => r+='\\'+c, Some(c) => { r.push('\\'); r.push(c); } }; size+=2; }, Some(c) => { //r+=&String::from(c); r.push(c); size+=1; }, }; } } ret } } re_terminal!(CloseEmpty,"/>"); re_terminal!(BeginClose,"</"); re_terminal!(Equal,"="); re_terminal!(LT,"<"); re_terminal!(GT,">"); re_terminal!(Other(char),"."); nonterminal Document(XMLElement) { (Element(ref elem)) => elem.clone(), } nonterminal Element(XMLElement) { (EmptyElemTag(ref name,ref attrs)) => XMLElement{name:name.clone(),attrs:attrs.clone(),contents:vec![]}, (STag(ref name, ref attrs),Content(ref content),ETag) => XMLElement{name:name.clone(),attrs:attrs.clone(),contents:content.clone()}, } nonterminal EmptyElemTag(String,Vec<(String,String)>) { (LT,Ident(ref name),Attributes(ref attrs),MaybeSpace,CloseEmpty) => (name.clone(),attrs.clone()), } nonterminal Attributes(Vec<(String,String)>) { () => vec![], (Attributes(ref attrs),Space,Attribute(ref a, ref b)) => { let mut new=(attrs.clone()); new.push((a.clone(),b.clone())); new }, } nonterminal Attribute(String,String) { (Ident(ref a),Equal,LitStr(ref b)) => (a.clone(),b.clone()), } nonterminal STag(String,Vec<(String,String)>) { (LT,Ident(ref name),Attributes(ref attrs),MaybeSpace,GT) => (name.clone(),attrs.clone()), } nonterminal ETag(String) { (BeginClose,Ident(ref s),MaybeSpace,GT) => s.clone(), } nonterminal Content(Vec<XMLContent>) { (CharData(ref s)) => vec![XMLContent::Data(s.clone())], (CharData(ref s),Contents(ref list)) => { let mut new=vec![XMLContent::Data(s.clone())]; new.extend(list.iter().map(|x|x.clone())); new }, } nonterminal Contents(Vec<XMLContent>) { () => vec![], (Contents(ref list),Element(ref elem),CharData(ref s)) => { let mut new=(list.clone()); new.push(XMLContent::Element(elem.clone())); if s!="" { new.push(XMLContent::Data(s.clone())); } new }, } nonterminal MaybeSpace { () => (), (Space) => (), } nonterminal CharData(String) { () => String::new(), (CharData(ref s),Space(ref o)) => format!("{}{}",s,o), (CharData(ref s),Ident(ref o)) => format!("{}{}",s,o), (CharData(ref s),Equal) => format!("{}=",s), (CharData(ref s),Other(o)) => format!("{}{}",s,o), } // ---- End of the grammar ---- use std::io::{BufRead,Read}; //As example, we parse stdin for a XML element fn main() { let stdin=std::io::stdin(); let mut buf=String::new(); stdin.lock().read_to_string(&mut buf); match Parser::<Token,ParsingTables>::parse(&buf,None) { Err(x) => println!("error parsing: {:?}",x), Ok(x) => println!("parsed correctly: {:?}",x), }; }

依赖关系 ~2.2–3MB ~54K SLoC regex

bin+lib gramatica

8 个版本

gramatica

使用方法

最近更改

示例：计算器

高级词法分析器

解析值为匹配子句

通过注解优先级

示例：解析 JSON

示例：解析基本 XML

依赖关系