8 个版本
0.2.1 | 2023 年 7 月 13 日 |
---|---|
0.2.0 | 2021 年 4 月 16 日 |
0.1.5 | 2020 年 11 月 6 日 |
0.1.4 | 2018 年 3 月 6 日 |
#22 in 解析工具
在 2 个 crate 中使用(通过 caminos-lib)
4MB
43K SLoC
包含 (ELF 可执行文件/库,3MB) examples/calculator_by_hand
gramatica
此 crate 提供了一个二进制文件,用于将语法编译成 Rust 代码,并实现了一个 Earley 解析算法的库,用于解析指定的语法。
使用方法
此 crate 是 gramatica
。要使用它,您应先安装它以获取 gramatica_compiler
二进制文件,并将 gramatica
添加到您的项目 Cargo.toml
文件中的依赖项中。
[dependencies]
gramatica = "0.2.1"
然后,如果您已创建了一个语法文件 example.rsg
,请执行 gramatica_compiler example.rsg > example.rs
。之后,您可以像使用源 Rust 文件一样使用生成的文件 example.rs
。
最近更改
- 现在可以使用绑定和可变引用。例如,在规则
(LPar, a @ Left(_), Right(ref mut b), RPar) => (std::mem::take(a),std::mem::take(b))
。 - 添加了
parser::cursor
以替换source_index
,以避免对 Unicode 字符串进行索引。 - 改进了对大文件的管理。
- 为
Parser::parse
添加了vebosity
参数。
示例:计算器
经典的例子是实现一个计算器。
//This is a just Rust header that it is copied literally
extern crate gramatica;
use std::cmp::Ordering;
use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo};
//Here the proper grammar begins.
//These lines are processed by gramatica_compiler to generate the Token enum and the parsing tables.
//We begin by terminal tokens (symbols that are not in the left of any rule but have a literal representation).
//For this example all terminals are regular expressions. The first argument of re_terminal! is the type entry, as used in a enum.
re_terminal!(Num(f64),"[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?");
re_terminal!(Plus,"\\+");
re_terminal!(Minus,"-");
re_terminal!(Star,"\\*");
re_terminal!(Slash,"/");
re_terminal!(Caret,"\\^");
re_terminal!(LPar,"\\(");
re_terminal!(RPar,"\\)");
re_terminal!(NewLine,"\\n");
re_terminal!(_,"\\s+");//Otherwise skip spaces
//Now is the turn of nonterminal tokens. The first one is the default start symbol.
//These have rules written as match clauses, with the pattern being the reduction of the nonterminal token and the expression being the value the token takes when reducing.
//In this case the type of the symbol is empty and so is the expression
nonterminal Input
{
() => (),
(Input,Line) => (),
}
//Although the value type of Line is empty we may have code executed on the reduction
nonterminal Line
{
(NewLine) => (),
(Expression(value), NewLine) =>
{
println!("{}",value);
},
}
//Finally a token with value type. Each rule creates the value in a different way.
//Most rules are annotated to avoid ambiguities
nonterminal Expression(f64)
{
(Num(value)) => value,
#[priority(addition)]
#[associativity(left)]
(Expression(l),Plus,Expression(r)) => l+r,
#[priority(addition)]
#[associativity(left)]
(Expression(l),Minus,Expression(r)) => l-r,
#[priority(multiplication)]
#[associativity(left)]
(Expression(l),Star,Expression(r)) => l*r,
#[priority(multiplication)]
#[associativity(left)]
(Expression(l),Slash,Expression(r)) => l/r,
#[priority(addition)]
#[associativity(left)]
(Minus,Expression(value)) => -value,
#[priority(exponentiation)]
#[associativity(right)]
(Expression(l),Caret,Expression(r)) => l.powf(r),
(LPar,Expression(value),RPar) => value,
}
//The ordering macro-like sets the order of application of the previously annotated rules
ordering!(exponentiation,multiplication,addition);
//Finally an example of using the grammar to parse some lines from stdin.
//We could do this or something similar in a different file if we desired to.
use std::io::BufRead;
fn main()
{
let stdin=std::io::stdin();
for rline in stdin.lock().lines()
{
let line=rline.unwrap()+"\n";
println!("line={}",line);
match Parser::<Token,ParsingTables>::parse(&line,None)
{
Err(x) => println!("error parsing: {:?}",x),
Ok(x) => println!("parsed correctly: {:?}",x),
};
}
}
高级词法分析器
为了定义无法用正则表达式表示的终端令牌,您可以使用以下方法。它必须包含一个返回包含匹配字符数和令牌值的选项的 _match 函数。
terminal LitChar(char)
{
fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,char)>
{
let mut characters=source.chars();
if (characters.next())==(Some('\''))
{
let mut c=characters.next().unwrap();
let mut size=3;
if c=='\\'
{
c=(characters.next().unwrap());
size=4;
}
if characters.next().unwrap()=='\''
{
Some((size,c))
}
else
{
None
}
}
else
{
None
}
}
}
从版本 0.1.1 开始,还有一个 keyword_terminal!
宏。
keyword_terminal!(Const,"const");
解析值为匹配子句
每个规则都写成匹配子句,其结束表达式是非终端令牌解析后的值。例如,解析语句列表:
nonterminal Stmts(Vec<StmtKind>)
{
(Stmt(ref stmt)) => vec![stmt.clone()],
(Stmts(ref stmts),Stmt(ref stmt)) =>
{
let mut new=(stmts.clone());
new.push(stmt.clone());
new
},
}
只有当它们是最终语法树的一部分时,缩减才会执行。
通过注解优先级
为了避免歧义,您有两个选择:确保语法中不包含它们,或者通过引入注解来优先处理规则。在计算器的示例中,我们看到了两种类型:
#[priority(p_name)]
用于声明具有优先级p_name
的规则。稍后应该有一个类似ordering!(p_0,p_1,p_2,...)
的宏,用于指示p_0
应在p_1
之前缩减。#[associativity(left/right)]
来决定如何处理嵌套相同规则的情况。
示例:解析 JSON
extern crate gramatica;
use std::cmp::Ordering;
use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo};
//See https://www.json.org/
use std::rc::Rc;
//We define an auxiliar type to store JSON values
#[derive(Clone,Debug,PartialEq)]
enum JsonValue
{
Literal(String),
Number(f64),
Object(Vec<(String,JsonValue)>),
Array(Vec<JsonValue>),
True,
False,
Null,
}
// ---- Start of the grammar ----
keyword_terminal!(True,"true");
keyword_terminal!(False,"false");
keyword_terminal!(Null,"null");
re_terminal!(Number(f64),"[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?");
terminal LitStr(String)
{
//This function has limited escaping capabilities
fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,String)>
{
let mut ret=None;
let mut characters=source.chars();
if (characters.next())!=(Some('"'))
{
}
else
{
let mut size=1;
let mut r=String::from("\"");
while true
{
match characters.next()
{
None => break,
Some('"') =>
{
ret=(Some((size+1,r+&"\"")));
break;
},
Some('\\') =>
{
match characters.next()
{
None => break,
//Some(c) => r+='\\'+c,
Some(c) =>
{
r.push('\\');
r.push(c);
}
};
size+=2;
},
Some(c) =>
{
//r+=&String::from(c);
r.push(c);
size+=1;
},
};
}
}
ret
}
}
re_terminal!(LBrace,"\\{");
re_terminal!(RBrace,"\\}");
re_terminal!(LBracket,"\\[");
re_terminal!(RBracket,"\\]");
re_terminal!(Comma,",");
re_terminal!(Colon,":");
re_terminal!(_,"\\s+|\n");//Otherwise skip spaces
nonterminal Object(JsonValue)
{
(LBrace,RBrace) => JsonValue::Object(vec![]),
(LBrace,Members(ref list),RBrace) => JsonValue::Object(list.clone()),
}
nonterminal Members(Vec<(String,JsonValue)>)
{
(Pair(ref s,ref value)) => vec![(s.clone(),value.clone())],
//(Pair,Comma,Members) => (),
(Members(ref list),Comma,Pair(ref s,ref value)) =>
{
let mut new=(list.clone());
new.push((s.clone(),value.clone()));
new
},
}
nonterminal Pair(String,JsonValue)
{
(LitStr(ref s),Colon,Value(ref value)) => (s.clone(),value.clone()),
}
nonterminal Array(Vec<JsonValue>)
{
(LBracket,RBracket) => vec![],
(LBracket,Elements(ref list),RBracket) => list.clone(),
}
nonterminal Elements(Vec<JsonValue>)
{
(Value(ref value)) => vec![value.clone()],
//(Value,Comma,Elements) => (),
(Elements(ref list),Comma,Value(ref value)) =>
{
let mut new=(list.clone());
new.push(value.clone());
new
},
}
nonterminal Value(JsonValue)
{
(LitStr(ref s)) => JsonValue::Literal(s.clone()),
(Number(v)) => JsonValue::Number(v),
(Object(ref value)) => value.clone(),
(Array(ref list)) => JsonValue::Array(list.clone()),
(True) => JsonValue::True,
(False) => JsonValue::False,
(Null) => JsonValue::Null,
}
// ---- End of the grammar ----
use std::io::{BufRead,Read};
//As example, we parse stdin for a JSON object
fn main()
{
let stdin=std::io::stdin();
let mut buf=String::new();
stdin.lock().read_to_string(&mut buf);
match Parser::<Token,ParsingTables>::parse(&buf,None)
{
Err(x) => println!("error parsing: {:?}",x),
Ok(x) => println!("parsed correctly: {:?}",x),
};
}
示例:解析基本 XML
//A very basic xml grammar
extern crate gramatica;
use std::cmp::Ordering;
use gramatica::{Associativity,EarleyKind,State,Parser,ParsingTablesTrait,AmbiguityInfo};
// see https://www.w3.org/People/Bos/meta-bnf
// also http://cs.lmu.edu/~ray/notes/xmlgrammar/
use std::rc::Rc;
//We define an auxiliar type to store XML elements
#[derive(Clone,Debug,PartialEq)]
struct XMLElement
{
name: String,
attrs: Vec<(String,String)>,
contents: Vec<XMLContent>,
}
#[derive(Clone,Debug,PartialEq)]
enum XMLContent
{
Element(XMLElement),
Data(String),
}
// ---- Start of the grammar ----
re_terminal!(Space(String),"(\\s|\n)+");
re_terminal!(Ident(String),"[a-zA-Z\\x80-\\xff_][a-zA-Z0-9\\x80-\\xff_]*");
terminal LitStr(String)
{
fn _match(parser: &mut Parser<Token,ParsingTables>, source:&str) -> Option<(usize,String)>
{
let mut ret=None;
let mut characters=source.chars();
if (characters.next())!=(Some('"'))
{
}
else
{
let mut size=1;
let mut r=String::from("\"");
while true
{
match characters.next()
{
None => break,
Some('"') =>
{
ret=(Some((size+1,r+&"\"")));
break;
},
Some('\\') =>
{
match characters.next()
{
None => break,
//Some(c) => r+='\\'+c,
Some(c) =>
{
r.push('\\');
r.push(c);
}
};
size+=2;
},
Some(c) =>
{
//r+=&String::from(c);
r.push(c);
size+=1;
},
};
}
}
ret
}
}
re_terminal!(CloseEmpty,"/>");
re_terminal!(BeginClose,"</");
re_terminal!(Equal,"=");
re_terminal!(LT,"<");
re_terminal!(GT,">");
re_terminal!(Other(char),".");
nonterminal Document(XMLElement)
{
(Element(ref elem)) => elem.clone(),
}
nonterminal Element(XMLElement)
{
(EmptyElemTag(ref name,ref attrs)) => XMLElement{name:name.clone(),attrs:attrs.clone(),contents:vec![]},
(STag(ref name, ref attrs),Content(ref content),ETag) => XMLElement{name:name.clone(),attrs:attrs.clone(),contents:content.clone()},
}
nonterminal EmptyElemTag(String,Vec<(String,String)>)
{
(LT,Ident(ref name),Attributes(ref attrs),MaybeSpace,CloseEmpty) => (name.clone(),attrs.clone()),
}
nonterminal Attributes(Vec<(String,String)>)
{
() => vec![],
(Attributes(ref attrs),Space,Attribute(ref a, ref b)) =>
{
let mut new=(attrs.clone());
new.push((a.clone(),b.clone()));
new
},
}
nonterminal Attribute(String,String)
{
(Ident(ref a),Equal,LitStr(ref b)) => (a.clone(),b.clone()),
}
nonterminal STag(String,Vec<(String,String)>)
{
(LT,Ident(ref name),Attributes(ref attrs),MaybeSpace,GT) => (name.clone(),attrs.clone()),
}
nonterminal ETag(String)
{
(BeginClose,Ident(ref s),MaybeSpace,GT) => s.clone(),
}
nonterminal Content(Vec<XMLContent>)
{
(CharData(ref s)) => vec![XMLContent::Data(s.clone())],
(CharData(ref s),Contents(ref list)) =>
{
let mut new=vec![XMLContent::Data(s.clone())];
new.extend(list.iter().map(|x|x.clone()));
new
},
}
nonterminal Contents(Vec<XMLContent>)
{
() => vec![],
(Contents(ref list),Element(ref elem),CharData(ref s)) =>
{
let mut new=(list.clone());
new.push(XMLContent::Element(elem.clone()));
if s!=""
{
new.push(XMLContent::Data(s.clone()));
}
new
},
}
nonterminal MaybeSpace
{
() => (),
(Space) => (),
}
nonterminal CharData(String)
{
() => String::new(),
(CharData(ref s),Space(ref o)) => format!("{}{}",s,o),
(CharData(ref s),Ident(ref o)) => format!("{}{}",s,o),
(CharData(ref s),Equal) => format!("{}=",s),
(CharData(ref s),Other(o)) => format!("{}{}",s,o),
}
// ---- End of the grammar ----
use std::io::{BufRead,Read};
//As example, we parse stdin for a XML element
fn main()
{
let stdin=std::io::stdin();
let mut buf=String::new();
stdin.lock().read_to_string(&mut buf);
match Parser::<Token,ParsingTables>::parse(&buf,None)
{
Err(x) => println!("error parsing: {:?}",x),
Ok(x) => println!("parsed correctly: {:?}",x),
};
}
依赖关系
~2.2–3MB
~54K SLoC