diff --git a/Cargo.lock b/Cargo.lock index 8094f17..c4e0095 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -111,6 +111,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9bda8e21c04aca2ae33ffc2fd8c23134f3cac46db123ba97bd9d3f3b8a4a85e1" +[[package]] +name = "either" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a26ae43d7bcc3b814de94796a5e736d4029efb0ee900c12e2d54c993ad1a1e07" + [[package]] name = "encode_unicode" version = "0.3.6" @@ -197,6 +203,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "itertools" +version = "0.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25db6b064527c5d482d0423354fcd07a89a2dfe07b67892e62411946db7f07b0" +dependencies = [ + "either", +] + [[package]] name = "lazy_static" version = "1.4.0" @@ -270,9 +285,9 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.74" +version = "1.0.78" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2de98502f212cfcea8d0bb305bd0f49d7ebdd75b64ba0a68f937d888f4e0d6db" +checksum = "e2422ad645d89c99f8f3e6b88a9fdeca7fabeac836b1002371c4367c8f984aae" dependencies = [ "unicode-ident", ] @@ -440,6 +455,12 @@ version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f18aa187839b2bdb1ad2fa35ead8c4c2976b64e4363c386d45ac0f7ee85c9233" +[[package]] +name = "ungrammar" +version = "1.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a3e5df347f0bf3ec1d670aad6ca5c6a1859cd9ea61d2113125794654ccced68f" + [[package]] name = "unicode-ident" version = "1.0.12" @@ -559,6 +580,21 @@ version = "0.52.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dff9641d1cd4be8d1a070daf9e3773c5f67e78b4d9d42263020c057706765c04" +[[package]] +name = "xshell" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eaad2035244c56da05573d4d7fda5f903c60a5f35b9110e157a14a1df45a9f14" +dependencies = [ + "xshell-macros", +] + +[[package]] +name = "xshell-macros" +version = "0.1.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4916a4a3cad759e499a3620523bf9545cc162d7a06163727dde97ce9aaa4cf39" + [[package]] name = "yansi" version = "1.0.0-rc.1" @@ -572,7 +608,12 @@ dependencies = [ "drop_bomb", "globwalk", "goldenfile", + "itertools", "logos", + "proc-macro2", + "quote", "rowan-test", "text-size", + "ungrammar", + "xshell", ] diff --git a/Cargo.toml b/Cargo.toml index 2a3d280..7d60713 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -10,7 +10,12 @@ logos = "0.13.0" rowan-test = { git = "https://github.com/avast/avast-rowan.git" } text-size = "1.1.1" drop_bomb = "0.1.5" +ungrammar = "1.16.1" +itertools = "0.12.0" +xshell = "0.1" [dev-dependencies] goldenfile = "1.6.0" globwalk = "0.9.1" +proc-macro2 = "1.0.78" +quote = "1.0.35" diff --git a/example.yar b/example.yar index 9f68a9b..6b0f143 100644 --- a/example.yar +++ b/example.yar @@ -10,6 +10,5 @@ rule test $a = "foo" $b = "bar" condition: - $a or - $b and true + $b and not true or false } diff --git a/rustfmt.toml b/rustfmt.toml new file mode 100644 index 0000000..7bd7676 --- /dev/null +++ b/rustfmt.toml @@ -0,0 +1,3 @@ +reorder_modules = false +use_small_heuristics = "Max" +edition = "2021" diff --git a/src/lexer/mod.rs b/src/lexer/mod.rs index bb30898..3c20876 100644 --- a/src/lexer/mod.rs +++ b/src/lexer/mod.rs @@ -1,4 +1,13 @@ -use crate::{parser::syntaxkind::SyntaxKind, syntax::syntax_error::SyntaxError}; +//! This module contains lexer for YARA language. +//! The lexer is implemented using `logos` crate. +//! The lexer is used to convert the input text into a stream of tokens. +//! +//! Logos tokens are converted to `SyntaxKind` which is used in the parser to build the syntax tree. + +use crate::{ + parser::syntax_kind::{SyntaxKind, T}, + syntax::syntax_error::SyntaxError, +}; use logos::Logos; use std::fmt; use std::num::ParseIntError; @@ -136,18 +145,12 @@ pub fn tokenize(text: &str) -> (Vec, Vec) { SyntaxKind::ERROR } }; - tokens.push(Token { - kind: syntaxkind, - len: token_len, - }); + tokens.push(Token { kind: syntaxkind, len: token_len }); offset += range.len(); } // Add EOF token at the end - tokens.push(Token { - kind: SyntaxKind::EOF, - len: 0.into(), - }); + tokens.push(Token { kind: SyntaxKind::EOF, len: 0.into() }); (tokens, errors) } @@ -155,25 +158,25 @@ pub fn tokenize(text: &str) -> (Vec, Vec) { // Convert LogosToken to SyntaxKind fn logos_tokenkind_to_syntaxkind(token: LogosToken) -> SyntaxKind { match token { - LogosToken::Rule => SyntaxKind::RULE, - LogosToken::Strings => SyntaxKind::STRINGS, - LogosToken::Condition => SyntaxKind::CONDITION, - LogosToken::And => SyntaxKind::AND, - LogosToken::Or => SyntaxKind::OR, - LogosToken::Not => SyntaxKind::NOT, + LogosToken::Rule => SyntaxKind::RULE_KW, + LogosToken::Strings => SyntaxKind::STRINGS_KW, + LogosToken::Condition => SyntaxKind::CONDITION_KW, + LogosToken::And => SyntaxKind::AND_KW, + LogosToken::Or => SyntaxKind::OR_KW, + LogosToken::Not => SyntaxKind::NOT_KW, LogosToken::Identifier(_) => SyntaxKind::IDENTIFIER, LogosToken::Variable(_) => SyntaxKind::VARIABLE, - LogosToken::String(_) => SyntaxKind::STRING, - LogosToken::Assign => SyntaxKind::ASSIGN, - LogosToken::Colon => SyntaxKind::COLON, - LogosToken::LBrace => SyntaxKind::LBRACE, - LogosToken::RBrace => SyntaxKind::RBRACE, - LogosToken::LParen => SyntaxKind::LPAREN, - LogosToken::RParen => SyntaxKind::RPAREN, - LogosToken::Comma => SyntaxKind::COMMA, + LogosToken::String(_) => SyntaxKind::STRING_LIT, + LogosToken::Assign => T![=], + LogosToken::Colon => T![:], + LogosToken::LBrace => T!['{'], + LogosToken::RBrace => T!['}'], + LogosToken::LParen => T!['('], + LogosToken::RParen => T![')'], + LogosToken::Comma => T![,], LogosToken::Number(_) => SyntaxKind::NUMBER, - LogosToken::True => SyntaxKind::TRUE, - LogosToken::False => SyntaxKind::FALSE, + LogosToken::True => SyntaxKind::TRUE_KW, + LogosToken::False => SyntaxKind::FALSE_KW, LogosToken::Whitespace => SyntaxKind::WHITESPACE, LogosToken::Comment | LogosToken::MultilineComment => SyntaxKind::COMMENT, } @@ -213,18 +216,18 @@ mod tests { assert!(errors.is_empty()); assert_eq!(tokens.len(), 15); assert_eq!(tokens[0].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[1].kind, SyntaxKind::RULE); + assert_eq!(tokens[1].kind, SyntaxKind::RULE_KW); assert_eq!(tokens[2].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[3].kind, SyntaxKind::IDENTIFIER); assert_eq!(tokens[4].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[5].kind, SyntaxKind::LBRACE); + assert_eq!(tokens[5].kind, SyntaxKind::L_BRACE); assert_eq!(tokens[6].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[7].kind, SyntaxKind::CONDITION); + assert_eq!(tokens[7].kind, SyntaxKind::CONDITION_KW); assert_eq!(tokens[8].kind, SyntaxKind::COLON); assert_eq!(tokens[9].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[10].kind, SyntaxKind::VARIABLE); assert_eq!(tokens[11].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[12].kind, SyntaxKind::RBRACE); + assert_eq!(tokens[12].kind, SyntaxKind::R_BRACE); assert_eq!(tokens[13].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[14].kind, SyntaxKind::EOF); } @@ -242,20 +245,20 @@ mod tests { assert_eq!(errors.len(), 1); assert_eq!(tokens.len(), 25); assert_eq!(tokens[0].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[1].kind, SyntaxKind::RULE); + assert_eq!(tokens[1].kind, SyntaxKind::RULE_KW); assert_eq!(tokens[2].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[3].kind, SyntaxKind::IDENTIFIER); assert_eq!(tokens[4].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[5].kind, SyntaxKind::LBRACE); + assert_eq!(tokens[5].kind, SyntaxKind::L_BRACE); assert_eq!(tokens[6].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[7].kind, SyntaxKind::CONDITION); + assert_eq!(tokens[7].kind, SyntaxKind::CONDITION_KW); assert_eq!(tokens[8].kind, SyntaxKind::COLON); assert_eq!(tokens[9].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[10].kind, SyntaxKind::VARIABLE); assert_eq!(tokens[11].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[12].kind, SyntaxKind::ASSIGN); assert_eq!(tokens[13].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[14].kind, SyntaxKind::STRING); + assert_eq!(tokens[14].kind, SyntaxKind::STRING_LIT); assert_eq!(tokens[15].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[16].kind, SyntaxKind::VARIABLE); assert_eq!(tokens[17].kind, SyntaxKind::WHITESPACE); @@ -263,7 +266,7 @@ mod tests { assert_eq!(tokens[19].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[20].kind, SyntaxKind::ERROR); assert_eq!(tokens[21].kind, SyntaxKind::WHITESPACE); - assert_eq!(tokens[22].kind, SyntaxKind::RBRACE); + assert_eq!(tokens[22].kind, SyntaxKind::R_BRACE); assert_eq!(tokens[23].kind, SyntaxKind::WHITESPACE); assert_eq!(tokens[24].kind, SyntaxKind::EOF); } diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..a48a7c6 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1,337 @@ +/// This library is used to create a parser for YARA language +/// It should provide also token for whitespaces +/// as we want full fidelity and error resilience.; +use crate::{ + parser::SyntaxKind, + syntax::{ + syntax_error::SyntaxError, + syntax_node::{SyntaxNode, SyntaxToken}, + text_token_source::TextTokenSource, + text_tree_sink::TextTreeSink, + }, +}; + +pub use crate::syntax::ast::*; +pub use crate::syntax::SourceFile; + +// use only for tests +#[cfg(test)] +use rowan_test::{NodeOrToken, WalkEvent}; +#[cfg(test)] +use std::fs; +#[cfg(test)] +use std::io::Write; +use std::ops::Range; +#[cfg(test)] +use text_size::TextRange; + +mod lexer; +mod parser; +mod syntax; + +/// Just a showcase test to see how API for typed layer +/// of AST could work +#[test] +fn api_walktrough() { + // This is a simple YARA rule + // without errors + let source_code = " + rule test_rule { + // This is a comment + strings: + $a = \"test\" + condition: + $a or not true + } + "; + + // SourceFile is the main entry point for any given file + // it contains a `parse` method which returns a `Parse` struct + // that contains AST and list of errors + + let parse_struct = SourceFile::parse(source_code); + assert!(parse_struct.errors().is_empty()); + + // To obtian the AST we can use `tree` method + // every tree starts with `SourceFile` node which is the root + let ast = parse_struct.tree(); + + // Now we can travers the tree and obtain the other nodes + // for example we can loop over rules in the file + // in given example we have only one rule + for rule in ast.rules() { + // We can obtain also the identifier of the rule + // and assert its name is `test_rule` + assert_eq!(rule.identifier_token().unwrap().text(), "test_rule"); + + // Each rule also have 'RULE_KW' token + assert!(rule.rule_token().is_some()); + // Yes, just like this we can get the syntax token + assert!(rule.rule_token().unwrap().kind() == SyntaxKind::RULE_KW); + + // Last but not least we can obtain the block of the rule + // which is essentially a block expression + let block = rule.body().unwrap(); + + // Just to showcase, each node can also return comments + // that belongs to that specific node + for comment in block.comments() { + // In this case we have only one comment + assert_eq!(comment.text(), "This is a comment"); + } + + // This block expression consists (for now) of two parts + // optional strings and required condition part + // Firstly we can obtain the strings part + let strings = block.strings().unwrap(); + + // I can show again that we can obtain the tokens + // for example the `STRINGS_KW` token + assert!(strings.strings_token().is_some()); + assert!(strings.strings_token().unwrap().kind() == SyntaxKind::STRINGS_KW); + + // and also `COLON` token + assert!(strings.colon_token().is_some()); + assert!(strings.colon_token().unwrap().kind() == SyntaxKind::COLON); + + // Each strings section also contains multiple + // `VARIABLE_STMT` nodes + for variable_stmt in strings.variable_stmts() { + // each variable statement contains a variable token + // an assign token and a literal token + // now I will showm only the pattern token as an example + let pattern = variable_stmt.pattern().unwrap(); + + // For now pattern can be only a string literal + assert!(pattern.string_lit_token().is_some()); + assert!(pattern.string_lit_token().unwrap().kind() == SyntaxKind::STRING_LIT); + } + + // For the condition part, we can similarly get its body which is + // an `EXPRESSION_STMT` node + let condition = block.condition().unwrap(); + let condition_body = condition.expression_stmt().unwrap(); + + // Each expression statement for now consists of either + // `EXPRESSION`, `PREFIX_EXPR` or `LITERAL` node for binary expressions, unary expressions + // and literals respectively, which are essentially the only 3 things we can have in the + // condition so far. `EXPR` enum is used to group these 3 types of nodes together + // There is Pratt parser in the background used for operators precedence + let expr = condition_body.expr().unwrap(); + let expression = match &expr { + Expr::Expression(e) => e, + _ => unreachable!(), + }; + + // Now we can obtain `lhs`, `rhs` or `op` nodes for top level expression + // in this case we have `OR` operator + assert!(expression.op_token().is_some()); + assert!(expression.op_token().unwrap().kind() == SyntaxKind::OR_KW); + + // On the left hand side we have a LITERAL token + // It is essentially like I mentioned `EXPR` enum + // therefore we have to match it to obtain the `LITERAL` node + let lhs = expression.lhs().unwrap(); + let lhs_literal = match &lhs { + Expr::Literal(l) => l, + _ => unreachable!(), + }; + assert!(lhs_literal.token().kind() == SyntaxKind::VARIABLE); + assert_eq!(lhs_literal.token().text(), "$a"); + + // On the right hand side we have a `PREFIX_EXPR` node + // which is essentially a unary expression + let rhs = expression.rhs().unwrap(); + let rhs_prefix = match &rhs { + Expr::PrefixExpr(p) => p, + _ => unreachable!(), + }; + + // Prefix expression consists of an operator and an expression + // in this case we have `NOT` operator + assert!(rhs_prefix.op_token().is_some()); + assert!(rhs_prefix.op_token().unwrap().kind() == SyntaxKind::NOT_KW); + + // and the `LITERAL` node which is a `TRUE_KW` token + let rhs_body = rhs_prefix.expr().unwrap(); + let rhs_literal = match &rhs_body { + Expr::Literal(l) => l, + _ => unreachable!(), + }; + assert!(rhs_literal.token().kind() == SyntaxKind::TRUE_KW); + assert_eq!(rhs_literal.token().text(), "true"); + + // Last but not least, in any point we can obtain the syntax node + // for example let's obtain the syntax node for `EXPRESSION_STMT` + let expression_stmt_syntax = condition_body.syntax(); + + assert_eq!(expression_stmt_syntax.text().to_string(), "$a or not true"); + + // Syntax node have also bunch of methods + // for example we can obtain the parent node + let parent = expression_stmt_syntax.parent().unwrap(); + assert_eq!(parent.kind(), SyntaxKind::CONDITION); + assert_eq!(parent.text().to_string(), "condition:\n $a or not true"); + + // We can also obtain the children + let children = expression_stmt_syntax.first_child_or_token().unwrap(); + assert_eq!(children.kind(), SyntaxKind::EXPRESSION); + + // and also the next sibling, which in this layer can be also a whitespace + let next_sibling = parent.next_sibling_or_token().unwrap(); + assert_eq!(next_sibling.kind(), SyntaxKind::WHITESPACE); + + // Some helpers: + // for example get token at specific offset. This can be useful + // to obtain the token at given Error offset, to get its text, length etc. + let tkn = expression_stmt_syntax.token_at_offset(151.into()); + + // We can have offset that is between two tokens, so we use `right_biased` method + // to obtain the token on the right side of the offset if it is between two tokens + // or just to get the token type + assert!(tkn.right_biased().unwrap().kind() == SyntaxKind::OR_KW); + + // There is also a method to do a preorder traversal + // Note that we are using those methods just for `EXPRESSION_STMT` subtree + // but it can be also used on root tree and any other subtree + // It works with `WalkEvent` which can be either `Enter` or `Leave` + for (i, event) in expression_stmt_syntax.preorder_with_tokens().enumerate() { + // Assert first couple of events + match event { + WalkEvent::Enter(node) => { + let kind = match &node { + NodeOrToken::Node(it) => it.kind(), + NodeOrToken::Token(it) => it.kind(), + }; + if i == 0 { + assert_eq!(kind, SyntaxKind::EXPRESSION_STMT); + } + if i == 1 { + assert_eq!(kind, SyntaxKind::EXPRESSION); + } + if i == 2 { + assert_eq!(kind, SyntaxKind::LITERAL); + } + if i == 3 { + assert_eq!(kind, SyntaxKind::VARIABLE); + } + } + WalkEvent::Leave(node) => { + let kind = match &node { + NodeOrToken::Node(it) => it.kind(), + NodeOrToken::Token(it) => it.kind(), + }; + if i == 4 { + assert_eq!(kind, SyntaxKind::VARIABLE); + } + } + } + } + + // The last thing I want to showcase are errors + // This is a simple YARA rule with errors + // it has two errors, one is missing `$` before variable + // declaration and the other one is unsupported `nor` operator + let source_code = " + rule test_rule { + // This is a comment + strings: + a = \"test\" + condition: + $a nor not true + } + "; + + let parse_struct = SourceFile::parse(source_code); + + // There are some errors + assert!(!parse_struct.errors().is_empty()); + assert!(parse_struct.errors().len() == 2); + assert!(parse_struct.errors()[0].to_string() == "expected a variable"); + assert!(parse_struct.errors()[1].to_string() == "unsupported expression"); + + // We still have the AST and we can traverse it + let ast = parse_struct.tree(); + + // We loop over rules + for rule in ast.rules() { + assert!(rule.identifier_token().unwrap().text() == "test_rule"); + let block = rule.body().unwrap(); + let condition = block.condition().unwrap(); + let condition_body = condition.expression_stmt().unwrap(); + let expr = condition_body.expr().unwrap(); + // The operator is wrong, therefore from binary expression we have + // a `LITERAL` expression + let expression = match &expr { + Expr::Literal(e) => e, + _ => unreachable!(), + }; + assert!(expression.token().kind() == SyntaxKind::VARIABLE); + + // and we can obtain the error token + let error_token = condition + .syntax() + .children_with_tokens() + .find(|it| it.kind() == SyntaxKind::ERROR) + .unwrap(); + + assert!(error_token.kind() == SyntaxKind::ERROR); + assert!(error_token.as_node().unwrap().text() == "nor"); + } + // We can also search a token that produced the error + // Even though it produces range, ParseErrors only supports text offsets + assert_eq!(parse_struct.errors()[1].range(), TextRange::new(173.into(), 173.into())); + + // But luckily we can obtain the token at the offset + // and from it we can get both its text and length + let tkn = ast.syntax().token_at_offset(173.into()).right_biased().unwrap(); + + assert_eq!(tkn.text(), "nor"); + // Error node contains also appropriate nested SyntaxKind + assert_eq!(tkn.kind(), SyntaxKind::IDENTIFIER); + // and also the length as TextRange for specific token + assert_eq!(tkn.text_range(), TextRange::new(173.into(), 176.into())); + // or + assert_eq!(tkn.text().len(), 3); + } +} + +/// This test is used to compare the output of the parser +/// with the expected output +#[test] +fn test_parse_text() { + let mut mint = goldenfile::Mint::new("."); + + for entry in globwalk::glob("tests/*.in").unwrap().flatten() { + // Path to the .in.zip file. + let path = entry.into_path(); + let display_path = path.display(); + + let input = fs::read_to_string(&path) + .unwrap_or_else(|_| panic!("Failed to read input file {:?}", display_path)); + + let ast_struct = SourceFile::parse(&input); + + let out_path = path.with_extension("").with_extension("out"); + + let mut output_file = mint.new_goldenfile(out_path).unwrap(); + + write!(output_file, "{:#?}", ast_struct.tree().syntax).unwrap(); + + // Check errors + let err_path = path.with_extension("").with_extension("err"); + if err_path.exists() { + let expected_errors = fs::read_to_string(&err_path) + .unwrap_or_else(|_| panic!("Failed to read error file {:?}", err_path.display())); + let actual_errors = ast_struct + .errors() + .iter() + .map(|error| format!("{:?}", error)) + .collect::>() + .join("\n"); + assert_eq!(actual_errors, expected_errors); + } else { + assert!(ast_struct.errors().is_empty(), "Unexpected errors: {:?}", ast_struct.errors()); + } + } +} diff --git a/src/main.rs b/src/main.rs deleted file mode 100644 index 6bd6dda..0000000 --- a/src/main.rs +++ /dev/null @@ -1,109 +0,0 @@ -/// This library is used to create a parser for YARA language -/// It should provide also token for whitespaces -/// as we want full fidelity and error resilience.; -use std::{env::args, fs, io::Write, path::Path}; - -use rowan_test::{GreenNode, NodeOrToken}; - -use crate::lexer::tokenize; -use crate::parser::{SyntaxKind, TokenSource, TreeSink}; -use crate::syntax::syntax_node::{SyntaxElement, SyntaxNode}; -use crate::syntax::{ - syntax_error::SyntaxError, text_token_source::TextTokenSource, text_tree_sink::TextTreeSink, -}; - -mod lexer; -mod parser; -mod syntax; - -fn main() { - // Take file as an input and parse it into tokens - let arg = args().nth(1).expect("No pathname given"); - let path = Path::new(&arg); - let input = fs::read_to_string(path).unwrap(); - - parse_text(&input); -} - -fn parse_text(text: &str) -> (GreenNode, Vec) { - let (tokens, lexer_errors) = tokenize(text); - let mut token_source = TextTokenSource::new(text, &tokens); - let mut tree_sink = TextTreeSink::new(text, &tokens); - - parser::parse(&mut token_source, &mut tree_sink); - let (tree, mut parser_errors) = tree_sink.finish(); - parser_errors.extend(lexer_errors); - - let syntax_tree = SyntaxNode::new_root(tree.clone()); - - println!("Tokens: \n{:?}", tokens); - println!(); - println!("Errors: \n{:?}", parser_errors); - println!(); - - let indent = 0; - let result = print(indent, syntax_tree.into()); - - print!("{}", result); - - (tree, parser_errors) -} - -fn print(indent: usize, element: SyntaxElement) -> String { - let mut result = String::new(); - let kind: SyntaxKind = element.kind(); - result.push_str(&format!("{:indent$}", "", indent = indent)); - match element { - NodeOrToken::Node(node) => { - result.push_str(&format!("- {:?}\n", kind)); - for child in node.children_with_tokens() { - result.push_str(&print(indent + 2, child)); - } - } - - NodeOrToken::Token(token) => { - result.push_str(&format!("- {:?} {:?}\n", token.text(), kind)); - } - } - result -} - -#[test] -fn test_parse_text() { - let mut mint = goldenfile::Mint::new("."); - - for entry in globwalk::glob("tests/*.in").unwrap().flatten() { - // Path to the .in.zip file. - let path = entry.into_path(); - let display_path = path.display(); - - let input = fs::read_to_string(&path) - .unwrap_or_else(|_| panic!("Failed to read input file {:?}", display_path)); - - let (tree, errors) = parse_text(&input); - - let out_path = path.with_extension("").with_extension("out"); - let syntax_tree = SyntaxNode::new_root(tree.clone()); - - let output = print(0, syntax_tree.into()); - - let mut output_file = mint.new_goldenfile(out_path).unwrap(); - - write!(output_file, "{}", output).unwrap(); - - // Check errors - let err_path = path.with_extension("").with_extension("err"); - if err_path.exists() { - let expected_errors = fs::read_to_string(&err_path) - .unwrap_or_else(|_| panic!("Failed to read error file {:?}", err_path.display())); - let actual_errors = errors - .iter() - .map(|error| format!("{:?}", error)) - .collect::>() - .join("\n"); - assert_eq!(actual_errors, expected_errors); - } else { - assert!(errors.is_empty(), "Unexpected errors: {:?}", errors); - } - } -} diff --git a/src/parser/event.rs b/src/parser/event.rs index af2c563..a31bdc6 100644 --- a/src/parser/event.rs +++ b/src/parser/event.rs @@ -1,3 +1,8 @@ +/// This module provides a way to process the events from the parser +/// It is decoupled from the parser +/// +/// The `TreeSink` trait is used to connect parser and tree builder +/// Parser produces a stream of `Event`s and they are converted to a real tree use std::mem; use crate::parser::{ @@ -8,13 +13,21 @@ use crate::parser::{ #[derive(Debug)] pub(crate) enum Event { + /// This event specifies the start of a new node + /// It is either abandoned or completed with `Finish` event + /// + /// All children that are consumed between `Start` and `Finish` are attached to this node Start { kind: SyntaxKind, forward_parent: Option, }, + /// Complete the current node Finish, + /// Add a new token to the current node + /// `n_raw_tokens` is used for consuming multiple tokens at once that should be glued together + /// this is not supported in YARA subset, but will be used in the future Token { kind: SyntaxKind, n_raw_tokens: u8, @@ -27,32 +40,24 @@ pub(crate) enum Event { impl Event { pub(crate) fn tombstone() -> Self { - Event::Start { - kind: TOMBSTONE, - forward_parent: None, - } + Event::Start { kind: TOMBSTONE, forward_parent: None } } } +/// Generate the syntax tree by processing the events pub(crate) fn process(sink: &mut dyn TreeSink, mut events: Vec) { let mut forward_parents = Vec::new(); for i in 0..events.len() { match mem::replace(&mut events[i], Event::tombstone()) { - Event::Start { - kind, - forward_parent, - } => { + Event::Start { kind, forward_parent } => { forward_parents.push(kind); let mut idx = i; let mut fp = forward_parent; while let Some(fwd) = fp { idx += fwd as usize; fp = match mem::replace(&mut events[idx], Event::tombstone()) { - Event::Start { - kind, - forward_parent, - } => { + Event::Start { kind, forward_parent } => { forward_parents.push(kind); forward_parent } diff --git a/src/parser/grammar.rs b/src/parser/grammar.rs index 1268919..f07246f 100644 --- a/src/parser/grammar.rs +++ b/src/parser/grammar.rs @@ -1,13 +1,18 @@ +/// This is the hand-written parser and `grammar` of YARA language + mod expressions; mod items; use crate::parser::{ grammar::expressions::rule_body, parser::{CompletedMarker, Marker, Parser}, + syntax_kind::T, token_set::TokenSet, SyntaxKind::{self, *}, }; +/// Parse a source file +/// Each YARA file is a SOURCE_FILE that has some content pub(crate) fn parse_source_file(p: &mut Parser) { let m = p.start(); @@ -15,22 +20,13 @@ pub(crate) fn parse_source_file(p: &mut Parser) { m.complete(p, SOURCE_FILE); } +/// To recover from error, we can parse block of a rule on its own fn error_block(p: &mut Parser, message: &str) { - assert!(p.at(LBRACE)); + assert!(p.at(T!['{'])); let m = p.start(); p.error(message); - p.bump(LBRACE); + p.bump(T!['{']); rule_body(p); - p.eat(RBRACE); + p.eat(T!['}']); m.complete(p, ERROR); } - -fn name_r(p: &mut Parser<'_>, recovery: TokenSet) { - if p.at(IDENTIFIER) { - let m = p.start(); - p.bump(IDENTIFIER); - m.complete(p, IDENTIFIER); - } else { - p.err_recover("expected a name", recovery); - } -} diff --git a/src/parser/grammar/expressions.rs b/src/parser/grammar/expressions.rs index ba715d7..1d34e00 100644 --- a/src/parser/grammar/expressions.rs +++ b/src/parser/grammar/expressions.rs @@ -2,25 +2,37 @@ mod atom; use super::*; +/// Recovery set for `strings` block. This also should be adjusted and tweaked to +/// better represents recovery set later on +const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[T![strings]]); + +/// Parse a rule body +/// A rule body consists `{`, rule_body and `}` +/// This can probably be later simplified to not have both +/// `rule_body` and `block_expr`. pub(crate) fn block_expr(p: &mut Parser) { - if !p.at(LBRACE) { + if !p.at(T!['{']) { p.error("expected a block expression"); return; } let m = p.start(); - p.bump(LBRACE); + p.bump(T!['{']); rule_body(p); - p.expect(RBRACE); + p.expect(T!['}']); m.complete(p, BLOCK_EXPR); } +/// Parse a rule body +/// A rule body consists of `strings` and `condition` blocks +/// `strings` part is optional but condition is required +/// but each of them can be defined only once and have to be in order pub(super) fn rule_body(p: &mut Parser) { let mut has_strings = false; let mut has_condition = false; - while !p.at(EOF) && !p.at(RBRACE) { + while !p.at(EOF) && !p.at(T!['}']) { match p.current() { - // add metadata later - STRINGS => { + // add metadata support later + T![strings] => { if has_strings { p.error("only one strings block is allowed"); } @@ -30,7 +42,7 @@ pub(super) fn rule_body(p: &mut Parser) { strings(p); has_strings = true; } - CONDITION => { + T![condition] => { if has_condition { p.error("only one condition block is allowed"); } @@ -38,64 +50,82 @@ pub(super) fn rule_body(p: &mut Parser) { has_condition = true; } _ => { + // It did not contain strings or condition in valid form + // but we can still try to parse their body and throw an error for parent + // for now it just looks at next 2 tokens to differenciate between valid strings + // body or condition body. This should probably be adjusted later p.err_and_bump("expected strings or condition"); + if p.current() == T![:] { + p.eat(T![:]); + if p.current() == T![variable] && p.nth(1) == T![=] { + strings_body(p) + } else { + condition_body(p); + } + } } } } } +/// Parse a `strings` block +/// It consists of `strings` keyword,`:` token and strings body fn strings(p: &mut Parser) { - assert!(p.at(STRINGS)); + assert!(p.at(T![strings])); let m = p.start(); - p.bump(STRINGS); - p.expect(COLON); + p.bump(T![strings]); + p.expect(T![:]); strings_body(p); m.complete(p, STRINGS); } +/// Parse a `condition` block +/// It consists of `condition` keyword,`:` token and condition body fn condition(p: &mut Parser) { - assert!(p.at(CONDITION)); + assert!(p.at(T![condition])); let m = p.start(); - p.bump(CONDITION); - p.expect(COLON); + p.bump(T![condition]); + p.expect(T![:]); condition_body(p); m.complete(p, CONDITION); } -const VARIABLE_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE]); - +/// Parse a `strings` body +/// It consists of a list of `variable` and `=` token and a string pub(super) fn strings_body(p: &mut Parser) { // add support for meta also - while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + while !p.at(EOF) && !p.at(T![condition]) && !p.at(T!['}']) { let m = p.start(); - if p.at(VARIABLE) { - let m = p.start(); - p.bump(VARIABLE); - m.complete(p, VARIABLE); + if p.at(T![variable]) { + p.bump(T![variable]); } else { p.err_recover("expected a variable", VARIABLE_RECOVERY_SET); } - p.expect(ASSIGN); + p.expect(T![=]); // so far only strings are supported, later add match for hex strings and regex - string(p); + pattern(p); m.complete(p, VARIABLE_STMT); } } +/// Parse a string. For now string can be only basic plaintext string // add support for hex and regex strings later on -fn string(p: &mut Parser) { +fn pattern(p: &mut Parser) { let m = p.start(); match p.current() { - STRING => p.bump(STRING), + STRING_LIT => p.bump(STRING_LIT), _ => p.err_and_bump("expected a string"), } // add string modifiers - m.complete(p, STRING); + m.complete(p, PATTERN); } +/// Parse a `condition` body +/// It consists of a list of expressions +/// Pratt parser is used to parse expressions pub(super) fn condition_body(p: &mut Parser) { // add support for meta also - while !p.at(EOF) && !p.at(STRINGS) && !p.at(CONDITION) && !p.at(RBRACE) { + while !p.at(EOF) && !p.at(T!['}']) { let m = p.start(); if let Some(cm) = expression(p, Some(m), 1) { let m = cm.precede(p); @@ -113,12 +143,18 @@ enum Associativity { fn current_op(p: &mut Parser) -> (u8, SyntaxKind, Associativity) { match p.current() { // add support for other operators - AND => (4, AND, Associativity::Left), - OR => (3, OR, Associativity::Left), + T![and] => (4, T![and], Associativity::Left), + T![or] => (3, T![or], Associativity::Left), _ => (0, ERROR, Associativity::Left), } } +/// Parse an expression using a Pratt parser. +/// +/// Expression can be binary, unary or literal +/// This is also used to reflect operator precedence and associativity +/// It is inspired by Pratt parser used in rust-analyter +/// fn expression(p: &mut Parser, m: Option, bp: u8) -> Option { let m = m.unwrap_or_else(|| p.start()); let mut lhs = match lhs(p) { @@ -147,11 +183,12 @@ fn expression(p: &mut Parser, m: Option, bp: u8) -> Option Option { let m; let kind = match p.current() { // unary operators - NOT => { + T![not] => { m = p.start(); p.bump_any(); PREFIX_EXPR diff --git a/src/parser/grammar/expressions/atom.rs b/src/parser/grammar/expressions/atom.rs index 8a87972..0cd918c 100644 --- a/src/parser/grammar/expressions/atom.rs +++ b/src/parser/grammar/expressions/atom.rs @@ -1,9 +1,15 @@ use super::*; +/// Recover set for expressions, FIRST set is used +const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[T![variable], T![true], T![false], T![not]]); + // So far the only literals we support are true, false and variables // numbers will be added later -pub(crate) const LITERAL_FIRST: TokenSet = TokenSet::new(&[TRUE, FALSE, VARIABLE]); +pub(crate) const LITERAL_FIRST: TokenSet = + TokenSet::new(&[T![true], T![false], T![variable], T![string_lit], NUMBER]); +/// Parse a literal +/// Literal right now is only: true, false, variable, string_lit or number pub(crate) fn literal(p: &mut Parser) -> Option { if !p.at_ts(LITERAL_FIRST) { return None; @@ -13,9 +19,8 @@ pub(crate) fn literal(p: &mut Parser) -> Option { Some(m.complete(p, LITERAL)) } -const EXPR_RECOVERY_SET: TokenSet = TokenSet::new(&[VARIABLE, TRUE, FALSE, NOT]); - -// add support for while/for loops, if/else statements, etc. +/// Add support for while/for loops, if/else statements, etc. +/// Right now the only atom in expression is literal pub(super) fn atom_expr(p: &mut Parser) -> Option { if let Some(m) = literal(p) { return Some(m); @@ -25,7 +30,7 @@ pub(super) fn atom_expr(p: &mut Parser) -> Option { #[allow(clippy::match_single_binding)] match p.current() { _ => { - p.err_recover("expected expression", EXPR_RECOVERY_SET); + p.err_recover("unsupported expression", EXPR_RECOVERY_SET); #[allow(clippy::needless_return)] return None; } diff --git a/src/parser/grammar/items.rs b/src/parser/grammar/items.rs index ff500ad..d6e93b6 100644 --- a/src/parser/grammar/items.rs +++ b/src/parser/grammar/items.rs @@ -1,14 +1,15 @@ use super::*; +/// This is a recover set for rule, FIRST set is used to recover from error +/// This will probably also needs to be tweaked, adjusted and extended in the future pub(super) const RULE_RECOVERY_SET: TokenSet = TokenSet::new( - // Add import here when it is supported - &[ - RULE, // rule - ], + // imports could be here when it is supported + &[T![rule]], ); +/// Process the content of a file, Stop on `EOF` token or `}` if `stop_on_r_brace` is true pub(super) fn mod_content(p: &mut Parser, stop_on_r_brace: bool) { - while !(p.at(EOF) || p.at(RBRACE) && stop_on_r_brace) { + while !(p.at(EOF) || p.at(T!['}']) && stop_on_r_brace) { process_top_level(p, stop_on_r_brace); } } @@ -22,18 +23,23 @@ pub(super) fn process_top_level(p: &mut Parser, stop_on_r_brace: bool) { } Err(m) => m, }; + + // On top level we can right now only have rules + // So if rules are not successfully parsed, we can just abandon the marker + // and either create an error block and try to parse it as a rule body and throw + // an error or just throw an error m.abandon(p); match p.current() { - LBRACE => { + T!['{'] => { error_block(p, "expected an item"); } - RBRACE if !stop_on_r_brace => { + T!['}'] if !stop_on_r_brace => { let e = p.start(); p.error("unmatched }"); - p.bump(RBRACE); + p.bump(T!['}']); e.complete(p, ERROR); } - EOF | RBRACE => p.error("expected an item"), + EOF | T!['}'] => p.error("expected an item"), _ => p.err_and_bump("expected an item"), } } @@ -43,15 +49,21 @@ pub(super) fn process_top_level(p: &mut Parser, stop_on_r_brace: bool) { pub(super) fn opt_rule_import_include(p: &mut Parser, m: Marker) -> Result<(), Marker> { // add rule modifiers to match current and lookahead next with p.nth(1) for RULE or ERROR match p.current() { - RULE => rule(p, m), + T![rule] => rule(p, m), _ => return Err(m), } Ok(()) } +// Parse a rule +// It consists of rule name [`IDENTIFIER`] and a body [`block_expr`] fn rule(p: &mut Parser, m: Marker) { - p.bump(RULE); - name_r(p, RULE_RECOVERY_SET); + p.bump(T![rule]); + if p.at(IDENTIFIER) { + p.bump(IDENTIFIER); + } else { + p.err_recover("expected a name", RULE_RECOVERY_SET); + } // add optional support for rule tags expressions::block_expr(p); m.complete(p, RULE); diff --git a/src/parser/mod.rs b/src/parser/mod.rs index 5585781..0d1c3ca 100644 --- a/src/parser/mod.rs +++ b/src/parser/mod.rs @@ -1,6 +1,12 @@ -pub mod syntaxkind; +//! YARA parser +//! +//! It uses abstract `TokenSource` and `TreeSink` traits. +//! It is cursor into the sequence of tokens. Parsing happens in +//! `grammar` module. -pub use syntaxkind::SyntaxKind; +pub mod syntax_kind; + +pub use syntax_kind::SyntaxKind; mod event; mod grammar; #[allow(clippy::module_inception)] @@ -12,14 +18,24 @@ use grammar::parse_source_file; #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct ParseError(pub String); +/// `TokenSource` abstracts the source of the tokens parser uses. +/// +/// This allows us to treat text and token trees in the same way. pub trait TokenSource { + /// Returns the current token fn current(&self) -> Token; + /// Lookahead `n` tokens fn lookahead_nth(&self, n: usize) -> Token; + /// Advance the cursor to the next token fn bump(&mut self); + + /// Check if the current token is keyword + fn is_keyword(&self, kw: &str) -> bool; } +/// `Token` abstracts the cursor for `TokenSource` #[derive(Debug, Copy, Clone, Eq, PartialEq)] pub struct Token { pub kind: SyntaxKind, @@ -27,13 +43,18 @@ pub struct Token { pub is_jointed_to_next: bool, } +/// `TreeSink` abstracts detail of syntax tree creation. pub trait TreeSink { + /// Adds new token to specific node fn token(&mut self, kind: SyntaxKind, n_tokens: u8); + /// Start new node fn start_node(&mut self, kind: SyntaxKind); + /// Finish the current node and return control to the parent fn finish_node(&mut self); + /// Create an error with message fn error(&mut self, error: ParseError); } diff --git a/src/parser/parser.rs b/src/parser/parser.rs index a9fef81..5bb63d1 100644 --- a/src/parser/parser.rs +++ b/src/parser/parser.rs @@ -6,10 +6,14 @@ use crate::parser::{ event::Event, token_set::TokenSet, ParseError, - SyntaxKind::{self, EOF, ERROR, LBRACE, RBRACE, TOMBSTONE}, + SyntaxKind::{self, EOF, ERROR, TOMBSTONE}, TokenSource, }; +/// This structure provides API for navigating through the token stream +/// and constructs a parse tree. +/// The parsing process is in `grammar/` module. +/// The result of parsing is a stream of `Event`s pub(crate) struct Parser<'t> { token_source: &'t mut dyn TokenSource, events: Vec, @@ -18,21 +22,19 @@ pub(crate) struct Parser<'t> { impl<'t> Parser<'t> { pub(crate) fn new(token_source: &'t mut dyn TokenSource) -> Parser<'t> { - Parser { - token_source, - events: Vec::new(), - steps: Cell::new(0), - } + Parser { token_source, events: Vec::new(), steps: Cell::new(0) } } pub(crate) fn finish(self) -> Vec { self.events } + /// Returns the current token pub(crate) fn current(&self) -> SyntaxKind { self.nth(0) } + /// Lookahead `n` tokens pub(crate) fn nth(&self, n: usize) -> SyntaxKind { assert!(n < 3); @@ -43,11 +45,13 @@ impl<'t> Parser<'t> { self.token_source.lookahead_nth(n).kind } + /// Check if the current token is specific `SyntaxKind` kind pub(crate) fn at(&self, kind: SyntaxKind) -> bool { // currently we don't need support for composite tokens (e.g. `>>`) self.token_source.lookahead_nth(0).kind == kind } + /// Consume the next token if it is of expected kind pub(crate) fn eat(&mut self, kind: SyntaxKind) -> bool { if !self.at(kind) { return false; @@ -59,20 +63,26 @@ impl<'t> Parser<'t> { true } + /// Check if current token is in the given set of tokens pub(crate) fn at_ts(&self, kinds: TokenSet) -> bool { kinds.contains(self.current()) } + /// Starts a new node in the syntax tree + /// All nodes that are consumed between the start and finish of the `Marker` + /// belongs to the same node pub(crate) fn start(&mut self) -> Marker { let pos = self.events.len() as u32; self.push_event(Event::tombstone()); Marker::new(pos) } + /// Cosumes the next token if it is of expected kind pub(crate) fn bump(&mut self, kind: SyntaxKind) { assert!(self.eat(kind)); } + /// Consume any token pub(crate) fn bump_any(&mut self) { let kind = self.nth(0); if kind == EOF { @@ -81,6 +91,7 @@ impl<'t> Parser<'t> { self.do_bump(kind, 1); } + /// Create an Token event fn do_bump(&mut self, kind: SyntaxKind, n_raw_tokens: u8) { for _ in 0..n_raw_tokens { self.token_source.bump(); @@ -93,11 +104,16 @@ impl<'t> Parser<'t> { self.events.push(event); } + /// Report an error with specified message + /// This can be in future extended to support also range + /// Right now `ParseError` is just converted to `SyntaxError` + /// after the parsing is done and uses just token offset (not range) pub(crate) fn error>(&mut self, message: T) { let msg = ParseError(message.into()); self.push_event(Event::Error { msg }); } + /// Consume the next token if it is of expected kind, otherwise report an error pub(crate) fn expect(&mut self, kind: SyntaxKind) -> bool { if self.eat(kind) { return true; @@ -106,10 +122,15 @@ impl<'t> Parser<'t> { false } + /// Create an error node and consume the next token + /// This token belongs to the `Error` node pub(crate) fn err_and_bump(&mut self, message: &str) { self.err_recover(message, TokenSet::EMPTY) } + /// Create an error node and consume the next token if it is of expected kind + /// If the current token belongs to given recovery set, it just reports and error + /// and tries to recover pub(crate) fn err_recover(&mut self, message: &str, recovery: TokenSet) { if self.at_ts(recovery) { self.error(message); @@ -123,6 +144,8 @@ impl<'t> Parser<'t> { } } +/// Marker that is used to mark the start of a new node in the syntax tree +/// It groups specific node/tokens that belongs to this node pub(crate) struct Marker { pos: u32, bomb: DropBomb, @@ -130,12 +153,10 @@ pub(crate) struct Marker { impl Marker { fn new(pos: u32) -> Marker { - Marker { - pos, - bomb: DropBomb::new("Marker must be either completed or abandoned"), - } + Marker { pos, bomb: DropBomb::new("Marker must be either completed or abandoned") } } + /// Finish the syntax tree node and assign specific kind to it pub(crate) fn complete(mut self, p: &mut Parser, kind: SyntaxKind) -> CompletedMarker { self.bomb.defuse(); let idx = self.pos as usize; @@ -149,15 +170,14 @@ impl Marker { CompletedMarker::new(self.pos, kind) } + /// Abandon the syntax tree node + /// all the children are then attached to the parent of this node pub(crate) fn abandon(mut self, p: &mut Parser) { self.bomb.defuse(); let idx = self.pos as usize; if idx == p.events.len() - 1 { match p.events.pop() { - Some(Event::Start { - kind: TOMBSTONE, - forward_parent: None, - }) => (), + Some(Event::Start { kind: TOMBSTONE, forward_parent: None }) => (), _ => unreachable!(), } } @@ -174,6 +194,9 @@ impl CompletedMarker { CompletedMarker { pos, kind } } + /// This allows us to create a new node which should precede the current node + /// Parser could start node `A`, complete it. Afterwards it decides that it should + /// have started node `B` before `A` was started. This allows exactly that. pub(crate) fn precede(self, p: &mut Parser) -> Marker { let new_pos = p.start(); let idx = self.pos as usize; diff --git a/src/parser/syntax_kind.rs b/src/parser/syntax_kind.rs new file mode 100644 index 0000000..a09082d --- /dev/null +++ b/src/parser/syntax_kind.rs @@ -0,0 +1,32 @@ +//! SyntaxKind is the main enum for the syntax tree. +//! It represents the kind of a node in the syntax tree +//! for YARA language +//! +//! all the variants are generated and located in `syntax_kind/generated.rs` + +mod generated; + +#[allow(unreachable_pub)] +pub use self::generated::{SyntaxKind, T}; + +impl From for SyntaxKind { + #[inline] + fn from(d: u16) -> SyntaxKind { + assert!(d <= (SyntaxKind::__LAST as u16)); + unsafe { std::mem::transmute::(d) } + } +} + +impl From for u16 { + #[inline] + fn from(k: SyntaxKind) -> u16 { + k as u16 + } +} + +impl SyntaxKind { + #[inline] + pub fn is_trivia(self) -> bool { + matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) + } +} diff --git a/src/parser/syntax_kind/generated.rs b/src/parser/syntax_kind/generated.rs new file mode 100644 index 0000000..bd253f4 --- /dev/null +++ b/src/parser/syntax_kind/generated.rs @@ -0,0 +1,92 @@ +//! Generated by `sourcegen_ast`, do not edit by hand. + +#![allow(bad_style, missing_docs, unreachable_pub, clippy::upper_case_acronyms)] +#[doc = r" The kind of syntax node, e.g. `IDENTIFIER`, `RULE_KW`, or `AND`."] +#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] +#[repr(u16)] +pub enum SyntaxKind { + #[doc(hidden)] + TOMBSTONE, + #[doc(hidden)] + EOF, + COLON, + L_PAREN, + R_PAREN, + L_BRACE, + R_BRACE, + COMMA, + ASSIGN, + AND_KW, + OR_KW, + NOT_KW, + TRUE_KW, + FALSE_KW, + RULE_KW, + STRINGS_KW, + CONDITION_KW, + STRING_LIT, + NUMBER, + IDENTIFIER, + VARIABLE, + WHITESPACE, + COMMENT, + ERROR, + RULE, + STRINGS, + CONDITION, + SOURCE_FILE, + BLOCK_EXPR, + PREFIX_EXPR, + LITERAL, + EXPRESSION, + EXPRESSION_STMT, + VARIABLE_STMT, + PATTERN, + #[doc(hidden)] + __LAST, +} +use self::SyntaxKind::*; +impl SyntaxKind { + pub fn is_keyword(self) -> bool { + matches!( + self, + AND_KW | OR_KW | NOT_KW | TRUE_KW | FALSE_KW | RULE_KW | STRINGS_KW | CONDITION_KW + ) + } + pub fn is_punct(self) -> bool { + matches!(self, COLON | L_PAREN | R_PAREN | L_BRACE | R_BRACE | COMMA | ASSIGN) + } + pub fn is_literal(self) -> bool { + matches!(self, STRING_LIT | NUMBER) + } + pub fn from_keyword(ident: &str) -> Option { + let kw = match ident { + "and" => AND_KW, + "or" => OR_KW, + "not" => NOT_KW, + "true" => TRUE_KW, + "false" => FALSE_KW, + "rule" => RULE_KW, + "strings" => STRINGS_KW, + "condition" => CONDITION_KW, + _ => return None, + }; + Some(kw) + } + pub fn from_char(c: char) -> Option { + let tok = match c { + ':' => COLON, + '(' => L_PAREN, + ')' => R_PAREN, + '{' => L_BRACE, + '}' => R_BRACE, + ',' => COMMA, + '=' => ASSIGN, + _ => return None, + }; + Some(tok) + } +} +#[macro_export] +macro_rules ! T { [:] => { $ crate :: SyntaxKind :: COLON } ; ['('] => { $ crate :: SyntaxKind :: L_PAREN } ; [')'] => { $ crate :: SyntaxKind :: R_PAREN } ; ['{'] => { $ crate :: SyntaxKind :: L_BRACE } ; ['}'] => { $ crate :: SyntaxKind :: R_BRACE } ; [,] => { $ crate :: SyntaxKind :: COMMA } ; [=] => { $ crate :: SyntaxKind :: ASSIGN } ; [and] => { $ crate :: SyntaxKind :: AND_KW } ; [or] => { $ crate :: SyntaxKind :: OR_KW } ; [not] => { $ crate :: SyntaxKind :: NOT_KW } ; [true] => { $ crate :: SyntaxKind :: TRUE_KW } ; [false] => { $ crate :: SyntaxKind :: FALSE_KW } ; [rule] => { $ crate :: SyntaxKind :: RULE_KW } ; [strings] => { $ crate :: SyntaxKind :: STRINGS_KW } ; [condition] => { $ crate :: SyntaxKind :: CONDITION_KW } ; [identifier] => { $ crate :: SyntaxKind :: IDENTIFIER } ; [variable] => { $ crate :: SyntaxKind :: VARIABLE } ; [string_lit] => { $ crate :: SyntaxKind :: STRING_LIT } ; } +pub use T; diff --git a/src/parser/syntaxkind.rs b/src/parser/syntaxkind.rs deleted file mode 100644 index fc47c2c..0000000 --- a/src/parser/syntaxkind.rs +++ /dev/null @@ -1,60 +0,0 @@ -#![allow(clippy::upper_case_acronyms)] - -#[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] -#[repr(u16)] -pub enum SyntaxKind { - TOMBSTONE, - EOF, - RULE, - STRINGS, - CONDITION, - AND, - OR, - NOT, - IDENTIFIER, - VARIABLE, - STRING, - ASSIGN, - COLON, - LBRACE, - RBRACE, - LPAREN, - RPAREN, - COMMA, - NUMBER, - TRUE, - FALSE, - WHITESPACE, - COMMENT, - ERROR, - SOURCE_FILE, - BLOCK_EXPR, - PREFIX_EXPR, - LITERAL, - EXPRESSION, - EXPRESSION_STMT, - VARIABLE_STMT, - __LAST, -} - -impl From for SyntaxKind { - #[inline] - fn from(d: u16) -> SyntaxKind { - assert!(d <= (SyntaxKind::__LAST as u16)); - unsafe { std::mem::transmute::(d) } - } -} - -impl From for u16 { - #[inline] - fn from(k: SyntaxKind) -> u16 { - k as u16 - } -} - -impl SyntaxKind { - #[inline] - pub fn is_trivia(self) -> bool { - matches!(self, SyntaxKind::WHITESPACE | SyntaxKind::COMMENT) - } -} diff --git a/src/parser/token_set.rs b/src/parser/token_set.rs index fe3d907..36680ab 100644 --- a/src/parser/token_set.rs +++ b/src/parser/token_set.rs @@ -1,3 +1,7 @@ +//! A bit set for `SyntaxKind` +//! This can be used to create a new set of tokens +//! mainly for Error recovery FIRST/FOLLOW sets + use crate::parser::SyntaxKind; #[derive(Clone, Copy)] diff --git a/src/syntax/ast.rs b/src/syntax/ast.rs new file mode 100644 index 0000000..877cc2c --- /dev/null +++ b/src/syntax/ast.rs @@ -0,0 +1,100 @@ +//! AST layer that is on top of the untyped syntax tree. +//! AST methods are mainly generated using `tests/sourcegen_ast` file + +mod generated; +mod traits; +mod expr_ext; +mod operators; + +use std::marker::PhantomData; + +use crate::{ + syntax::syntax_node::{SyntaxNode, SyntaxNodeChildren, SyntaxToken}, + SyntaxKind, +}; + +pub use self::{ + generated::{nodes::*, tokens::*}, + traits::HasComments, +}; + +/// Zero runtime cost conversion to AST layer +pub trait AstNode { + fn can_cast(kind: SyntaxKind) -> bool + where + Self: Sized; + + fn cast(syntax: SyntaxNode) -> Option + where + Self: Sized; + + fn syntax(&self) -> &SyntaxNode; + + fn clone_for_update(&self) -> Self + where + Self: Sized, + { + Self::cast(self.syntax().clone_for_update()).unwrap() + } + + fn clone_subtree(&self) -> Self + where + Self: Sized, + { + Self::cast(self.syntax().clone_subtree()).unwrap() + } +} + +/// Same as `AstNode` but for tokens +pub trait AstToken { + fn can_cast(kind: SyntaxKind) -> bool + where + Self: Sized; + + fn cast(syntax: SyntaxToken) -> Option + where + Self: Sized; + + fn syntax(&self) -> &SyntaxToken; + + fn text(&self) -> &str { + self.syntax().text() + } +} + +/// An iterator over `SyntaxNode` children of a particular AST type` +#[derive(Debug, Clone)] +pub struct AstChildren { + inner: SyntaxNodeChildren, + ph: PhantomData, +} + +impl AstChildren { + pub fn new(parent: &SyntaxNode) -> Self { + AstChildren { inner: parent.children(), ph: PhantomData } + } +} + +impl Iterator for AstChildren { + type Item = N; + + fn next(&mut self) -> Option { + self.inner.find_map(N::cast) + } +} + +pub mod support { + use super::{AstChildren, AstNode, SyntaxKind, SyntaxNode, SyntaxToken}; + + pub fn child(parent: &SyntaxNode) -> Option { + parent.children().find_map(N::cast) + } + + pub fn children(parent: &SyntaxNode) -> AstChildren { + AstChildren::new(parent) + } + + pub fn token(parent: &SyntaxNode, kind: SyntaxKind) -> Option { + parent.children_with_tokens().filter_map(|it| it.into_token()).find(|it| it.kind() == kind) + } +} diff --git a/src/syntax/ast/expr_ext.rs b/src/syntax/ast/expr_ext.rs new file mode 100644 index 0000000..776110a --- /dev/null +++ b/src/syntax/ast/expr_ext.rs @@ -0,0 +1,103 @@ +//! Similary as `operators.rs` it contains various extensions +//! and methods for `ast::Expr` and `ast::Literal` nodes. +//! `LiteralKind` type will probably also be changed during integration +//! It is for now just to showcase its abilities + +use crate::{ + syntax::ast::{ + self, + operators::{BinaryOp, LogicOp, UnaryOp}, + support, AstNode, AstToken, + }, + SyntaxToken, T, +}; + +impl ast::PrefixExpr { + pub fn op_kind(&self) -> Option { + let res = match self.op_token()?.kind() { + T![not] => UnaryOp::Not, + _ => return None, + }; + Some(res) + } + + pub fn op_token(&self) -> Option { + self.syntax().first_child_or_token()?.into_token() + } +} + +impl ast::Expression { + pub fn op_details(&self) -> Option<(SyntaxToken, BinaryOp)> { + self.syntax().children_with_tokens().filter_map(|it| it.into_token()).find_map(|c| { + let bin_op = match c.kind() { + T![and] => BinaryOp::LogicOp(LogicOp::And), + T![or] => BinaryOp::LogicOp(LogicOp::Or), + _ => return None, + }; + Some((c, bin_op)) + }) + } + + pub fn op_kind(&self) -> Option { + self.op_details().map(|t| t.1) + } + + pub fn op_token(&self) -> Option { + self.op_details().map(|t| t.0) + } + + pub fn lhs(&self) -> Option { + support::children(self.syntax()).next() + } + + pub fn rhs(&self) -> Option { + support::children(self.syntax()).nth(1) + } + + pub fn sub_exprs(&self) -> (Option, Option) { + let mut children = support::children(self.syntax()); + let first = children.next(); + let second = children.next(); + (first, second) + } +} + +#[derive(Clone, Debug, PartialEq, Eq, Hash)] +pub enum LiteralKind { + String(ast::StringLit), + Number(ast::Number), + Variable(ast::Variable), + Bool(bool), +} + +impl ast::Literal { + pub fn token(&self) -> SyntaxToken { + self.syntax() + .children_with_tokens() + .find(|e| !e.kind().is_trivia()) + .and_then(|e| e.into_token()) + .unwrap() + } + + pub fn kind(&self) -> LiteralKind { + let token = self.token(); + + if let Some(number) = ast::Number::cast(token.clone()) { + return LiteralKind::Number(number); + } + + if let Some(variable) = ast::Variable::cast(token.clone()) { + return LiteralKind::Variable(variable); + } + + if let Some(string) = ast::StringLit::cast(token.clone()) { + return LiteralKind::String(string); + } + + match token.kind() { + T![true] => LiteralKind::Bool(true), + T![false] => LiteralKind::Bool(false), + _ => unreachable!(), + } + } +} diff --git a/src/syntax/ast/generated.rs b/src/syntax/ast/generated.rs new file mode 100644 index 0000000..61eb636 --- /dev/null +++ b/src/syntax/ast/generated.rs @@ -0,0 +1,4 @@ +#[rustfmt::skip] +pub mod nodes; +#[rustfmt::skip] +pub mod tokens; diff --git a/src/syntax/ast/generated/nodes.rs b/src/syntax/ast/generated/nodes.rs new file mode 100644 index 0000000..1a9e7ed --- /dev/null +++ b/src/syntax/ast/generated/nodes.rs @@ -0,0 +1,442 @@ +//! Generated by `sourcegen_ast`, do not edit by hand. + +#![allow(clippy::enum_variant_names)] +use crate::{ + syntax::ast::{self, support, AstChildren, AstNode}, + SyntaxKind::{self, *}, + SyntaxNode, SyntaxToken, T, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct SourceFile { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for SourceFile {} +impl SourceFile { + pub fn rules(&self) -> AstChildren { + support::children(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Rule { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for Rule {} +impl Rule { + pub fn rule_token(&self) -> Option { + support::token(&self.syntax, T![rule]) + } + pub fn identifier_token(&self) -> Option { + support::token(&self.syntax, T![identifier]) + } + pub fn body(&self) -> Option { + support::child(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct BlockExpr { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for BlockExpr {} +impl BlockExpr { + pub fn l_brace_token(&self) -> Option { + support::token(&self.syntax, T!['{']) + } + pub fn strings(&self) -> Option { + support::child(&self.syntax) + } + pub fn condition(&self) -> Option { + support::child(&self.syntax) + } + pub fn r_brace_token(&self) -> Option { + support::token(&self.syntax, T!['}']) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Strings { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for Strings {} +impl Strings { + pub fn strings_token(&self) -> Option { + support::token(&self.syntax, T![strings]) + } + pub fn colon_token(&self) -> Option { + support::token(&self.syntax, T![:]) + } + pub fn variable_stmts(&self) -> AstChildren { + support::children(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Condition { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for Condition {} +impl Condition { + pub fn condition_token(&self) -> Option { + support::token(&self.syntax, T![condition]) + } + pub fn colon_token(&self) -> Option { + support::token(&self.syntax, T![:]) + } + pub fn expression_stmt(&self) -> Option { + support::child(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct VariableStmt { + pub(crate) syntax: SyntaxNode, +} +impl VariableStmt { + pub fn variable_token(&self) -> Option { + support::token(&self.syntax, T![variable]) + } + pub fn assign_token(&self) -> Option { + support::token(&self.syntax, T![=]) + } + pub fn pattern(&self) -> Option { + support::child(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Pattern { + pub(crate) syntax: SyntaxNode, +} +impl Pattern { + pub fn string_lit_token(&self) -> Option { + support::token(&self.syntax, T![string_lit]) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct ExpressionStmt { + pub(crate) syntax: SyntaxNode, +} +impl ExpressionStmt { + pub fn expr(&self) -> Option { + support::child(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Expression { + pub(crate) syntax: SyntaxNode, +} +impl Expression {} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct PrefixExpr { + pub(crate) syntax: SyntaxNode, +} +impl PrefixExpr { + pub fn not_token(&self) -> Option { + support::token(&self.syntax, T![not]) + } + pub fn expr(&self) -> Option { + support::child(&self.syntax) + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Literal { + pub(crate) syntax: SyntaxNode, +} +impl Literal {} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub enum Expr { + Expression(Expression), + PrefixExpr(PrefixExpr), + Literal(Literal), +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct AnyHasComments { + pub(crate) syntax: SyntaxNode, +} +impl ast::HasComments for AnyHasComments {} +impl AstNode for SourceFile { + fn can_cast(kind: SyntaxKind) -> bool { + kind == SOURCE_FILE + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Rule { + fn can_cast(kind: SyntaxKind) -> bool { + kind == RULE + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for BlockExpr { + fn can_cast(kind: SyntaxKind) -> bool { + kind == BLOCK_EXPR + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Strings { + fn can_cast(kind: SyntaxKind) -> bool { + kind == STRINGS + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Condition { + fn can_cast(kind: SyntaxKind) -> bool { + kind == CONDITION + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for VariableStmt { + fn can_cast(kind: SyntaxKind) -> bool { + kind == VARIABLE_STMT + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Pattern { + fn can_cast(kind: SyntaxKind) -> bool { + kind == PATTERN + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for ExpressionStmt { + fn can_cast(kind: SyntaxKind) -> bool { + kind == EXPRESSION_STMT + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Expression { + fn can_cast(kind: SyntaxKind) -> bool { + kind == EXPRESSION + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for PrefixExpr { + fn can_cast(kind: SyntaxKind) -> bool { + kind == PREFIX_EXPR + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl AstNode for Literal { + fn can_cast(kind: SyntaxKind) -> bool { + kind == LITERAL + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl From for Expr { + fn from(node: Expression) -> Expr { + Expr::Expression(node) + } +} +impl From for Expr { + fn from(node: PrefixExpr) -> Expr { + Expr::PrefixExpr(node) + } +} +impl From for Expr { + fn from(node: Literal) -> Expr { + Expr::Literal(node) + } +} +impl AstNode for Expr { + fn can_cast(kind: SyntaxKind) -> bool { + matches!(kind, EXPRESSION | PREFIX_EXPR | LITERAL) + } + fn cast(syntax: SyntaxNode) -> Option { + let res = match syntax.kind() { + EXPRESSION => Expr::Expression(Expression { syntax }), + PREFIX_EXPR => Expr::PrefixExpr(PrefixExpr { syntax }), + LITERAL => Expr::Literal(Literal { syntax }), + _ => return None, + }; + Some(res) + } + fn syntax(&self) -> &SyntaxNode { + match self { + Expr::Expression(it) => &it.syntax, + Expr::PrefixExpr(it) => &it.syntax, + Expr::Literal(it) => &it.syntax, + } + } +} +impl AnyHasComments { + #[inline] + pub fn new(node: T) -> AnyHasComments { + AnyHasComments { syntax: node.syntax().clone() } + } +} +impl AstNode for AnyHasComments { + fn can_cast(kind: SyntaxKind) -> bool { + matches!(kind, SOURCE_FILE | RULE | BLOCK_EXPR | STRINGS | CONDITION) + } + fn cast(syntax: SyntaxNode) -> Option { + Self::can_cast(syntax.kind()).then_some(AnyHasComments { syntax }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } +} +impl std::fmt::Display for Expr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for SourceFile { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Rule { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for BlockExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Strings { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Condition { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for VariableStmt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Pattern { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for ExpressionStmt { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Expression { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for PrefixExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} +impl std::fmt::Display for Literal { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } +} diff --git a/src/syntax/ast/generated/tokens.rs b/src/syntax/ast/generated/tokens.rs new file mode 100644 index 0000000..4fb388b --- /dev/null +++ b/src/syntax/ast/generated/tokens.rs @@ -0,0 +1,132 @@ +//! Generated by `sourcegen_ast`, do not edit by hand. + +use crate::{ + syntax::ast::AstToken, + SyntaxKind::{self, *}, + SyntaxToken, +}; + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Whitespace { + pub(crate) syntax: SyntaxToken, +} +impl std::fmt::Display for Whitespace { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } +} +impl AstToken for Whitespace { + fn can_cast(kind: SyntaxKind) -> bool { + kind == WHITESPACE + } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxToken { + &self.syntax + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Comment { + pub(crate) syntax: SyntaxToken, +} +impl std::fmt::Display for Comment { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } +} +impl AstToken for Comment { + fn can_cast(kind: SyntaxKind) -> bool { + kind == COMMENT + } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxToken { + &self.syntax + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct StringLit { + pub(crate) syntax: SyntaxToken, +} +impl std::fmt::Display for StringLit { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } +} +impl AstToken for StringLit { + fn can_cast(kind: SyntaxKind) -> bool { + kind == STRING_LIT + } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxToken { + &self.syntax + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Number { + pub(crate) syntax: SyntaxToken, +} +impl std::fmt::Display for Number { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } +} +impl AstToken for Number { + fn can_cast(kind: SyntaxKind) -> bool { + kind == NUMBER + } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxToken { + &self.syntax + } +} + +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Variable { + pub(crate) syntax: SyntaxToken, +} +impl std::fmt::Display for Variable { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } +} +impl AstToken for Variable { + fn can_cast(kind: SyntaxKind) -> bool { + kind == VARIABLE + } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { + Some(Self { syntax }) + } else { + None + } + } + fn syntax(&self) -> &SyntaxToken { + &self.syntax + } +} diff --git a/src/syntax/ast/mod.rs b/src/syntax/ast/mod.rs deleted file mode 100644 index 2dd05f8..0000000 --- a/src/syntax/ast/mod.rs +++ /dev/null @@ -1,72 +0,0 @@ -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub struct CommentKind { - pub shape: CommentShape, -} - -#[derive(Debug, PartialEq, Eq, Clone, Copy)] -pub enum CommentShape { - Line, - Block, -} - -impl CommentShape { - pub fn is_line(self) -> bool { - self == CommentShape::Line - } - - pub fn is_block(self) -> bool { - self == CommentShape::Block - } -} - -impl CommentKind { - const BY_PREFIX: [(&'static str, CommentKind); 5] = [ - ( - "/**/", - CommentKind { - shape: CommentShape::Block, - }, - ), - ( - "/***", - CommentKind { - shape: CommentShape::Block, - }, - ), - ( - "////", - CommentKind { - shape: CommentShape::Line, - }, - ), - ( - "//", - CommentKind { - shape: CommentShape::Line, - }, - ), - ( - "/*", - CommentKind { - shape: CommentShape::Block, - }, - ), - ]; - - pub(crate) fn from_text(text: &str) -> CommentKind { - let &(_prefix, kind) = CommentKind::BY_PREFIX - .iter() - .find(|&(prefix, _kind)| text.starts_with(prefix)) - .unwrap(); - kind - } - - pub fn prefix(&self) -> &'static str { - let &(prefix, _) = CommentKind::BY_PREFIX - .iter() - .rev() - .find(|(_, kind)| kind == self) - .unwrap(); - prefix - } -} diff --git a/src/syntax/ast/operators.rs b/src/syntax/ast/operators.rs new file mode 100644 index 0000000..48671e7 --- /dev/null +++ b/src/syntax/ast/operators.rs @@ -0,0 +1,39 @@ +//! Implementations of operators for the AST. +//! This uses own type and it can be reused for both AST and HIR +//! This will probably be change during parser integration into `YARA-X` + +use std::fmt; + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum UnaryOp { + Not, +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum BinaryOp { + LogicOp(LogicOp), +} + +#[derive(Copy, Clone, Debug, PartialEq, Eq, Hash)] +pub enum LogicOp { + And, + Or, +} + +impl fmt::Display for LogicOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + let res = match self { + LogicOp::And => "and", + LogicOp::Or => "or", + }; + f.write_str(res) + } +} + +impl fmt::Display for BinaryOp { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + match self { + BinaryOp::LogicOp(op) => fmt::Display::fmt(op, f), + } + } +} diff --git a/src/syntax/ast/traits.rs b/src/syntax/ast/traits.rs new file mode 100644 index 0000000..b7c0845 --- /dev/null +++ b/src/syntax/ast/traits.rs @@ -0,0 +1,32 @@ +//! Implementation of these traits is in `generated/` folder +//! So far the only implemented trait is `HasComments` which is used +//! to iterate over comments in the syntax tree +//! This can be easily extended to support other traits + +use crate::syntax::ast::{self, AstNode}; +use crate::syntax::syntax_node::SyntaxElementChildren; + +use super::AstToken; + +pub trait HasComments: AstNode { + fn comments(&self) -> CommentIter { + CommentIter { iter: self.syntax().children_with_tokens() } + } +} + +impl CommentIter { + pub fn from_syntax_node(syntax_node: &ast::SyntaxNode) -> CommentIter { + CommentIter { iter: syntax_node.children_with_tokens() } + } +} + +pub struct CommentIter { + iter: SyntaxElementChildren, +} + +impl Iterator for CommentIter { + type Item = ast::Comment; + fn next(&mut self) -> Option { + self.iter.by_ref().find_map(|el| el.into_token().and_then(ast::Comment::cast)) + } +} diff --git a/src/syntax/mod.rs b/src/syntax/mod.rs index 0ebb190..6010822 100644 --- a/src/syntax/mod.rs +++ b/src/syntax/mod.rs @@ -1,5 +1,138 @@ +//! Syntax tree representation +//! +//! Properties: +//! - errors handling +//! - full-fidelity representation +//! - easy to navigate +//! - in future easy to extend with incremental re-parsing +//! +//! It is inspired by the Swift's libSyntax and the Rust's rowan. +//! [Swift]: +//! [Rust-analyzer]: +//! +//! It uses modified rowan crate for storing all the information in fast and convinient way. +//! [Rowan]: +//! +//! More detailed information can be also found in `rust-analyzer` syntax documentation +//! [Rust-analyzer]: + pub mod ast; pub mod syntax_error; pub mod syntax_node; pub mod text_token_source; pub mod text_tree_sink; +#[cfg(test)] +mod tests; + +pub use rowan_test::GreenNode; +use std::{marker::PhantomData, sync::Arc}; + +use crate::{ + lexer::tokenize, + parser::{self, SyntaxKind}, + syntax::{ast::AstNode, syntax_node::SyntaxNode}, + SyntaxError, TextTokenSource, TextTreeSink, +}; + +macro_rules! format_to { + ($buf:expr) => (); + ($buf:expr, $lit:literal $($arg:tt)*) => { + { use ::std::fmt::Write as _; let _ = ::std::write!($buf, $lit $($arg)*); } + }; +} + +/// A result of a successful parsing of a source file. +/// It provides AST and list of errors. +/// We always produce a syntax tree, even for invalid files. +pub struct Parse { + green: GreenNode, + errors: Arc>, + _ty: PhantomData T>, +} + +impl Clone for Parse { + fn clone(&self) -> Parse { + Parse { green: self.green.clone(), errors: self.errors.clone(), _ty: PhantomData } + } +} + +impl Parse { + fn new(green: GreenNode, errors: Vec) -> Parse { + Parse { green, errors: Arc::new(errors), _ty: PhantomData } + } + + pub fn syntax_node(&self) -> SyntaxNode { + SyntaxNode::new_root(self.green.clone()) + } +} + +impl Parse { + #[allow(clippy::wrong_self_convention)] + pub fn to_syntax(self) -> Parse { + Parse { green: self.green, errors: self.errors, _ty: PhantomData } + } + + pub fn tree(&self) -> T { + T::cast(self.syntax_node()).unwrap() + } + + pub fn errors(&self) -> &[SyntaxError] { + &self.errors + } + + pub fn ok(self) -> Result>> { + if self.errors.is_empty() { + Ok(self.tree()) + } else { + Err(self.errors) + } + } +} + +impl Parse { + pub fn cast(self) -> Option> { + if N::cast(self.syntax_node()).is_some() { + Some(Parse { green: self.green, errors: self.errors, _ty: PhantomData }) + } else { + None + } + } +} + +impl Parse { + pub fn debug_dump(&self) -> String { + let mut buf = format!("{:#?}", self.tree().syntax()); + for err in self.errors.iter() { + format_to!(buf, "error {:?}: {}\n", err.range(), err); + } + buf + } +} + +/// Source file represents single YARA file that can contain multiple rules +/// So far only subset of YARA is supported +/// YARA file is at this point represented as a string on input +pub use crate::syntax::ast::SourceFile; + +impl SourceFile { + pub fn parse(text: &str) -> Parse { + let (green, errors) = parse_text(text); + let root = SyntaxNode::new_root(green.clone()); + + assert_eq!(root.kind(), SyntaxKind::SOURCE_FILE); + Parse { green, errors: Arc::new(errors), _ty: PhantomData } + } +} + +/// Parses the given string representation of file into a syntax tree. +fn parse_text(text: &str) -> (GreenNode, Vec) { + let (tokens, lexer_errors) = tokenize(text); + let mut token_source = TextTokenSource::new(text, &tokens); + let mut tree_sink = TextTreeSink::new(text, &tokens); + + parser::parse(&mut token_source, &mut tree_sink); + let (tree, mut parser_errors) = tree_sink.finish(); + parser_errors.extend(lexer_errors); + + (tree, parser_errors) +} diff --git a/src/syntax/syntax_error.rs b/src/syntax/syntax_error.rs index 312d7b9..cb6fab0 100644 --- a/src/syntax/syntax_error.rs +++ b/src/syntax/syntax_error.rs @@ -2,6 +2,9 @@ use std::fmt; use text_size::{TextRange, TextSize}; +/// Represents an error that can happen during parsing or lexing +/// This can be also used in further AST validations to throw another error +/// Each error has a message and a range #[derive(Debug, Clone, PartialEq, Eq, Hash)] pub struct SyntaxError(String, TextRange); diff --git a/src/syntax/syntax_node.rs b/src/syntax/syntax_node.rs index 8d762d0..8e33b66 100644 --- a/src/syntax/syntax_node.rs +++ b/src/syntax/syntax_node.rs @@ -1,15 +1,14 @@ -//! This module defines Concrete Syntax Tree (CST), used by rust-analyzer. +//! This module represents CST for YARA language //! -//! The CST includes comments and whitespace, provides a single node type, -//! `SyntaxNode`, and a basic traversal API (parent, children, siblings). -//! -//! The *real* implementation is in the (language-agnostic) `rowan` crate, this -//! module just wraps its API. +//! THe CST includes trivia such as comments or whitespaces +//! `SyntaxNode` provides basic API that allows to travers the tree +//! to find parent, children or siblings +//! This is just a wrapper around `rowan` crate API use rowan_test::{GreenNodeBuilder, Language}; use text_size::TextSize; -use crate::parser::{self, syntaxkind::SyntaxKind}; +use crate::parser::{self, syntax_kind::SyntaxKind}; use crate::SyntaxError; pub(crate) use rowan_test::GreenNode; @@ -62,7 +61,6 @@ impl SyntaxTreeBuilder { } pub fn error(&mut self, error: parser::ParseError, text_pos: TextSize) { - self.errors - .push(SyntaxError::new_at_offset(error.0, text_pos)) + self.errors.push(SyntaxError::new_at_offset(error.0, text_pos)) } } diff --git a/src/syntax/tests.rs b/src/syntax/tests.rs new file mode 100644 index 0000000..e4c67ef --- /dev/null +++ b/src/syntax/tests.rs @@ -0,0 +1,3 @@ +mod sourcegen_ast; +mod ast_src; +pub mod tools; diff --git a/src/syntax/tests/ast_src.rs b/src/syntax/tests/ast_src.rs new file mode 100644 index 0000000..55f39ef --- /dev/null +++ b/src/syntax/tests/ast_src.rs @@ -0,0 +1,72 @@ +//! Defines input for generation of AST and `SyntaxKind` + +pub(crate) struct KindsSrc<'a> { + pub(crate) punct: &'a [(&'a str, &'a str)], + pub(crate) keywords: &'a [&'a str], + pub(crate) literals: &'a [&'a str], + pub(crate) tokens: &'a [&'a str], + pub(crate) nodes: &'a [&'a str], +} + +pub(crate) const KINDS_SRC: KindsSrc = KindsSrc { + punct: &[ + (":", "COLON"), + ("(", "L_PAREN"), + (")", "R_PAREN"), + ("{", "L_BRACE"), + ("}", "R_BRACE"), + (",", "COMMA"), + ("=", "ASSIGN"), + ], + keywords: &["and", "or", "not", "true", "false", "rule", "strings", "condition"], + literals: &["STRING_LIT", "NUMBER"], + tokens: &["IDENTIFIER", "VARIABLE", "WHITESPACE", "COMMENT", "ERROR"], + nodes: &[ + "RULE", + "STRINGS", + "CONDITION", + "SOURCE_FILE", + "BLOCK_EXPR", + "PREFIX_EXPR", + "LITERAL", + "EXPRESSION", + "EXPRESSION_STMT", + "VARIABLE_STMT", + "PATTERN", + ], +}; + +#[derive(Default, Debug)] +pub(crate) struct AstSrc { + pub(crate) tokens: Vec, + pub(crate) nodes: Vec, + pub(crate) enums: Vec, +} + +#[derive(Debug)] +pub(crate) struct AstNodeSrc { + pub(crate) doc: Vec, + pub(crate) name: String, + pub(crate) traits: Vec, + pub(crate) fields: Vec, +} + +#[derive(Debug, Eq, PartialEq)] +pub(crate) enum Field { + Token(String), + Node { name: String, ty: String, cardinality: Cardinality }, +} + +#[derive(Debug, Eq, PartialEq)] +pub(crate) enum Cardinality { + Optional, + Many, +} + +#[derive(Debug)] +pub(crate) struct AstEnumSrc { + pub(crate) doc: Vec, + pub(crate) name: String, + pub(crate) traits: Vec, + pub(crate) variants: Vec, +} diff --git a/src/syntax/tests/sourcegen_ast.rs b/src/syntax/tests/sourcegen_ast.rs new file mode 100644 index 0000000..3a131ab --- /dev/null +++ b/src/syntax/tests/sourcegen_ast.rs @@ -0,0 +1,615 @@ +//! This module is basically one test that generates `SyntaxKind` +//! and wrappers around `SyntaxNode` to provide AST layer +//! It uses `ungrammar` crate to parse `yara.ungram` file and generate AST +//! It is not a grammar, it does not validate anything. Just generates methods +//! and types for AST layer + +use std::{collections::HashSet, fmt::Write}; + +use itertools::Itertools; +use proc_macro2::{Punct, Spacing}; +use quote::{format_ident, quote}; +use ungrammar::{Grammar, Rule}; + +use crate::syntax::tests::ast_src::{ + AstEnumSrc, AstNodeSrc, AstSrc, Cardinality, Field, KindsSrc, KINDS_SRC, +}; + +use crate::syntax::tests::tools::{ + add_preamble, ensure_file_contents, project_root, reformat, to_pascal_case, to_upper_snake_case, +}; + +use super::tools::to_lower_snake_case; + +#[test] +fn sourcegen_ast() { + let syntax_kinds = generate_syntax_kinds(KINDS_SRC); + let syntax_kinds_file = project_root().join("src/parser/syntax_kind/generated.rs"); + ensure_file_contents(syntax_kinds_file.as_path(), &syntax_kinds); + + let grammar = std::fs::read_to_string(project_root().join("yara.ungram")) + .expect("Failed to read grammar file"); + + let grammar = grammar.parse::().unwrap(); + let ast = lower(&grammar); + + let ast_tokens = generate_tokens(&ast); + let ast_tokens_file = project_root().join("src/syntax/ast/generated/tokens.rs"); + ensure_file_contents(ast_tokens_file.as_path(), &ast_tokens); + + let ast_nodes = generate_nodes(KINDS_SRC, &ast); + let ast_nodes_file = project_root().join("src/syntax/ast/generated/nodes.rs"); + ensure_file_contents(ast_nodes_file.as_path(), &ast_nodes); +} + +fn generate_tokens(grammar: &AstSrc) -> String { + let tokens = grammar.tokens.iter().map(|token| { + let name = format_ident!("{}", token); + let kind = format_ident!("{}", to_upper_snake_case(token)); + quote! { + #[derive(Debug, Clone, PartialEq, Eq, Hash)] + pub struct #name { + pub(crate) syntax: SyntaxToken, + } + impl std::fmt::Display for #name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.syntax, f) + } + } + impl AstToken for #name { + fn can_cast(kind: SyntaxKind) -> bool { kind == #kind } + fn cast(syntax: SyntaxToken) -> Option { + if Self::can_cast(syntax.kind()) { Some(Self { syntax }) } else { None } + } + fn syntax(&self) -> &SyntaxToken { &self.syntax } + } + } + }); + + add_preamble( + "sourcegen_ast", + reformat( + quote! { + use crate::{SyntaxKind::{self, *}, SyntaxToken, syntax::ast::AstToken}; + #(#tokens)* + } + .to_string(), + ), + ) + .replace("#[derive", "\n#[derive") +} + +fn generate_nodes(kinds: KindsSrc<'_>, grammar: &AstSrc) -> String { + let (node_defs, node_boilerplate_impls): (Vec<_>, Vec<_>) = grammar + .nodes + .iter() + .map(|node| { + let name = format_ident!("{}", node.name); + let kind = format_ident!("{}", to_upper_snake_case(&node.name)); + let traits = node.traits.iter().map(|trait_name| { + let trait_name = format_ident!("{}", trait_name); + quote!(impl ast::#trait_name for #name {}) + }); + + let methods = node.fields.iter().map(|field| { + let method_name = field.method_name(); + let ty = field.ty(); + + if field.is_many() { + quote! { + pub fn #method_name(&self) -> AstChildren<#ty> { + support::children(&self.syntax) + } + } + } else if let Some(token_kind) = field.token_kind() { + quote! { + pub fn #method_name(&self) -> Option<#ty> { + support::token(&self.syntax, #token_kind) + } + } + } else { + quote! { + pub fn #method_name(&self) -> Option<#ty> { + support::child(&self.syntax) + } + } + } + }); + ( + quote! { + #[pretty_doc_comment_placeholder_workaround] + #[derive(Debug, Clone, PartialEq, Eq, Hash)] + pub struct #name { + pub(crate) syntax: SyntaxNode, + } + + #(#traits)* + + impl #name { + #(#methods)* + } + }, + quote! { + impl AstNode for #name { + fn can_cast(kind: SyntaxKind) -> bool { + kind == #kind + } + fn cast(syntax: SyntaxNode) -> Option { + if Self::can_cast(syntax.kind()) { Some(Self { syntax }) } else { None } + } + fn syntax(&self) -> &SyntaxNode { &self.syntax } + } + }, + ) + }) + .unzip(); + + let (enum_defs, enum_boilerplate_impls): (Vec<_>, Vec<_>) = grammar + .enums + .iter() + .map(|en| { + let variants: Vec<_> = en.variants.iter().map(|var| format_ident!("{}", var)).collect(); + let name = format_ident!("{}", en.name); + let kinds: Vec<_> = variants + .iter() + .map(|name| format_ident!("{}", to_upper_snake_case(&name.to_string()))) + .collect(); + let traits = en.traits.iter().map(|trait_name| { + let trait_name = format_ident!("{}", trait_name); + quote!(impl ast::#trait_name for #name {}) + }); + + let ast_node = quote! { + impl AstNode for #name { + fn can_cast(kind: SyntaxKind) -> bool { + matches!(kind, #(#kinds)|*) + } + fn cast(syntax: SyntaxNode) -> Option { + let res = match syntax.kind() { + #( + #kinds => #name::#variants(#variants { syntax }), + )* + _ => return None, + }; + Some(res) + } + fn syntax(&self) -> &SyntaxNode { + match self { + #( + #name::#variants(it) => &it.syntax, + )* + } + } + } + }; + + ( + quote! { + #[pretty_doc_comment_placeholder_workaround] + #[derive(Debug, Clone, PartialEq, Eq, Hash)] + pub enum #name { + #(#variants(#variants),)* + } + + #(#traits)* + }, + quote! { + #( + impl From<#variants> for #name { + fn from(node: #variants) -> #name { + #name::#variants(node) + } + } + )* + #ast_node + }, + ) + }) + .unzip(); + + let (any_node_defs, any_node_boilerplate_impls): (Vec<_>, Vec<_>) = grammar + .nodes + .iter() + .flat_map(|node| node.traits.iter().map(move |t| (t, node))) + .into_group_map() + .into_iter() + .sorted_by_key(|(k, _)| k.to_owned()) + .map(|(trait_name, nodes)| { + let name = format_ident!("Any{}", trait_name); + let trait_name = format_ident!("{}", trait_name); + let kinds: Vec<_> = nodes + .iter() + .map(|name| format_ident!("{}", to_upper_snake_case(&name.name.to_string()))) + .collect(); + ( + quote! { + #[pretty_doc_comment_placeholder_workaround] + #[derive(Debug, Clone, PartialEq, Eq, Hash)] + pub struct #name { + pub(crate) syntax: SyntaxNode, + } + impl ast::#trait_name for #name {} + }, + quote! { + impl #name { + #[inline] + pub fn new(node: T) -> #name { + #name { + syntax: node.syntax().clone() + } + } + } + impl AstNode for #name { + fn can_cast(kind: SyntaxKind) -> bool { + matches!(kind, #(#kinds)|*) + } + fn cast(syntax: SyntaxNode) -> Option { + Self::can_cast(syntax.kind()).then_some(#name { syntax }) + } + fn syntax(&self) -> &SyntaxNode { + &self.syntax + } + } + }, + ) + }) + .unzip(); + + let enum_names = grammar.enums.iter().map(|it| &it.name); + let node_names = grammar.nodes.iter().map(|it| &it.name); + + let display_impls = + enum_names.chain(node_names.clone()).map(|it| format_ident!("{}", it)).map(|name| { + quote! { + impl std::fmt::Display for #name { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(self.syntax(), f) + } + } + } + }); + + let defined_nodes: HashSet<_> = node_names.collect(); + + for node in kinds + .nodes + .iter() + .map(|kind| to_pascal_case(kind)) + .filter(|name| !defined_nodes.iter().any(|&it| it == name)) + { + drop(node); + } + + let ast = quote! { + #![allow(clippy::enum_variant_names)] + use crate::{ + SyntaxNode, SyntaxToken, SyntaxKind::{self, *}, + syntax::ast::{self, AstNode, AstChildren, support}, + T, + }; + + #(#node_defs)* + #(#enum_defs)* + #(#any_node_defs)* + #(#node_boilerplate_impls)* + #(#enum_boilerplate_impls)* + #(#any_node_boilerplate_impls)* + #(#display_impls)* + }; + + let ast = ast.to_string().replace("T ! [", "T!["); + let mut res = String::with_capacity(ast.len() * 2); + + let mut docs = + grammar.nodes.iter().map(|it| &it.doc).chain(grammar.enums.iter().map(|it| &it.doc)); + + for chunk in ast.split("# [pretty_doc_comment_placeholder_workaround] ") { + res.push_str(chunk); + if let Some(doc) = docs.next() { + write_doc_comment(doc, &mut res); + } + } + + let res = add_preamble("sourcegen_ast", reformat(res)); + res.replace("#[derive", "\n#[derive") +} + +fn write_doc_comment(contents: &[String], dest: &mut String) { + for line in contents { + writeln!(dest, "///{}", line).unwrap(); + } +} + +fn generate_syntax_kinds(grammar: KindsSrc<'_>) -> String { + let (single_byte_tokens_values, single_byte_tokens): (Vec<_>, Vec<_>) = grammar + .punct + .iter() + .filter(|(token, _name)| token.len() == 1) + .map(|(token, name)| (token.chars().next().unwrap(), format_ident!("{}", name))) + .unzip(); + + let punctuation_values = grammar.punct.iter().map(|(token, _name)| { + if "{}[]()".contains(token) { + let c = token.chars().next().unwrap(); + quote! { #c } + } else { + let cs = token.chars().map(|c| Punct::new(c, Spacing::Joint)); + quote! { #(#cs)* } + } + }); + + let punctuation = + grammar.punct.iter().map(|(_token, name)| format_ident!("{}", name)).collect::>(); + + let keywords_values = &grammar.keywords; + let keywords_idents = keywords_values.iter().map(|kw| format_ident!("{}", kw)); + let keywords = keywords_values + .iter() + .map(|kw| format_ident!("{}_KW", to_upper_snake_case(kw))) + .collect::>(); + + let literals = + grammar.literals.iter().map(|name| format_ident!("{}", name)).collect::>(); + + let tokens = grammar.tokens.iter().map(|name| format_ident!("{}", name)).collect::>(); + + let nodes = grammar.nodes.iter().map(|name| format_ident!("{}", name)).collect::>(); + + let ast = quote! { + #![allow(bad_style, missing_docs, unreachable_pub, clippy::upper_case_acronyms)] + /// The kind of syntax node, e.g. `IDENTIFIER`, `RULE_KW`, or `AND`. + #[derive(Clone, Copy, PartialEq, Eq, PartialOrd, Ord, Hash, Debug)] + #[repr(u16)] + pub enum SyntaxKind { + // Technical SyntaxKinds: they appear temporally during parsing, + // but never end up in the final tree + #[doc(hidden)] + TOMBSTONE, + #[doc(hidden)] + EOF, + #(#punctuation,)* + #(#keywords,)* + #(#literals,)* + #(#tokens,)* + #(#nodes,)* + + // Technical kind so that we can cast from u16 safely + #[doc(hidden)] + __LAST, + } + use self::SyntaxKind::*; + + impl SyntaxKind { + pub fn is_keyword(self) -> bool { + matches!(self, #(#keywords)|*) + } + + pub fn is_punct(self) -> bool { + matches!(self, #(#punctuation)|*) + } + + pub fn is_literal(self) -> bool { + matches!(self, #(#literals)|*) + } + + pub fn from_keyword(ident: &str) -> Option { + let kw = match ident { + #(#keywords_values => #keywords,)* + _ => return None, + }; + Some(kw) + } + + pub fn from_char(c: char) -> Option { + let tok = match c { + #(#single_byte_tokens_values => #single_byte_tokens,)* + _ => return None, + }; + Some(tok) + } + } + + #[macro_export] + macro_rules! T { + #([#punctuation_values] => { $crate::SyntaxKind::#punctuation };)* + #([#keywords_idents] => { $crate::SyntaxKind::#keywords };)* + [identifier] => { $crate::SyntaxKind::IDENTIFIER }; + [variable] => { $crate::SyntaxKind::VARIABLE }; + [string_lit] => { $crate::SyntaxKind::STRING_LIT }; + } + pub use T; + }; + + add_preamble("sourcegen_ast", reformat(ast.to_string())) +} + +impl Field { + fn is_many(&self) -> bool { + matches!(self, Field::Node { cardinality: Cardinality::Many, .. }) + } + fn token_kind(&self) -> Option { + match self { + Field::Token(token) => { + let token: proc_macro2::TokenStream = token.parse().unwrap(); + Some(quote! { T![#token] }) + } + _ => None, + } + } + fn method_name(&self) -> proc_macro2::Ident { + match self { + Field::Token(name) => { + let name = match name.as_str() { + "'{'" => "l_brace", + "'}'" => "r_brace", + "'('" => "l_paren", + "')'" => "r_paren", + ":" => "colon", + "," => "comma", + "=" => "assign", + _ => name, + }; + format_ident!("{}_token", name) + } + Field::Node { name, .. } => { + if name == "type" { + format_ident!("ty") + } else { + format_ident!("{}", name) + } + } + } + } + fn ty(&self) -> proc_macro2::Ident { + match self { + Field::Token(_) => format_ident!("SyntaxToken"), + Field::Node { ty, .. } => format_ident!("{}", ty), + } + } +} + +fn lower(grammar: &Grammar) -> AstSrc { + let mut res = AstSrc { + tokens: "Whitespace Comment StringLit Number Variable" + .split_ascii_whitespace() + .map(|it| it.to_string()) + .collect::>(), + ..Default::default() + }; + + let nodes = grammar.iter().collect::>(); + + for &node in &nodes { + let name = grammar[node].name.clone(); + let rule = &grammar[node].rule; + match lower_enum(grammar, rule) { + Some(variants) => { + let enum_src = AstEnumSrc { doc: Vec::new(), name, traits: Vec::new(), variants }; + res.enums.push(enum_src); + } + None => { + let mut fields = Vec::new(); + lower_rule(&mut fields, grammar, None, rule); + res.nodes.push(AstNodeSrc { doc: Vec::new(), name, traits: Vec::new(), fields }); + } + } + } + + extract_struct_traits(&mut res); + res +} + +fn lower_enum(grammar: &Grammar, rule: &Rule) -> Option> { + let alternatives = match rule { + Rule::Alt(it) => it, + _ => return None, + }; + + let mut variants = Vec::new(); + for alternative in alternatives { + match alternative { + Rule::Node(it) => variants.push(grammar[*it].name.clone()), + Rule::Token(it) if grammar[*it].name == ";" => (), + _ => return None, + } + } + Some(variants) +} + +fn lower_rule(acc: &mut Vec, grammar: &Grammar, label: Option<&String>, rule: &Rule) { + if lower_comma_list(acc, grammar, label, rule) { + return; + } + + match rule { + Rule::Node(node) => { + let ty = grammar[*node].name.clone(); + let name = label.cloned().unwrap_or_else(|| to_lower_snake_case(&ty)); + let field = Field::Node { name, ty, cardinality: Cardinality::Optional }; + acc.push(field); + } + Rule::Token(token) => { + assert!(label.is_none()); + let mut name = grammar[*token].name.clone(); + // add support for numbers + if name != "number" && name != "string" { + if "[](){}".contains(&name) { + name = format!("'{}'", name); + } + let field = Field::Token(name); + acc.push(field); + } + } + Rule::Rep(inner) => { + if let Rule::Node(node) = &**inner { + let ty = grammar[*node].name.clone(); + let name = + label.cloned().unwrap_or_else(|| format!("{}s", &to_lower_snake_case(&ty))); + let field = Field::Node { name, ty, cardinality: Cardinality::Many }; + acc.push(field); + return; + } + panic!("Unsupported rule: {:?}", rule); + } + Rule::Labeled { label: l, rule } => { + assert!(label.is_none()); + let manually_implemented = matches!(l.as_str(), "lhs" | "rhs" | "op" | "value"); + if manually_implemented { + return; + } + lower_rule(acc, grammar, Some(l), rule); + } + Rule::Seq(rules) | Rule::Alt(rules) => { + for rule in rules { + lower_rule(acc, grammar, label, rule) + } + } + Rule::Opt(rule) => lower_rule(acc, grammar, label, rule), + } +} + +fn lower_comma_list( + acc: &mut Vec, + grammar: &Grammar, + label: Option<&String>, + rule: &Rule, +) -> bool { + let rule = match rule { + Rule::Seq(it) => it, + _ => return false, + }; + + let (node, repeat, trailing_comma) = match rule.as_slice() { + [Rule::Node(node), Rule::Rep(repeat), Rule::Opt(trailing_comma)] => { + (node, repeat, trailing_comma) + } + _ => return false, + }; + + let repeat = match &**repeat { + Rule::Seq(it) => it, + _ => return false, + }; + + match repeat.as_slice() { + [comma, Rule::Node(n)] if comma == &**trailing_comma && n == node => (), + _ => return false, + } + + let ty = grammar[*node].name.clone(); + let name = label.cloned().unwrap_or_else(|| format!("{}s", &to_lower_snake_case(&ty))); + let field = Field::Node { name, ty, cardinality: Cardinality::Many }; + acc.push(field); + true +} + +//TODO: possible deduplication and enum extraction and struct traits, so far not needed +fn extract_struct_traits(ast: &mut AstSrc) { + let nodes_with_comments = ["SourceFile", "Rule", "BlockExpr", "Strings", "Condition"]; + + for node in &mut ast.nodes { + if nodes_with_comments.contains(&&*node.name) { + node.traits.push("HasComments".into()); + } + } +} diff --git a/src/syntax/tests/tools.rs b/src/syntax/tests/tools.rs new file mode 100644 index 0000000..c9e35d2 --- /dev/null +++ b/src/syntax/tests/tools.rs @@ -0,0 +1,107 @@ +// Lot of helping functions used mainly for generating code, +// modifying it and formating it + +use std::{ + fs, + path::{Path, PathBuf}, +}; +use xshell::{cmd, pushenv}; + +pub fn add_preamble(generator: &'static str, mut text: String) -> String { + let preamble = format!("//! Generated by `{}`, do not edit by hand.\n\n", generator); + text.insert_str(0, &preamble); + text +} + +fn ensure_rustfmt() { + let version = cmd!("rustfmt --version").read().unwrap_or_default(); + if !version.contains("stable") { + panic!( + "Failed to run rustfmt from toolchain 'stable'. \ + Please run `rustup component add rustfmt --toolchain stable` to install it.", + ) + } +} + +pub fn reformat(text: String) -> String { + let _e = pushenv("RUSTUP_TOOLCHAIN", "stable"); + ensure_rustfmt(); + let rustfmt_toml = project_root().join("rustfmt.toml"); + let mut stdout = cmd!("rustfmt --config-path {rustfmt_toml}").stdin(text).read().unwrap(); + if !stdout.ends_with('\n') { + stdout.push('\n'); + } + stdout +} + +pub fn ensure_file_contents(file: &Path, contents: &str) { + if let Ok(old_contents) = fs::read_to_string(file) { + if normalize_newlines(&old_contents) == normalize_newlines(contents) { + // File is already up to date. + return; + } + } + + let display_path = file.strip_prefix(&project_root()).unwrap_or(file); + eprintln!( + "\n\x1b[31;1merror\x1b[0m: {} was not up-to-date, updating\n", + display_path.display() + ); + if std::env::var("CI").is_ok() { + eprintln!(" NOTE: run `cargo test` locally and commit the updated files\n"); + } + if let Some(parent) = file.parent() { + let _ = fs::create_dir_all(parent); + } + fs::write(file, contents).unwrap(); + panic!("some file was not up to date and has been updated, simply re-run the tests") +} + +fn normalize_newlines(s: &str) -> String { + s.replace("\r\n", "\n") +} + +pub fn project_root() -> PathBuf { + let dir = env!("CARGO_MANIFEST_DIR"); + PathBuf::from(dir).to_owned() +} + +pub fn to_lower_snake_case(s: &str) -> String { + to_snake_case(s, char::to_ascii_lowercase) +} +pub fn to_upper_snake_case(s: &str) -> String { + to_snake_case(s, char::to_ascii_uppercase) +} +fn to_snake_case char>(s: &str, change_case: F) -> String { + let mut buf = String::with_capacity(s.len()); + let mut prev = false; + for c in s.chars() { + // `&& prev` is required to not insert `_` before the first symbol. + if c.is_ascii_uppercase() && prev { + // This check is required to not translate `Weird_Case` into `weird__case`. + if !buf.ends_with('_') { + buf.push('_') + } + } + prev = true; + + buf.push(change_case(&c)); + } + buf +} + +pub fn to_pascal_case(s: &str) -> String { + let mut buf = String::with_capacity(s.len()); + let mut prev_is_underscore = true; + for c in s.chars() { + if c == '_' { + prev_is_underscore = true; + } else if prev_is_underscore { + buf.push(c.to_ascii_uppercase()); + prev_is_underscore = false; + } else { + buf.push(c.to_ascii_lowercase()); + } + } + buf +} diff --git a/src/syntax/text_token_source.rs b/src/syntax/text_token_source.rs index 279d7f1..f52df84 100644 --- a/src/syntax/text_token_source.rs +++ b/src/syntax/text_token_source.rs @@ -4,6 +4,8 @@ use crate::{ }; use text_size::{TextRange, TextSize}; +/// A source of tokens for the parser. +/// It takes tokens from a source text and store them into token-offset pairs pub(crate) struct TextTokenSource<'t> { text: &'t str, @@ -29,6 +31,13 @@ impl<'t> TokenSource for TextTokenSource<'t> { let pos = self.curr.1 + 1; self.curr = (mk_token(pos, &self.token_offset_pairs), pos); } + + fn is_keyword(&self, kw: &str) -> bool { + self.token_offset_pairs + .get(self.curr.1) + .map(|(token, offset)| &self.text[TextRange::at(*offset, token.len)] == kw) + .unwrap_or(false) + } } fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Token { @@ -42,12 +51,10 @@ fn mk_token(pos: usize, token_offset_pairs: &[(Token, TextSize)]) -> parser::Tok ), None => (EOF, false), }; - parser::Token { - kind, - is_jointed_to_next, - } + parser::Token { kind, is_jointed_to_next } } +/// Generate token-offset pairs impl<'t> TextTokenSource<'t> { pub(crate) fn new(text: &'t str, raw_tokens: &'t [Token]) -> TextTokenSource<'t> { let token_offset_pairs: Vec<_> = raw_tokens @@ -55,11 +62,7 @@ impl<'t> TextTokenSource<'t> { .filter_map({ let mut len = 0.into(); move |token| { - let pair = if token.kind.is_trivia() { - None - } else { - Some((*token, len)) - }; + let pair = if token.kind.is_trivia() { None } else { Some((*token, len)) }; len += token.len; pair } @@ -67,10 +70,6 @@ impl<'t> TextTokenSource<'t> { .collect(); let first = mk_token(0, &token_offset_pairs); - TextTokenSource { - text, - token_offset_pairs, - curr: (first, 0), - } + TextTokenSource { text, token_offset_pairs, curr: (first, 0) } } } diff --git a/src/syntax/text_tree_sink.rs b/src/syntax/text_tree_sink.rs index 63f2992..589e4f3 100644 --- a/src/syntax/text_tree_sink.rs +++ b/src/syntax/text_tree_sink.rs @@ -4,11 +4,11 @@ use text_size::{TextRange, TextSize}; use crate::{ lexer::Token, parser::{ParseError, SyntaxKind, TreeSink}, - syntax::{ - ast, syntax_error::SyntaxError, syntax_node::GreenNode, syntax_node::SyntaxTreeBuilder, - }, + syntax::{syntax_error::SyntaxError, syntax_node::GreenNode, syntax_node::SyntaxTreeBuilder}, }; +/// Used to connect parser and specific SyntaxTree representation +/// It also handles attaching trivia (whitespaces and comments) to the nodes pub(crate) struct TextTreeSink<'a> { text: &'a str, tokens: &'a [Token], @@ -54,10 +54,8 @@ impl<'a> TreeSink for TextTreeSink<'a> { State::Normal => (), } - let n_trivias = self.tokens[self.token_pos..] - .iter() - .take_while(|it| it.kind.is_trivia()) - .count(); + let n_trivias = + self.tokens[self.token_pos..].iter().take_while(|it| it.kind.is_trivia()).count(); let leading_trivias = &self.tokens[self.token_pos..self.token_pos + n_trivias]; let mut trivia_end = self.text_pos + leading_trivias.iter().map(|it| it.len).sum::(); @@ -146,6 +144,7 @@ fn n_attached_trivias<'a>( trivias: impl Iterator, ) -> usize { match kind { + // Nodes that are supported to have attached trivias SyntaxKind::RULE | SyntaxKind::BLOCK_EXPR | SyntaxKind::STRINGS | SyntaxKind::CONDITION => { let mut res = 0; let trivias = trivias.enumerate().peekable(); diff --git a/tests/test1.out b/tests/test1.out index f4c65e8..5e20fa0 100644 --- a/tests/test1.out +++ b/tests/test1.out @@ -1,33 +1,31 @@ -- SOURCE_FILE - - RULE - - "rule" RULE - - " " WHITESPACE - - IDENTIFIER - - "test" IDENTIFIER - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - STRINGS - - "strings" STRINGS - - ":" COLON - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$a" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"foo\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - LITERAL - - "$a" VARIABLE - - "\n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..54 + RULE@0..53 + RULE_KW@0..4 "rule" + WHITESPACE@4..5 " " + IDENTIFIER@5..9 "test" + WHITESPACE@9..10 "\n" + BLOCK_EXPR@10..53 + L_BRACE@10..11 "{" + WHITESPACE@11..13 "\n\t" + STRINGS@13..34 + STRINGS_KW@13..20 "strings" + COLON@20..21 ":" + WHITESPACE@21..24 "\n\t\t" + VARIABLE_STMT@24..34 + VARIABLE@24..26 "$a" + WHITESPACE@26..27 " " + ASSIGN@27..28 "=" + WHITESPACE@28..29 " " + PATTERN@29..34 + STRING_LIT@29..34 "\"foo\"" + WHITESPACE@34..36 "\n\t" + CONDITION@36..51 + CONDITION_KW@36..45 "condition" + COLON@45..46 ":" + WHITESPACE@46..49 "\n\t\t" + EXPRESSION_STMT@49..51 + LITERAL@49..51 + VARIABLE@49..51 "$a" + WHITESPACE@51..52 "\n" + R_BRACE@52..53 "}" + WHITESPACE@53..54 "\n" diff --git a/tests/test2.out b/tests/test2.out index 68899c1..9b388b5 100644 --- a/tests/test2.out +++ b/tests/test2.out @@ -1,48 +1,45 @@ -- SOURCE_FILE - - RULE - - "rule" RULE - - " " WHITESPACE - - IDENTIFIER - - "test" IDENTIFIER - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - STRINGS - - "strings" STRINGS - - ":" COLON - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$a" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"foo\"" STRING - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$b" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"bar\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - EXPRESSION - - LITERAL - - "$a" VARIABLE - - " " WHITESPACE - - "or" OR - - "\n\t\t" WHITESPACE - - LITERAL - - "$b" VARIABLE - - "\n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..75 + RULE@0..74 + RULE_KW@0..4 "rule" + WHITESPACE@4..5 " " + IDENTIFIER@5..9 "test" + WHITESPACE@9..10 "\n" + BLOCK_EXPR@10..74 + L_BRACE@10..11 "{" + WHITESPACE@11..13 "\n\t" + STRINGS@13..47 + STRINGS_KW@13..20 "strings" + COLON@20..21 ":" + WHITESPACE@21..24 "\n\t\t" + VARIABLE_STMT@24..34 + VARIABLE@24..26 "$a" + WHITESPACE@26..27 " " + ASSIGN@27..28 "=" + WHITESPACE@28..29 " " + PATTERN@29..34 + STRING_LIT@29..34 "\"foo\"" + WHITESPACE@34..37 "\n\t\t" + VARIABLE_STMT@37..47 + VARIABLE@37..39 "$b" + WHITESPACE@39..40 " " + ASSIGN@40..41 "=" + WHITESPACE@41..42 " " + PATTERN@42..47 + STRING_LIT@42..47 "\"bar\"" + WHITESPACE@47..49 "\n\t" + CONDITION@49..72 + CONDITION_KW@49..58 "condition" + COLON@58..59 ":" + WHITESPACE@59..62 "\n\t\t" + EXPRESSION_STMT@62..72 + EXPRESSION@62..72 + LITERAL@62..64 + VARIABLE@62..64 "$a" + WHITESPACE@64..65 " " + OR_KW@65..67 "or" + WHITESPACE@67..70 "\n\t\t" + LITERAL@70..72 + VARIABLE@70..72 "$b" + WHITESPACE@72..73 "\n" + R_BRACE@73..74 "}" + WHITESPACE@74..75 "\n" diff --git a/tests/test3.out b/tests/test3.out index 1d407f9..0b65420 100644 --- a/tests/test3.out +++ b/tests/test3.out @@ -1,62 +1,59 @@ -- SOURCE_FILE - - "//Global comment" COMMENT - - "\n\n" WHITESPACE - - RULE - - "//Rule comment" COMMENT - - "\n" WHITESPACE - - "rule" RULE - - " " WHITESPACE - - IDENTIFIER - - "test" IDENTIFIER - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - "//Rule block comment" COMMENT - - "\n\n\t" WHITESPACE - - STRINGS - - "//String comment" COMMENT - - "\n\t" WHITESPACE - - "strings" STRINGS - - ":" COLON - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$a" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"foo\"" STRING - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$b" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"bar\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - EXPRESSION - - LITERAL - - "$a" VARIABLE - - " " WHITESPACE - - "or" OR - - "\n\t\t" WHITESPACE - - EXPRESSION - - LITERAL - - "$b" VARIABLE - - " " WHITESPACE - - "and" AND - - " " WHITESPACE - - LITERAL - - "true" TRUE - - "\n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..158 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..157 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + IDENTIFIER@38..42 "test" + WHITESPACE@42..43 "\n" + BLOCK_EXPR@43..157 + L_BRACE@43..44 "{" + WHITESPACE@44..46 "\n\t" + COMMENT@46..66 "//Rule block comment" + WHITESPACE@66..69 "\n\n\t" + STRINGS@69..121 + COMMENT@69..85 "//String comment" + WHITESPACE@85..87 "\n\t" + STRINGS_KW@87..94 "strings" + COLON@94..95 ":" + WHITESPACE@95..98 "\n\t\t" + VARIABLE_STMT@98..108 + VARIABLE@98..100 "$a" + WHITESPACE@100..101 " " + ASSIGN@101..102 "=" + WHITESPACE@102..103 " " + PATTERN@103..108 + STRING_LIT@103..108 "\"foo\"" + WHITESPACE@108..111 "\n\t\t" + VARIABLE_STMT@111..121 + VARIABLE@111..113 "$b" + WHITESPACE@113..114 " " + ASSIGN@114..115 "=" + WHITESPACE@115..116 " " + PATTERN@116..121 + STRING_LIT@116..121 "\"bar\"" + WHITESPACE@121..123 "\n\t" + CONDITION@123..155 + CONDITION_KW@123..132 "condition" + COLON@132..133 ":" + WHITESPACE@133..136 "\n\t\t" + EXPRESSION_STMT@136..155 + EXPRESSION@136..155 + LITERAL@136..138 + VARIABLE@136..138 "$a" + WHITESPACE@138..139 " " + OR_KW@139..141 "or" + WHITESPACE@141..144 "\n\t\t" + EXPRESSION@144..155 + LITERAL@144..146 + VARIABLE@144..146 "$b" + WHITESPACE@146..147 " " + AND_KW@147..150 "and" + WHITESPACE@150..151 " " + LITERAL@151..155 + TRUE_KW@151..155 "true" + WHITESPACE@155..156 "\n" + R_BRACE@156..157 "}" + WHITESPACE@157..158 "\n" diff --git a/tests/test4.out b/tests/test4.out index 699f73c..c28fc99 100644 --- a/tests/test4.out +++ b/tests/test4.out @@ -1,62 +1,60 @@ -- SOURCE_FILE - - "//Global comment" COMMENT - - "\n\n" WHITESPACE - - RULE - - "//Rule comment" COMMENT - - "\n" WHITESPACE - - "rule" RULE - - " " WHITESPACE - - IDENTIFIER - - "test" IDENTIFIER - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - "//Rule block comment" COMMENT - - "\n\n\t" WHITESPACE - - STRINGS - - "//String comment" COMMENT - - "\n\t" WHITESPACE - - "strings" STRINGS - - ":" COLON - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - ERROR - - "a" IDENTIFIER - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"foo\"" STRING - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$b" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"bar\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - EXPRESSION - - LITERAL - - "$a" VARIABLE - - " " WHITESPACE - - "or" OR - - "\n\t\t" WHITESPACE - - EXPRESSION - - LITERAL - - "$b" VARIABLE - - " " WHITESPACE - - "and" AND - - " " WHITESPACE - - LITERAL - - "true" TRUE - - "\n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..157 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..156 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + IDENTIFIER@38..42 "test" + WHITESPACE@42..43 "\n" + BLOCK_EXPR@43..156 + L_BRACE@43..44 "{" + WHITESPACE@44..46 "\n\t" + COMMENT@46..66 "//Rule block comment" + WHITESPACE@66..69 "\n\n\t" + STRINGS@69..120 + COMMENT@69..85 "//String comment" + WHITESPACE@85..87 "\n\t" + STRINGS_KW@87..94 "strings" + COLON@94..95 ":" + WHITESPACE@95..98 "\n\t\t" + VARIABLE_STMT@98..107 + ERROR@98..99 + IDENTIFIER@98..99 "a" + WHITESPACE@99..100 " " + ASSIGN@100..101 "=" + WHITESPACE@101..102 " " + PATTERN@102..107 + STRING_LIT@102..107 "\"foo\"" + WHITESPACE@107..110 "\n\t\t" + VARIABLE_STMT@110..120 + VARIABLE@110..112 "$b" + WHITESPACE@112..113 " " + ASSIGN@113..114 "=" + WHITESPACE@114..115 " " + PATTERN@115..120 + STRING_LIT@115..120 "\"bar\"" + WHITESPACE@120..122 "\n\t" + CONDITION@122..154 + CONDITION_KW@122..131 "condition" + COLON@131..132 ":" + WHITESPACE@132..135 "\n\t\t" + EXPRESSION_STMT@135..154 + EXPRESSION@135..154 + LITERAL@135..137 + VARIABLE@135..137 "$a" + WHITESPACE@137..138 " " + OR_KW@138..140 "or" + WHITESPACE@140..143 "\n\t\t" + EXPRESSION@143..154 + LITERAL@143..145 + VARIABLE@143..145 "$b" + WHITESPACE@145..146 " " + AND_KW@146..149 "and" + WHITESPACE@149..150 " " + LITERAL@150..154 + TRUE_KW@150..154 "true" + WHITESPACE@154..155 "\n" + R_BRACE@155..156 "}" + WHITESPACE@156..157 "\n" diff --git a/tests/test5.err b/tests/test5.err index af68e68..fddbdca 100644 --- a/tests/test5.err +++ b/tests/test5.err @@ -1 +1 @@ -SyntaxError("expected expression", 144..144) \ No newline at end of file +SyntaxError("unsupported expression", 144..144) \ No newline at end of file diff --git a/tests/test5.out b/tests/test5.out index 5753b77..52d454e 100644 --- a/tests/test5.out +++ b/tests/test5.out @@ -1,62 +1,59 @@ -- SOURCE_FILE - - "//Global comment" COMMENT - - "\n\n" WHITESPACE - - RULE - - "//Rule comment" COMMENT - - "\n" WHITESPACE - - "rule" RULE - - " " WHITESPACE - - IDENTIFIER - - "test" IDENTIFIER - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - "//Rule block comment" COMMENT - - "\n\n\t" WHITESPACE - - STRINGS - - "//String comment" COMMENT - - "\n\t" WHITESPACE - - "strings" STRINGS - - ":" COLON - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$a" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"foo\"" STRING - - "\n\t\t" WHITESPACE - - VARIABLE_STMT - - VARIABLE - - "$b" VARIABLE - - " " WHITESPACE - - "=" ASSIGN - - " " WHITESPACE - - STRING - - "\"bar\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - EXPRESSION - - EXPRESSION - - LITERAL - - "$a" VARIABLE - - " " WHITESPACE - - "or" OR - - "\n\t\t" WHITESPACE - - ERROR - - "b" IDENTIFIER - - " " WHITESPACE - - "and" AND - - " " WHITESPACE - - LITERAL - - "true" TRUE - - "\n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..157 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..156 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + IDENTIFIER@38..42 "test" + WHITESPACE@42..43 "\n" + BLOCK_EXPR@43..156 + L_BRACE@43..44 "{" + WHITESPACE@44..46 "\n\t" + COMMENT@46..66 "//Rule block comment" + WHITESPACE@66..69 "\n\n\t" + STRINGS@69..121 + COMMENT@69..85 "//String comment" + WHITESPACE@85..87 "\n\t" + STRINGS_KW@87..94 "strings" + COLON@94..95 ":" + WHITESPACE@95..98 "\n\t\t" + VARIABLE_STMT@98..108 + VARIABLE@98..100 "$a" + WHITESPACE@100..101 " " + ASSIGN@101..102 "=" + WHITESPACE@102..103 " " + PATTERN@103..108 + STRING_LIT@103..108 "\"foo\"" + WHITESPACE@108..111 "\n\t\t" + VARIABLE_STMT@111..121 + VARIABLE@111..113 "$b" + WHITESPACE@113..114 " " + ASSIGN@114..115 "=" + WHITESPACE@115..116 " " + PATTERN@116..121 + STRING_LIT@116..121 "\"bar\"" + WHITESPACE@121..123 "\n\t" + CONDITION@123..154 + CONDITION_KW@123..132 "condition" + COLON@132..133 ":" + WHITESPACE@133..136 "\n\t\t" + EXPRESSION_STMT@136..154 + EXPRESSION@136..154 + EXPRESSION@136..145 + LITERAL@136..138 + VARIABLE@136..138 "$a" + WHITESPACE@138..139 " " + OR_KW@139..141 "or" + WHITESPACE@141..144 "\n\t\t" + ERROR@144..145 + IDENTIFIER@144..145 "b" + WHITESPACE@145..146 " " + AND_KW@146..149 "and" + WHITESPACE@149..150 " " + LITERAL@150..154 + TRUE_KW@150..154 "true" + WHITESPACE@154..155 "\n" + R_BRACE@155..156 "}" + WHITESPACE@156..157 "\n" diff --git a/tests/test6.err b/tests/test6.err index b6080fd..54dd076 100644 --- a/tests/test6.err +++ b/tests/test6.err @@ -7,7 +7,7 @@ SyntaxError("expected strings or condition", 106..106) SyntaxError("expected strings or condition", 114..114) SyntaxError("expected strings or condition", 117..117) SyntaxError("expected strings or condition", 119..119) -SyntaxError("expected expression", 139..139) -SyntaxError("expected expression", 141..141) -SyntaxError("expected expression", 150..150) +SyntaxError("unsupported expression", 139..139) +SyntaxError("unsupported expression", 141..141) +SyntaxError("unsupported expression", 150..150) SyntaxError("Invalid character", 98..99) \ No newline at end of file diff --git a/tests/test6.out b/tests/test6.out index 9aeedac..f5126ed 100644 --- a/tests/test6.out +++ b/tests/test6.out @@ -1,60 +1,60 @@ -- SOURCE_FILE - - "//Global comment" COMMENT - - "\n\n" WHITESPACE - - RULE - - "//Rule comment" COMMENT - - "\n" WHITESPACE - - "rule" RULE - - " " WHITESPACE - - ERROR - - "condition" CONDITION - - "\n" WHITESPACE - - BLOCK_EXPR - - "{" LBRACE - - "\n\t" WHITESPACE - - "//Rule block comment" COMMENT - - "\n\n\t" WHITESPACE - - "//String comment" COMMENT - - "\n\t" WHITESPACE - - ERROR - - "string" IDENTIFIER - - ERROR - - "*" ERROR - - "\n\t\t" WHITESPACE - - ERROR - - "a" IDENTIFIER - - " " WHITESPACE - - ERROR - - "=" ASSIGN - - " " WHITESPACE - - ERROR - - "00000" NUMBER - - "\n\t\t" WHITESPACE - - ERROR - - "$b" VARIABLE - - " " WHITESPACE - - ERROR - - "=" ASSIGN - - " " WHITESPACE - - ERROR - - "\"bar\"" STRING - - "\n\t" WHITESPACE - - CONDITION - - "condition" CONDITION - - ":" COLON - - "\n\t\t" WHITESPACE - - ERROR - - "a" IDENTIFIER - - " " WHITESPACE - - ERROR - - "ord" IDENTIFIER - - "\n\t\t" WHITESPACE - - EXPRESSION_STMT - - LITERAL - - "$b" VARIABLE - - " " WHITESPACE - - ERROR - - "ant" IDENTIFIER - - " \n" WHITESPACE - - "}" RBRACE - - "\n" WHITESPACE +SOURCE_FILE@0..157 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..156 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + ERROR@38..47 + CONDITION_KW@38..47 "condition" + WHITESPACE@47..48 "\n" + BLOCK_EXPR@48..156 + L_BRACE@48..49 "{" + WHITESPACE@49..51 "\n\t" + COMMENT@51..71 "//Rule block comment" + WHITESPACE@71..74 "\n\n\t" + COMMENT@74..90 "//String comment" + WHITESPACE@90..92 "\n\t" + ERROR@92..98 + IDENTIFIER@92..98 "string" + ERROR@98..99 + ERROR@98..99 "*" + WHITESPACE@99..102 "\n\t\t" + ERROR@102..103 + IDENTIFIER@102..103 "a" + WHITESPACE@103..104 " " + ERROR@104..105 + ASSIGN@104..105 "=" + WHITESPACE@105..106 " " + ERROR@106..111 + NUMBER@106..111 "00000" + WHITESPACE@111..114 "\n\t\t" + ERROR@114..116 + VARIABLE@114..116 "$b" + WHITESPACE@116..117 " " + ERROR@117..118 + ASSIGN@117..118 "=" + WHITESPACE@118..119 " " + ERROR@119..124 + STRING_LIT@119..124 "\"bar\"" + WHITESPACE@124..126 "\n\t" + CONDITION@126..153 + CONDITION_KW@126..135 "condition" + COLON@135..136 ":" + WHITESPACE@136..139 "\n\t\t" + ERROR@139..140 + IDENTIFIER@139..140 "a" + WHITESPACE@140..141 " " + ERROR@141..144 + IDENTIFIER@141..144 "ord" + WHITESPACE@144..147 "\n\t\t" + EXPRESSION_STMT@147..149 + LITERAL@147..149 + VARIABLE@147..149 "$b" + WHITESPACE@149..150 " " + ERROR@150..153 + IDENTIFIER@150..153 "ant" + WHITESPACE@153..155 " \n" + R_BRACE@155..156 "}" + WHITESPACE@156..157 "\n" diff --git a/tests/test7.in b/tests/test7.in new file mode 100644 index 0000000..c19bdf8 --- /dev/null +++ b/tests/test7.in @@ -0,0 +1,15 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + strings: + $a = "foo" + $b = "bar" + condition: + $a and + $b or true +} diff --git a/tests/test7.out b/tests/test7.out new file mode 100644 index 0000000..ca09462 --- /dev/null +++ b/tests/test7.out @@ -0,0 +1,59 @@ +SOURCE_FILE@0..158 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..157 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + IDENTIFIER@38..42 "test" + WHITESPACE@42..43 "\n" + BLOCK_EXPR@43..157 + L_BRACE@43..44 "{" + WHITESPACE@44..46 "\n\t" + COMMENT@46..66 "//Rule block comment" + WHITESPACE@66..69 "\n\n\t" + STRINGS@69..121 + COMMENT@69..85 "//String comment" + WHITESPACE@85..87 "\n\t" + STRINGS_KW@87..94 "strings" + COLON@94..95 ":" + WHITESPACE@95..98 "\n\t\t" + VARIABLE_STMT@98..108 + VARIABLE@98..100 "$a" + WHITESPACE@100..101 " " + ASSIGN@101..102 "=" + WHITESPACE@102..103 " " + PATTERN@103..108 + STRING_LIT@103..108 "\"foo\"" + WHITESPACE@108..111 "\n\t\t" + VARIABLE_STMT@111..121 + VARIABLE@111..113 "$b" + WHITESPACE@113..114 " " + ASSIGN@114..115 "=" + WHITESPACE@115..116 " " + PATTERN@116..121 + STRING_LIT@116..121 "\"bar\"" + WHITESPACE@121..123 "\n\t" + CONDITION@123..155 + CONDITION_KW@123..132 "condition" + COLON@132..133 ":" + WHITESPACE@133..136 "\n\t\t" + EXPRESSION_STMT@136..155 + EXPRESSION@136..155 + EXPRESSION@136..147 + LITERAL@136..138 + VARIABLE@136..138 "$a" + WHITESPACE@138..139 " " + AND_KW@139..142 "and" + WHITESPACE@142..145 "\n\t\t" + LITERAL@145..147 + VARIABLE@145..147 "$b" + WHITESPACE@147..148 " " + OR_KW@148..150 "or" + WHITESPACE@150..151 " " + LITERAL@151..155 + TRUE_KW@151..155 "true" + WHITESPACE@155..156 "\n" + R_BRACE@156..157 "}" + WHITESPACE@157..158 "\n" diff --git a/tests/test8.err b/tests/test8.err new file mode 100644 index 0000000..7f0b377 --- /dev/null +++ b/tests/test8.err @@ -0,0 +1 @@ +SyntaxError("expected strings or condition", 87..87) \ No newline at end of file diff --git a/tests/test8.in b/tests/test8.in new file mode 100644 index 0000000..6927341 --- /dev/null +++ b/tests/test8.in @@ -0,0 +1,13 @@ +//Global comment + +//Rule comment +rule test +{ + //Rule block comment + + //String comment + string: + $b = "bar" + condition: + $b and not true or false +} diff --git a/tests/test8.out b/tests/test8.out new file mode 100644 index 0000000..d9c69bd --- /dev/null +++ b/tests/test8.out @@ -0,0 +1,54 @@ +SOURCE_FILE@0..149 + COMMENT@0..16 "//Global comment" + WHITESPACE@16..18 "\n\n" + RULE@18..148 + COMMENT@18..32 "//Rule comment" + WHITESPACE@32..33 "\n" + RULE_KW@33..37 "rule" + WHITESPACE@37..38 " " + IDENTIFIER@38..42 "test" + WHITESPACE@42..43 "\n" + BLOCK_EXPR@43..148 + L_BRACE@43..44 "{" + WHITESPACE@44..46 "\n\t" + COMMENT@46..66 "//Rule block comment" + WHITESPACE@66..69 "\n\n\t" + COMMENT@69..85 "//String comment" + WHITESPACE@85..87 "\n\t" + ERROR@87..93 + IDENTIFIER@87..93 "string" + COLON@93..94 ":" + WHITESPACE@94..97 "\n\t\t" + VARIABLE_STMT@97..107 + VARIABLE@97..99 "$b" + WHITESPACE@99..100 " " + ASSIGN@100..101 "=" + WHITESPACE@101..102 " " + PATTERN@102..107 + STRING_LIT@102..107 "\"bar\"" + WHITESPACE@107..109 "\n\t" + CONDITION@109..146 + CONDITION_KW@109..118 "condition" + COLON@118..119 ":" + WHITESPACE@119..122 "\n\t\t" + EXPRESSION_STMT@122..146 + EXPRESSION@122..146 + EXPRESSION@122..137 + LITERAL@122..124 + VARIABLE@122..124 "$b" + WHITESPACE@124..125 " " + AND_KW@125..128 "and" + WHITESPACE@128..129 " " + PREFIX_EXPR@129..137 + NOT_KW@129..132 "not" + WHITESPACE@132..133 " " + LITERAL@133..137 + TRUE_KW@133..137 "true" + WHITESPACE@137..138 " " + OR_KW@138..140 "or" + WHITESPACE@140..141 " " + LITERAL@141..146 + FALSE_KW@141..146 "false" + WHITESPACE@146..147 "\n" + R_BRACE@147..148 "}" + WHITESPACE@148..149 "\n" diff --git a/yara.ungram b/yara.ungram new file mode 100644 index 0000000..4b0ec97 --- /dev/null +++ b/yara.ungram @@ -0,0 +1,43 @@ +SourceFile = Rule* + +Rule = + 'rule' 'identifier' + body:BlockExpr + +BlockExpr = + '{' + Strings? Condition + '}' + +Strings = + 'strings' ':' + VariableStmt* + +VariableStmt = + 'variable' '=' Pattern + +Pattern = + 'string_lit' + +Condition = + 'condition' ':' + ExpressionStmt + +ExpressionStmt = Expr + +Expr = + Expression +| PrefixExpr +| Literal + +Expression = + lhs:Expr + op:('and' | 'or') + rhs:Expr + +PrefixExpr = + 'not' + Expr + +Literal = + value:('true' | 'false' | 'variable')