diff --git a/Cargo.lock b/Cargo.lock index 46ab217..a3292bb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -4,9 +4,9 @@ version = 3 [[package]] name = "aho-corasick" -version = "0.7.20" +version = "1.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "cc936419f96fa211c1b9166887b38e5e40b19958e5b895be7c1f93adec7071ac" +checksum = "8e60d3430d3a69478ad0993f19238d2df97c507009a52b3c10addcd7f6bcb916" dependencies = [ "memchr", ] @@ -415,6 +415,7 @@ dependencies = [ "log", "model", "parsing_handlers", + "regex", "tree-sitter", "tree-sitter-java", "uuid", @@ -492,26 +493,32 @@ dependencies = [ [[package]] name = "regex" -version = "1.7.3" +version = "1.10.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d" +checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" dependencies = [ "aho-corasick", "memchr", + "regex-automata", "regex-syntax", ] [[package]] name = "regex-automata" -version = "0.4.3" +version = "0.4.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f804c7828047e88b2d32e2d7fe5a105da8ee3264f01902f796c8e067dc2483f" +checksum = "38caf58cc5ef2fed281f89292ef23f6365465ed9a41b7a7754eb4e26496c92df" +dependencies = [ + "aho-corasick", + "memchr", + "regex-syntax", +] [[package]] name = "regex-syntax" -version = "0.6.29" +version = "0.8.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "rustc-hash" diff --git a/bin/tests/node_reordering/merge.output.java b/bin/tests/node_reordering/merge.output.java deleted file mode 100644 index 253237e..0000000 --- a/bin/tests/node_reordering/merge.output.java +++ /dev/null @@ -1,6 +0,0 @@ - -<<<<<<<<< - public class Test { class A { } class B { } interface I { } } -========= - public class Test { class A { } class C { } interface I { } } ->>>>>>>>> diff --git a/bin/tests/node_reordering/base.java b/bin/tests/scenarios/node_reordering/base.java similarity index 100% rename from bin/tests/node_reordering/base.java rename to bin/tests/scenarios/node_reordering/base.java diff --git a/bin/tests/node_reordering/left.java b/bin/tests/scenarios/node_reordering/left.java similarity index 100% rename from bin/tests/node_reordering/left.java rename to bin/tests/scenarios/node_reordering/left.java diff --git a/bin/tests/node_reordering/merge.java b/bin/tests/scenarios/node_reordering/merge.java similarity index 100% rename from bin/tests/node_reordering/merge.java rename to bin/tests/scenarios/node_reordering/merge.java diff --git a/bin/tests/node_reordering/right.java b/bin/tests/scenarios/node_reordering/right.java similarity index 100% rename from bin/tests/node_reordering/right.java rename to bin/tests/scenarios/node_reordering/right.java diff --git a/matching/src/ordered/mod.rs b/matching/src/ordered/mod.rs index 9f5f3cf..6425aa3 100644 --- a/matching/src/ordered/mod.rs +++ b/matching/src/ordered/mod.rs @@ -1,5 +1,6 @@ use crate::{ - matching_configuration::MatchingConfiguration, matching_entry::MatchingEntry, MatchingRepresentation, Matchings + matching_configuration::MatchingConfiguration, matching_entry::MatchingEntry, + MatchingRepresentation, Matchings, }; use model::{cst_node::NonTerminal, CSTNode}; use unordered_pair::UnorderedPair; @@ -36,7 +37,8 @@ pub fn calculate_matchings<'a>( .. }), ) => { - let root_matching: usize = (left.get_matching_representation() == right.get_matching_representation()).into(); + let root_matching: usize = + (left.get_matching_representation() == right.get_matching_representation()).into(); let m = children_left.len(); let n = children_right.len(); diff --git a/matching/src/unordered/mod.rs b/matching/src/unordered/mod.rs index 37f6043..e9fee02 100644 --- a/matching/src/unordered/mod.rs +++ b/matching/src/unordered/mod.rs @@ -32,5 +32,7 @@ pub fn calculate_matchings<'a>( } fn all_children_labeled(node: &NonTerminal, config: &MatchingConfiguration) -> bool { - node.children.iter().all(|child| child.get_identifier().is_some()) + node.children + .iter() + .all(|child| child.get_identifier().is_some()) } diff --git a/matching/src/unordered/unique_label.rs b/matching/src/unordered/unique_label.rs index 3d979f2..69ac897 100644 --- a/matching/src/unordered/unique_label.rs +++ b/matching/src/unordered/unique_label.rs @@ -28,8 +28,9 @@ pub fn calculate_matchings<'a>( for child_left in children_left { for child_right in children_right { - let is_same_identifier = child_left.get_identifier().is_some() && child_left.get_identifier() == child_right.get_identifier(); - + let is_same_identifier = child_left.get_identifier().is_some() + && child_left.get_identifier() == child_right.get_identifier(); + if is_same_identifier { let child_matchings = crate::calculate_matchings(child_left, child_right, config); diff --git a/matching_handlers/src/java/mod.rs b/matching_handlers/src/java/mod.rs index 4bd4d46..05e94f0 100644 --- a/matching_handlers/src/java/mod.rs +++ b/matching_handlers/src/java/mod.rs @@ -15,7 +15,7 @@ use self::{ pub fn get_default_java_matching_handlers<'a>() -> MatchingHandlers<'a> { let mut matching_handlers: MatchingHandlers<'a> = MatchingHandlers::new(); - + matching_handlers.register( "constructor_declaration", compute_matching_score_for_method_declaration, diff --git a/model/src/cst_node.rs b/model/src/cst_node.rs index 5a267db..3ddf9cf 100644 --- a/model/src/cst_node.rs +++ b/model/src/cst_node.rs @@ -68,7 +68,7 @@ impl CSTNode<'_> { pub fn get_identifier(&self) -> Option> { match self { CSTNode::Terminal(node) => Some(vec![node.kind, node.value]), - CSTNode::NonTerminal(node) => node.get_identifier() + CSTNode::NonTerminal(node) => node.get_identifier(), } } } diff --git a/parsing/Cargo.toml b/parsing/Cargo.toml index c4470f2..f100d45 100644 --- a/parsing/Cargo.toml +++ b/parsing/Cargo.toml @@ -8,6 +8,7 @@ edition = "2021" [dependencies] tree-sitter = "0.20.9" tree-sitter-java = "0.20.0" +regex = "1.10.5" model = { path = "../model" } parsing_handlers = { path = "../parsing_handlers" } log = { workspace = true } diff --git a/parsing/src/parse.rs b/parsing/src/parse.rs index 5c3c013..b1241dc 100644 --- a/parsing/src/parse.rs +++ b/parsing/src/parse.rs @@ -7,6 +7,11 @@ use model::{ }; use tree_sitter::{Node, Query, QueryCursor}; +enum IdentifierExtractor { + RegularExpression(&'static str), + TreeSitterQuery(&'static str), +} + fn extract_identifier_from_node<'a>( node: Node, src: &'a str, @@ -15,7 +20,8 @@ fn extract_identifier_from_node<'a>( let queries = HashMap::from([ ( "constructor_declaration", - r#" + IdentifierExtractor::TreeSitterQuery( + r#" ( constructor_declaration name: @@ -35,7 +41,7 @@ fn extract_identifier_from_node<'a>( @argument_type ) ( - spread_parameter (type_identifier) @spread + spread_parameter (type_identifier) @spread_parameter "..." @spread_indicator ) ] ) @@ -45,10 +51,12 @@ fn extract_identifier_from_node<'a>( "#, + ), ), ( "method_declaration", - r#" + IdentifierExtractor::TreeSitterQuery( + r#" ( method_declaration name: @@ -68,7 +76,7 @@ fn extract_identifier_from_node<'a>( @argument_type ) ( - spread_parameter (type_identifier) @spread + spread_parameter (type_identifier) @spread_parameter "..." @spread_indicator ) ] ) @@ -78,55 +86,78 @@ fn extract_identifier_from_node<'a>( "#, + ), ), ( "field_declaration", - r#"(variable_declarator name: _ @name)"#, + IdentifierExtractor::TreeSitterQuery(r#"(variable_declarator name: _ @name)"#), ), ( "import_declaration", - r#"(import_declaration ( scoped_identifier ) @namespace)"#, + IdentifierExtractor::TreeSitterQuery( + r#"(import_declaration ( scoped_identifier ) @namespace)"#, + ), ), ( "class_declaration", - r#"(class_declaration name: (identifier) @name)"#, + IdentifierExtractor::RegularExpression( + r#"class [A-Za-z_][A-Za-z0-9_]*"#, + ), ), ( "enum_declaration", - r#"(enum_declaration name: (identifier) @name)"#, + IdentifierExtractor::RegularExpression( + r#"enum [A-Za-z_][A-Za-z0-9_]*"#, + ), ), ( "interface_declaration", - r#"(interface_declaration name: (identifier) @name)"#, + IdentifierExtractor::RegularExpression( + r#"interface [A-Za-z_][A-Za-z0-9_]*"#, + ), ), ]); - let query_string = queries.get(node.kind())?; - log::debug!("Using {:?} as query_string", query_string); - let query = Query::new(config.language, &query_string).ok()?; - let mut cursor = QueryCursor::new(); - let identifier = cursor - .matches(&query, node, src.as_bytes()) - .into_iter() - .flat_map(|a_match| { - a_match - .captures - .iter() - .filter(|capture| { - capture.node.start_byte() >= node.start_byte() - && capture.node.end_byte() <= node.end_byte() - + let identifier_extractor = queries.get(node.kind())?; + let identifier = match identifier_extractor { + IdentifierExtractor::RegularExpression(regex) => { + let identifier = regex::Regex::new(regex) + .unwrap() + .find(node.utf8_text(src.as_bytes()).ok()?) + .map(|m| m.as_str())?; + Some(vec![identifier]) + } + IdentifierExtractor::TreeSitterQuery(query_string) => { + let query = Query::new(config.language, query_string).ok()?; + let mut cursor = QueryCursor::new(); + let identifier = cursor + .matches(&query, node, src.as_bytes()) + .into_iter() + .flat_map(|a_match| { + a_match + .captures + .iter() + .filter(|capture| { + capture.node.start_byte() >= node.start_byte() + && capture.node.end_byte() <= node.end_byte() + }) + .filter_map(|capture_index| { + capture_index.node.utf8_text(src.as_bytes()).ok() + }) + }) + .collect(); + Some(identifier) + } + }; - }) - .filter_map(|capture_index| capture_index.node.utf8_text(src.as_bytes()).ok()) - }) - .collect(); + log::debug!( + "Found {:?} as identifier for node {:?}", + identifier, + node.utf8_text(src.as_bytes()).ok()? + ); - log::debug!("Found {:?} as identifier for node {:?}", identifier, node.utf8_text(src.as_bytes()).unwrap()); - - Some(identifier - ) + identifier } fn explore_node<'a>(node: Node, src: &'a str, config: &'a ParserConfiguration) -> CSTNode<'a> {