Skip to content

Commit

Permalink
feat: Introduce custom identifier extraction mecanism (#62)
Browse files Browse the repository at this point in the history
Up until now, we used matching handlers to capture special node
identifiers, diverging from their intended purpose. This approach also
required users to add Rust code to the project, limiting the tool's
generalization and extensibility.

This PR changes how node identifiers are extracted by moving the
extraction process to the parsing step. This improves performance (as it
runs only once per parse) and leverages Tree-sitter’s pattern matching
query functionality.

Users can now provide a configuration with node types and a Tree-sitter
query expression to extract identifiers. For example, in a Java class, a
user can extract a field declaration identifier using the query
`(variable_declarator name: _ @field_name)`, which captures the field
name.

However, Tree-sitter pattern matching can fall short in some cases. For
instance, when trying to retrieve the identifier for a class with an
inner class:

```java
class A {
    class B {
    }
}
```

Using the query `(class_declaration (identifier) @class_name)` matches
both classes A and B, resulting in `[A, B]` as the identifier, which is
incorrect. Since Tree-sitter’s query language doesn’t support matching a
single entry - this has to be done in userland code, which would
complicate the identifier extraction process.

To address this, this PR introduces the option to use a Regular
Expression for identifier extraction. The regular expression runs on the
node’s source code and captures only the first match. In this case,
`class [A-Za-z_][A-Za-z0-9_]*` correctly matches the class name, and we
can safely discard the match for class B (since only the first match is
considered).

These changes simplify the introduction of new extractors and eliminate
approximately 600 lines of Rust code (tests and source code) previously
used for node identifier extraction.
  • Loading branch information
jpedroh authored Jul 21, 2024
1 parent f6d66a5 commit 2c6a135
Show file tree
Hide file tree
Showing 24 changed files with 275 additions and 676 deletions.
23 changes: 15 additions & 8 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

37 changes: 5 additions & 32 deletions matching/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,50 +1,23 @@
mod matches;
mod matching;
pub mod matching_configuration;
mod matching_entry;
mod matchings;
pub mod ordered;
pub mod unordered;

use matches::Matches;
use matching_configuration::MatchingConfiguration;
pub use matching_entry::MatchingEntry;
pub use matchings::Matchings;
use model::cst_node::Terminal;
use unordered_pair::UnorderedPair;

/**
* TODO: This probably belongs on the node declaration itself, but for that we
* need to move the identifiers extraction into there which would be a pain now.
* Furthermore, in the future, we want to move the extraction of identifiers
* from programmatic code to use Tree Sitter query syntax.
*/
fn are_nodes_matching_representations_equal<'a>(
left: &'a model::CSTNode,
right: &'a model::CSTNode,
config: &'a MatchingConfiguration<'a>,
) -> bool {
config
.handlers
.compute_matching_score(left, right)
.map(|score| score == 1)
.unwrap_or(match (left, right) {
(
model::CSTNode::NonTerminal(left_non_terminal),
model::CSTNode::NonTerminal(right_non_terminal),
) => left_non_terminal.kind == right_non_terminal.kind,
(model::CSTNode::Terminal(left_terminal), model::CSTNode::Terminal(right_terminal)) => {
left_terminal.kind == right_terminal.kind
&& left_terminal.value == right_terminal.value
}
(_, _) => false,
})
}

pub fn calculate_matchings<'a>(
left: &'a model::CSTNode,
right: &'a model::CSTNode,
config: &'a MatchingConfiguration<'a>,
) -> Matchings<'a> {
if !are_nodes_matching_representations_equal(left, right, config) {
if !left.matches(right) {
return Matchings::empty();
}

Expand All @@ -61,12 +34,12 @@ pub fn calculate_matchings<'a>(
}
}
(
model::CSTNode::Terminal(Terminal {
model::CSTNode::Terminal(model::cst_node::Terminal {
kind: kind_left,
value: value_left,
..
}),
model::CSTNode::Terminal(Terminal {
model::CSTNode::Terminal(model::cst_node::Terminal {
kind: kind_right,
value: value_right,
..
Expand Down
19 changes: 19 additions & 0 deletions matching/src/matches.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
use model::CSTNode;

pub trait Matches {
fn matches(&self, right: &CSTNode) -> bool;
}

impl Matches for CSTNode<'_> {
fn matches(&self, right: &CSTNode) -> bool {
match (self, right) {
(CSTNode::Terminal(left), CSTNode::Terminal(right)) => {
left.get_identifier() == right.get_identifier()
}
(CSTNode::NonTerminal(left), CSTNode::NonTerminal(right)) => {
left.kind == right.kind && left.get_identifier() == right.get_identifier()
}
(_, _) => false,
}
}
}
16 changes: 1 addition & 15 deletions matching/src/matching_configuration.rs
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
use matching_handlers::MatchingHandlers;
use model::Language;
use std::collections::HashSet;

pub struct MatchingConfiguration<'a> {
pub(crate) delimiters: HashSet<&'static str>,
pub(crate) kinds_with_label: HashSet<&'static str>,
#[allow(dead_code)]
pub(crate) handlers: MatchingHandlers<'a>,
}

Expand All @@ -18,18 +16,6 @@ impl From<Language> for MatchingConfiguration<'_> {
fn from(language: Language) -> Self {
match language {
Language::Java => MatchingConfiguration {
delimiters: ["{", "}", ";"].into(),
kinds_with_label: [
"compact_constructor_declaration",
"constructor_declaration",
"field_declaration",
"method_declaration",
"import_declaration",
"class_declaration",
"interface_declaration",
"enum_declaration",
]
.into(),
handlers: MatchingHandlers::from(Language::Java),
},
}
Expand Down
20 changes: 15 additions & 5 deletions matching/src/ordered/mod.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::{
matching_configuration::MatchingConfiguration, matching_entry::MatchingEntry, Matchings,
matches::Matches, matching_configuration::MatchingConfiguration, matching_entry::MatchingEntry,
Matchings,
};
use model::{cst_node::NonTerminal, CSTNode};
use unordered_pair::UnorderedPair;
Expand Down Expand Up @@ -36,10 +37,7 @@ pub fn calculate_matchings<'a>(
..
}),
) => {
let root_matching: usize = config
.handlers
.compute_matching_score(left, right)
.unwrap_or((left.kind() == right.kind()).into());
let root_matching: usize = (left.matches(right)).into();

let m = children_left.len();
let n = children_right.len();
Expand Down Expand Up @@ -128,6 +126,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 1, column: 7 },
children: vec![child.clone()],
..Default::default()
});
let right = CSTNode::NonTerminal(NonTerminal {
id: uuid::Uuid::new_v4(),
Expand All @@ -136,6 +135,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 1, column: 7 },
children: vec![child.clone()],
..Default::default()
});

let matching_configuration = MatchingConfiguration::default();
Expand Down Expand Up @@ -173,6 +173,7 @@ mod tests {
children: vec![left_child.clone()],
start_position: Point { row: 1, column: 0 },
end_position: Point { row: 0, column: 7 },
..Default::default()
});
let right = CSTNode::NonTerminal(NonTerminal {
id: uuid::Uuid::new_v4(),
Expand All @@ -181,6 +182,7 @@ mod tests {
children: vec![right_child.clone()],
start_position: Point { row: 1, column: 0 },
end_position: Point { row: 0, column: 7 },
..Default::default()
});

let matching_configuration = MatchingConfiguration::from(Language::Java);
Expand Down Expand Up @@ -216,6 +218,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![common_child.clone()],
..Default::default()
});
let right = CSTNode::NonTerminal(NonTerminal {
id: uuid::Uuid::new_v4(),
Expand All @@ -224,6 +227,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![common_child.clone(), unique_right_child],
..Default::default()
});

let matching_configuration = MatchingConfiguration::from(language::Language::Java);
Expand Down Expand Up @@ -252,6 +256,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![common_child.clone()],
..Default::default()
});
let right = CSTNode::NonTerminal(NonTerminal {
id: uuid::Uuid::new_v4(),
Expand All @@ -260,6 +265,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![common_child.clone()],
..Default::default()
});

let matching_configuration = MatchingConfiguration::from(language::Language::Java);
Expand All @@ -279,6 +285,7 @@ mod tests {
end_position: Point { row: 0, column: 7 },
value: "value_b",
is_block_end_delimiter: false,
..Default::default()
});

let intermediate = CSTNode::NonTerminal(NonTerminal {
Expand All @@ -288,6 +295,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![leaf],
..Default::default()
});

let left = CSTNode::NonTerminal(NonTerminal {
Expand All @@ -297,6 +305,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![intermediate.clone()],
..Default::default()
});
let right = CSTNode::NonTerminal(NonTerminal {
id: uuid::Uuid::new_v4(),
Expand All @@ -305,6 +314,7 @@ mod tests {
start_position: Point { row: 0, column: 0 },
end_position: Point { row: 0, column: 7 },
children: vec![intermediate.clone()],
..Default::default()
});

let matching_configuration = MatchingConfiguration::default();
Expand Down
9 changes: 3 additions & 6 deletions matching/src/unordered/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ pub fn calculate_matchings<'a>(
) -> crate::Matchings<'a> {
match (left, right) {
(model::CSTNode::NonTerminal(left_nt), model::CSTNode::NonTerminal(right_nt)) => {
if all_children_labeled(left_nt, config) && all_children_labeled(right_nt, config) {
if all_children_labeled(left_nt) && all_children_labeled(right_nt) {
log::debug!(
"Matching children of \"{}\" with \"{}\" using unique label matching.",
left.kind(),
Expand All @@ -31,9 +31,6 @@ pub fn calculate_matchings<'a>(
}
}

fn all_children_labeled(node: &NonTerminal, config: &MatchingConfiguration) -> bool {
node.children
.iter()
.filter(|child| !config.delimiters.contains(child.kind()))
.all(|child| config.kinds_with_label.contains(child.kind()))
fn all_children_labeled(node: &NonTerminal) -> bool {
node.children.iter().all(|child| child.has_identifier())
}
15 changes: 10 additions & 5 deletions matching/src/unordered/unique_label.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,17 @@ pub fn calculate_matchings<'a>(

for child_left in children_left {
for child_right in children_right {
let is_same_identifier = config
.handlers
.compute_matching_score(child_left, child_right)
.unwrap_or_else(|| (child_left.kind() == child_right.kind()).into());
let is_same_identifier = match (child_left, child_right) {
(CSTNode::Terminal(left), CSTNode::Terminal(right)) => {
left.get_identifier() == right.get_identifier()
}
(CSTNode::NonTerminal(left), CSTNode::NonTerminal(right)) => {
left.get_identifier() == right.get_identifier()
}
(_, _) => false,
};

if is_same_identifier == 1 {
if is_same_identifier {
let child_matchings =
crate::calculate_matchings(child_left, child_right, config);

Expand Down
Loading

0 comments on commit 2c6a135

Please sign in to comment.