From fd5d9432d2bcecee669d8836c6d3ab73105b1cca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Pedro=20Henrique?= Date: Wed, 1 May 2024 18:02:06 -0300 Subject: [PATCH] feat(matching): use the Hungarian Algorithm for unordered matching (#50) Our current approach to unordered node matching relies on a naive assumption: that all nodes possess an identifier. While this holds true for most nodes we've encountered thus far, such as method and property declarations within a Java class, it proves insufficient when attempting to match nodes lacking a label, like static blocks in Java. In such cases, calculations for matchings may yield incorrect results, consequently leading to erroneous merges. This pull request introduces a solution for matching unordered nodes via the Assignment Problem, utilizing the Hungarian Algorithm to resolve it. This approach mirrors the one used in jDime. Given the widespread recognition of the Hungarian Algorithm, we rely on the implementation provided by the [pathfinding](https://github.com/evenfurther/pathfinding) crate. This simplifies our implementation efforts, as we only need to provide the weights matrix and extract the matching information from the solution. A workaround had to be implemented since pathfinding expects the input matrix weight to have the same number of rows and columns, which might not always be true in our case since nodes can have a different number of children. The solution involves initializing the remaining columns/rows with 0. For now, our naive label implementation has been bypassed and is not being utilized. In a further pull request, the idea is to resort to the Hungarian algorithm only if the nodes are unlabeled, as it's significantly more complex than merely matching identifiers. --- Cargo.lock | 124 +++++++++++++++++- .../unordered_with_non_labelled/base.java | 5 + .../unordered_with_non_labelled/left.java | 13 ++ .../unordered_with_non_labelled/merge.java | 1 + .../unordered_with_non_labelled/right.java | 18 +++ matching/Cargo.toml | 1 + matching/src/lib.rs | 3 +- matching/src/unordered/assignment_problem.rs | 95 ++++++++++++++ matching/src/unordered/mod.rs | 1 + 9 files changed, 254 insertions(+), 7 deletions(-) create mode 100644 bin/tests/scenarios/unordered_with_non_labelled/base.java create mode 100644 bin/tests/scenarios/unordered_with_non_labelled/left.java create mode 100644 bin/tests/scenarios/unordered_with_non_labelled/merge.java create mode 100644 bin/tests/scenarios/unordered_with_non_labelled/right.java create mode 100644 matching/src/unordered/assignment_problem.rs create mode 100644 matching/src/unordered/mod.rs diff --git a/Cargo.lock b/Cargo.lock index bf99a14..b3feb0c 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -74,6 +74,12 @@ dependencies = [ "wait-timeout", ] +[[package]] +name = "autocfg" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f1fdabc7756949593fe60f30ec81974b613357de856987752631dea1e3394c80" + [[package]] name = "bin" version = "0.1.0" @@ -164,6 +170,18 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7" +[[package]] +name = "deprecate-until" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a3767f826efbbe5a5ae093920b58b43b01734202be697e1354914e862e8e704" +dependencies = [ + "proc-macro2", + "quote", + "semver", + "syn", +] + [[package]] name = "difflib" version = "0.4.0" @@ -204,6 +222,12 @@ dependencies = [ "termcolor", ] +[[package]] +name = "equivalent" +version = "1.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" + [[package]] name = "errno" version = "0.3.8" @@ -214,6 +238,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "fixedbitset" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0ce7134b9999ecaf8bcd65542e436736ef32ddca1b3e06094cb6ec5755203b80" + [[package]] name = "getrandom" version = "0.2.12" @@ -225,6 +255,12 @@ dependencies = [ "wasi", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "heck" version = "0.4.1" @@ -243,6 +279,25 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" +[[package]] +name = "indexmap" +version = "2.2.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "integer-sqrt" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "276ec31bcb4a9ee45f58bec6f9ec700ae4cf4f4f8f2fa7e06cb406bd5ffdd770" +dependencies = [ + "num-traits", +] + [[package]] name = "is-terminal" version = "0.4.10" @@ -288,6 +343,7 @@ dependencies = [ "log", "matching_handlers", "model", + "pathfinding", "unordered-pair", "uuid", ] @@ -336,6 +392,15 @@ dependencies = [ "winapi", ] +[[package]] +name = "num-traits" +version = "0.2.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "da0df0e5185db44f69b44f26786fe401b6c293d1907744beaa7fa62b2e5a517a" +dependencies = [ + "autocfg", +] + [[package]] name = "overload" version = "0.1.1" @@ -353,6 +418,21 @@ dependencies = [ "uuid", ] +[[package]] +name = "pathfinding" +version = "4.9.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0a21c30f03223ae4a4c892f077b3189133689b8a659a84372f8422384ce94c9" +dependencies = [ + "deprecate-until", + "fixedbitset", + "indexmap", + "integer-sqrt", + "num-traits", + "rustc-hash", + "thiserror", +] + [[package]] name = "predicates" version = "3.0.4" @@ -383,18 +463,18 @@ dependencies = [ [[package]] name = "proc-macro2" -version = "1.0.69" +version = "1.0.81" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "134c189feb4956b20f6f547d2cf727d4c0fe06722b20a0eec87ed445a97f92da" +checksum = "3d1597b0c024618f09a9c3b8655b7e430397a36d23fdafec26d6965e9eec3eba" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.33" +version = "1.0.36" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" +checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" dependencies = [ "proc-macro2", ] @@ -422,6 +502,12 @@ version = "0.6.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1" +[[package]] +name = "rustc-hash" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" + [[package]] name = "rustix" version = "0.38.28" @@ -435,6 +521,12 @@ dependencies = [ "windows-sys 0.52.0", ] +[[package]] +name = "semver" +version = "1.0.22" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92d43fe69e652f3df9bdc2b85b2854a0825b86e4fb76bc44d945137d053639ca" + [[package]] name = "serde" version = "1.0.193" @@ -463,9 +555,9 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623" [[package]] name = "syn" -version = "2.0.39" +version = "2.0.60" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "23e78b90f2fcf45d3e842032ce32e3f2d1545ba6636271dcbf24fa306d87be7a" +checksum = "909518bc7b1c9b779f1bbf07f2929d35af9f0f37e47c6e9ef7f9dddc1e1821f3" dependencies = [ "proc-macro2", "quote", @@ -487,6 +579,26 @@ version = "0.4.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3369f5ac52d5eb6ab48c6b4ffdc8efbcad6b89c765749064ba298f2c68a16a76" +[[package]] +name = "thiserror" +version = "1.0.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f0126ad08bff79f29fc3ae6a55cc72352056dfff61e3ff8bb7129476d44b23aa" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "1.0.59" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d1cd413b5d558b4c5bf3680e324a6fa5014e7b7c067a51e69dbdf47eb7148b66" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + [[package]] name = "tree-sitter" version = "0.20.9" diff --git a/bin/tests/scenarios/unordered_with_non_labelled/base.java b/bin/tests/scenarios/unordered_with_non_labelled/base.java new file mode 100644 index 0000000..723dfb4 --- /dev/null +++ b/bin/tests/scenarios/unordered_with_non_labelled/base.java @@ -0,0 +1,5 @@ +public class Main { + static { + System.out.println("I'm a static block"); + } +} diff --git a/bin/tests/scenarios/unordered_with_non_labelled/left.java b/bin/tests/scenarios/unordered_with_non_labelled/left.java new file mode 100644 index 0000000..8ea28e6 --- /dev/null +++ b/bin/tests/scenarios/unordered_with_non_labelled/left.java @@ -0,0 +1,13 @@ +public class Main { + static { + int x = 0; + } + + static { + System.out.println("I'm a static block"); + } + + public Main() { + System.out.println("I'm a constructor"); + } +} diff --git a/bin/tests/scenarios/unordered_with_non_labelled/merge.java b/bin/tests/scenarios/unordered_with_non_labelled/merge.java new file mode 100644 index 0000000..263386c --- /dev/null +++ b/bin/tests/scenarios/unordered_with_non_labelled/merge.java @@ -0,0 +1 @@ + public class Main { static { int <<<<<<<<< x ========= y >>>>>>>>> = <<<<<<<<< 0 ========= 2 >>>>>>>>> ; } static { System . out . println ( "I'm a static block" ) ; } public Main ( ) { System . out . println ( "I'm a constructor" ) ; int y = 3 ; } static { System . out . println ( "I don't know what's going on" ) ; } } diff --git a/bin/tests/scenarios/unordered_with_non_labelled/right.java b/bin/tests/scenarios/unordered_with_non_labelled/right.java new file mode 100644 index 0000000..d58f44b --- /dev/null +++ b/bin/tests/scenarios/unordered_with_non_labelled/right.java @@ -0,0 +1,18 @@ +public class Main { + static { + System.out.println("I'm a static block"); + } + + static { + int y = 2; + } + + static { + System.out.println("I don't know what's going on"); + } + + public Main() { + System.out.println("I'm a constructor"); + int y = 3; + } +} diff --git a/matching/Cargo.toml b/matching/Cargo.toml index 69adda6..8058d77 100644 --- a/matching/Cargo.toml +++ b/matching/Cargo.toml @@ -10,6 +10,7 @@ model = { path = "../model" } matching_handlers = { path = "../matching_handlers" } unordered-pair = "0.2.4" log = { workspace = true } +pathfinding = "4.9.1" [dev-dependencies] uuid = { workspace = true } diff --git a/matching/src/lib.rs b/matching/src/lib.rs index 8c47c9a..3fe849c 100644 --- a/matching/src/lib.rs +++ b/matching/src/lib.rs @@ -1,3 +1,4 @@ +mod unordered; mod matching; mod matching_entry; mod matchings; @@ -24,7 +25,7 @@ pub fn calculate_matchings<'a>( ) => { if non_terminal_left.are_children_unordered && non_terminal_right.are_children_unordered { - unordered_tree_matching::unordered_tree_matching(left, right, matching_handlers) + unordered::assignment_problem::calculate_matchings(left, right, matching_handlers) } else { ordered_tree_matching::ordered_tree_matching(left, right, matching_handlers) } diff --git a/matching/src/unordered/assignment_problem.rs b/matching/src/unordered/assignment_problem.rs new file mode 100644 index 0000000..6670916 --- /dev/null +++ b/matching/src/unordered/assignment_problem.rs @@ -0,0 +1,95 @@ +use std::cmp::max; + +use matching_handlers::MatchingHandlers; +use model::{cst_node::NonTerminal, CSTNode}; +use pathfinding::{kuhn_munkres::Weights, matrix}; +use unordered_pair::UnorderedPair; + +use crate::{MatchingEntry, Matchings}; + +pub fn calculate_matchings<'a>( + left: &'a CSTNode, + right: &'a CSTNode, + matching_handlers: &'a MatchingHandlers<'a>, +) -> crate::Matchings<'a> { + match (left, right) { + ( + CSTNode::NonTerminal(NonTerminal { + kind: kind_left, + children: children_left, + .. + }), + CSTNode::NonTerminal(NonTerminal { + kind: kind_right, + children: children_right, + .. + }), + ) => { + if kind_left != kind_right { + return Matchings::empty(); + } + + let children_matchings = children_left + .iter() + .map(|left_child| { + children_right + .iter() + .map(|right_child| { + let w = crate::calculate_matchings(left_child, right_child, matching_handlers); + let matching = w + .get_matching_entry(left_child, right_child) + .unwrap_or_default(); + (matching.score, w) + }) + .collect() + }) + .collect(); + + solve_assignment_problem(left, right, children_matchings) + } + (_, _) => unreachable!( + "Unordered matching must never be called if the nodes are not NonTerminals." + ), + } +} + +fn solve_assignment_problem<'a>( + left: &'a CSTNode, + right: &'a CSTNode, + children_matchings: Vec)>> +) -> Matchings<'a> { + let m = children_matchings.len(); + let n = children_matchings[0].len(); + let max_size = max(m, n); + + let mut matrix: Vec> = vec![vec![0; max_size]; max_size]; + for i in 0..m { + for j in 0..n { + matrix[i][j] = children_matchings[i][j].0.try_into().unwrap(); + } + } + + let weights_matrix = matrix::Matrix::from_rows(matrix) + .expect("Could not build weights matrix for assignment problem."); + let (max_matching, best_matches) = pathfinding::kuhn_munkres::kuhn_munkres(&weights_matrix); + + let mut result = Matchings::empty(); + + for i in 0..best_matches.len() { + let j = best_matches[i]; + let cur_matching = weights_matrix.at(i, j); + if cur_matching > 0 { + result.extend(children_matchings[i][j].1.clone()); + } + } + + result.extend(Matchings::from_single( + UnorderedPair(left, right), + MatchingEntry { + score: max_matching as usize + 1, + is_perfect_match: left.contents() == right.contents(), + }, + )); + + result +} diff --git a/matching/src/unordered/mod.rs b/matching/src/unordered/mod.rs new file mode 100644 index 0000000..d98f2cd --- /dev/null +++ b/matching/src/unordered/mod.rs @@ -0,0 +1 @@ +pub mod assignment_problem; \ No newline at end of file