From 8f6aa4cead9fe8da8d8c4ed1b3876893c7e1c00c Mon Sep 17 00:00:00 2001 From: Devin Jeanpierre Date: Fri, 22 Sep 2023 13:34:50 -0700 Subject: [PATCH] Implement a matcher for pattern matching with globs. This is the first step to implementing wildcard `$foo...` support in --mode=py.expr/py.stmt ([#6](https://github.com/ssbr/refex/issues/6)). I actually wanted to go the rest of the way with it, but may as well start sending out PRs for intermediate steps. PiperOrigin-RevId: 567705007 --- refex/python/matchers/base_matchers.py | 128 +++++++++++++++++- refex/python/matchers/test_base_matchers.py | 142 +++++++++++++++++++- 2 files changed, 260 insertions(+), 10 deletions(-) diff --git a/refex/python/matchers/base_matchers.py b/refex/python/matchers/base_matchers.py index e9b8e40..9b409c3 100644 --- a/refex/python/matchers/base_matchers.py +++ b/refex/python/matchers/base_matchers.py @@ -78,17 +78,13 @@ """ # pyformat: enable -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - +import collections import re -from typing import Container, Dict, List, Sequence, Hashable +from typing import Container, Dict, Hashable, Iterable, List, Sequence import weakref import attr import cached_property - from refex import formatting from refex import match from refex.python import matcher @@ -755,3 +751,123 @@ def _match(self, context, candidate): matcher.create_match(context.parsed_file, candidate)) else: return None + + +# TODO: make this public after glob support is implemented, and it's determined +# this does the right thing. +# In particular, at time of writing, it does completely the wrong thing with +# bindings -- you can't add a bound GlobStar() :( +# @matcher.safe_to_eval +@attr.s(frozen=True) +class GlobStar(matcher.Matcher): + """Matches any sequence of items in a sequence. + + Only valid within :class:`Glob`. + """ + + def _match(self, context, candidate): + del context, candidate # unused + # _match isn't called by GlobMatcher; it instead specially recognizes it + # inside its own search algorithm. GlobMatcher is a bug when present in + # any other context. + raise matcher.MatchError('GlobStar used outside of Glob') + + +def _blockify_glob_matchers( + matchers: Iterable[matcher.Matcher], +) -> list[GlobStar | list[matcher.Matcher]]: + """Matchers separated into GlobStar() and sequential block of non-* matchers.""" + blocks = [] + current = [] + for m in matchers: + if isinstance(m, GlobStar): + if current: + blocks.append(current) + current = m + else: + if isinstance(current, GlobStar): + blocks.append(current) + current = [] + current.append(m) + blocks.append(current) + + return blocks + + +# TODO: make this public after glob support is implemented (see GlobStar) +@attr.s(frozen=True) +class Glob(matcher.Matcher): + """Matches a sequence, with :func:`GlobStar` wildcards. + + For example, ``Glob(['a', GlobStar(), 'b'])`` matches any sequence which + starts with ``'a'`` and ends with ``'b'``. + + class:`GlobStar()` is only valid directly within the body of a `Glob`. + """ + + _matchers = matcher.submatcher_list_attrib() + + @cached_property.cached_property + def _blocked_matchers(self): + return _blockify_glob_matchers(self._matchers) + + def _match(self, context, candidate): + if not isinstance(candidate, collections.abc.Sequence): + return False + + # https://research.swtch.com/glob + # + # The following algorithm is courtesy of the insight (from Russ Cox and + # others): you can do a backtracking search to find the substrings + # ("blocks" here), but not backtrack past the most recent GlobStar, as long + # as you take the "earliest possible" choice. + # + # (The algorithm looks a bit different because we're borrowing the idea, + # not the code.) + + # TODO: allow for ``Bind('name', GlobStar())`` + + is_search = False + pos = 0 + bindings = {} + for block_i, block in enumerate(self._blocked_matchers): + if isinstance(block, GlobStar): + is_search = True + continue + + if is_search: + # searched blocks can terminate at the earliest possible point, with the + # sole exception of the last block. We handle that by moving the search + # to the one place that it could possibly match, for free performance. + if block_i == len(self._blocked_matchers) - 1: + pos = len(candidate) - len(block) + search_end = pos + 1 + else: + search_end = len(candidate) + else: + search_end = pos + 1 # only one candidate to search. + + is_search = False + result = None + for match_start in range(pos, search_end): + match_end = match_start + len(block) + result = ItemsAre(block).match( + context, candidate[match_start:match_end] + ) + if result is not None: + pos = match_end + break + + if result is None: + return None + bindings = matcher.merge_bindings(bindings, result.bindings) + if bindings is None: + return None + + if pos != len(candidate) and not is_search: + return None + + return matcher.MatchInfo( + matcher.create_match(context.parsed_file, candidate), + bindings, + ) diff --git a/refex/python/matchers/test_base_matchers.py b/refex/python/matchers/test_base_matchers.py index 1baf1ac..2815485 100644 --- a/refex/python/matchers/test_base_matchers.py +++ b/refex/python/matchers/test_base_matchers.py @@ -15,10 +15,6 @@ # python3 python2 """Tests for refex.python.matchers.base_matchers.""" -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - import ast from unittest import mock @@ -758,5 +754,143 @@ def test_match_lines(self): ['c = d', 'g = h']) +class GlobTest(parameterized.TestCase): + + @parameterized.parameters(['abc'], [['a', 'b', 'c']]) + def test_sequence(self, abc_seq): + self.assertIsNotNone( + base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, abc_seq) + ) + + @parameterized.parameters( + 'prefix_abc', + 'abc_suffix', + '', + 'axc', + ) + def test_sequence_nomatch(self, not_abc): + self.assertIsNone( + base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, not_abc) + ) + + def test_empty(self): + empty_glob = base_matchers.Glob([]) + self.assertIsNotNone(empty_glob.match(_FAKE_CONTEXT, '')) + self.assertIsNone(empty_glob.match(_FAKE_CONTEXT, 'x')) + + @parameterized.parameters( + '', + 'x', + ) + def test_star(self, seq): + self.assertIsNotNone( + base_matchers.Glob([base_matchers.GlobStar()]).match(_FAKE_CONTEXT, seq) + ) + self.assertIsNotNone( + base_matchers.Glob( + [base_matchers.GlobStar(), base_matchers.GlobStar()] + ).match(_FAKE_CONTEXT, seq) + ) + + @parameterized.parameters( + 'ab', + 'abab', + ) + def test_prefix_star(self, seq): + self.assertIsNotNone( + base_matchers.Glob(['a', 'b', base_matchers.GlobStar()]).match( + _FAKE_CONTEXT, seq + ) + ) + + @parameterized.parameters( + '', + 'ba', + ) + def test_prefix_star_nomatch(self, seq): + self.assertIsNone( + base_matchers.Glob(['a', base_matchers.GlobStar()]).match( + _FAKE_CONTEXT, seq + ) + ) + + @parameterized.parameters( + 'ab', + 'abab', + ) + def test_star_suffix(self, seq): + self.assertIsNotNone( + base_matchers.Glob([base_matchers.GlobStar(), 'a', 'b']).match( + _FAKE_CONTEXT, seq + ) + ) + + def test_backtracking(self): + self.assertIsNotNone( + base_matchers.Glob([ + base_matchers.GlobStar(), + 'a', + 'b', + base_matchers.GlobStar(), + 'c', + base_matchers.GlobStar(), + ]).match(_FAKE_CONTEXT, 'abcab') + ) + + @parameterized.parameters( + '', + 'ab', + ) + def test_star_suffix_nomatch(self, seq): + self.assertIsNone( + base_matchers.Glob([base_matchers.GlobStar(), 'a']).match( + _FAKE_CONTEXT, seq + ) + ) + + @parameterized.parameters( + 'abcd', + 'a bcd', + 'abc d', + 'a bc d', + ) + def test_sandwich(self, seq): + glob = base_matchers.Glob([ + 'a', + base_matchers.GlobStar(), + 'b', + 'c', + base_matchers.GlobStar(), + 'd', + ]) + self.assertIsNotNone(glob.match(_FAKE_CONTEXT, seq)) + + def test_greediness(self): + """GlobStar matches as little as possible, so that it can match in O(n). + + That is, the algorithm for globbing a*bc*d... is linear if we try to match + a, then the _earliest possible_ bc, then d, ... + """ + glob = base_matchers.Glob([ + base_matchers.Bind('prefix', base_matchers.Anything()), + base_matchers.GlobStar(), + base_matchers.Bind('middle', base_matchers.Anything()), + base_matchers.GlobStar(), + base_matchers.Bind('end', base_matchers.Anything()), + ]) + m = glob.match(_FAKE_CONTEXT, 'abcdefg') + self.assertIsNotNone(m) + self.assertEqual(m.match, match.StringMatch('abcdefg')) + bindings = {k: v.value.string for k, v in m.bindings.items()} + self.assertEqual( + bindings, + { + 'prefix': 'a', + 'middle': 'b', + 'end': 'g', + }, + ) + + if __name__ == '__main__': absltest.main()