From 8f6aa4cead9fe8da8d8c4ed1b3876893c7e1c00c Mon Sep 17 00:00:00 2001
From: Devin Jeanpierre <jeanpierreda@google.com>
Date: Fri, 22 Sep 2023 13:34:50 -0700
Subject: [PATCH] Implement a matcher for pattern matching with globs.

This is the first step to implementing wildcard `$foo...` support in --mode=py.expr/py.stmt ([#6](https://github.com/ssbr/refex/issues/6)).

I actually wanted to go the rest of the way with it, but may as well start sending out PRs for intermediate steps.

PiperOrigin-RevId: 567705007
---
 refex/python/matchers/base_matchers.py      | 128 +++++++++++++++++-
 refex/python/matchers/test_base_matchers.py | 142 +++++++++++++++++++-
 2 files changed, 260 insertions(+), 10 deletions(-)

diff --git a/refex/python/matchers/base_matchers.py b/refex/python/matchers/base_matchers.py
index e9b8e40..9b409c3 100644
--- a/refex/python/matchers/base_matchers.py
+++ b/refex/python/matchers/base_matchers.py
@@ -78,17 +78,13 @@
 """
 # pyformat: enable
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
+import collections
 import re
-from typing import Container, Dict, List, Sequence, Hashable
+from typing import Container, Dict, Hashable, Iterable, List, Sequence
 import weakref
 
 import attr
 import cached_property
-
 from refex import formatting
 from refex import match
 from refex.python import matcher
@@ -755,3 +751,123 @@ def _match(self, context, candidate):
           matcher.create_match(context.parsed_file, candidate))
     else:
       return None
+
+
+# TODO: make this public after glob support is implemented, and it's determined
+# this does the right thing.
+# In particular, at time of writing, it does completely the wrong thing with
+# bindings -- you can't add a bound GlobStar() :(
+# @matcher.safe_to_eval
+@attr.s(frozen=True)
+class GlobStar(matcher.Matcher):
+  """Matches any sequence of items in a sequence.
+
+  Only valid within :class:`Glob`.
+  """
+
+  def _match(self, context, candidate):
+    del context, candidate  # unused
+    # _match isn't called by GlobMatcher; it instead specially recognizes it
+    # inside its own search algorithm. GlobMatcher is a bug when present in
+    # any other context.
+    raise matcher.MatchError('GlobStar used outside of Glob')
+
+
+def _blockify_glob_matchers(
+    matchers: Iterable[matcher.Matcher],
+) -> list[GlobStar | list[matcher.Matcher]]:
+  """Matchers separated into GlobStar() and sequential block of non-* matchers."""
+  blocks = []
+  current = []
+  for m in matchers:
+    if isinstance(m, GlobStar):
+      if current:
+        blocks.append(current)
+      current = m
+    else:
+      if isinstance(current, GlobStar):
+        blocks.append(current)
+        current = []
+      current.append(m)
+  blocks.append(current)
+
+  return blocks
+
+
+# TODO: make this public after glob support is implemented (see GlobStar)
+@attr.s(frozen=True)
+class Glob(matcher.Matcher):
+  """Matches a sequence, with :func:`GlobStar` wildcards.
+
+  For example, ``Glob(['a', GlobStar(), 'b'])`` matches any sequence which
+  starts with ``'a'`` and ends with ``'b'``.
+
+  class:`GlobStar()` is only valid directly within the body of a `Glob`.
+  """
+
+  _matchers = matcher.submatcher_list_attrib()
+
+  @cached_property.cached_property
+  def _blocked_matchers(self):
+    return _blockify_glob_matchers(self._matchers)
+
+  def _match(self, context, candidate):
+    if not isinstance(candidate, collections.abc.Sequence):
+      return False
+
+    # https://research.swtch.com/glob
+    #
+    # The following algorithm is courtesy of the insight (from Russ Cox and
+    # others): you can do a backtracking search to find the substrings
+    # ("blocks" here), but not backtrack past the most recent GlobStar, as long
+    # as you take the "earliest possible" choice.
+    #
+    # (The algorithm looks a bit different because we're borrowing the idea,
+    # not the code.)
+
+    # TODO: allow for ``Bind('name', GlobStar())``
+
+    is_search = False
+    pos = 0
+    bindings = {}
+    for block_i, block in enumerate(self._blocked_matchers):
+      if isinstance(block, GlobStar):
+        is_search = True
+        continue
+
+      if is_search:
+        # searched blocks can terminate at the earliest possible point, with the
+        # sole exception of the last block. We handle that by moving the search
+        # to the one place that it could possibly match, for free performance.
+        if block_i == len(self._blocked_matchers) - 1:
+          pos = len(candidate) - len(block)
+          search_end = pos + 1
+        else:
+          search_end = len(candidate)
+      else:
+        search_end = pos + 1  # only one candidate to search.
+
+      is_search = False
+      result = None
+      for match_start in range(pos, search_end):
+        match_end = match_start + len(block)
+        result = ItemsAre(block).match(
+            context, candidate[match_start:match_end]
+        )
+        if result is not None:
+          pos = match_end
+          break
+
+      if result is None:
+        return None
+      bindings = matcher.merge_bindings(bindings, result.bindings)
+      if bindings is None:
+        return None
+
+    if pos != len(candidate) and not is_search:
+      return None
+
+    return matcher.MatchInfo(
+        matcher.create_match(context.parsed_file, candidate),
+        bindings,
+    )
diff --git a/refex/python/matchers/test_base_matchers.py b/refex/python/matchers/test_base_matchers.py
index 1baf1ac..2815485 100644
--- a/refex/python/matchers/test_base_matchers.py
+++ b/refex/python/matchers/test_base_matchers.py
@@ -15,10 +15,6 @@
 # python3 python2
 """Tests for refex.python.matchers.base_matchers."""
 
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
 import ast
 from unittest import mock
 
@@ -758,5 +754,143 @@ def test_match_lines(self):
         ['c = d', 'g = h'])
 
 
+class GlobTest(parameterized.TestCase):
+
+  @parameterized.parameters(['abc'], [['a', 'b', 'c']])
+  def test_sequence(self, abc_seq):
+    self.assertIsNotNone(
+        base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, abc_seq)
+    )
+
+  @parameterized.parameters(
+      'prefix_abc',
+      'abc_suffix',
+      '',
+      'axc',
+  )
+  def test_sequence_nomatch(self, not_abc):
+    self.assertIsNone(
+        base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, not_abc)
+    )
+
+  def test_empty(self):
+    empty_glob = base_matchers.Glob([])
+    self.assertIsNotNone(empty_glob.match(_FAKE_CONTEXT, ''))
+    self.assertIsNone(empty_glob.match(_FAKE_CONTEXT, 'x'))
+
+  @parameterized.parameters(
+      '',
+      'x',
+  )
+  def test_star(self, seq):
+    self.assertIsNotNone(
+        base_matchers.Glob([base_matchers.GlobStar()]).match(_FAKE_CONTEXT, seq)
+    )
+    self.assertIsNotNone(
+        base_matchers.Glob(
+            [base_matchers.GlobStar(), base_matchers.GlobStar()]
+        ).match(_FAKE_CONTEXT, seq)
+    )
+
+  @parameterized.parameters(
+      'ab',
+      'abab',
+  )
+  def test_prefix_star(self, seq):
+    self.assertIsNotNone(
+        base_matchers.Glob(['a', 'b', base_matchers.GlobStar()]).match(
+            _FAKE_CONTEXT, seq
+        )
+    )
+
+  @parameterized.parameters(
+      '',
+      'ba',
+  )
+  def test_prefix_star_nomatch(self, seq):
+    self.assertIsNone(
+        base_matchers.Glob(['a', base_matchers.GlobStar()]).match(
+            _FAKE_CONTEXT, seq
+        )
+    )
+
+  @parameterized.parameters(
+      'ab',
+      'abab',
+  )
+  def test_star_suffix(self, seq):
+    self.assertIsNotNone(
+        base_matchers.Glob([base_matchers.GlobStar(), 'a', 'b']).match(
+            _FAKE_CONTEXT, seq
+        )
+    )
+
+  def test_backtracking(self):
+    self.assertIsNotNone(
+        base_matchers.Glob([
+            base_matchers.GlobStar(),
+            'a',
+            'b',
+            base_matchers.GlobStar(),
+            'c',
+            base_matchers.GlobStar(),
+        ]).match(_FAKE_CONTEXT, 'abcab')
+    )
+
+  @parameterized.parameters(
+      '',
+      'ab',
+  )
+  def test_star_suffix_nomatch(self, seq):
+    self.assertIsNone(
+        base_matchers.Glob([base_matchers.GlobStar(), 'a']).match(
+            _FAKE_CONTEXT, seq
+        )
+    )
+
+  @parameterized.parameters(
+      'abcd',
+      'a  bcd',
+      'abc  d',
+      'a  bc  d',
+  )
+  def test_sandwich(self, seq):
+    glob = base_matchers.Glob([
+        'a',
+        base_matchers.GlobStar(),
+        'b',
+        'c',
+        base_matchers.GlobStar(),
+        'd',
+    ])
+    self.assertIsNotNone(glob.match(_FAKE_CONTEXT, seq))
+
+  def test_greediness(self):
+    """GlobStar matches as little as possible, so that it can match in O(n).
+
+    That is, the algorithm for globbing a*bc*d... is linear if we try to match
+    a, then the _earliest possible_ bc, then d, ...
+    """
+    glob = base_matchers.Glob([
+        base_matchers.Bind('prefix', base_matchers.Anything()),
+        base_matchers.GlobStar(),
+        base_matchers.Bind('middle', base_matchers.Anything()),
+        base_matchers.GlobStar(),
+        base_matchers.Bind('end', base_matchers.Anything()),
+    ])
+    m = glob.match(_FAKE_CONTEXT, 'abcdefg')
+    self.assertIsNotNone(m)
+    self.assertEqual(m.match, match.StringMatch('abcdefg'))
+    bindings = {k: v.value.string for k, v in m.bindings.items()}
+    self.assertEqual(
+        bindings,
+        {
+            'prefix': 'a',
+            'middle': 'b',
+            'end': 'g',
+        },
+    )
+
+
 if __name__ == '__main__':
   absltest.main()