Skip to content

Commit

Permalink
Implement a matcher for pattern matching with globs.
Browse files Browse the repository at this point in the history
This is the first step to implementing wildcard `$foo...` support in --mode=py.expr/py.stmt ([#6](#6)).

I actually wanted to go the rest of the way with it, but may as well start sending out PRs for intermediate steps.

PiperOrigin-RevId: 567705007
  • Loading branch information
ssbr authored and copybara-github committed Sep 22, 2023
1 parent 1657b45 commit 8f6aa4c
Show file tree
Hide file tree
Showing 2 changed files with 260 additions and 10 deletions.
128 changes: 122 additions & 6 deletions refex/python/matchers/base_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,17 +78,13 @@
"""
# pyformat: enable

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import collections
import re
from typing import Container, Dict, List, Sequence, Hashable
from typing import Container, Dict, Hashable, Iterable, List, Sequence
import weakref

import attr
import cached_property

from refex import formatting
from refex import match
from refex.python import matcher
Expand Down Expand Up @@ -755,3 +751,123 @@ def _match(self, context, candidate):
matcher.create_match(context.parsed_file, candidate))
else:
return None


# TODO: make this public after glob support is implemented, and it's determined
# this does the right thing.
# In particular, at time of writing, it does completely the wrong thing with
# bindings -- you can't add a bound GlobStar() :(
# @matcher.safe_to_eval
@attr.s(frozen=True)
class GlobStar(matcher.Matcher):
"""Matches any sequence of items in a sequence.
Only valid within :class:`Glob`.
"""

def _match(self, context, candidate):
del context, candidate # unused
# _match isn't called by GlobMatcher; it instead specially recognizes it
# inside its own search algorithm. GlobMatcher is a bug when present in
# any other context.
raise matcher.MatchError('GlobStar used outside of Glob')


def _blockify_glob_matchers(
matchers: Iterable[matcher.Matcher],
) -> list[GlobStar | list[matcher.Matcher]]:
"""Matchers separated into GlobStar() and sequential block of non-* matchers."""
blocks = []
current = []
for m in matchers:
if isinstance(m, GlobStar):
if current:
blocks.append(current)
current = m
else:
if isinstance(current, GlobStar):
blocks.append(current)
current = []
current.append(m)
blocks.append(current)

return blocks


# TODO: make this public after glob support is implemented (see GlobStar)
@attr.s(frozen=True)
class Glob(matcher.Matcher):
"""Matches a sequence, with :func:`GlobStar` wildcards.
For example, ``Glob(['a', GlobStar(), 'b'])`` matches any sequence which
starts with ``'a'`` and ends with ``'b'``.
class:`GlobStar()` is only valid directly within the body of a `Glob`.
"""

_matchers = matcher.submatcher_list_attrib()

@cached_property.cached_property
def _blocked_matchers(self):
return _blockify_glob_matchers(self._matchers)

def _match(self, context, candidate):
if not isinstance(candidate, collections.abc.Sequence):
return False

# https://research.swtch.com/glob
#
# The following algorithm is courtesy of the insight (from Russ Cox and
# others): you can do a backtracking search to find the substrings
# ("blocks" here), but not backtrack past the most recent GlobStar, as long
# as you take the "earliest possible" choice.
#
# (The algorithm looks a bit different because we're borrowing the idea,
# not the code.)

# TODO: allow for ``Bind('name', GlobStar())``

is_search = False
pos = 0
bindings = {}
for block_i, block in enumerate(self._blocked_matchers):
if isinstance(block, GlobStar):
is_search = True
continue

if is_search:
# searched blocks can terminate at the earliest possible point, with the
# sole exception of the last block. We handle that by moving the search
# to the one place that it could possibly match, for free performance.
if block_i == len(self._blocked_matchers) - 1:
pos = len(candidate) - len(block)
search_end = pos + 1
else:
search_end = len(candidate)
else:
search_end = pos + 1 # only one candidate to search.

is_search = False
result = None
for match_start in range(pos, search_end):
match_end = match_start + len(block)
result = ItemsAre(block).match(
context, candidate[match_start:match_end]
)
if result is not None:
pos = match_end
break

if result is None:
return None
bindings = matcher.merge_bindings(bindings, result.bindings)
if bindings is None:
return None

if pos != len(candidate) and not is_search:
return None

return matcher.MatchInfo(
matcher.create_match(context.parsed_file, candidate),
bindings,
)
142 changes: 138 additions & 4 deletions refex/python/matchers/test_base_matchers.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,6 @@
# python3 python2
"""Tests for refex.python.matchers.base_matchers."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import ast
from unittest import mock

Expand Down Expand Up @@ -758,5 +754,143 @@ def test_match_lines(self):
['c = d', 'g = h'])


class GlobTest(parameterized.TestCase):

@parameterized.parameters(['abc'], [['a', 'b', 'c']])
def test_sequence(self, abc_seq):
self.assertIsNotNone(
base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, abc_seq)
)

@parameterized.parameters(
'prefix_abc',
'abc_suffix',
'',
'axc',
)
def test_sequence_nomatch(self, not_abc):
self.assertIsNone(
base_matchers.Glob(['a', 'b', 'c']).match(_FAKE_CONTEXT, not_abc)
)

def test_empty(self):
empty_glob = base_matchers.Glob([])
self.assertIsNotNone(empty_glob.match(_FAKE_CONTEXT, ''))
self.assertIsNone(empty_glob.match(_FAKE_CONTEXT, 'x'))

@parameterized.parameters(
'',
'x',
)
def test_star(self, seq):
self.assertIsNotNone(
base_matchers.Glob([base_matchers.GlobStar()]).match(_FAKE_CONTEXT, seq)
)
self.assertIsNotNone(
base_matchers.Glob(
[base_matchers.GlobStar(), base_matchers.GlobStar()]
).match(_FAKE_CONTEXT, seq)
)

@parameterized.parameters(
'ab',
'abab',
)
def test_prefix_star(self, seq):
self.assertIsNotNone(
base_matchers.Glob(['a', 'b', base_matchers.GlobStar()]).match(
_FAKE_CONTEXT, seq
)
)

@parameterized.parameters(
'',
'ba',
)
def test_prefix_star_nomatch(self, seq):
self.assertIsNone(
base_matchers.Glob(['a', base_matchers.GlobStar()]).match(
_FAKE_CONTEXT, seq
)
)

@parameterized.parameters(
'ab',
'abab',
)
def test_star_suffix(self, seq):
self.assertIsNotNone(
base_matchers.Glob([base_matchers.GlobStar(), 'a', 'b']).match(
_FAKE_CONTEXT, seq
)
)

def test_backtracking(self):
self.assertIsNotNone(
base_matchers.Glob([
base_matchers.GlobStar(),
'a',
'b',
base_matchers.GlobStar(),
'c',
base_matchers.GlobStar(),
]).match(_FAKE_CONTEXT, 'abcab')
)

@parameterized.parameters(
'',
'ab',
)
def test_star_suffix_nomatch(self, seq):
self.assertIsNone(
base_matchers.Glob([base_matchers.GlobStar(), 'a']).match(
_FAKE_CONTEXT, seq
)
)

@parameterized.parameters(
'abcd',
'a bcd',
'abc d',
'a bc d',
)
def test_sandwich(self, seq):
glob = base_matchers.Glob([
'a',
base_matchers.GlobStar(),
'b',
'c',
base_matchers.GlobStar(),
'd',
])
self.assertIsNotNone(glob.match(_FAKE_CONTEXT, seq))

def test_greediness(self):
"""GlobStar matches as little as possible, so that it can match in O(n).
That is, the algorithm for globbing a*bc*d... is linear if we try to match
a, then the _earliest possible_ bc, then d, ...
"""
glob = base_matchers.Glob([
base_matchers.Bind('prefix', base_matchers.Anything()),
base_matchers.GlobStar(),
base_matchers.Bind('middle', base_matchers.Anything()),
base_matchers.GlobStar(),
base_matchers.Bind('end', base_matchers.Anything()),
])
m = glob.match(_FAKE_CONTEXT, 'abcdefg')
self.assertIsNotNone(m)
self.assertEqual(m.match, match.StringMatch('abcdefg'))
bindings = {k: v.value.string for k, v in m.bindings.items()}
self.assertEqual(
bindings,
{
'prefix': 'a',
'middle': 'b',
'end': 'g',
},
)


if __name__ == '__main__':
absltest.main()

0 comments on commit 8f6aa4c

Please sign in to comment.