From 97ee2d6f1a52f55c72ecf555388b9e507b4223a5 Mon Sep 17 00:00:00 2001 From: Devin Jeanpierre Date: Mon, 25 Sep 2023 13:57:49 -0700 Subject: [PATCH] Special-case `*$...` as equivalent to `$...`. This is non-ambiguous: a `$...` only "makes "sense" in a repeated context, and a `Starred()` ast node is not a repeated context. So we're free to interpret it in some other, magical way instead. Since we actually specifically want to support a syntax like `**$...`, `*$...` should match its semantics. PiperOrigin-RevId: 568317031 --- refex/python/matcher.py | 178 +++++++++++------- refex/python/matchers/ast_matchers.py | 16 ++ refex/python/matchers/base_matchers.py | 25 ++- refex/python/matchers/syntax_matchers.py | 73 ++++--- refex/python/matchers/test_base_matchers.py | 20 +- refex/python/matchers/test_syntax_matchers.py | 34 +++- refex/python/python_pattern.py | 114 +++++++---- refex/python/syntactic_template.py | 6 +- refex/python/test_python_pattern.py | 36 +++- refex/test_cli.py | 27 ++- 10 files changed, 367 insertions(+), 162 deletions(-) diff --git a/refex/python/matcher.py b/refex/python/matcher.py index d17c47a..6bf2e0d 100644 --- a/refex/python/matcher.py +++ b/refex/python/matcher.py @@ -203,74 +203,6 @@ def register_constant(name: str, constant: Any): registered_constants[name] = constant -def coerce(value): # Nobody uses coerce. pylint: disable=redefined-builtin - """Returns the 'intended' matcher given by `value`. - - If `value` is already a matcher, then this is what is returned. - - If `value` is anything else, then coerce returns `ImplicitEquals(value)`. - - Args: - value: Either a Matcher, or a value to compare for equality. - """ - if isinstance(value, Matcher): - return value - else: - return ImplicitEquals(value) - - -def _coerce_list(values): - return [coerce(v) for v in values] - - -# TODO(b/199577701): drop the **kwargs: Any in the *_attrib functions. - -_IS_SUBMATCHER_ATTRIB = __name__ + '._IS_SUBMATCHER_ATTRIB' -_IS_SUBMATCHER_LIST_ATTRIB = __name__ + '._IS_SUBMATCHER_LIST_ATTRIB' - - -def submatcher_attrib(*args, walk: bool = True, **kwargs: Any): - """Creates an attr.ib that is marked as a submatcher. - - This will cause the matcher to be automatically walked as part of the - computation of .bind_variables. Any submatcher that can introduce a binding - must be listed as a submatcher_attrib or submatcher_list_attrib. - - Args: - *args: Forwarded to attr.ib. - walk: Whether or not to walk to accumulate .bind_variables. - **kwargs: Forwarded to attr.ib. - - Returns: - An attr.ib() - """ - if walk: - kwargs.setdefault('metadata', {})[_IS_SUBMATCHER_ATTRIB] = True - kwargs.setdefault('converter', coerce) - return attr.ib(*args, **kwargs) - - -def submatcher_list_attrib(*args, walk: bool = True, **kwargs: Any): - """Creates an attr.ib that is marked as an iterable of submatchers. - - This will cause the matcher to be automatically walked as part of the - computation of .bind_variables. Any submatcher that can introduce a binding - must be listed as a submatcher_attrib or submatcher_list_attrib. - - Args: - *args: Forwarded to attr.ib. - walk: Whether or not to walk to accumulate .bind_variables. - **kwargs: Forwarded to attr.ib. - - Returns: - An attr.ib() - """ - if walk: - kwargs.setdefault('metadata', {})[_IS_SUBMATCHER_LIST_ATTRIB] = True - kwargs.setdefault('converter', _coerce_list) - return attr.ib(*args, **kwargs) - - # TODO: make MatchObject, MatchInfo, and Matcher generic, parameterized # by match type. Since pytype doesn't support generics yet, that's not an # option, but it'd greatly clarify the API by allowing us to classify matchers @@ -921,6 +853,16 @@ def bind_variables(self): type_filter = None +class ContextualMatcher(Matcher): + """A matcher which requires special understanding in context. + + By default, contextual matchers are not allowed inside of a submatcher + attribute. + To allow one, specify, for instance, + ``submatcher_attrib(contextual=MyContextualMatcher)``. + """ + + pass def accumulating_matcher(f): @@ -1026,6 +968,106 @@ class ParseError(Exception): """ +def coerce(value): # Nobody uses coerce. pylint: disable=redefined-builtin + """Returns the 'intended' matcher given by `value`. + + If `value` is already a matcher, then this is what is returned. + + If `value` is anything else, then coerce returns `ImplicitEquals(value)`. + + Args: + value: Either a Matcher, or a value to compare for equality. + """ + if isinstance(value, Matcher): + return value + else: + return ImplicitEquals(value) + + +def _coerce_list(values): + return [coerce(v) for v in values] + + +# TODO(b/199577701): drop the **kwargs: Any in the *_attrib functions. + +_IS_SUBMATCHER_ATTRIB = __name__ + '._IS_SUBMATCHER_ATTRIB' +_IS_SUBMATCHER_LIST_ATTRIB = __name__ + '._IS_SUBMATCHER_LIST_ATTRIB' + + +def _submatcher_validator(old_validator, contextual): + def validator(o: object, attribute: attr.Attribute, m: Matcher): + if isinstance(m, ContextualMatcher) and not isinstance(m, contextual): + raise TypeError( + f'Cannot use a `{m}` in `{type(o).__name__}.{attribute.name}`.' + ) + if old_validator is not None: + old_validator(o, attribute, m) + + return validator + + +def submatcher_attrib( + *args, + walk: bool = True, + contextual: type[ContextualMatcher] + | tuple[type[ContextualMatcher], ...] = (), + **kwargs: Any, +): + """Creates an attr.ib that is marked as a submatcher. + + This will cause the matcher to be automatically walked as part of the + computation of .bind_variables. Any submatcher that can introduce a binding + must be listed as a submatcher_attrib or submatcher_list_attrib. + + Args: + *args: Forwarded to attr.ib. + walk: Whether or not to walk to accumulate .bind_variables. + contextual: The contextual matcher classes to allow, if any. + **kwargs: Forwarded to attr.ib. + + Returns: + An attr.ib() + """ + if walk: + kwargs.setdefault('metadata', {})[_IS_SUBMATCHER_ATTRIB] = True + kwargs.setdefault('converter', coerce) + kwargs['validator'] = _submatcher_validator( + kwargs.get('validator'), contextual + ) + return attr.ib(*args, **kwargs) + + +def submatcher_list_attrib( + *args, + walk: bool = True, + contextual: type[ContextualMatcher] + | tuple[type[ContextualMatcher], ...] = (), + **kwargs: Any, +): + """Creates an attr.ib that is marked as an iterable of submatchers. + + This will cause the matcher to be automatically walked as part of the + computation of .bind_variables. Any submatcher that can introduce a binding + must be listed as a submatcher_attrib or submatcher_list_attrib. + + Args: + *args: Forwarded to attr.ib. + walk: Whether or not to walk to accumulate .bind_variables. + contextual: The contextual matcher classes to allow, if any. + **kwargs: Forwarded to attr.ib. + + Returns: + An attr.ib() + """ + if walk: + kwargs.setdefault('metadata', {})[_IS_SUBMATCHER_LIST_ATTRIB] = True + kwargs.setdefault('converter', _coerce_list) + kwargs['validator'] = attr.validators.deep_iterable( + _submatcher_validator(kwargs.get('validator'), contextual) + ) + return attr.ib(*args, **kwargs) + + @attr.s(frozen=True) class _CompareById: """Wrapper object to compare things by identity.""" diff --git a/refex/python/matchers/ast_matchers.py b/refex/python/matchers/ast_matchers.py index ede7ad0..b50bf90 100644 --- a/refex/python/matchers/ast_matchers.py +++ b/refex/python/matchers/ast_matchers.py @@ -206,3 +206,19 @@ def _match(self, context, candidate): (type(...),)) type_filter = frozenset({ast.Constant}) + + +##################################### +# High level AST matching overrides # +##################################### + +# Firstly, a `*$...` is always equivalent to a `$...`. + +_old_starred = Starred # pylint: disable=undefined-variable + + +def Starred(**kw): # pylint: disable=invalid-name + value = kw.get('value') + if isinstance(value, base_matchers.GlobStar): + return value + return _old_starred(**kw) diff --git a/refex/python/matchers/base_matchers.py b/refex/python/matchers/base_matchers.py index ba83d96..273e1a8 100644 --- a/refex/python/matchers/base_matchers.py +++ b/refex/python/matchers/base_matchers.py @@ -125,15 +125,17 @@ def _match(self, context, candidate): raise TestOnlyRaisedError(self.message) -@attr.s(init=False, frozen=True) +@attr.s(frozen=True) class _NAryMatcher(matcher.Matcher): """Base class for matchers which take arbitrarily many submatchers in init.""" _matchers = matcher.submatcher_list_attrib() + +# override __init__ to take *args +class _NAryMatcher(_NAryMatcher): def __init__(self, *matchers): - super(_NAryMatcher, self).__init__() - self.__dict__['_matchers'] = matchers + super().__init__(matchers) @matcher.safe_to_eval @@ -275,7 +277,6 @@ class Bind(matcher.Matcher): `, or None for the default strategy (``KEEP_LAST``). """ - _NAME_REGEX = re.compile(r'\A(?!__)[a-zA-Z_]\w*\Z') name = attr.ib() _submatcher = matcher.submatcher_attrib(default=Anything()) @@ -284,7 +285,12 @@ class Bind(matcher.Matcher): validator=attr.validators.in_(frozenset(matcher.BindConflict) | {None})) _on_merge = attr.ib( default=None, - validator=attr.validators.in_(frozenset(matcher.BindMerge) | {None})) + validator=attr.validators.in_(frozenset(matcher.BindMerge) | {None}), + ) + + # Constants go after attrs-fields to work around static analysis tooling: + # b/301979723 + _NAME_REGEX = re.compile(r'\A(?!__)[a-zA-Z_]\w*\Z') @name.validator def _name_validator(self, attribute, value): @@ -781,12 +787,15 @@ def _match(self, context, candidate): # bindings -- you can't add a bound GlobStar() :( # @matcher.safe_to_eval @attr.s(frozen=True) -class GlobStar(matcher.Matcher): +class GlobStar(matcher.ContextualMatcher): """Matches any sequence of items in a sequence. - Only valid within :class:`Glob`. + Only valid within special matchers like :class:`Glob`. """ + def __str__(self): + return '$...' + def _match(self, context, candidate): del context, candidate # unused # _match isn't called by GlobMatcher; it instead specially recognizes it @@ -827,7 +836,7 @@ class Glob(matcher.Matcher): class:`GlobStar()` is only valid directly within the body of a `Glob`. """ - _matchers = matcher.submatcher_list_attrib() + _matchers = matcher.submatcher_list_attrib(contextual=GlobStar) @cached_property.cached_property def _blocked_matchers(self): diff --git a/refex/python/matchers/syntax_matchers.py b/refex/python/matchers/syntax_matchers.py index 5f3b3a1..03d81f3 100644 --- a/refex/python/matchers/syntax_matchers.py +++ b/refex/python/matchers/syntax_matchers.py @@ -103,10 +103,6 @@ # pylint: disable=g-classes-have-attributes -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function - import abc import ast import inspect @@ -125,34 +121,38 @@ from refex.python.matchers import base_matchers -def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]]: +def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str], set[str], +]: """Renames the variables from the source pattern to give valid Python. Args: - pattern: A source pattern containing metavariables like "$foo". + pattern: A source pattern containing metavariables like ``$foo``, or + repeating metavariables like ``$foo...`` Returns: - (remapped_source, variables, anonymous_variables) + (remapped_source, variables, anonymous_variables, repeating) * remapped_source is the pattern, but with all dollar-prefixed variables replaced with unique non-dollar-prefixed versions. * variables is the mapping of the original name to the remapped name. * anonymous_variables is a set of remapped names that came from `_`. + * repeating is the set of remapped-names which are defined to repeat many times. Raises: SyntaxError: The pattern can't be parsed. """ - remapped_tokens, metavar_indices = python_pattern.token_pattern(pattern) + remapped_tokens, metavar_indices, repeating_metavar_indices = python_pattern.token_pattern(pattern) taken_tokens = { - token[1] + token.string for i, token in enumerate(remapped_tokens) if i not in metavar_indices } original_to_unique = {} anonymous_unique = set() + repeating_unique = set() - for metavar_index in metavar_indices: - metavar_token = list(remapped_tokens[metavar_index]) - variable = metavar_token[1] + for metavar_index in itertools.chain(metavar_indices, repeating_metavar_indices): + metavar_token = remapped_tokens[metavar_index] + variable = metavar_token.string if variable in original_to_unique: remapped_name = original_to_unique[variable] @@ -175,23 +175,26 @@ def _remap_macro_variables(pattern: str) -> tuple[str, dict[str, str], set[str]] else: original_to_unique[variable] = remapped_name break - metavar_token[1] = remapped_name - remapped_tokens[metavar_index] = tuple(metavar_token) + metavar_token = metavar_token._replace(string=remapped_name) + remapped_tokens[metavar_index] = metavar_token + if metavar_index in repeating_metavar_indices: + repeating_unique.add(remapped_name) return ( tokenize.untokenize(remapped_tokens), original_to_unique, anonymous_unique, + repeating_unique, ) -def _rewrite_submatchers(pattern, restrictions): +def _rewrite_submatchers(pattern: str, restrictions: dict[str, matcher.Matcher]): """Rewrites pattern/restrictions to erase metasyntactic variables. Args: pattern: a pattern containing $variables. restrictions: a dictionary of variables to submatchers. If a variable is - missing, Anything() is used instead. + not specified, Anything() is used instead. Returns: (remapped_pattern, variables, new_submatchers) @@ -200,22 +203,33 @@ def _rewrite_submatchers(pattern, restrictions): * variables is the mapping of the original name to the remapped name. * new_submatchers is a dict from remapped names to submatchers. Every non-anonymous variable is put in a Bind() node, which has a submatcher - taken from `restrictions`. + taken from ``restrictions``. + Repeated anonymous wildcards use ``GlobStar()``. Raises: KeyError: if restrictions has a key that isn't a variable name. """ - pattern, variables, anonymous_remapped = _remap_macro_variables(pattern) + pattern, variables, anonymous_remapped, repeating_remapped = _remap_macro_variables(pattern) incorrect_variables = set(restrictions) - set(variables) if incorrect_variables: raise KeyError('Some variables specified in restrictions were missing. ' 'Did you misplace a "$"? Missing variables: %r' % incorrect_variables) - submatchers = { - new_name: base_matchers.Anything() for new_name in anonymous_remapped - } + submatchers = {} + for new_name in anonymous_remapped: + if new_name in repeating_remapped: + m = base_matchers.GlobStar() + else: + m = base_matchers.Anything() + submatchers[new_name] = m + for old_name, new_name in variables.items(): + if new_name in repeating_remapped: + raise ValueError( + 'Repeated variables are not supported:' + ' use `$...` (unnamed repeated wildcard)' + f' instead of named `${old_name}...`.') submatchers[new_name] = base_matchers.Bind( old_name, restrictions.get(old_name, base_matchers.Anything()), @@ -286,21 +300,20 @@ def _ast_pattern(tree, variables): # but does that even happen IRL? # TODO: use a stack. if isinstance(tree, list): - return base_matchers.ItemsAre([_ast_pattern(e, variables) for e in tree]) + return base_matchers.Glob([_ast_pattern(e, variables) for e in tree]) if not isinstance(tree, ast.AST): # e.g. the identifier for an ast.Name. return base_matchers.Equals(tree) if isinstance(tree, ast.Name): if tree.id in variables: return variables[tree.id] - return getattr(ast_matchers, - type(tree).__name__)( - **{ - field: _ast_pattern(getattr(tree, field), variables) - for field in type(tree)._fields - # Filter out variable ctx. - if field != 'ctx' or not isinstance(tree, ast.Name) - }) + kwargs = { + field: _ast_pattern(getattr(tree, field), variables) + for field in type(tree)._fields + # Filter out variable ctx. + if field != 'ctx' or not isinstance(tree, ast.Name) + } + return getattr(ast_matchers, type(tree).__name__)(**kwargs) def _verify_variables(tree, variables): diff --git a/refex/python/matchers/test_base_matchers.py b/refex/python/matchers/test_base_matchers.py index 2815485..2978625 100644 --- a/refex/python/matchers/test_base_matchers.py +++ b/refex/python/matchers/test_base_matchers.py @@ -685,10 +685,15 @@ def test_repr(self): self.assertEqual( repr( base_matchers.RecursivelyWrapped( - ast_matchers.Num(), lambda i: ast_matchers.UnaryOp( - op=ast_matchers.Invert(), operand=i))), - 'RecursivelyWrapped(_matchers=(Num(n=Anything()),' - ' UnaryOp(op=Invert(), operand=_Recurse(...))))') + ast_matchers.Num(), + lambda i: ast_matchers.UnaryOp( + op=ast_matchers.Invert(), operand=i + ), + ) + ), + 'RecursivelyWrapped(_matchers=[Num(n=Anything()),' + ' UnaryOp(op=Invert(), operand=_Recurse(...))])', + ) def test_recursive_bindings(self): """Recursive matchers cover both recursive/base cases in .bind_variables. @@ -756,6 +761,13 @@ def test_match_lines(self): class GlobTest(parameterized.TestCase): + def test_globstar_in_bad_location(self): + with self.assertRaises(TypeError) as cm: + base_matchers.AllOf(base_matchers.GlobStar()) + self.assertIn( + 'Cannot use a `$...` in `AllOf._matchers`.', str(cm.exception) + ) + @parameterized.parameters(['abc'], [['a', 'b', 'c']]) def test_sequence(self, abc_seq): self.assertIsNotNone( diff --git a/refex/python/matchers/test_syntax_matchers.py b/refex/python/matchers/test_syntax_matchers.py index 0060fa6..f1d45b7 100644 --- a/refex/python/matchers/test_syntax_matchers.py +++ b/refex/python/matchers/test_syntax_matchers.py @@ -55,20 +55,21 @@ def test_error_spacing_line(self): def test_identity(self): self.assertEqual( - syntax_matchers._remap_macro_variables('a + b'), ('a + b', {}, set()) + syntax_matchers._remap_macro_variables('a + b'), + ('a + b', {}, set(), set()), ) def test_remap(self): self.assertEqual( syntax_matchers._remap_macro_variables('a + $b'), - ('a + gensym_b', {'b': 'gensym_b'}, set()), + ('a + gensym_b', {'b': 'gensym_b'}, set(), set()), ) def test_remap_twice(self): # But why would you _do_ this? self.assertEqual( syntax_matchers._remap_macro_variables('gensym_b + $b'), - ('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set()), + ('gensym_b + gensym0_b', {'b': 'gensym0_b'}, set(), set()), ) def test_remap_doesnt_eat_tokens(self): @@ -79,7 +80,7 @@ def test_remap_doesnt_eat_tokens(self): # columns to regenerate where things should go: # 1) eating whitespace: 'gensym_ain b' # 2) leavint the $ empty and causing a pahton indent: ' gensym_a in b' - ('gensym_a in b', {'a': 'gensym_a'}, set()), + ('gensym_a in b', {'a': 'gensym_a'}, set(), set()), ) def test_remap_is_noninvasive(self): @@ -87,7 +88,7 @@ def test_remap_is_noninvasive(self): for s in ('# $cash', '"$money"'): with self.subTest(s=s): self.assertEqual( - syntax_matchers._remap_macro_variables(s), (s, {}, set()) + syntax_matchers._remap_macro_variables(s), (s, {}, set(), set()) ) @@ -196,7 +197,7 @@ def test_restrictions(self): self.assertIsNotNone(m.match(matcher.MatchContext(parsed), expr_match)) self.assertIsNone(m.match(matcher.MatchContext(parsed), expr_nomatch)) - def test_repeated_variable(self): + def test_reused_variable(self): self.assertEqual( self.get_all_match_strings( syntax_matchers.ExprPattern('$x + $x'), @@ -217,6 +218,27 @@ def test_variable_conflict(self): syntax_matchers.ExprPattern('$x'), base_matchers.Bind('x')), '1'), ['1']) + def test_repeated_wildcard_in_bad_location(self): + with self.assertRaises(TypeError) as cm: + syntax_matchers.ExprPattern('$... + 3') + self.assertIn('Cannot use a `$...` in `BinOp.left`.', str(cm.exception)) + + def test_bad_dict_glob(self): + """Tests separated globbing of keys and values. + + Under the hood, the AST is separate for keys and values, but + this is almost an implementation detail -- not every AST would do this -- + and definitely surprising. + + We should, for the purpose of globs, pretend it's a list of pairs. + """ + + parsed = matcher.parse_ast('{key1: value1, key2: value2}', '') + expr = parsed.tree.body[0].value + m = syntax_matchers.ExprPattern('{$...:$..., key1:value2, $...:$...}') + # TODO(b/301637225): This shouldn't match. + self.assertIsNotNone(m.match(_FAKE_CONTEXT, expr)) + class StmtPatternTest(matcher_test_util.MatcherTestCase): diff --git a/refex/python/python_pattern.py b/refex/python/python_pattern.py index 3afd714..7811cec 100644 --- a/refex/python/python_pattern.py +++ b/refex/python/python_pattern.py @@ -25,32 +25,52 @@ _VARIABLE_REGEX = re.compile(r'\A[a-zA-Z_][a-zA-Z0-9_]*\Z') -# TODO: replace tuple token manipulation with namedtuple manipulation, -# when we can be Python3-only, in this and its callers. -# For example: -# Py2: print(tok[1]) -# tok = list(tok); tok[1] = ''; tok = tuple(tok) -# Py3: print(tok.string) -# tok = tok._replace(string='') - -def token_pattern(pattern): +def token_pattern( + pattern: str, +) -> tuple[list[tokenize.TokenInfo], set[int], set[int]]: """Tokenizes a source pattern containing metavariables like "$foo". + The following special forms are accepted: + + * Matches anything once, binding to the name ``foo``: ``$foo``. + * Matches anything ``n`` times, binding to the name ``bars``: ``$bars...``. + * Matches anything ``n`` times, binding to the name ``_``: ``$...``. + + (The repeated matching syntax makes the assumption that there's no useful + place in _Python_ for an identifier to be followed directly by a ``...``. If, + one day, Python allows an identifier directly followed by a ``...``, this is + possible to handle by extending the pattern syntax to match this via e.g. + ``${x}...`` vs ``${x...}``. However, this would not be a desirable state of + affairs.) + Args: pattern: A Python source pattern. Returns: - (tokenized, metavar_indices). + (tokenized, nonrepeating_metavar_indices, repeating_metavar_indices). tokenized: A list of source tokens, omitting the metavariable marker ($). - metavar_indices: - A set of token indexes. tokenized[i] is a metavariable token if and only - if i is in metavar_indices. + nonrepeating_metavar_indices: + A set of token indexes. tokenized[i] is a nonrepeating metavariable token + if and only if i is in nonrepeating_metavar_indices. + repeating_metavar_indices: + A set of token indexes. tokenized[i] is a repeating metavariable token + if and only if i is in repeating_metavar_indices. Raises: SyntaxError: The pattern can't be parsed. """ + # A note on column spans: + # + # untokenize() uses the gap between the end_col of the last token and the + # start_col of the next token to decide how many spaces to put -- there is no + # "space token". As a result, if we do nothing, in the expression `$x...`, + # the `$` and the `...` will each be filled with spaces when the `$` and `...` + # tokens are removed, even if `x` is replaced with a long token string such + # as `foooooo`. To prevent this, we must mutate the start/end cols, resulting + # in tokens with a string value shorter than the col span. + # Work around Python 3.6.7's newline requirement. See b/118359498. if pattern.endswith('\n'): added_newline = False @@ -61,14 +81,30 @@ def token_pattern(pattern): try: tokens = list(tokenize.generate_tokens(io.StringIO(pattern).readline)) except tokenize.TokenError as e: - raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e)) + raise SyntaxError("Couldn't tokenize %r: %s" % (pattern, e)) from None retokenized = [] - metavar_indices = set() + nonrepeating_metavar_indices = set() + repeating_metavar_indices = set() tokens_it = iter(tokens) for tok in tokens_it: - if tok[1] != '$': + if tok[1] == '...': + # If the last token was a nonrepeating variable, upgrade it to repeating. + # otherwise, add `...`. + last_token_index = len(retokenized) - 1 + if last_token_index in nonrepeating_metavar_indices: + last_token = retokenized[last_token_index] + if last_token.end != tok.start: + raise SyntaxError( + f'No spaces allowed between metavariable and `...`: {pattern!r}' + ) + retokenized[last_token_index] = last_token._replace(end=tok.end) + nonrepeating_metavar_indices.remove(last_token_index) + repeating_metavar_indices.add(last_token_index) + else: + retokenized.append(tok) + elif tok[1] != '$': # Just a note: in the presence of errors, even whitespace gets added as # error tokens, so we're including that here on purpose. retokenized.append(tok) @@ -79,32 +115,32 @@ def token_pattern(pattern): except StopIteration: # This should never happen, because we get an ENDMARKER token. # But who knows, the token stream may change in the future. - raise SyntaxError('Expected variable after $, got EOF') - variable = variable_token[1] - if not _VARIABLE_REGEX.match(variable): + raise SyntaxError('Expected variable after $, got EOF') from None + if variable_token.string == '...': + is_repeated = True + variable_token = variable_token._replace(string='_') + elif _VARIABLE_REGEX.match(variable_token.string): + is_repeated = False + else: + raise SyntaxError( + 'Expected variable name or ``...`` after $, but next token' + f" {variable_token.string!r} wasn't /{_VARIABLE_REGEX.pattern}/" + ' or ...' + ) + + if tok.end != variable_token.start: raise SyntaxError( - "Expected variable after $, but next token (%r) didn't match %s" % - (variable, _VARIABLE_REGEX.pattern)) - - start_row, start_col = variable_token[2] - # untokenize() uses the gap between the end_col of the last token and the - # start_col of this token to decide how many spaces to put -- there is no - # "space token". As a result, if we do nothing, the place where the "$" - # was will become a space. This is usually fine, but causes phantom - # indents and syntax errors if the $ was the first character on the line. - # e.g. it could not even parse the simple expression "$foo" - # To avoid this, we must remove 1 from start_col to make up for it. - if tok[2][1] != start_col - 1: - # newlines get a NL token, so we only need to worry about columns. - raise SyntaxError('No spaces allowed between $ and variable name: %r' % - pattern) - variable_token_mut = list(variable_token) - variable_token_mut[2] = (start_row, start_col - 1) - metavar_indices.add(len(retokenized)) - retokenized.append(tuple(variable_token_mut)) + f'No spaces allowed between $ and next token: {pattern!r}' + ) + variable_token = variable_token._replace(start=tok.start) + if is_repeated: + repeating_metavar_indices.add(len(retokenized)) + else: + nonrepeating_metavar_indices.add(len(retokenized)) + retokenized.append(variable_token) # Undo damage required to work around Python 3.6.7's newline requirement # See b/118359498 for details. if added_newline and len(retokenized) >= 2 and retokenized[-2][1] == '\n': del retokenized[-2] - return retokenized, metavar_indices + return retokenized, nonrepeating_metavar_indices, repeating_metavar_indices diff --git a/refex/python/syntactic_template.py b/refex/python/syntactic_template.py index 603cad9..1cbcc21 100644 --- a/refex/python/syntactic_template.py +++ b/refex/python/syntactic_template.py @@ -69,7 +69,11 @@ class _LexicalTemplate: def __attrs_post_init__(self): # Because we have frozen=True, creating values for _tokens and _var_to_i # is a little complex, and requires invoking object.__setattr__. - tokenized, metavar_indices = python_pattern.token_pattern(self.template) + tokenized, metavar_indices, repeating_indices = ( + python_pattern.token_pattern(self.template) + ) + if repeating_indices: + raise ValueError('Repeated substitutions are not yet supported') var_to_i = {} for i in metavar_indices: var = tokenized[i][1] diff --git a/refex/python/test_python_pattern.py b/refex/python/test_python_pattern.py index 29ec6c9..0fac579 100644 --- a/refex/python/test_python_pattern.py +++ b/refex/python/test_python_pattern.py @@ -26,18 +26,46 @@ class PythonPatternTest(parameterized.TestCase): @parameterized.parameters('', 'x', 'x y') def test_simple_nonpattern(self, pattern): - tokenized, _ = python_pattern.token_pattern(pattern) + tokenized, nonrepeating, repeating = python_pattern.token_pattern(pattern) + self.assertEmpty(nonrepeating) + self.assertEmpty(repeating) self.assertEqual(tokenize.untokenize(tokenized), pattern) @parameterized.parameters('$x', 'foo + $x', 'import $x', '$x "$y"', '$x = 0') - def test_simple_pattern(self, pattern): - tokenized, [metavar_i] = python_pattern.token_pattern(pattern) + def test_nonrepeating_pattern(self, pattern): + tokenized, [metavar_i], [] = python_pattern.token_pattern(pattern) # token text is 'x' -- that's the only variable in the pattern. self.assertEqual(tokenized[metavar_i][1], 'x') # it round trips to the same string except $x -> x self.assertEqual(tokenize.untokenize(tokenized), pattern.replace('$x', 'x')) - @parameterized.parameters('$1', '$', '$\n', '$[', '$""', '$ x', '$\nx') + @parameterized.parameters( + '$x...', 'foo + $x...', 'import $x...', '$x... "$y..."', '$x... = 0' + ) + def test_repeating_pattern(self, pattern): + tokenized, [], [metavar_i] = python_pattern.token_pattern(pattern) + # token text is 'x' -- that's the only variable in the pattern. + self.assertEqual(tokenized[metavar_i][1], 'x') + # it round trips to the same string except $x... -> x + self.assertEqual( + tokenize.untokenize(tokenized), pattern.replace('$x...', 'x') + ) + + @parameterized.parameters( + '$...', 'foo + $...', 'import $...', '$... "$..."', '$... = 0' + ) + def test_anonymous_repeating_pattern(self, pattern): + tokenized, [], [metavar_i] = python_pattern.token_pattern(pattern) + # token text is '_' -- that's the only variable in the pattern. + self.assertEqual(tokenized[metavar_i][1], '_') + # it round trips to the same string except $... -> _ + self.assertEqual( + tokenize.untokenize(tokenized), pattern.replace('$...', '_', 1) + ) + + @parameterized.parameters( + '$1', '$', '$\n', '$[', '$""', '$ x', '$\nx', '$ ...' + ) def test_syntax_error(self, pattern): with self.assertRaises(SyntaxError): python_pattern.token_pattern(pattern) diff --git a/refex/test_cli.py b/refex/test_cli.py index d935cf4..6087cfe 100644 --- a/refex/test_cli.py +++ b/refex/test_cli.py @@ -768,9 +768,31 @@ def test_py_ez_sub_extra_variables(self): ['--mode=%s' % mode, '$x', '--sub=$y+$z', os.devnull]) self.assertEqual( message, - 'The substitution template(s) referenced variables not matched in the Python matcher: `y`, `z`' + 'The substitution template(s) referenced variables not matched in' + ' the Python matcher: `y`, `z`', ) + @parameterized.parameters( + '[$..., 3, $...]', + '[*$..., 3, *$...]', + '[$..., 5]', + '[1, $...]', + '[$..., 1, $..., 2, $..., 3, $..., 4, $..., 5, $...]', + ) + def test_py_ez_sub_glob_list(self, pattern): + for mode in ['py.expr', 'py.stmt']: + with self.subTest(mode=mode): + f = self.create_tempfile(content='[1, 2, 3, 4, 5]\n') + + self.main([ + '--mode=%s' % mode, + pattern, + '--sub=good', + '-i', + f.full_path, + ]) + self.assertEqual(f.read_text(), 'good\n') + @parameterized.parameters( # Statement fragments can't be reparsed. 'for a in foo: pass', @@ -782,7 +804,8 @@ def test_py_ez_sub_extra_variables(self): # and 'bar', two valid lines. # But '(foo\n,bar)' and '(foo,\n bar)' can't parse as two expressions, # only as one expression. - '(foo,\n bar)') + '(foo,\n bar)', + ) def test_py_ez_sub_bad_reparse(self, source): """Substitution works for suite statements and other parser edge cases.""" f = self.create_tempfile(content=source)