preparing for 1.5.0

bentheiii · Apr 8, 2024 · 02d291f · 02d291f
1 parent 6bb7bf4
commit 02d291f
Show file tree

Hide file tree

Showing 11 changed files with 259 additions and 104 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -10,7 +10,7 @@ jobs:
   unittest:
     strategy:
       matrix:
-        python-version: ["3.7", "3.8", "3.9", "3.10", "3.11", "3.12"] # format: 3.7, 3.8, 3.9
+        python-version: [ "3.8", "3.9", "3.10", "3.11", "3.12"] # format: 3.7, 3.8, 3.9
         platform: [ubuntu-latest, macos-latest, windows-latest]
       fail-fast: false
     runs-on: ${{ matrix.platform }}

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,4 +1,14 @@
 # envolved Changelog
+## 1.5.0
+### Removed
+* `envolved` no longer supports python 3.7
+### Added
+* `FindIterCollectionParser`
+### Fixed
+* `CollectionParser`'s `opener` and `closer` arguments now correctly handle matches that would be split by the delimiter
+* `CollectionParser`'s `closer` argument now correctly handles overlapping matches
+* `CollectionParser`'s `closer` argument is now faster when using non-regex matches
+* `CollectionParser.pair_wise_delimited` will now be more memory efficient when using a mapping `value_type`
 ## 1.4.0
 ### Deprecated
 * this is the last release to support python 3.7

diff --git a/docs/string_parsing.rst b/docs/string_parsing.rst
@@ -75,9 +75,10 @@ Utility Parsers
     :param delimiter: The delimiter string or pattern to split the string on.
     :param inner_parser: The parser to use to parse the elements of the collection. Note this parser is treated the
      same an an EnvVar type, so :ref:`string_parsing:Special parsers` apply.
-    :param output_type: The type to use to aggregate the parsed items to a collection defaults to list.
+    :param output_type: The type to use to aggregate the parsed items to a collection. Defaults to list.
     :param opener: If set, specifies a string or pattern that should be at the beginning of the delimited string.
-    :param closer: If set, specifies a string or pattern that should be at the end of the delimited string.
+    :param closer: If set, specifies a string or pattern that should be at the end of the delimited string. Note that providing
+     a pattern will slow down the parsing process.
     :param strip: Whether or not to strip whitespaces from the beginning and end of each item.
 
     .. code-block::
@@ -140,6 +141,36 @@ Utility Parsers
 
             assert server_params_ev.get() == {"host": "localhost", "port": 8080, "is_ssl": False}
 
+.. class:: FindIterCollectionParser(element_pattern: typing.Pattern, element_func: collections.abc.Callable[[re.Match], E], \
+                    output_type: collections.abc.Callable[[collections.abc.Iterator[E]], G] = list, \
+                    opener: str | typing.Pattern = '', closer: str | typing.Pattern = '')
+
+    A parser to translate a string to a collection of values by splitting the string to continguous elements that match
+    a regex pattern. This parser is useful for parsing strings that have a repeating, complex structure, or in cases where
+    a :class:`naive split <CollectionParser>` would split the string incorrectly.
+
+    :param element_pattern: A regex pattern to find the elements in the string.
+    :param element_func: A function that takes a regex match object and returns an element.
+    :param output_type: The type to use to aggregate the parsed items to a collection. Defaults to list.
+    :param opener: If set, specifies a string or pattern that should be at the beginning of the string.
+    :param closer: If set, specifies a string or pattern that should be at the end of the string. Note that providing
+     a pattern will slow down the parsing process.
+
+    .. code-block::
+        :caption: Using FindIterCollectionParser to parse a string of comma-separated groups of numbers.
+
+        def parse_group(match: re.Match) -> set[int]:
+            return {int(x) for x in match.group(1).split(',')}
+
+        groups_ev = env_var("GROUPS", type=FindIterCollectionParser(
+            re.compile(r"{([,\d]+)},?"),
+            parse_group
+        ))
+
+        os.environ["GROUPS"] = "{1,2,3},{4,5,6},{7,8,9}"
+
+        assert groups_ev.get() == [{1, 2, 3}, {4, 5, 6}, {7, 8, 9}]
+
 
 .. class:: MatchParser(cases: collections.abc.Iterable[tuple[typing.Pattern[str] | str, T]] | \
             collections.abc.Mapping[str, T] | type[enum.Enum], fallback: T = ...)

diff --git a/envolved/_version.py b/envolved/_version.py
@@ -1 +1 @@
-__version__ = "1.4.0"
+__version__ = "1.5.0"
diff --git a/envolved/envparser.py b/envolved/envparser.py
@@ -120,50 +120,12 @@ def get(self, case_sensitive: bool, key: str) -> str:
         return ret
 
 
-class NonAuditingEnvParser(ReloadingEnvParser):
-    def get(self, case_sensitive: bool, key: str) -> str:
-        if case_sensitive:
-            return getenv_unsafe(key)
-
-        def out_of_date() -> str:
-            self.reload()
-            return get_case_insensitive(retry_allowed=False)
-
-        lowered = key.lower()
-
-        def get_case_insensitive(retry_allowed: bool) -> str:
-            if retry_allowed and lowered not in self.environ_case_insensitive:
-                # if a retry is allowed, and no candidates are available, we need to retry
-                return out_of_date()
-            candidates = self.environ_case_insensitive[lowered]
-            if key in candidates:
-                preferred_key = key
-            elif retry_allowed and has_env(key):
-                # key is not a candidate, but it is in the env
-                return out_of_date()
-            elif len(candidates) == 1:
-                (preferred_key,) = candidates
-            elif retry_allowed:
-                return out_of_date()
-            else:
-                raise CaseInsensitiveAmbiguityError(candidates)
-            ret = getenv(preferred_key)
-            if ret is None:
-                assert retry_allowed
-                return out_of_date()
-            return ret
-
-        return get_case_insensitive(retry_allowed=True)
-
-
 EnvParser: Type[BaseEnvParser]
 if name == "nt":
     # in windows, all env vars are uppercase
     EnvParser = CaseInsensitiveEnvParser
-elif sys.version_info >= (3, 8):  # adding audit hooks is only supported in python 3.8+
-    EnvParser = AuditingEnvParser
 else:
-    EnvParser = NonAuditingEnvParser
+    EnvParser = AuditingEnvParser
 
 
 env_parser = EnvParser()

diff --git a/envolved/parsers.py b/envolved/parsers.py
@@ -2,7 +2,6 @@
 
 import re
 from enum import Enum, auto
-from functools import lru_cache
 from itertools import chain
 from sys import version_info
 from typing import (
@@ -138,6 +137,68 @@ def _duplicate_avoiding_dict(pairs: Iterator[Tuple[K, V]]) -> Dict[K, V]:
     return ret
 
 
+def strip_opener_idx(x: str, opener: Pattern[str]) -> int:
+    opener_match = opener.match(x)
+    if not opener_match:
+        raise ValueError("position 0, expected opener")
+    return opener_match.end()
+
+
+def strip_closer_idx(x: str, closer: Needle, pos: int) -> int:
+    if isinstance(closer, str):
+        if len(closer) + pos > len(x) or not x.endswith(closer):
+            raise ValueError("expected string to end in closer")
+        return len(x) - len(closer)
+    else:
+        assert isinstance(closer, Pattern)
+        # now we have a problem, as the standard re module doesn't support reverse matches
+        closer_matches = closer.finditer(x, pos)
+        closer_match = None
+        for closer_match in closer_matches:  # noqa: B007
+            # we iterate to find the last match
+            pass
+        if not closer_match:
+            raise ValueError("expected string to end in closer")
+        else:
+            while closer_match.end() != len(x):
+                # finditer could have missed on overlapping match, if there is an overlapping match
+                # it will be found after the start of the last match (but before its end)
+                closer_match = closer.search(x, closer_match.start() + 1)
+                # if there is a match, it's an overlapping match, but it doesn't neccessarily end at
+                # the end of the string
+                if not closer_match:
+                    raise ValueError("expected string to end in closer")
+        return closer_match.start()
+
+
+def strip_opener_and_closer(x: str, opener: Pattern[str], closer: Needle) -> str:
+    start_idx = strip_opener_idx(x, opener)
+    end_idx = strip_closer_idx(x, closer, start_idx)
+
+    if start_idx != 0 or end_idx != len(x):
+        return x[start_idx:end_idx]
+    return x
+
+
+def value_parser_func(value_type: Union[ParserInput[V], Mapping[K, ParserInput[V]]]) -> Callable[[K], Parser[V]]:
+    if isinstance(value_type, Mapping):
+        value_parsers = {k: parser(v) for k, v in value_type.items()}
+
+        def get_value_parser(key: K) -> Parser[V]:
+            try:
+                return value_parsers[key]
+            except KeyError:
+                # in case the mapping has a default value or the like
+                return parser(value_type[key])
+    else:
+        _value_parser = parser(value_type)
+
+        def get_value_parser(key: K) -> Parser[V]:
+            return _value_parser
+
+    return get_value_parser
+
+
 class CollectionParser(Generic[G, E]):
     """
     A parser that splits a string by a delimiter, and parses each part individually.
@@ -149,45 +210,20 @@ def __init__(
         inner_parser: ParserInput[E],
         output_type: Callable[[Iterator[E]], G] = list,  # type: ignore[assignment]
         opener: Needle = empty_pattern,
-        closer: Needle = empty_pattern,
+        closer: Needle = "",
         *,
         strip: bool = True,
     ):
-        """
-        :param delimiter: The delimiter to split by.
-        :param inner_parser: The inner parser to apply to each element.
-        :param output_type: The aggregator function of all the parsed elements.
-        :param opener: Optional opener that must be present at the start of the string.
-        :param closer: Optional closer that must be present at the end of the string.
-        """
         self.delimiter_pattern = needle_to_pattern(delimiter)
         self.inner_parser = parser(inner_parser)
         self.output_type = output_type
         self.opener_pattern = needle_to_pattern(opener)
-        self.closer_pattern = needle_to_pattern(closer)
+        self.closer = closer
         self.strip = strip
 
     def __call__(self, x: str) -> G:
-        opener_match = self.opener_pattern.match(x)
-        if not opener_match:
-            raise ValueError("position 0, expected opener")
-        x = x[opener_match.end() :]
-        raw_elements = self.delimiter_pattern.split(x)
-        closer_matches = self.closer_pattern.finditer(raw_elements[-1])
-
-        closer_match = None
-        for closer_match in closer_matches:  # noqa: B007
-            pass
-        if not closer_match:
-            raise ValueError("expected string to end in closer")
-        elif closer_match.end() != len(raw_elements[-1]):
-            raise ValueError(
-                "expected closer to match end of string, got unexpected suffix: "
-                + raw_elements[-1][closer_match.end() :]
-            )
-
-        raw_elements[-1] = raw_elements[-1][: closer_match.start()]
-        raw_items = iter(raw_elements)
+        x = strip_opener_and_closer(x, self.opener_pattern, self.closer)
+        raw_items = iter(self.delimiter_pattern.split(x))
         if self.strip:
             raw_items = (r.strip() for r in raw_items)
         elements = (self.inner_parser(r) for r in raw_items)
@@ -201,36 +237,14 @@ def pair_wise_delimited(
         key_type: ParserInput[K],
         value_type: Union[ParserInput[V], Mapping[K, ParserInput[V]]],
         output_type: Callable[[Iterator[Tuple[K, V]]], G] = _duplicate_avoiding_dict,  # type: ignore[assignment]
-        *,
         key_first: bool = True,
         strip_keys: bool = True,
         strip_values: bool = True,
         **kwargs: Any,
     ) -> Parser[G]:
-        """
-        Create a collectionParser that aggregates to key-value pairs.
-        :param pair_delimiter: The separator between different key-value pairs.
-        :param key_value_delimiter: The separator between each key and value.
-        :param key_type: The parser for key elements.
-        :param value_type: The parser for value elements. Can also be a mapping, parsing each key under a different
-         parser.
-        :param output_type: The tuple aggregator function. Defaults to a duplicate-checking dict.
-        :param key_first: If set to false, will evaluate the part behind the key-value separator as a value.
-        :param kwargs: forwarded to `CollectionParser.__init__`
-        """
         key_value_delimiter = needle_to_pattern(key_value_delimiter)
         key_parser = parser(key_type)
-        get_value_parser: Callable[[K], Parser]
-        if isinstance(value_type, Mapping):
-
-            @lru_cache(None)
-            def get_value_parser(key: K) -> Parser[V]:
-                return parser(value_type[key])
-        else:
-            _value_parser = parser(value_type)
-
-            def get_value_parser(key: K) -> Parser[V]:
-                return _value_parser
+        get_value_parser = value_parser_func(value_type)
 
         def combined_parser(s: str) -> Tuple[K, V]:
             split = key_value_delimiter.split(s, maxsplit=2)
@@ -250,6 +264,38 @@ def combined_parser(s: str) -> Tuple[K, V]:
         return cls(pair_delimiter, combined_parser, output_type, **kwargs)  # type: ignore[arg-type]
 
 
+def find_iter_contingient(x: str, pattern: Pattern[str]) -> Iterator[re.Match[str]]:
+    start_idx = 0
+    while start_idx < len(x):
+        match = pattern.match(x, start_idx)
+        if match is None:
+            raise ValueError(f"could not match pattern {pattern} at position {start_idx}")
+        start_idx = match.end()
+        yield match
+
+
+class FindIterCollectionParser(Generic[G, E]):
+    def __init__(
+        self,
+        element_pattern: Pattern[str],
+        element_func: Callable[[re.Match[str]], E],
+        output_type: Callable[[Iterator[E]], G] = list,  # type: ignore[assignment]
+        opener: Needle = empty_pattern,
+        closer: Needle = "",
+    ):
+        self.prefix_pattern = element_pattern
+        self.element_func = element_func
+        self.output_type = output_type
+        self.opener_pattern = needle_to_pattern(opener)
+        self.closer = closer
+
+    def __call__(self, x: str) -> G:
+        x = strip_opener_and_closer(x, self.opener_pattern, self.closer)
+        raw_matches = find_iter_contingient(x, self.prefix_pattern)
+        elements = (self.element_func(r) for r in raw_matches)
+        return self.output_type(elements)
+
+
 class NoFallback(Enum):
     no_fallback = auto()
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "envolved"
-version = "1.4.0"
+version = "1.5.0"
 description = ""
 authors = ["ben avrahami <[email protected]>"]
 license = "MIT"
@@ -12,11 +12,8 @@ packages = [
 ]
 
 [tool.poetry.dependencies]
-python = "^3.7"
-typing-extensions = [
-    {version="<4.8.0", python=">=3.7, <3.8"},
-    {version="*", python=">=3.8"},
-]
+python = "^3.8"
+typing-extensions = "*"
 
 [tool.poetry.group.dev.dependencies]
 pytest = "*"
@@ -39,7 +36,7 @@ build-backend = "poetry.masonry.api"
 
 
 [tool.ruff]
-target-version = "py37"
+target-version = "py38"
 line-length = 120
 output-format = "full"
 [tool.ruff.lint]
@@ -108,3 +105,7 @@ keep-runtime-typing = true
     "PTH",  # use pathlib
     "PERF",  # performance anti-patterns
 ]
+
+"type_checking/**" = [
+    "INP001",  # implicit namespace packages
+]
diff --git a/scripts/test_type_hinting.sh b/scripts/test_type_hinting.sh
@@ -0,0 +1 @@
+python -m mypy --show-error-codes --check-untyped-defs type_checking
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		python -m mypy --show-error-codes --check-untyped-defs type_checking