Add flake8 and fix issues raised by it.

NOTE: For now, ignoring checks for missing docstrings, since there are so many. Those should be added and the checks enabled.
SironaMedical · Mar 29, 2024 · b415da9 · b415da9
1 parent 3e43307
commit b415da9
Show file tree

Hide file tree

Showing 11 changed files with 104 additions and 110 deletions.
diff --git a/.flake8 b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+    ignore = E203, E266, E501, W503, B028, D100, D101, D102, D103, D104
+    max-line-length = 120
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 ---
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.8.0
+    rev: 23.9.1
     hooks:
       - id: black
         language_version: python3
@@ -11,3 +11,10 @@ repos:
     hooks:
       - id: isort
         args: [ --profile, black ]
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        language: python
+        additional_dependencies: [flake8-bugbear, flake8-comprehensions, flake8-print, flake8-docstrings]
diff --git a/philter_lite/__init__.py b/philter_lite/__init__.py
@@ -1,3 +1,4 @@
+"""The philter_lite package."""
 from importlib import metadata
 
 from philter_lite.coordinate_map import CoordinateMap
@@ -8,6 +9,16 @@
 from .asterisk import transform_text_asterisk
 from .i2b2 import transform_text_i2b2
 
+__all__ = [
+    "CoordinateMap",
+    "Filter",
+    "filter_from_dict",
+    "load_filters",
+    "detect_phi",
+    "transform_text_asterisk",
+    "transform_text_i2b2",
+]
+
 _DISTRIBUTION_METADATA = metadata.metadata("philter_lite")
 
 __author__ = _DISTRIBUTION_METADATA["Author"]

diff --git a/philter_lite/asterisk.py b/philter_lite/asterisk.py
@@ -1,7 +1,8 @@
-from .coordinate_map import CoordinateMap, PUNCTUATION_MATCHER
+from .coordinate_map import PUNCTUATION_MATCHER, CoordinateMap
 
 
 def save_to_asterisk(contents, output_file):
+    """Write some data to a text file, using utf-8 encoding."""
     with open(output_file, "w", encoding="utf-8", errors="surrogateescape") as f:
         f.write(contents)
 
@@ -11,7 +12,6 @@ def transform_text_asterisk(txt, include_map: CoordinateMap):
     # read the text by character, any non-punc non-overlaps will be replaced
     contents = []
     for i in range(len(txt)):
-
         if i < last_marker:
             continue
 

diff --git a/philter_lite/coordinate_map.py b/philter_lite/coordinate_map.py
@@ -6,31 +6,32 @@
 
 
 class CoordinateMap:
-    """Hits are stored in a coordinate map data structure
+    """Internal data structure mapping filepaths to a map of int:string (coordinate start --> stop).
 
-    This class stores start coordinates for any matches found for this pattern"""
+    Hits are stored in a coordinate map data structure.
 
-    def __init__(self):
-        """internal data structure maps filepaths to a map of int:string (coordinate start --> stop)
-
-        map is the internal structure of
-        { filename : { startcoordinate : stop_coordinate}}
-        eg: { "data/foo.txt": {123:126, 19:25} }
-
-        coord2pattern keeps reference of the patterns
-        that matched this coorinate (can be multiple patterns)
+    This class stores start coordinates for any matches found for this pattern.
 
-        all_coords keeps a reference of all coordinates mapped by filename,
-        allowing us to easily check if these coordinates have been matched yet"""
+    Attributes:
+        map: Has the internal structure of
+            { filename : { startcoordinate : stop_coordinate}}
+            eg: { "data/foo.txt": {123:126, 19:25} }
+        coord2pattern: Keeps reference of the patterns that matched this coordinate
+            (can be multiple patterns).
+        all_coords: Keeps a reference of all coordinates mapped by filename,
+            allowing us to easily check if these coordinates have been matched yet.
+    """
 
+    def __init__(self):
+        """Initialize."""
         self.map = {}
         self.coord2pattern = {}
         self.all_coords = {}
 
     def add(self, start, stop, overlap=False, pattern=""):
-        """adds a new coordinate to the coordinate map
+        """Add a new coordinate to the coordinate map.
 
-        if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)
+        If overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs).
         """
         if not overlap:
             if self.does_overlap(start, stop):
@@ -47,13 +48,15 @@ def add(self, start, stop, overlap=False, pattern=""):
         return True, None
 
     def add_pattern(self, start, stop, pattern):
-        """adds this pattern to this start coord"""
+        """Add this pattern to this start coord."""
         self.coord2pattern[start] = []
         self.coord2pattern[start].append(pattern)
 
     def add_extend(self, start, stop, pattern=""):
-        """adds a new coordinate to the coordinate map
-        if overlaps with another, will extend to the larger size"""
+        """Add a new coordinate to the coordinate map.
+
+        If overlaps with another, will extend to the larger size.
+        """
         overlaps = self.max_overlap(start, stop)
 
         def clear_overlaps(lst):
@@ -78,7 +81,7 @@ def clear_overlaps(lst):
         return True, None
 
     def remove(self, start, stop):
-        """Removes this coordinate pairing from the map, all_coords, and coord2pattern"""
+        """Remove this coordinate pairing from the map, all_coords, and coord2pattern."""
         # delete from our map structure
         if start in self.map:
             del self.map[start]
@@ -89,7 +92,7 @@ def remove(self, start, stop):
         return True, None
 
     def scan(self):
-        """does an inorder scan of the coordinates and their values"""
+        """Do an inorder scan of the coordinates and their values."""
         for fn in self.map:
             coords = list(self.map[fn].keys())
             coords.sort()
@@ -105,20 +108,19 @@ def get_coords(self, start):
         return start, stop
 
     def filecoords(self):
-        """generator does an inorder scan of the coordinates for this file"""
+        """Provide a generator of an in-order scan of the coordinates for this file."""
         coords = sorted(self.map.keys())
         for coord in coords:
             yield coord, self.map[coord]
 
     def does_exist(self, index):
-        """Simple check to see if this index is a hit (start of coordinates)"""
+        """Simply check to see if this index is a hit (start of coordinates)."""
         if index in self.map:
             return True
         return False
 
     def does_overlap(self, start, stop):
-        """Check if this coordinate overlaps with any existing range"""
-
+        """Check if this coordinate overlaps with any existing range."""
         ranges = [list(range(key, self.map[key] + 1)) for key in self.map]
         all_coords = [item for sublist in ranges for item in sublist]
         # removing all_coords implementation until we write some tests
@@ -128,10 +130,11 @@ def does_overlap(self, start, stop):
         return False
 
     def calc_overlap(self, start, stop):
-        """given a set of coordinates, will calculate all overlaps
-        perf: stop after we know we won't hit any more
-        perf: use binary search approach"""
+        """Given a set of coordinates, calculate all overlaps.
 
+        perf: stop after we know we won't hit any more
+        perf: use binary search approach
+        """
         overlaps = []
         for s in self.map:
             e = self.map[s]
@@ -149,10 +152,11 @@ def calc_overlap(self, start, stop):
         return overlaps
 
     def max_overlap(self, start, stop):
-        """given a set of coordinates, will calculate max of all overlaps
-        perf: stop after we know we won't hit any more
-        perf: use binary search approach"""
+        """Given a set of coordinates, calculate max of all overlaps.
 
+        perf: stop after we know we won't hit any more
+        perf: use binary search approach
+        """
         overlaps = []
         for s in self.map:
             e = self.map[s]
@@ -168,9 +172,7 @@ def max_overlap(self, start, stop):
                         }
                     )
                 else:
-                    overlaps.append(
-                        {"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
-                    )
+                    overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})
 
             elif s <= stop <= e:
                 if start <= s:
@@ -183,15 +185,12 @@ def max_overlap(self, start, stop):
                         }
                     )
                 else:
-                    overlaps.append(
-                        {"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
-                    )
+                    overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})
 
         return overlaps
 
     def get_complement(self, text):
-        """get the complementary coordinates of the input coordinate map (excludes punctuation)"""
-
+        """Get the complementary coordinates of the input coordinate map (excludes punctuation)."""
         complement_coordinate_map = {}
 
         current_map_coordinates: List[int] = []
@@ -201,9 +200,7 @@ def get_complement(self, text):
             current_map_coordinates += range(start, stop)
 
         text_coordinates = list(range(0, len(text)))
-        complement_coordinates = list(
-            set(text_coordinates) - set(current_map_coordinates)
-        )
+        complement_coordinates = list(set(text_coordinates) - set(current_map_coordinates))
 
         # Remove punctuation from complement coordinates
         for i in range(0, len(text)):
@@ -214,9 +211,7 @@ def get_complement(self, text):
         # Group complement coordinates into ranges
         def to_ranges(iterable):
             iterable = sorted(set(iterable))
-            for key, group in itertools.groupby(
-                enumerate(iterable), lambda t: t[1] - t[0]
-            ):
+            for _key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
                 group_list = list(group)
                 yield group_list[0][1], group_list[-1][1] + 1
 

diff --git a/philter_lite/filters/__init__.py b/philter_lite/filters/__init__.py
@@ -130,23 +130,21 @@ def filter_from_dict(
 
 
 def load_filters(filter_path) -> List[Filter]:
-    """Loads filters from a file on disk.
+    """Load filters from a file on disk.
 
-    File must be a toml file with a key of `filters`
+    File must be a toml file with a key of `filters`.
     """
     if not os.path.exists(filter_path):
         raise Exception("Filepath does not exist", filter_path)
     with open(filter_path, "r") as fil_file:
         return [filter_from_dict(x) for x in toml.loads(fil_file.read())["filters"]]
 
 
-def _precompile(regex: str):
-    """precompiles our regex to speed up pattern matching"""
+def _precompile(regex: str) -> Pattern[str]:
+    """Precompile our regex to speed up pattern matching."""
     # NOTE: this is not thread safe! but we want to print a more detailed warning message
     with warnings.catch_warnings():
-        warnings.simplefilter(
-            action="error", category=FutureWarning
-        )  # in order to print a detailed message
+        warnings.simplefilter(action="error", category=FutureWarning)  # in order to print a detailed message
         try:
             re_compiled = re.compile(regex)
         except FutureWarning:

diff --git a/philter_lite/filters/stanford_ner.py b/philter_lite/filters/stanford_ner.py
@@ -1,6 +1,7 @@
 import os
 import re
 import subprocess
+import sys
 
 from nltk.tag.stanford import StanfordNERTagger
 
@@ -9,16 +10,12 @@
 from . import NerFilter
 
 
-def build_ner_tagger(
-    classifier, tagger_jar, download: bool = True
-) -> StanfordNERTagger:
+def build_ner_tagger(classifier, tagger_jar, download: bool = True) -> StanfordNERTagger:
     if not os.path.exists(classifier) and not download:
         raise Exception("Filepath does not exist", classifier)
     else:
         # download the ner data
-        process = subprocess.Popen(
-            "cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE
-        )
+        process = subprocess.Popen("cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE)
         process.communicate()
 
     if not os.path.exists(tagger_jar):
@@ -34,7 +31,7 @@ def map_ner(
     stanford_ner_tagger: StanfordNERTagger,
     pre_process=r"[^a-zA-Z0-9]+",
 ) -> CoordinateMap:
-    """map NER tagging"""
+    """Map NER tagging."""
     pos_set = set()
     if pattern.pos:
         pos_set = set(pattern.pos)
@@ -62,7 +59,6 @@ def map_ner(
     # add these coordinates to our coordinate map
     start_coordinate = 0
     for word in cleaned:
-
         word_clean = re.sub(pre_process, "", word.lower().strip())
         if len(word_clean) == 0:
             # got a blank space or something without any characters or digits, move forward
@@ -75,7 +71,7 @@ def map_ner(
             if ner_tag in pos_set:
                 stop = start + len(word)
                 coord_map.add_extend(start, stop)
-                print("FOUND: ", word, "NER: ", ner_tag, start, stop)
+                sys.stdout.write(f"FOUND: {word}  NER: {ner_tag} {start} {stop}")
 
         # advance our start coordinate
         start_coordinate += len(word)