fix(minor code defects): (#14)

* feat(Update dependencies and clean up.): * feat(update nltk): * fix(minor code defects): * fix(another usage of PUNCTUATION_MATCHER): * fix(refactor PUNCTUATION_MATCHER in asterisk): * feat(Update dependencies and clean up.): * feat(Bump version): * fix(version): * fix(typo):
SironaMedical · Mar 29, 2024 · 3e43307 · 3e43307
1 parent 35e0b0b
commit 3e43307
Show file tree

Hide file tree

Showing 6 changed files with 15 additions and 29 deletions.
diff --git a/philter_lite/asterisk.py b/philter_lite/asterisk.py
@@ -1,6 +1,4 @@
-import re
-
-from .coordinate_map import CoordinateMap
+from .coordinate_map import CoordinateMap, PUNCTUATION_MATCHER
 
 
 def save_to_asterisk(contents, output_file):
@@ -10,10 +8,9 @@ def save_to_asterisk(contents, output_file):
 
 def transform_text_asterisk(txt, include_map: CoordinateMap):
     last_marker = 0
-    punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
     # read the text by character, any non-punc non-overlaps will be replaced
     contents = []
-    for i in range(0, len(txt)):
+    for i in range(len(txt)):
 
         if i < last_marker:
             continue
@@ -23,7 +20,7 @@ def transform_text_asterisk(txt, include_map: CoordinateMap):
             start, stop = include_map.get_coords(i)
             contents.append(txt[start:stop])
             last_marker = stop
-        elif punctuation_matcher.match(txt[i]):
+        elif PUNCTUATION_MATCHER.match(txt[i]):
             contents.append(txt[i])
         else:
             contents.append("*")

diff --git a/philter_lite/coordinate_map.py b/philter_lite/coordinate_map.py
@@ -2,13 +2,15 @@
 import re
 from typing import List
 
+PUNCTUATION_MATCHER = re.compile(r"[^a-zA-Z0-9*]")
+
 
 class CoordinateMap:
     """Hits are stored in a coordinate map data structure
 
     This class stores start coordinates for any matches found for this pattern"""
 
-    def __init__(self, pattern={"title": "untitled"}, debug=False):
+    def __init__(self):
         """internal data structure maps filepaths to a map of int:string (coordinate start --> stop)
 
         map is the internal structure of
@@ -23,14 +25,13 @@ def __init__(self, pattern={"title": "untitled"}, debug=False):
 
         self.map = {}
         self.coord2pattern = {}
-        self.pattern = pattern
-        self.debug = debug
         self.all_coords = {}
 
     def add(self, start, stop, overlap=False, pattern=""):
         """adds a new coordinate to the coordinate map
 
-        if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)"""
+        if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)
+        """
         if not overlap:
             if self.does_overlap(start, stop):
                 return False, "Error, overlaps were found: {} {}".format(start, stop)
@@ -62,26 +63,17 @@ def clear_overlaps(lst):
         if len(overlaps) == 0:
             # no overlap, just save these coordinates
             self.add(start, stop, pattern=pattern, overlap=True)
-            # if filename == "./data/i2b2_notes/167-02.txt":
-            # 	print("No overlaps:")
-            # 	print(filename,start,stop,pattern)
         elif len(overlaps) == 1:
             clear_overlaps(overlaps)
             # 1 overlap, save this value
             o = overlaps[0]
             self.add(o["new_start"], o["new_stop"], pattern=pattern, overlap=True)
-            # if filename == "./data/i2b2_notes/167-02.txt":
-            # 	print("One overlap:")
-            # 	print(filename,start,stop,pattern)
         else:
             clear_overlaps(overlaps)
             # greater than 1 overlap, by default this is sorted because of scan order
             o1 = overlaps[0]
             o2 = overlaps[-1]
             self.add(o2["new_start"], o1["new_stop"], pattern=pattern, overlap=True)
-            # if filename == "./data/i2b2_notes/167-02.txt":
-            # 	print("Multiple overlaps:")
-            # 	print(filename,start,stop,pattern)
 
         return True, None
 
@@ -214,9 +206,8 @@ def get_complement(self, text):
         )
 
         # Remove punctuation from complement coordinates
-        punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
         for i in range(0, len(text)):
-            if punctuation_matcher.match(text[i]):
+            if PUNCTUATION_MATCHER.match(text[i]):
                 if i in complement_coordinates:
                     complement_coordinates.remove(i)
 

diff --git a/philter_lite/filters/__init__.py b/philter_lite/filters/__init__.py
@@ -149,7 +149,7 @@ def _precompile(regex: str):
         )  # in order to print a detailed message
         try:
             re_compiled = re.compile(regex)
-        except FutureWarning as warn:
+        except FutureWarning:
             warnings.simplefilter(action="ignore", category=FutureWarning)
             re_compiled = re.compile(regex)  # assign nevertheless
     return re_compiled

diff --git a/philter_lite/filters/filter_db.py b/philter_lite/filters/filter_db.py
@@ -1,5 +1,5 @@
 from importlib import resources
-from typing import Any, Dict, MutableMapping
+from typing import Any, MutableMapping
 
 import toml
 

diff --git a/philter_lite/philter.py b/philter_lite/philter.py
@@ -5,7 +5,7 @@
 
 import nltk
 
-from philter_lite.coordinate_map import CoordinateMap
+from philter_lite.coordinate_map import PUNCTUATION_MATCHER, CoordinateMap
 
 from .filters import Filter, PosFilter, RegexContextFilter, RegexFilter, SetFilter
 
@@ -118,7 +118,7 @@ def detect_phi(
     # create intersection maps for all phi types and add them to a dictionary containing all maps
     # get full exclude map (only updated either on-command by map_regex_context or at the very end of map_
     # coordinates)
-    full_exclude_map = include_map.get_complement(text_data)
+    # full_exclude_map = include_map.get_complement(text_data)
 
     for phi_type in phi_type_list:
         for start, stop in phi_type_dict[phi_type].filecoords():
@@ -245,8 +245,6 @@ def _map_regex_context(
             full_exclude_map[start] = stop
 
     # 1. Get coordinates of all include and exclude mathches
-
-    punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
     # 2. Find all patterns expressions that match regular expression
     matches = regex.finditer(text)
     for m in matches:
@@ -280,7 +278,7 @@ def _map_regex_context(
         coord_tracker = 0
         for element in split_match:
             if element != "":
-                if not punctuation_matcher.match(element[0]):
+                if not PUNCTUATION_MATCHER.match(element[0]):
                     current_start = match_start + coord_tracker
                     current_end = current_start + len(element)
                     tokenized_matches.append((current_start, current_end))

diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "philter-lite"
-version = "0.4.0"
+version = "0.5.0"
 description = "Open-source PHI-filtering software. A fork of philter-ucsf."
 readme = "README.md"
 authors = [