From b415da9544e1cafe3081900829fa5d96480354b6 Mon Sep 17 00:00:00 2001
From: Jeff Lerman <jeff@sironamedical.com>
Date: Fri, 29 Mar 2024 12:59:02 -0700
Subject: [PATCH 1/2] Add flake8 and fix issues raised by it.

NOTE: For now, ignoring checks for missing docstrings, since there are so many.
      Those should be added and the checks enabled.
---
 .flake8                              |  3 +
 .pre-commit-config.yaml              |  9 ++-
 philter_lite/__init__.py             | 11 ++++
 philter_lite/asterisk.py             |  4 +-
 philter_lite/coordinate_map.py       | 83 +++++++++++++---------------
 philter_lite/filters/__init__.py     | 12 ++--
 philter_lite/filters/stanford_ner.py | 14 ++---
 philter_lite/main.py                 | 30 ++++------
 philter_lite/philter.py              | 34 +++++-------
 pyproject.toml                       |  2 +
 tests/test_philter.py                | 12 ++--
 11 files changed, 104 insertions(+), 110 deletions(-)
 create mode 100644 .flake8

diff --git a/.flake8 b/.flake8
new file mode 100644
index 0000000..70f988c
--- /dev/null
+++ b/.flake8
@@ -0,0 +1,3 @@
+[flake8]
+    ignore = E203, E266, E501, W503, B028, D100, D101, D102, D103, D104
+    max-line-length = 120
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 8b99214..aefd491 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,7 +1,7 @@
 ---
 repos:
   - repo: https://github.com/psf/black
-    rev: 22.8.0
+    rev: 23.9.1
     hooks:
       - id: black
         language_version: python3
@@ -11,3 +11,10 @@ repos:
     hooks:
       - id: isort
         args: [ --profile, black ]
+
+  - repo: https://github.com/pycqa/flake8
+    rev: 6.1.0
+    hooks:
+      - id: flake8
+        language: python
+        additional_dependencies: [flake8-bugbear, flake8-comprehensions, flake8-print, flake8-docstrings]
\ No newline at end of file
diff --git a/philter_lite/__init__.py b/philter_lite/__init__.py
index 7aec798..10cfb0c 100644
--- a/philter_lite/__init__.py
+++ b/philter_lite/__init__.py
@@ -1,3 +1,4 @@
+"""The philter_lite package."""
 from importlib import metadata
 
 from philter_lite.coordinate_map import CoordinateMap
@@ -8,6 +9,16 @@
 from .asterisk import transform_text_asterisk
 from .i2b2 import transform_text_i2b2
 
+__all__ = [
+    "CoordinateMap",
+    "Filter",
+    "filter_from_dict",
+    "load_filters",
+    "detect_phi",
+    "transform_text_asterisk",
+    "transform_text_i2b2",
+]
+
 _DISTRIBUTION_METADATA = metadata.metadata("philter_lite")
 
 __author__ = _DISTRIBUTION_METADATA["Author"]
diff --git a/philter_lite/asterisk.py b/philter_lite/asterisk.py
index 9f3bbda..388d163 100644
--- a/philter_lite/asterisk.py
+++ b/philter_lite/asterisk.py
@@ -1,7 +1,8 @@
-from .coordinate_map import CoordinateMap, PUNCTUATION_MATCHER
+from .coordinate_map import PUNCTUATION_MATCHER, CoordinateMap
 
 
 def save_to_asterisk(contents, output_file):
+    """Write some data to a text file, using utf-8 encoding."""
     with open(output_file, "w", encoding="utf-8", errors="surrogateescape") as f:
         f.write(contents)
 
@@ -11,7 +12,6 @@ def transform_text_asterisk(txt, include_map: CoordinateMap):
     # read the text by character, any non-punc non-overlaps will be replaced
     contents = []
     for i in range(len(txt)):
-
         if i < last_marker:
             continue
 
diff --git a/philter_lite/coordinate_map.py b/philter_lite/coordinate_map.py
index b35d426..ea6afca 100644
--- a/philter_lite/coordinate_map.py
+++ b/philter_lite/coordinate_map.py
@@ -6,31 +6,32 @@
 
 
 class CoordinateMap:
-    """Hits are stored in a coordinate map data structure
+    """Internal data structure mapping filepaths to a map of int:string (coordinate start --> stop).
 
-    This class stores start coordinates for any matches found for this pattern"""
+    Hits are stored in a coordinate map data structure.
 
-    def __init__(self):
-        """internal data structure maps filepaths to a map of int:string (coordinate start --> stop)
-
-        map is the internal structure of
-        { filename : { startcoordinate : stop_coordinate}}
-        eg: { "data/foo.txt": {123:126, 19:25} }
-
-        coord2pattern keeps reference of the patterns
-        that matched this coorinate (can be multiple patterns)
+    This class stores start coordinates for any matches found for this pattern.
 
-        all_coords keeps a reference of all coordinates mapped by filename,
-        allowing us to easily check if these coordinates have been matched yet"""
+    Attributes:
+        map: Has the internal structure of
+            { filename : { startcoordinate : stop_coordinate}}
+            eg: { "data/foo.txt": {123:126, 19:25} }
+        coord2pattern: Keeps reference of the patterns that matched this coordinate
+            (can be multiple patterns).
+        all_coords: Keeps a reference of all coordinates mapped by filename,
+            allowing us to easily check if these coordinates have been matched yet.
+    """
 
+    def __init__(self):
+        """Initialize."""
         self.map = {}
         self.coord2pattern = {}
         self.all_coords = {}
 
     def add(self, start, stop, overlap=False, pattern=""):
-        """adds a new coordinate to the coordinate map
+        """Add a new coordinate to the coordinate map.
 
-        if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)
+        If overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs).
         """
         if not overlap:
             if self.does_overlap(start, stop):
@@ -47,13 +48,15 @@ def add(self, start, stop, overlap=False, pattern=""):
         return True, None
 
     def add_pattern(self, start, stop, pattern):
-        """adds this pattern to this start coord"""
+        """Add this pattern to this start coord."""
         self.coord2pattern[start] = []
         self.coord2pattern[start].append(pattern)
 
     def add_extend(self, start, stop, pattern=""):
-        """adds a new coordinate to the coordinate map
-        if overlaps with another, will extend to the larger size"""
+        """Add a new coordinate to the coordinate map.
+
+        If overlaps with another, will extend to the larger size.
+        """
         overlaps = self.max_overlap(start, stop)
 
         def clear_overlaps(lst):
@@ -78,7 +81,7 @@ def clear_overlaps(lst):
         return True, None
 
     def remove(self, start, stop):
-        """Removes this coordinate pairing from the map, all_coords, and coord2pattern"""
+        """Remove this coordinate pairing from the map, all_coords, and coord2pattern."""
         # delete from our map structure
         if start in self.map:
             del self.map[start]
@@ -89,7 +92,7 @@ def remove(self, start, stop):
         return True, None
 
     def scan(self):
-        """does an inorder scan of the coordinates and their values"""
+        """Do an inorder scan of the coordinates and their values."""
         for fn in self.map:
             coords = list(self.map[fn].keys())
             coords.sort()
@@ -105,20 +108,19 @@ def get_coords(self, start):
         return start, stop
 
     def filecoords(self):
-        """generator does an inorder scan of the coordinates for this file"""
+        """Provide a generator of an in-order scan of the coordinates for this file."""
         coords = sorted(self.map.keys())
         for coord in coords:
             yield coord, self.map[coord]
 
     def does_exist(self, index):
-        """Simple check to see if this index is a hit (start of coordinates)"""
+        """Simply check to see if this index is a hit (start of coordinates)."""
         if index in self.map:
             return True
         return False
 
     def does_overlap(self, start, stop):
-        """Check if this coordinate overlaps with any existing range"""
-
+        """Check if this coordinate overlaps with any existing range."""
         ranges = [list(range(key, self.map[key] + 1)) for key in self.map]
         all_coords = [item for sublist in ranges for item in sublist]
         # removing all_coords implementation until we write some tests
@@ -128,10 +130,11 @@ def does_overlap(self, start, stop):
         return False
 
     def calc_overlap(self, start, stop):
-        """given a set of coordinates, will calculate all overlaps
-        perf: stop after we know we won't hit any more
-        perf: use binary search approach"""
+        """Given a set of coordinates, calculate all overlaps.
 
+        perf: stop after we know we won't hit any more
+        perf: use binary search approach
+        """
         overlaps = []
         for s in self.map:
             e = self.map[s]
@@ -149,10 +152,11 @@ def calc_overlap(self, start, stop):
         return overlaps
 
     def max_overlap(self, start, stop):
-        """given a set of coordinates, will calculate max of all overlaps
-        perf: stop after we know we won't hit any more
-        perf: use binary search approach"""
+        """Given a set of coordinates, calculate max of all overlaps.
 
+        perf: stop after we know we won't hit any more
+        perf: use binary search approach
+        """
         overlaps = []
         for s in self.map:
             e = self.map[s]
@@ -168,9 +172,7 @@ def max_overlap(self, start, stop):
                         }
                     )
                 else:
-                    overlaps.append(
-                        {"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
-                    )
+                    overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})
 
             elif s <= stop <= e:
                 if start <= s:
@@ -183,15 +185,12 @@ def max_overlap(self, start, stop):
                         }
                     )
                 else:
-                    overlaps.append(
-                        {"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
-                    )
+                    overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})
 
         return overlaps
 
     def get_complement(self, text):
-        """get the complementary coordinates of the input coordinate map (excludes punctuation)"""
-
+        """Get the complementary coordinates of the input coordinate map (excludes punctuation)."""
         complement_coordinate_map = {}
 
         current_map_coordinates: List[int] = []
@@ -201,9 +200,7 @@ def get_complement(self, text):
             current_map_coordinates += range(start, stop)
 
         text_coordinates = list(range(0, len(text)))
-        complement_coordinates = list(
-            set(text_coordinates) - set(current_map_coordinates)
-        )
+        complement_coordinates = list(set(text_coordinates) - set(current_map_coordinates))
 
         # Remove punctuation from complement coordinates
         for i in range(0, len(text)):
@@ -214,9 +211,7 @@ def get_complement(self, text):
         # Group complement coordinates into ranges
         def to_ranges(iterable):
             iterable = sorted(set(iterable))
-            for key, group in itertools.groupby(
-                enumerate(iterable), lambda t: t[1] - t[0]
-            ):
+            for _key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
                 group_list = list(group)
                 yield group_list[0][1], group_list[-1][1] + 1
 
diff --git a/philter_lite/filters/__init__.py b/philter_lite/filters/__init__.py
index 061fffc..9639abd 100644
--- a/philter_lite/filters/__init__.py
+++ b/philter_lite/filters/__init__.py
@@ -130,9 +130,9 @@ def filter_from_dict(
 
 
 def load_filters(filter_path) -> List[Filter]:
-    """Loads filters from a file on disk.
+    """Load filters from a file on disk.
 
-    File must be a toml file with a key of `filters`
+    File must be a toml file with a key of `filters`.
     """
     if not os.path.exists(filter_path):
         raise Exception("Filepath does not exist", filter_path)
@@ -140,13 +140,11 @@ def load_filters(filter_path) -> List[Filter]:
         return [filter_from_dict(x) for x in toml.loads(fil_file.read())["filters"]]
 
 
-def _precompile(regex: str):
-    """precompiles our regex to speed up pattern matching"""
+def _precompile(regex: str) -> Pattern[str]:
+    """Precompile our regex to speed up pattern matching."""
     # NOTE: this is not thread safe! but we want to print a more detailed warning message
     with warnings.catch_warnings():
-        warnings.simplefilter(
-            action="error", category=FutureWarning
-        )  # in order to print a detailed message
+        warnings.simplefilter(action="error", category=FutureWarning)  # in order to print a detailed message
         try:
             re_compiled = re.compile(regex)
         except FutureWarning:
diff --git a/philter_lite/filters/stanford_ner.py b/philter_lite/filters/stanford_ner.py
index 1e93228..afc999d 100644
--- a/philter_lite/filters/stanford_ner.py
+++ b/philter_lite/filters/stanford_ner.py
@@ -1,6 +1,7 @@
 import os
 import re
 import subprocess
+import sys
 
 from nltk.tag.stanford import StanfordNERTagger
 
@@ -9,16 +10,12 @@
 from . import NerFilter
 
 
-def build_ner_tagger(
-    classifier, tagger_jar, download: bool = True
-) -> StanfordNERTagger:
+def build_ner_tagger(classifier, tagger_jar, download: bool = True) -> StanfordNERTagger:
     if not os.path.exists(classifier) and not download:
         raise Exception("Filepath does not exist", classifier)
     else:
         # download the ner data
-        process = subprocess.Popen(
-            "cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE
-        )
+        process = subprocess.Popen("cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE)
         process.communicate()
 
     if not os.path.exists(tagger_jar):
@@ -34,7 +31,7 @@ def map_ner(
     stanford_ner_tagger: StanfordNERTagger,
     pre_process=r"[^a-zA-Z0-9]+",
 ) -> CoordinateMap:
-    """map NER tagging"""
+    """Map NER tagging."""
     pos_set = set()
     if pattern.pos:
         pos_set = set(pattern.pos)
@@ -62,7 +59,6 @@ def map_ner(
     # add these coordinates to our coordinate map
     start_coordinate = 0
     for word in cleaned:
-
         word_clean = re.sub(pre_process, "", word.lower().strip())
         if len(word_clean) == 0:
             # got a blank space or something without any characters or digits, move forward
@@ -75,7 +71,7 @@ def map_ner(
             if ner_tag in pos_set:
                 stop = start + len(word)
                 coord_map.add_extend(start, stop)
-                print("FOUND: ", word, "NER: ", ner_tag, start, stop)
+                sys.stdout.write(f"FOUND: {word}  NER: {ner_tag} {start} {stop}")
 
         # advance our start coordinate
         start_coordinate += len(word)
diff --git a/philter_lite/main.py b/philter_lite/main.py
index 578c5d1..c562c0f 100644
--- a/philter_lite/main.py
+++ b/philter_lite/main.py
@@ -1,6 +1,7 @@
 import argparse
 import distutils.util
 import os
+import sys
 
 import philter_lite
 
@@ -13,16 +14,14 @@ def main():
         "-i",
         "--input",
         default="./data/i2b2_notes/",
-        help="Path to the directory or the file that contains the PHI note, the default is "
-        "./data/i2b2_notes/",
+        help="Path to the directory or the file that contains the PHI note, the default is ./data/i2b2_notes/",
         type=str,
     )
     ap.add_argument(
         "-a",
         "--anno",
         default="./data/i2b2_anno/",
-        help="Path to the directory or the file that contains the PHI annotation, the default is "
-        "./data/i2b2_anno/",
+        help="Path to the directory or the file that contains the PHI annotation, the default is ./data/i2b2_anno/",
         type=str,
     )
     ap.add_argument(
@@ -84,8 +83,7 @@ def main():
     ap.add_argument(
         "--outputformat",
         default="i2b2",
-        help='Define format of annotation, allowed values are "asterisk", "i2b2". Default '
-        'is "asterisk"',
+        help='Define format of annotation, allowed values are "asterisk", "i2b2". Default is "asterisk"',
         type=str,
     )
     ap.add_argument(
@@ -142,29 +140,21 @@ def main():
         }
 
     if verbose:
-        print("RUNNING ", philter_config["filters"])
+        sys.stderr.write(f'RUNNING {philter_config["filters"]}')
 
     filters = philter_lite.load_filters(philter_config["filters"])
 
-    for root, dirs, files in os.walk(philter_config["finpath"]):
+    for root, _dirs, files in os.walk(philter_config["finpath"]):
         for file in files:
             with open(os.path.join(root, file)) as inf:
                 text = inf.read()
-                include_map, exclude_map, data_tracker = philter_lite.detect_phi(
-                    text, patterns=filters
-                )
+                include_map, exclude_map, data_tracker = philter_lite.detect_phi(text, patterns=filters)
                 if philter_config["outformat"] == "i2b2":
-                    with open(
-                        os.path.join(philter_config["foutpath"], f"{file}.txt"), "w"
-                    ) as fout:
+                    with open(os.path.join(philter_config["foutpath"], f"{file}.txt"), "w") as fout:
                         fout.write(philter_lite.transform_text_i2b2(data_tracker))
                 elif philter_config["outformat"] == "asterisk":
-                    with open(
-                        os.path.join(philter_config["foutpath"], f"{file}.txt"), "w"
-                    ) as fout:
-                        fout.write(
-                            philter_lite.transform_text_asterisk(text, include_map)
-                        )
+                    with open(os.path.join(philter_config["foutpath"], f"{file}.txt"), "w") as fout:
+                        fout.write(philter_lite.transform_text_asterisk(text, include_map))
 
 
 if __name__ == "__main__":
diff --git a/philter_lite/philter.py b/philter_lite/philter.py
index 5be585d..e208659 100644
--- a/philter_lite/philter.py
+++ b/philter_lite/philter.py
@@ -53,8 +53,8 @@ def detect_phi(
     patterns: List[Filter],
     phi_type_list: List[str] = DEFAULT_PHI_TYPE_LIST,
 ):
-    """Runs the set, or regex on the input data
-    generating a coordinate map of hits given
+    """Run the set or regex on the input data, generating a coordinate map of hits given.
+
     (this performs a dry run on the data and doesn't transform)
     """
     # create coordinate maps for each pattern
@@ -81,8 +81,8 @@ def detect_phi(
 
     pos_list = _get_pos(text_data)
 
-    # Create inital self.exclude/include for file
-    for i, pat in enumerate(patterns):
+    # Create initial self.exclude/include for file
+    for pat in patterns:
         pattern_coord = pattern_coords[pat.title]
 
         if pat.type == "regex" and isinstance(pat, RegexFilter):
@@ -98,9 +98,7 @@ def detect_phi(
                 pattern=pat,
             )
         elif pat.type == "pos_matcher" and isinstance(pat, PosFilter):
-            _map_parts_of_speech(
-                pos_list=pos_list, coord_map=pattern_coord, pattern=pat
-            )
+            _map_parts_of_speech(pos_list=pos_list, coord_map=pattern_coord, pattern=pat)
         elif pat.type == "match_all":
             _match_all(text=text_data, coord_map=pattern_coord)
         else:
@@ -161,8 +159,9 @@ def _map_regex(
     coord_map: CoordinateMap,
     pre_process=REGEX_NON_ALPHANUM_CHAR,
 ) -> CoordinateMap:
-    """Creates a coordinate map from the pattern on this data
-    generating a coordinate map of hits given (dry run doesn't transform)
+    """Create a coordinate map from the pattern on this data.
+
+    Generates a coordinate map of hits given (dry run doesn't transform).
     """
     regex = pattern.data
 
@@ -217,9 +216,7 @@ def _map_regex_context(
     include_map: CoordinateMap,
     pre_process=REGEX_NON_ALPHANUM_CHAR,
 ) -> CoordinateMap:
-    """map_regex_context creates a coordinate map from combined regex + PHI coordinates
-    of all previously mapped patterns
-    """
+    """Create a CoordinateMap from combined regex + PHI coordinates of all previously mapped patterns."""
     regex = pattern.data
     context = pattern.context
     try:
@@ -248,7 +245,6 @@ def _map_regex_context(
     # 2. Find all patterns expressions that match regular expression
     matches = regex.finditer(text)
     for m in matches:
-
         # initialize phi_left and phi_right
         phi_left = False
         phi_right = False
@@ -301,15 +297,14 @@ def _map_regex_context(
 
 
 def _match_all(text, coord_map: CoordinateMap) -> CoordinateMap:
-    """Simply maps to the entirety of the file"""
+    """Simply map to the entirety of the file."""
     # add the entire length of the file
     coord_map.add(0, len(text))
     return coord_map
 
 
 def _map_set(pos_list, coord_map: CoordinateMap, pattern: SetFilter) -> CoordinateMap:
-    """Creates a coordinate mapping of words any words in this set"""
-
+    """Create a coordinate mapping of words any words in this set."""
     set_data = pattern.data
 
     # get part of speech we will be sending through this set
@@ -343,11 +338,8 @@ def _map_set(pos_list, coord_map: CoordinateMap, pattern: SetFilter) -> Coordina
     return coord_map
 
 
-def _map_parts_of_speech(
-    pos_list, pattern: PosFilter, coord_map: CoordinateMap
-) -> CoordinateMap:
-    """Creates a coordinate mapping of words which match this part of speech (POS)"""
-
+def _map_parts_of_speech(pos_list, pattern: PosFilter, coord_map: CoordinateMap) -> CoordinateMap:
+    """Create a coordinate mapping of words which match this part of speech (POS)."""
     pos_set = set(pattern.pos)
 
     # Use pre-process to split sentence by spaces AND symbols, while preserving spaces in the split list
diff --git a/pyproject.toml b/pyproject.toml
index d0dea46..6714869 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -36,6 +36,8 @@ philter_lite = 'philter_lite.main:main'
 requires = ["poetry_core>=1.8.1"]
 build-backend = "poetry.core.masonry.api"
 
+[tool.black]
+line-length = 120
 
 [tool.mypy]
 check_untyped_defs = true
diff --git a/tests/test_philter.py b/tests/test_philter.py
index 47a94d6..6a944d4 100644
--- a/tests/test_philter.py
+++ b/tests/test_philter.py
@@ -100,16 +100,16 @@ def test_filter_from_dict_missing_file():
         "filepath": "filters/regex/addresses/non_existent.txt",
     }
 
-    with pytest.raises(Exception):
+    # TODO: This test appears to be intended to test that a reference to a file that doesn't exist
+    #       causes an exception - but the excption raised has nothing to do with that file-path.
+    #       Instead, the complaint is about a missing value for the "keyword" key.
+    #       Should be investigated.
+    with pytest.raises(Exception):  # noqa: B017
         filter_from_dict(filter_dict)
 
 
 def test_default_config():
-    filters = load_filters(
-        os.path.join(
-            os.path.dirname(philter_lite.__file__), "configs/philter_delta.toml"
-        )
-    )
+    filters = load_filters(os.path.join(os.path.dirname(philter_lite.__file__), "configs/philter_delta.toml"))
     assert len(filters) > 0
 
 

From 7e607c72bc65355d5db2b61944d14dbb8a9eae10 Mon Sep 17 00:00:00 2001
From: Jeff Lerman <jeff@sironamedical.com>
Date: Fri, 29 Mar 2024 13:13:12 -0700
Subject: [PATCH 2/2] add end-of-file fixer, and fix remaining flake8 issues

---
 .pre-commit-config.yaml                 | 7 ++++++-
 data/i2b2_anno/110-01.txt               | 4 ----
 data/i2b2_anno/110-02.txt               | 5 -----
 data/i2b2_anno/110-03.txt               | 4 ----
 data/i2b2_anno/110-04.txt               | 3 ---
 data/i2b2_anno/111-01.txt               | 5 -----
 data/i2b2_notes/110-01.txt              | 4 ----
 data/i2b2_notes/110-02.txt              | 5 -----
 data/i2b2_notes/110-03.txt              | 4 ----
 data/i2b2_notes/110-04.txt              | 3 ---
 data/i2b2_notes/111-01.txt              | 5 -----
 data/i2b2_xml/110-01.xml                | 2 +-
 data/i2b2_xml/110-02.xml                | 2 +-
 data/i2b2_xml/110-03.xml                | 2 +-
 data/i2b2_xml/110-04.xml                | 2 +-
 data/i2b2_xml/111-01.xml                | 2 +-
 data/phi/fn.txt                         | 2 +-
 data/phi/fp.txt                         | 2 +-
 data/phi/summary.json                   | 2 +-
 data/phi_notes_i2b2.json                | 2 +-
 philter_lite/configs/philter_delta.toml | 1 -
 philter_lite/data/i2b2_anno/110-01.txt  | 4 ----
 philter_lite/data/i2b2_anno/110-02.txt  | 5 -----
 philter_lite/data/i2b2_anno/110-03.txt  | 4 ----
 philter_lite/data/i2b2_anno/110-04.txt  | 3 ---
 philter_lite/data/i2b2_anno/111-01.txt  | 5 -----
 philter_lite/data/i2b2_notes/110-01.txt | 4 ----
 philter_lite/data/i2b2_notes/110-02.txt | 5 -----
 philter_lite/data/i2b2_notes/110-03.txt | 4 ----
 philter_lite/data/i2b2_notes/110-04.txt | 3 ---
 philter_lite/data/i2b2_notes/111-01.txt | 5 -----
 philter_lite/data/i2b2_xml/110-01.xml   | 2 +-
 philter_lite/data/i2b2_xml/110-02.xml   | 2 +-
 philter_lite/data/i2b2_xml/110-03.xml   | 2 +-
 philter_lite/data/i2b2_xml/110-04.xml   | 2 +-
 philter_lite/data/i2b2_xml/111-01.xml   | 2 +-
 philter_lite/data/phi/fn.txt            | 2 +-
 philter_lite/data/phi/fp.txt            | 2 +-
 philter_lite/data/phi/summary.json      | 2 +-
 philter_lite/data/phi_notes_i2b2.json   | 2 +-
 philter_lite/i2b2.py                    | 2 +-
 tests/test_package_metadata.py          | 1 -
 42 files changed, 25 insertions(+), 106 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index aefd491..e4fd591 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -17,4 +17,9 @@ repos:
     hooks:
       - id: flake8
         language: python
-        additional_dependencies: [flake8-bugbear, flake8-comprehensions, flake8-print, flake8-docstrings]
\ No newline at end of file
+        additional_dependencies: [flake8-bugbear, flake8-comprehensions, flake8-print, flake8-docstrings]
+
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.4.0
+    hooks:
+      - id: end-of-file-fixer
diff --git a/data/i2b2_anno/110-01.txt b/data/i2b2_anno/110-01.txt
index 5a53ead..8538d0b 100644
--- a/data/i2b2_anno/110-01.txt
+++ b/data/i2b2_anno/110-01.txt
@@ -48,7 +48,3 @@ DD: **/**/**
 DT: **/**/**
 DV: **/**/**
           Approved but not reviewed by Attending Provider         
-
-
-
-
diff --git a/data/i2b2_anno/110-02.txt b/data/i2b2_anno/110-02.txt
index e238bbe..17a86f1 100644
--- a/data/i2b2_anno/110-02.txt
+++ b/data/i2b2_anno/110-02.txt
@@ -66,8 +66,3 @@ P:   Follow up with Dr. ***** in 3 months.
 
 
 GPP/******/*******
-
-
-
-
-
diff --git a/data/i2b2_anno/110-03.txt b/data/i2b2_anno/110-03.txt
index bd7b4ff..d4fdcfa 100644
--- a/data/i2b2_anno/110-03.txt
+++ b/data/i2b2_anno/110-03.txt
@@ -100,7 +100,3 @@ GI prophylaxis: Zantac
 
 _______________________
 **** ****, MD, EHMS pager *****
-
-
-
-
diff --git a/data/i2b2_anno/110-04.txt b/data/i2b2_anno/110-04.txt
index 97408d3..5cadd64 100644
--- a/data/i2b2_anno/110-04.txt
+++ b/data/i2b2_anno/110-04.txt
@@ -176,6 +176,3 @@ Plan
 ______________________________
 
 **** * *********, M.D.
-
-
-
diff --git a/data/i2b2_anno/111-01.txt b/data/i2b2_anno/111-01.txt
index e5a2271..cdae610 100644
--- a/data/i2b2_anno/111-01.txt
+++ b/data/i2b2_anno/111-01.txt
@@ -66,8 +66,3 @@ ___________________________________                    *****/*****
 Dictated by:  ******* ******, M.D.    **** 
 
           Not reviewed by Attending Physician         
-
-
-
-
-
diff --git a/data/i2b2_notes/110-01.txt b/data/i2b2_notes/110-01.txt
index 45a6405..da5997e 100644
--- a/data/i2b2_notes/110-01.txt
+++ b/data/i2b2_notes/110-01.txt
@@ -48,7 +48,3 @@ DD: 04/07/69
 DT: 04/15/69
 DV: 04/07/69
           Approved but not reviewed by Attending Provider         
-
-
-
-
diff --git a/data/i2b2_notes/110-02.txt b/data/i2b2_notes/110-02.txt
index 0b5d6fb..f2b4f20 100644
--- a/data/i2b2_notes/110-02.txt
+++ b/data/i2b2_notes/110-02.txt
@@ -66,8 +66,3 @@ Gilbert P. Perez, M.D.
 
 
 GPP/church/olinger
-
-
-
-
-
diff --git a/data/i2b2_notes/110-03.txt b/data/i2b2_notes/110-03.txt
index eae3007..575250b 100644
--- a/data/i2b2_notes/110-03.txt
+++ b/data/i2b2_notes/110-03.txt
@@ -100,7 +100,3 @@ GI prophylaxis: Zantac
 
 _______________________
 Mike Ivan, MD, EHMS pager 84710
-
-
-
-
diff --git a/data/i2b2_notes/110-04.txt b/data/i2b2_notes/110-04.txt
index dbf9bf6..1d45058 100644
--- a/data/i2b2_notes/110-04.txt
+++ b/data/i2b2_notes/110-04.txt
@@ -176,6 +176,3 @@ Plan
 ______________________________
 
 Owen M Gallagher, M.D.
-
-
-
diff --git a/data/i2b2_notes/111-01.txt b/data/i2b2_notes/111-01.txt
index 85196e1..39018f9 100644
--- a/data/i2b2_notes/111-01.txt
+++ b/data/i2b2_notes/111-01.txt
@@ -66,8 +66,3 @@ FILBERT BRIGHT, M.D.    FB59                     D:07/20/83
 Dictated by:  FILBERT BRIGHT, M.D.    FB59 
 
           Not reviewed by Attending Physician         
-
-
-
-
-
diff --git a/data/i2b2_xml/110-01.xml b/data/i2b2_xml/110-01.xml
index c0c8326..92180df 100644
--- a/data/i2b2_xml/110-01.xml
+++ b/data/i2b2_xml/110-01.xml
@@ -65,4 +65,4 @@ DV: 04/07/69
 <DATE TYPE="DATE" comment="" end="1729" id="P7" start="1721" text="04/15/69" />
 <DATE TYPE="DATE" comment="" end="1742" id="P8" start="1734" text="04/07/69" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/data/i2b2_xml/110-02.xml b/data/i2b2_xml/110-02.xml
index a4653d7..7ed6313 100644
--- a/data/i2b2_xml/110-02.xml
+++ b/data/i2b2_xml/110-02.xml
@@ -83,4 +83,4 @@ GPP/church/olinger
 <NAME TYPE="DOCTOR" comment="" end="1320" id="P6" start="1314" text="church" />
 <NAME TYPE="DOCTOR" comment="" end="1328" id="P7" start="1321" text="olinger" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/data/i2b2_xml/110-03.xml b/data/i2b2_xml/110-03.xml
index 547b70b..f40dffa 100644
--- a/data/i2b2_xml/110-03.xml
+++ b/data/i2b2_xml/110-03.xml
@@ -119,4 +119,4 @@ Mike Ivan, MD, EHMS pager 84710
 <NAME TYPE="DOCTOR" comment="" end="3592" id="P10" start="3583" text="Mike Ivan" />
 <CONTACT TYPE="PHONE" comment="" end="3614" id="P12" start="3609" text="84710" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/data/i2b2_xml/110-04.xml b/data/i2b2_xml/110-04.xml
index 50c3069..0ebd892 100644
--- a/data/i2b2_xml/110-04.xml
+++ b/data/i2b2_xml/110-04.xml
@@ -192,4 +192,4 @@ Owen M Gallagher, M.D.
 <NAME TYPE="DOCTOR" comment="" end="3877" id="P8" start="3872" text="Perez" />
 <NAME TYPE="DOCTOR" comment="" end="4080" id="P9" start="4064" text="Owen M Gallagher" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/data/i2b2_xml/111-01.xml b/data/i2b2_xml/111-01.xml
index 9f7470e..ea6b9f9 100644
--- a/data/i2b2_xml/111-01.xml
+++ b/data/i2b2_xml/111-01.xml
@@ -89,4 +89,4 @@ Dictated by:  FILBERT BRIGHT, M.D.    FB59
 <NAME TYPE="DOCTOR" comment="" end="1773" id="P12" start="1759" text="FILBERT BRIGHT" />
 <NAME TYPE="USERNAME" comment="" end="1787" id="P13" start="1783" text="FB59" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/data/phi/fn.txt b/data/phi/fn.txt
index b979605..120c3b1 100644
--- a/data/phi/fn.txt
+++ b/data/phi/fn.txt
@@ -1,4 +1,4 @@
 [
     "SILVER",
     "church"
-]
\ No newline at end of file
+]
diff --git a/data/phi/fp.txt b/data/phi/fp.txt
index a694fda..0dca58b 100644
--- a/data/phi/fp.txt
+++ b/data/phi/fp.txt
@@ -30,4 +30,4 @@
     "Will",
     "Will",
     "Will"
-]
\ No newline at end of file
+]
diff --git a/data/phi/summary.json b/data/phi/summary.json
index 2504d4b..5c380c2 100644
--- a/data/phi/summary.json
+++ b/data/phi/summary.json
@@ -74,4 +74,4 @@
             "num_false_negatives": 0
         }
     }
-}
\ No newline at end of file
+}
diff --git a/data/phi_notes_i2b2.json b/data/phi_notes_i2b2.json
index 24a7dfe..c50b356 100644
--- a/data/phi_notes_i2b2.json
+++ b/data/phi_notes_i2b2.json
@@ -392,4 +392,4 @@
             }
         ]
     }
-}
\ No newline at end of file
+}
diff --git a/philter_lite/configs/philter_delta.toml b/philter_lite/configs/philter_delta.toml
index bb40411..d10707f 100644
--- a/philter_lite/configs/philter_delta.toml
+++ b/philter_lite/configs/philter_delta.toml
@@ -2245,4 +2245,3 @@ context = "left_or_right"
 context_filter = "all"
 notes = ""
 keyword = "regex_context.initials"
-
diff --git a/philter_lite/data/i2b2_anno/110-01.txt b/philter_lite/data/i2b2_anno/110-01.txt
index 5a53ead..8538d0b 100644
--- a/philter_lite/data/i2b2_anno/110-01.txt
+++ b/philter_lite/data/i2b2_anno/110-01.txt
@@ -48,7 +48,3 @@ DD: **/**/**
 DT: **/**/**
 DV: **/**/**
           Approved but not reviewed by Attending Provider         
-
-
-
-
diff --git a/philter_lite/data/i2b2_anno/110-02.txt b/philter_lite/data/i2b2_anno/110-02.txt
index e238bbe..17a86f1 100644
--- a/philter_lite/data/i2b2_anno/110-02.txt
+++ b/philter_lite/data/i2b2_anno/110-02.txt
@@ -66,8 +66,3 @@ P:   Follow up with Dr. ***** in 3 months.
 
 
 GPP/******/*******
-
-
-
-
-
diff --git a/philter_lite/data/i2b2_anno/110-03.txt b/philter_lite/data/i2b2_anno/110-03.txt
index bd7b4ff..d4fdcfa 100644
--- a/philter_lite/data/i2b2_anno/110-03.txt
+++ b/philter_lite/data/i2b2_anno/110-03.txt
@@ -100,7 +100,3 @@ GI prophylaxis: Zantac
 
 _______________________
 **** ****, MD, EHMS pager *****
-
-
-
-
diff --git a/philter_lite/data/i2b2_anno/110-04.txt b/philter_lite/data/i2b2_anno/110-04.txt
index 97408d3..5cadd64 100644
--- a/philter_lite/data/i2b2_anno/110-04.txt
+++ b/philter_lite/data/i2b2_anno/110-04.txt
@@ -176,6 +176,3 @@ Plan
 ______________________________
 
 **** * *********, M.D.
-
-
-
diff --git a/philter_lite/data/i2b2_anno/111-01.txt b/philter_lite/data/i2b2_anno/111-01.txt
index e5a2271..cdae610 100644
--- a/philter_lite/data/i2b2_anno/111-01.txt
+++ b/philter_lite/data/i2b2_anno/111-01.txt
@@ -66,8 +66,3 @@ ___________________________________                    *****/*****
 Dictated by:  ******* ******, M.D.    **** 
 
           Not reviewed by Attending Physician         
-
-
-
-
-
diff --git a/philter_lite/data/i2b2_notes/110-01.txt b/philter_lite/data/i2b2_notes/110-01.txt
index 45a6405..da5997e 100644
--- a/philter_lite/data/i2b2_notes/110-01.txt
+++ b/philter_lite/data/i2b2_notes/110-01.txt
@@ -48,7 +48,3 @@ DD: 04/07/69
 DT: 04/15/69
 DV: 04/07/69
           Approved but not reviewed by Attending Provider         
-
-
-
-
diff --git a/philter_lite/data/i2b2_notes/110-02.txt b/philter_lite/data/i2b2_notes/110-02.txt
index 0b5d6fb..f2b4f20 100644
--- a/philter_lite/data/i2b2_notes/110-02.txt
+++ b/philter_lite/data/i2b2_notes/110-02.txt
@@ -66,8 +66,3 @@ Gilbert P. Perez, M.D.
 
 
 GPP/church/olinger
-
-
-
-
-
diff --git a/philter_lite/data/i2b2_notes/110-03.txt b/philter_lite/data/i2b2_notes/110-03.txt
index eae3007..575250b 100644
--- a/philter_lite/data/i2b2_notes/110-03.txt
+++ b/philter_lite/data/i2b2_notes/110-03.txt
@@ -100,7 +100,3 @@ GI prophylaxis: Zantac
 
 _______________________
 Mike Ivan, MD, EHMS pager 84710
-
-
-
-
diff --git a/philter_lite/data/i2b2_notes/110-04.txt b/philter_lite/data/i2b2_notes/110-04.txt
index dbf9bf6..1d45058 100644
--- a/philter_lite/data/i2b2_notes/110-04.txt
+++ b/philter_lite/data/i2b2_notes/110-04.txt
@@ -176,6 +176,3 @@ Plan
 ______________________________
 
 Owen M Gallagher, M.D.
-
-
-
diff --git a/philter_lite/data/i2b2_notes/111-01.txt b/philter_lite/data/i2b2_notes/111-01.txt
index 85196e1..39018f9 100644
--- a/philter_lite/data/i2b2_notes/111-01.txt
+++ b/philter_lite/data/i2b2_notes/111-01.txt
@@ -66,8 +66,3 @@ FILBERT BRIGHT, M.D.    FB59                     D:07/20/83
 Dictated by:  FILBERT BRIGHT, M.D.    FB59 
 
           Not reviewed by Attending Physician         
-
-
-
-
-
diff --git a/philter_lite/data/i2b2_xml/110-01.xml b/philter_lite/data/i2b2_xml/110-01.xml
index c0c8326..92180df 100644
--- a/philter_lite/data/i2b2_xml/110-01.xml
+++ b/philter_lite/data/i2b2_xml/110-01.xml
@@ -65,4 +65,4 @@ DV: 04/07/69
 <DATE TYPE="DATE" comment="" end="1729" id="P7" start="1721" text="04/15/69" />
 <DATE TYPE="DATE" comment="" end="1742" id="P8" start="1734" text="04/07/69" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/philter_lite/data/i2b2_xml/110-02.xml b/philter_lite/data/i2b2_xml/110-02.xml
index a4653d7..7ed6313 100644
--- a/philter_lite/data/i2b2_xml/110-02.xml
+++ b/philter_lite/data/i2b2_xml/110-02.xml
@@ -83,4 +83,4 @@ GPP/church/olinger
 <NAME TYPE="DOCTOR" comment="" end="1320" id="P6" start="1314" text="church" />
 <NAME TYPE="DOCTOR" comment="" end="1328" id="P7" start="1321" text="olinger" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/philter_lite/data/i2b2_xml/110-03.xml b/philter_lite/data/i2b2_xml/110-03.xml
index 547b70b..f40dffa 100644
--- a/philter_lite/data/i2b2_xml/110-03.xml
+++ b/philter_lite/data/i2b2_xml/110-03.xml
@@ -119,4 +119,4 @@ Mike Ivan, MD, EHMS pager 84710
 <NAME TYPE="DOCTOR" comment="" end="3592" id="P10" start="3583" text="Mike Ivan" />
 <CONTACT TYPE="PHONE" comment="" end="3614" id="P12" start="3609" text="84710" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/philter_lite/data/i2b2_xml/110-04.xml b/philter_lite/data/i2b2_xml/110-04.xml
index 50c3069..0ebd892 100644
--- a/philter_lite/data/i2b2_xml/110-04.xml
+++ b/philter_lite/data/i2b2_xml/110-04.xml
@@ -192,4 +192,4 @@ Owen M Gallagher, M.D.
 <NAME TYPE="DOCTOR" comment="" end="3877" id="P8" start="3872" text="Perez" />
 <NAME TYPE="DOCTOR" comment="" end="4080" id="P9" start="4064" text="Owen M Gallagher" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/philter_lite/data/i2b2_xml/111-01.xml b/philter_lite/data/i2b2_xml/111-01.xml
index 9f7470e..ea6b9f9 100644
--- a/philter_lite/data/i2b2_xml/111-01.xml
+++ b/philter_lite/data/i2b2_xml/111-01.xml
@@ -89,4 +89,4 @@ Dictated by:  FILBERT BRIGHT, M.D.    FB59
 <NAME TYPE="DOCTOR" comment="" end="1773" id="P12" start="1759" text="FILBERT BRIGHT" />
 <NAME TYPE="USERNAME" comment="" end="1787" id="P13" start="1783" text="FB59" />
 </TAGS>
-</deIdi2b2>
\ No newline at end of file
+</deIdi2b2>
diff --git a/philter_lite/data/phi/fn.txt b/philter_lite/data/phi/fn.txt
index 946ea46..2fb7825 100644
--- a/philter_lite/data/phi/fn.txt
+++ b/philter_lite/data/phi/fn.txt
@@ -2,4 +2,4 @@
     "SILVER",
     "church",
     "holmes"
-]
\ No newline at end of file
+]
diff --git a/philter_lite/data/phi/fp.txt b/philter_lite/data/phi/fp.txt
index e8eb5a9..46ba7db 100644
--- a/philter_lite/data/phi/fp.txt
+++ b/philter_lite/data/phi/fp.txt
@@ -35,4 +35,4 @@
     "EHMS",
     "New",
     "XGT"
-]
\ No newline at end of file
+]
diff --git a/philter_lite/data/phi/summary.json b/philter_lite/data/phi/summary.json
index d298511..b2e6a06 100644
--- a/philter_lite/data/phi/summary.json
+++ b/philter_lite/data/phi/summary.json
@@ -80,4 +80,4 @@
             "num_false_negatives": 1
         }
     }
-}
\ No newline at end of file
+}
diff --git a/philter_lite/data/phi_notes_i2b2.json b/philter_lite/data/phi_notes_i2b2.json
index 24a7dfe..c50b356 100644
--- a/philter_lite/data/phi_notes_i2b2.json
+++ b/philter_lite/data/phi_notes_i2b2.json
@@ -392,4 +392,4 @@
             }
         ]
     }
-}
\ No newline at end of file
+}
diff --git a/philter_lite/i2b2.py b/philter_lite/i2b2.py
index 40e1abf..1a04123 100644
--- a/philter_lite/i2b2.py
+++ b/philter_lite/i2b2.py
@@ -7,7 +7,7 @@ def save_to_i2b2(contents, output_file):
 
 
 def transform_text_i2b2(tagdata: DataTracker):
-    """creates a string in i2b2-XML format"""
+    """Create a string in i2b2-XML format."""
     root = "Philter"
     contents = [
         '<?xml version="1.0" ?>\n',
diff --git a/tests/test_package_metadata.py b/tests/test_package_metadata.py
index 2d6d451..28c1d0b 100644
--- a/tests/test_package_metadata.py
+++ b/tests/test_package_metadata.py
@@ -1,6 +1,5 @@
 """Confirm that project metadata is set correctly."""
 import os
-import re
 from typing import Any, Mapping
 
 import toml