Skip to content

Commit

Permalink
Add flake8 and fix issues raised by it.
Browse files Browse the repository at this point in the history
NOTE: For now, ignoring checks for missing docstrings, since there are so many.
      Those should be added and the checks enabled.
  • Loading branch information
jclerman committed Mar 29, 2024
1 parent 3e43307 commit b415da9
Show file tree
Hide file tree
Showing 11 changed files with 104 additions and 110 deletions.
3 changes: 3 additions & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[flake8]
ignore = E203, E266, E501, W503, B028, D100, D101, D102, D103, D104
max-line-length = 120
9 changes: 8 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
---
repos:
- repo: https://github.com/psf/black
rev: 22.8.0
rev: 23.9.1
hooks:
- id: black
language_version: python3
Expand All @@ -11,3 +11,10 @@ repos:
hooks:
- id: isort
args: [ --profile, black ]

- repo: https://github.com/pycqa/flake8
rev: 6.1.0
hooks:
- id: flake8
language: python
additional_dependencies: [flake8-bugbear, flake8-comprehensions, flake8-print, flake8-docstrings]
11 changes: 11 additions & 0 deletions philter_lite/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
"""The philter_lite package."""
from importlib import metadata

from philter_lite.coordinate_map import CoordinateMap
Expand All @@ -8,6 +9,16 @@
from .asterisk import transform_text_asterisk
from .i2b2 import transform_text_i2b2

__all__ = [
"CoordinateMap",
"Filter",
"filter_from_dict",
"load_filters",
"detect_phi",
"transform_text_asterisk",
"transform_text_i2b2",
]

_DISTRIBUTION_METADATA = metadata.metadata("philter_lite")

__author__ = _DISTRIBUTION_METADATA["Author"]
Expand Down
4 changes: 2 additions & 2 deletions philter_lite/asterisk.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
from .coordinate_map import CoordinateMap, PUNCTUATION_MATCHER
from .coordinate_map import PUNCTUATION_MATCHER, CoordinateMap


def save_to_asterisk(contents, output_file):
"""Write some data to a text file, using utf-8 encoding."""
with open(output_file, "w", encoding="utf-8", errors="surrogateescape") as f:
f.write(contents)

Expand All @@ -11,7 +12,6 @@ def transform_text_asterisk(txt, include_map: CoordinateMap):
# read the text by character, any non-punc non-overlaps will be replaced
contents = []
for i in range(len(txt)):

if i < last_marker:
continue

Expand Down
83 changes: 39 additions & 44 deletions philter_lite/coordinate_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,31 +6,32 @@


class CoordinateMap:
"""Hits are stored in a coordinate map data structure
"""Internal data structure mapping filepaths to a map of int:string (coordinate start --> stop).
This class stores start coordinates for any matches found for this pattern"""
Hits are stored in a coordinate map data structure.
def __init__(self):
"""internal data structure maps filepaths to a map of int:string (coordinate start --> stop)
map is the internal structure of
{ filename : { startcoordinate : stop_coordinate}}
eg: { "data/foo.txt": {123:126, 19:25} }
coord2pattern keeps reference of the patterns
that matched this coorinate (can be multiple patterns)
This class stores start coordinates for any matches found for this pattern.
all_coords keeps a reference of all coordinates mapped by filename,
allowing us to easily check if these coordinates have been matched yet"""
Attributes:
map: Has the internal structure of
{ filename : { startcoordinate : stop_coordinate}}
eg: { "data/foo.txt": {123:126, 19:25} }
coord2pattern: Keeps reference of the patterns that matched this coordinate
(can be multiple patterns).
all_coords: Keeps a reference of all coordinates mapped by filename,
allowing us to easily check if these coordinates have been matched yet.
"""

def __init__(self):
"""Initialize."""
self.map = {}
self.coord2pattern = {}
self.all_coords = {}

def add(self, start, stop, overlap=False, pattern=""):
"""adds a new coordinate to the coordinate map
"""Add a new coordinate to the coordinate map.
if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)
If overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs).
"""
if not overlap:
if self.does_overlap(start, stop):
Expand All @@ -47,13 +48,15 @@ def add(self, start, stop, overlap=False, pattern=""):
return True, None

def add_pattern(self, start, stop, pattern):
"""adds this pattern to this start coord"""
"""Add this pattern to this start coord."""
self.coord2pattern[start] = []
self.coord2pattern[start].append(pattern)

def add_extend(self, start, stop, pattern=""):
"""adds a new coordinate to the coordinate map
if overlaps with another, will extend to the larger size"""
"""Add a new coordinate to the coordinate map.
If overlaps with another, will extend to the larger size.
"""
overlaps = self.max_overlap(start, stop)

def clear_overlaps(lst):
Expand All @@ -78,7 +81,7 @@ def clear_overlaps(lst):
return True, None

def remove(self, start, stop):
"""Removes this coordinate pairing from the map, all_coords, and coord2pattern"""
"""Remove this coordinate pairing from the map, all_coords, and coord2pattern."""
# delete from our map structure
if start in self.map:
del self.map[start]
Expand All @@ -89,7 +92,7 @@ def remove(self, start, stop):
return True, None

def scan(self):
"""does an inorder scan of the coordinates and their values"""
"""Do an inorder scan of the coordinates and their values."""
for fn in self.map:
coords = list(self.map[fn].keys())
coords.sort()
Expand All @@ -105,20 +108,19 @@ def get_coords(self, start):
return start, stop

def filecoords(self):
"""generator does an inorder scan of the coordinates for this file"""
"""Provide a generator of an in-order scan of the coordinates for this file."""
coords = sorted(self.map.keys())
for coord in coords:
yield coord, self.map[coord]

def does_exist(self, index):
"""Simple check to see if this index is a hit (start of coordinates)"""
"""Simply check to see if this index is a hit (start of coordinates)."""
if index in self.map:
return True
return False

def does_overlap(self, start, stop):
"""Check if this coordinate overlaps with any existing range"""

"""Check if this coordinate overlaps with any existing range."""
ranges = [list(range(key, self.map[key] + 1)) for key in self.map]
all_coords = [item for sublist in ranges for item in sublist]
# removing all_coords implementation until we write some tests
Expand All @@ -128,10 +130,11 @@ def does_overlap(self, start, stop):
return False

def calc_overlap(self, start, stop):
"""given a set of coordinates, will calculate all overlaps
perf: stop after we know we won't hit any more
perf: use binary search approach"""
"""Given a set of coordinates, calculate all overlaps.
perf: stop after we know we won't hit any more
perf: use binary search approach
"""
overlaps = []
for s in self.map:
e = self.map[s]
Expand All @@ -149,10 +152,11 @@ def calc_overlap(self, start, stop):
return overlaps

def max_overlap(self, start, stop):
"""given a set of coordinates, will calculate max of all overlaps
perf: stop after we know we won't hit any more
perf: use binary search approach"""
"""Given a set of coordinates, calculate max of all overlaps.
perf: stop after we know we won't hit any more
perf: use binary search approach
"""
overlaps = []
for s in self.map:
e = self.map[s]
Expand All @@ -168,9 +172,7 @@ def max_overlap(self, start, stop):
}
)
else:
overlaps.append(
{"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
)
overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})

elif s <= stop <= e:
if start <= s:
Expand All @@ -183,15 +185,12 @@ def max_overlap(self, start, stop):
}
)
else:
overlaps.append(
{"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e}
)
overlaps.append({"orig_start": s, "orig_end": e, "new_start": s, "new_stop": e})

return overlaps

def get_complement(self, text):
"""get the complementary coordinates of the input coordinate map (excludes punctuation)"""

"""Get the complementary coordinates of the input coordinate map (excludes punctuation)."""
complement_coordinate_map = {}

current_map_coordinates: List[int] = []
Expand All @@ -201,9 +200,7 @@ def get_complement(self, text):
current_map_coordinates += range(start, stop)

text_coordinates = list(range(0, len(text)))
complement_coordinates = list(
set(text_coordinates) - set(current_map_coordinates)
)
complement_coordinates = list(set(text_coordinates) - set(current_map_coordinates))

# Remove punctuation from complement coordinates
for i in range(0, len(text)):
Expand All @@ -214,9 +211,7 @@ def get_complement(self, text):
# Group complement coordinates into ranges
def to_ranges(iterable):
iterable = sorted(set(iterable))
for key, group in itertools.groupby(
enumerate(iterable), lambda t: t[1] - t[0]
):
for _key, group in itertools.groupby(enumerate(iterable), lambda t: t[1] - t[0]):
group_list = list(group)
yield group_list[0][1], group_list[-1][1] + 1

Expand Down
12 changes: 5 additions & 7 deletions philter_lite/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,23 +130,21 @@ def filter_from_dict(


def load_filters(filter_path) -> List[Filter]:
"""Loads filters from a file on disk.
"""Load filters from a file on disk.
File must be a toml file with a key of `filters`
File must be a toml file with a key of `filters`.
"""
if not os.path.exists(filter_path):
raise Exception("Filepath does not exist", filter_path)
with open(filter_path, "r") as fil_file:
return [filter_from_dict(x) for x in toml.loads(fil_file.read())["filters"]]


def _precompile(regex: str):
"""precompiles our regex to speed up pattern matching"""
def _precompile(regex: str) -> Pattern[str]:
"""Precompile our regex to speed up pattern matching."""
# NOTE: this is not thread safe! but we want to print a more detailed warning message
with warnings.catch_warnings():
warnings.simplefilter(
action="error", category=FutureWarning
) # in order to print a detailed message
warnings.simplefilter(action="error", category=FutureWarning) # in order to print a detailed message
try:
re_compiled = re.compile(regex)
except FutureWarning:
Expand Down
14 changes: 5 additions & 9 deletions philter_lite/filters/stanford_ner.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import os
import re
import subprocess
import sys

from nltk.tag.stanford import StanfordNERTagger

Expand All @@ -9,16 +10,12 @@
from . import NerFilter


def build_ner_tagger(
classifier, tagger_jar, download: bool = True
) -> StanfordNERTagger:
def build_ner_tagger(classifier, tagger_jar, download: bool = True) -> StanfordNERTagger:
if not os.path.exists(classifier) and not download:
raise Exception("Filepath does not exist", classifier)
else:
# download the ner data
process = subprocess.Popen(
"cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE
)
process = subprocess.Popen("cd generate_dataset && ./download_ner.sh".split(), stdout=subprocess.PIPE)
process.communicate()

if not os.path.exists(tagger_jar):
Expand All @@ -34,7 +31,7 @@ def map_ner(
stanford_ner_tagger: StanfordNERTagger,
pre_process=r"[^a-zA-Z0-9]+",
) -> CoordinateMap:
"""map NER tagging"""
"""Map NER tagging."""
pos_set = set()
if pattern.pos:
pos_set = set(pattern.pos)
Expand Down Expand Up @@ -62,7 +59,6 @@ def map_ner(
# add these coordinates to our coordinate map
start_coordinate = 0
for word in cleaned:

word_clean = re.sub(pre_process, "", word.lower().strip())
if len(word_clean) == 0:
# got a blank space or something without any characters or digits, move forward
Expand All @@ -75,7 +71,7 @@ def map_ner(
if ner_tag in pos_set:
stop = start + len(word)
coord_map.add_extend(start, stop)
print("FOUND: ", word, "NER: ", ner_tag, start, stop)
sys.stdout.write(f"FOUND: {word} NER: {ner_tag} {start} {stop}")

# advance our start coordinate
start_coordinate += len(word)
Expand Down
Loading

0 comments on commit b415da9

Please sign in to comment.