Skip to content

Commit

Permalink
fix(minor code defects): (#14)
Browse files Browse the repository at this point in the history
* feat(Update dependencies and clean up.):

* feat(update nltk):

* fix(minor code defects):

* fix(another usage of PUNCTUATION_MATCHER):

* fix(refactor PUNCTUATION_MATCHER in asterisk):

* feat(Update dependencies and clean up.):

* feat(Bump version):

* fix(version):

* fix(typo):
  • Loading branch information
msoedov authored Mar 29, 2024
1 parent 35e0b0b commit 3e43307
Show file tree
Hide file tree
Showing 6 changed files with 15 additions and 29 deletions.
9 changes: 3 additions & 6 deletions philter_lite/asterisk.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import re

from .coordinate_map import CoordinateMap
from .coordinate_map import CoordinateMap, PUNCTUATION_MATCHER


def save_to_asterisk(contents, output_file):
Expand All @@ -10,10 +8,9 @@ def save_to_asterisk(contents, output_file):

def transform_text_asterisk(txt, include_map: CoordinateMap):
last_marker = 0
punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
# read the text by character, any non-punc non-overlaps will be replaced
contents = []
for i in range(0, len(txt)):
for i in range(len(txt)):

if i < last_marker:
continue
Expand All @@ -23,7 +20,7 @@ def transform_text_asterisk(txt, include_map: CoordinateMap):
start, stop = include_map.get_coords(i)
contents.append(txt[start:stop])
last_marker = stop
elif punctuation_matcher.match(txt[i]):
elif PUNCTUATION_MATCHER.match(txt[i]):
contents.append(txt[i])
else:
contents.append("*")
Expand Down
21 changes: 6 additions & 15 deletions philter_lite/coordinate_map.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,15 @@
import re
from typing import List

PUNCTUATION_MATCHER = re.compile(r"[^a-zA-Z0-9*]")


class CoordinateMap:
"""Hits are stored in a coordinate map data structure
This class stores start coordinates for any matches found for this pattern"""

def __init__(self, pattern={"title": "untitled"}, debug=False):
def __init__(self):
"""internal data structure maps filepaths to a map of int:string (coordinate start --> stop)
map is the internal structure of
Expand All @@ -23,14 +25,13 @@ def __init__(self, pattern={"title": "untitled"}, debug=False):

self.map = {}
self.coord2pattern = {}
self.pattern = pattern
self.debug = debug
self.all_coords = {}

def add(self, start, stop, overlap=False, pattern=""):
"""adds a new coordinate to the coordinate map
if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)"""
if overlap is false, this will reject any overlapping hits (usually from multiple regex scan runs)
"""
if not overlap:
if self.does_overlap(start, stop):
return False, "Error, overlaps were found: {} {}".format(start, stop)
Expand Down Expand Up @@ -62,26 +63,17 @@ def clear_overlaps(lst):
if len(overlaps) == 0:
# no overlap, just save these coordinates
self.add(start, stop, pattern=pattern, overlap=True)
# if filename == "./data/i2b2_notes/167-02.txt":
# print("No overlaps:")
# print(filename,start,stop,pattern)
elif len(overlaps) == 1:
clear_overlaps(overlaps)
# 1 overlap, save this value
o = overlaps[0]
self.add(o["new_start"], o["new_stop"], pattern=pattern, overlap=True)
# if filename == "./data/i2b2_notes/167-02.txt":
# print("One overlap:")
# print(filename,start,stop,pattern)
else:
clear_overlaps(overlaps)
# greater than 1 overlap, by default this is sorted because of scan order
o1 = overlaps[0]
o2 = overlaps[-1]
self.add(o2["new_start"], o1["new_stop"], pattern=pattern, overlap=True)
# if filename == "./data/i2b2_notes/167-02.txt":
# print("Multiple overlaps:")
# print(filename,start,stop,pattern)

return True, None

Expand Down Expand Up @@ -214,9 +206,8 @@ def get_complement(self, text):
)

# Remove punctuation from complement coordinates
punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
for i in range(0, len(text)):
if punctuation_matcher.match(text[i]):
if PUNCTUATION_MATCHER.match(text[i]):
if i in complement_coordinates:
complement_coordinates.remove(i)

Expand Down
2 changes: 1 addition & 1 deletion philter_lite/filters/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ def _precompile(regex: str):
) # in order to print a detailed message
try:
re_compiled = re.compile(regex)
except FutureWarning as warn:
except FutureWarning:
warnings.simplefilter(action="ignore", category=FutureWarning)
re_compiled = re.compile(regex) # assign nevertheless
return re_compiled
Expand Down
2 changes: 1 addition & 1 deletion philter_lite/filters/filter_db.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from importlib import resources
from typing import Any, Dict, MutableMapping
from typing import Any, MutableMapping

import toml

Expand Down
8 changes: 3 additions & 5 deletions philter_lite/philter.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

import nltk

from philter_lite.coordinate_map import CoordinateMap
from philter_lite.coordinate_map import PUNCTUATION_MATCHER, CoordinateMap

from .filters import Filter, PosFilter, RegexContextFilter, RegexFilter, SetFilter

Expand Down Expand Up @@ -118,7 +118,7 @@ def detect_phi(
# create intersection maps for all phi types and add them to a dictionary containing all maps
# get full exclude map (only updated either on-command by map_regex_context or at the very end of map_
# coordinates)
full_exclude_map = include_map.get_complement(text_data)
# full_exclude_map = include_map.get_complement(text_data)

for phi_type in phi_type_list:
for start, stop in phi_type_dict[phi_type].filecoords():
Expand Down Expand Up @@ -245,8 +245,6 @@ def _map_regex_context(
full_exclude_map[start] = stop

# 1. Get coordinates of all include and exclude mathches

punctuation_matcher = re.compile(r"[^a-zA-Z0-9*]")
# 2. Find all patterns expressions that match regular expression
matches = regex.finditer(text)
for m in matches:
Expand Down Expand Up @@ -280,7 +278,7 @@ def _map_regex_context(
coord_tracker = 0
for element in split_match:
if element != "":
if not punctuation_matcher.match(element[0]):
if not PUNCTUATION_MATCHER.match(element[0]):
current_start = match_start + coord_tracker
current_end = current_start + len(element)
tokenized_matches.append((current_start, current_end))
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "philter-lite"
version = "0.4.0"
version = "0.5.0"
description = "Open-source PHI-filtering software. A fork of philter-ucsf."
readme = "README.md"
authors = [
Expand Down

0 comments on commit 3e43307

Please sign in to comment.