-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathflag_duplicates.py
237 lines (173 loc) · 11.3 KB
/
flag_duplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
# coding: utf-8
# In[ ]:
#Imports & Setup
"""
Description: Flags duplicate patients in the argument .csv file by adding a column called 'Flag' that maps to
unique user tests. Thus, the number in the 'Flag' column for any given row will only match to duplicate
persons
Sample Usage: Use 'python3 flag_duplicates.py [$filename$.csv]' to flag all duplicates and output a new .csv to the path
./filename.flagged_duplicates.csv, where filename is replaced with a
"""
import pandas as pd
import numpy as np
import math as mt
import sys
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
# In[ ]:
# Loading in data
# Ensuring that a file path has been passed into the script
improper_arg_msg = "Improper arguments passed in - use 'python3 flag_duplicates.py [$filename$.csv]'"
if len(sys.argv) < 2:
assert False, improper_arg_msg
elif ".csv" not in sys.argv[1]:
print("Using the sample dataset...\n", improper_arg_msg)
sys.argv[1] = "deduplicator_sample_data_scramble.csv"
# Getting the roster and homework response paths
path = sys.argv[1]
lab_confirmed_flu = pd.read_csv(sys.argv[1])
lab_confirmed_flu
# In[ ]:
#Find Fuzzy string matches for each Patient
try:
patients = lab_confirmed_flu["Patient"]
except:
try:
lab_confirmed_flu["Patient"] = np.core.defchararray.add(lab_confirmed_flu['last_name'],
lab_confirmed_flu['first_name'])
except:
lab_confirmed_flu["Patient"] = (np.asarray(lab_confirmed_flu["last_name"]) +
" " +
np.asarray(lab_confirmed_flu.first_name)
)
patients = lab_confirmed_flu["Patient"]
#The first match of each list will be the row the patient was from
matches = [process.extract(query=patient,
choices=patients,
limit=max(25,
int(len(patients) ** .5))
)
for patient in patients]
matches
# In[ ]:
#Find true duplicates and match them up
#Maps frozenset tuples of 2 filtered match indices >> patient index (-1 if they are not true matches)
filtered_match_ids = dict()
#The column to be added, containing the updated patient_id for each index
flags = np.arange( len(patients) )
#Ensures that one patient's index is not set multiple times (a single patient's index should not match via manual and automated detection more than once)
already_matched = set()
#Keeps track of all matches for displaying later
all_matches_in_dataset = []
def validate_matches(filtered_match_index, filtered_match_ids):
"""For a given filtered_match_index, returns whether all keys in filtered_match_ids that have filtered_match_index within the key have the same value
:param int filtered_match_index : the index of a match
:param dict filtered_match_ids : Maps frozenset tuples of 2 filtered match indices >> patient index (-1 if they are not true matches)
:return boolean : whether all keys in filtered_match_ids that have filtered_match_index within the key map to the same value
"""
match_keys = [key for key in filtered_match_ids if filtered_match_index in key and filtered_match_ids[key] != -1]
reference_value = filtered_match_ids[ match_keys[0] ]
for key in match_keys:
if filtered_match_ids[key] != reference_value: return False
return True
for patient_match_list in matches:
patient = patient_match_list[0]
patient_index = int(patient[2])
#Narrow down all fuzzy string scores to only potential duplicates of "patient"
all_matches = np.asarray(patient_match_list[1:])
filtered_matches = all_matches[ np.asarray([int(match[1]) > 65 for match in all_matches]) ]
#For all filtered matches, find true duplicates and give them the same patient_id
#Note: simply because another patient passed the filter does NOT mean they are a true match
for filtered_match in filtered_matches:
filtered_match_index = int(filtered_match[2])
possible_match_key = frozenset([patient_index, filtered_match_index])
#If this possible_match_key has already been checked and IS NOT a match
if possible_match_key in filtered_match_ids and filtered_match_ids[possible_match_key] == -1:
continue
#If this possible_match_key has already been checked and IS a match, use the same value
elif possible_match_key in filtered_match_ids:
flags[patient_index] = filtered_match_ids[possible_match_key]
break
#If the possible_match_key has not already been checked, determine whether it is a true match
else:
patient_row = lab_confirmed_flu.iloc[patient_index]
filtered_match_row = lab_confirmed_flu.iloc[filtered_match_index]
#Uncertain based on data -- ask user to take a closer look
if ( (type(patient_row.DOB) == float and np.isnan(patient_row.DOB )) or
(type(filtered_match_row.DOB) == float and np.isnan(filtered_match_row.DOB )) or
(type(patient_row.Collected) == float and np.isnan(patient_row.Collected )) or
(type(filtered_match_row.Collected) == float and np.isnan(filtered_match_row.Collected))
):
msg = """Please press 'Y' if the two patients are matches and anything else if they are not: """
print("\n\n===========================================================\nPlease examine the following:\n")
print("\tTarget Patient:\n", patient_row)
print("\tPotential match:\n", filtered_match_row)
is_match = input("\n"+msg).strip().lower() == 'y'
#Highly probable matches based on DOB + Collection Time + Test
else:
DOB_match = patient_row["DOB"] == filtered_match_row["DOB"]
Collected_match = patient_row["Collected"] == filtered_match_row["Collected"]
try:
Test_match = patient_row["Test"] == filtered_match_row["Test"]
except: #cc.dedup
try:
Test_match = patient_row['Result']== filtered_match_row['Result']
except: #cho.a.dedup
try:
Test_match = patient_row['flua'] == filtered_match_row['flua'] and patient_row['flub'] == filtered_match_row['flub']
except: #cho.b.dedup
try:
Test_match = (patient_row['influenza.a.h1'] == filtered_match_row['influenza.a.h1'] and
patient_row['influenza.a.h3'] == filtered_match_row['influenza.a.h3'] and
patient_row['x2009.inf.a.h1n1.rvp'] == filtered_match_row['x2009.inf.a.h1n1.rvp'] and
patient_row['flu.b'] == filtered_match_row['flu.b'] and
patient_row['rsv.a'] == filtered_match_row['rsv.a'] and
patient_row['rsv.b'] == filtered_match_row['rsv.b'] and
patient_row['parainfluenza.1'] == filtered_match_row['parainfluenza.1'] and
patient_row['parainfluenza.2'] == filtered_match_row['parainfluenza.2'] and
patient_row['parainfluenza.3'] == filtered_match_row['parainfluenza.3'] and
patient_row['rhinovirus'] == filtered_match_row['rhinovirus'] and
patient_row['adenovirus'] == filtered_match_row['adenovirus'] and
patient_row['metapneumovirus'] == filtered_match_row['metapneumovirus']
)
except: #CEIP Shoo the Flu
try:
Test_match = (patient_row['FLU TEST TYPE'] == filtered_match_row['FLU TEST TYPE'] and
patient_row['INFLUENZA ANTIGEN DETECTION'] == filtered_match_row['INFLUENZA ANTIGEN DETECTION'] and
patient_row['FLU B'] == filtered_match_row['FLU B'] and
patient_row['INFLUENZA A H1'] == filtered_match_row['INFLUENZA A H1'] and
patient_row['INFLUENZA A H3'] == filtered_match_row['INFLUENZA A H3'] and
patient_row['2009 INF A/H1N1 RVP'] == filtered_match_row['2009 INF A/H1N1 RVP'] and
patient_row['RSV A'] == filtered_match_row['RSV A'] and
patient_row['RSV B'] == filtered_match_row['RSV B'] and
patient_row['PARAINFLUENZA 1'] == filtered_match_row['PARAINFLUENZA 1'] and
patient_row['PARAINFLUENZA 2'] == filtered_match_row['PARAINFLUENZA 2'] and
patient_row['PARAINFLUENZA 3'] == filtered_match_row['PARAINFLUENZA 3']
)
except:
assert False, "All cases should've been covered"
print(path)
print("\n\nSample Patient Row:\n\n")
print(patient_row)
is_match = DOB_match and Collected_match and Test_match
#Storing the result of our comparison in filtered_match_ids
if is_match:
all_matches_in_dataset.append(patient_row)
contradiction_msg = "The newly matched patient has already been matched -- this is a contradiction. Patient: " + str(patient)
assert (filtered_match_index not in already_matched) or validate_matches(filtered_match_index, filtered_match_ids), contradiction_msg
filtered_match_ids[possible_match_key] = flags[patient_index]
flags[filtered_match_index] = flags[patient_index]
already_matched.add(filtered_match_index)
else:
filtered_match_ids[possible_match_key] = -1
lab_confirmed_flu["Flag"] = flags
lab_confirmed_flu
# In[ ]:
#Saving the result
assert path[-4:] == ".csv"
new_path = path[:-4] + ".flagged_duplicates.csv"
lab_confirmed_flu.to_csv(path_or_buf=new_path, index=False)
# In[ ]:
if len(all_matches_in_dataset) > 0:
print("Dataset:", new_path)
print("All matches in dataset:\n\n", all_matches_in_dataset)