-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanonymize_txt.py
115 lines (98 loc) · 4.59 KB
/
anonymize_txt.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import argparse
from text_anonymizer import TextAnonymizer
def main():
parser = argparse.ArgumentParser(description='Anonymize txt file', epilog="Example: python anonymize_txt.py file_in.txt file_out.txt --column_name=text")
parser.add_argument('source_file', type=str, help='Text file to be anonymized')
parser.add_argument('target_file', type=str, help='Name or path of (anonymized) destination file.')
parser.add_argument('--debug', type=str, help='Toggle debug logging. Shows scores within labels. (True/False)')
parser.add_argument('--languages', type=str, help='Selected languages (comma separated). Default: fi,en')
parser.add_argument('--encoding', type=str, help='Source encoding. Default: UTF-8')
parser.add_argument('--separator', type=str, help='String separator for newlines. Default: None')
debug = False
args = parser.parse_args()
languages = ['fi', 'en']
source_encoding = 'UTF-8'
source_file = None
separator = None
if args.source_file:
source_file = args.source_file
if args.target_file:
target_file = args.target_file
if args.debug and "true" == args.debug.lower():
debug = True
if args.languages:
languages = args.languages.split(',')
if args.encoding:
source_encoding = args.encoding
if args.separator:
separator = args.separator
print("Anonymizing file: {i}. ".format(i=source_file))
print("")
print("Parameters:")
print("- Source file: {s}".format(s=source_file))
print("- Target file: {s}".format(s=target_file))
print("- Debug mode: {s}".format(s=debug))
print("")
text_anonymizer = TextAnonymizer(languages=languages, debug_mode=debug)
statistics = []
details = []
def anonymize(doc: [str]) -> (str, object, object):
a = ' '.join(doc)
if a:
result = text_anonymizer.anonymize(a)
return result.anonymized_text, result.statistics, result.details
return None, None, None
def prepare_raw_text(line):
line = line.replace('\s+', ' ')
return line
if source_file:
try:
# use same encoding for source and target file
with open(target_file, mode='w+', newline='', encoding=source_encoding) as outfile:
with open(source_file, mode='r', newline='', encoding=source_encoding) as in_file:
lines = in_file.readlines()
doc = []
newline_counter = 0
line_counter = 0
for line in lines:
line_counter += 1
if line != '\n':
# remove double spaces etc
prepared = prepare_raw_text(line)
doc.append(prepared)
else:
newline_counter += 1
if newline_counter >= 2 or line_counter >= len(lines):
newline_counter = 0
anonymized, stats, detail = anonymize(doc)
if anonymized:
anonymized = ' '.join(anonymized.split())
if stats:
statistics.append(stats)
if detail:
details.append(detail)
if debug:
if doc:
print('>>> Original: ')
print(''.join(doc))
print('>>> Anonymized: ')
print(anonymized)
print('---')
doc = []
if anonymized:
outfile.write(anonymized)
if separator:
outfile.write("\n{separator}\n")
except Exception as e:
print("Error: ", e)
if 'codec' in str(e):
print("Hint: Possibly invalid encoding. Please check the encoding of the source file. Use --encoding=... option to set the correct encoding.")
exit(-1)
combined_stats = text_anonymizer.combine_statistics(statistics)
combined_details = text_anonymizer.combine_details(details)
print("Statistics: ", combined_stats)
if debug:
print("Details: ", combined_details)
print("\nFinished. Wrote anonymized version to: "+target_file)
if __name__ == "__main__":
main()