-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path3_process_text.py
133 lines (107 loc) · 4.56 KB
/
3_process_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
import numpy as np
import pandas as pd
import sys
import csv
import os
print 'Number of arguments:', len(sys.argv), 'arguments.'
print 'Argument List:', str(sys.argv)
if len(sys.argv) >= 3:
filename_in = sys.argv[1]
filename_labels = sys.argv[2]
elif len(sys.argv) == 2:
print "Only one argument found, please provide path to pmids and labels file"
else:
print "No command line arguments found."
filename_out = filename_in.replace(".csv", "_processed.txt")
print "Targeting file " + filename_in + " for processing into " + filename_out
print "Using labels file: " + filename_labels
#### Process text
## define a function that takes a dataframe with text in field text
## removes copyright notices, and normalizes whitespaces to a single space
def process(df):
# removing whitespaces, punctuations, stopwords, and stemming words
documents = []
failures = []
import re
pattern = re.compile('[\W_]+')
for document in df['text']:
try:
## remove trailing terms like Elsevier and All rights reserved
document = re.sub(string = document.lower(), pattern = 'copyright.{0,100}?$', repl = '')
## remove anything after a copyright.
## should check for "published" or "rights reserved"
## remove everything except alphanumerics and whitespace
## word2vec can handle multiple whitespaces between tokens
## combine into one string
outstr = pattern.sub(' ', document)
## append outstr
documents.append(outstr)
except:
documents.append(np.nan)
## record failures
failures.append(df[df['text'] == document].index.tolist())
continue
return [documents, failures]
#### Load, process, save
## loads a file, processes and saves the result
## takes a single integer as input
## check file exists, skip if not
if not os.path.isfile(filename_in):
print 'Failed to find input file: ' + filename_in
## Reading raw file with pmid, year, title, and abstracts
## this file is created after extraction from XML
raw = pd.read_table(filename_in, sep="|", quotechar='"', error_bad_lines=False)
## remove years 2000 and earlier
#year_mask = raw['year'] >= 2000 # removed for ctsa labeled articles
#raw_concat = raw[year_mask].copy()
raw_concat = raw.copy()
## combine title and abstracts into one text field and drop the parents
raw_concat["text"] = raw_concat["title"] + " " + raw_concat["abstract"]
#raw_concat.drop('pmid', axis=1, inplace=True) ## for labeled articles, you want to keep pmid to merge on later!
raw_concat.drop('year', axis=1, inplace=True)
raw_concat.drop('title', axis=1, inplace=True)
raw_concat.drop('abstract', axis=1, inplace=True)
## check for and remove any rows with null text field
## don't care if pmid or year is incomplete here
print(raw_concat.shape)
df = raw_concat[raw_concat['text'].notnull()]
#### subset of CTSA LABELED data.
## first load labels
labels = pd.read_csv(filename_labels)
def add_label_tag(num):
s = "__label__" + str(int(num))
return s
labels["label"] = map(add_label_tag, labels["group"])
labels.drop('group', axis=1, inplace=True)
## merge df onto labels on pmid
df_labels = labels.merge(df, on = 'pmid', how = 'inner')
## inner to only keep the articles with nonnull text
## check for pmids that don't appear - should be empty
print "PMIDs that get removed, this should be an empty list:"
print list(set(labels["pmid"]) - set(df_labels["pmid"]))
## this was an issue with the original XML not containing all pmids
## fixed with updated R parsing script.
# cleanup and drop
#df_labels.drop('pmid', axis=1, inplace=True)
print(df_labels.shape)
## the result is a nice ordered data.frame.
#### Preprocess
## now apply processing function to the real df
df_labels["output"] = process(df_labels)[0] # [0] for output not failures
print df_labels.shape
#df["output"] = process(df)[0] # [0] for output not failures
#print df.shape
#print df.head()
## investigate failures!
#failures = process_documents(df)[1]
#df_failures = df.iloc[[i[0] for i in failures],:]
#print "Failures occured on: " + str(df_failures.shape[0]) + " documents."
#df_failures.head()
## make a copy and filter for non null output strings
df_tosave = df_labels.copy()[df_labels['output'].notnull()]
#df_tosave = df.copy()[df['output'].notnull()]
df_tosave.drop('text', axis=1, inplace=True)
print df_tosave.shape
df_tosave.to_csv(filename_out, sep = ",", header=False, index=False, quoting=csv.QUOTE_NONE)
## save without header or index/rownames or quotes around the string.
print "Done!"