forked from munibanust/febrl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tcsv.py
296 lines (243 loc) · 11.6 KB
/
tcsv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# =============================================================================
# tcsv.py - Module for parsing malformed CSV and other delimited files
#
# Freely extensible biomedical record linkage (Febrl) Version 0.2.2
# See http://datamining.anu.edu.au/projects/linkage.html
#
# =============================================================================
# AUSTRALIAN NATIONAL UNIVERSITY OPEN SOURCE LICENSE (ANUOS LICENSE)
# VERSION 1.1
#
# The contents of this file are subject to the ANUOS License Version 1.1 (the
# "License"); you may not use this file except in compliance with the License.
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License for
# the specific language governing rights and limitations under the License.
# The Original Software is "tcsv.py".
# The Initial Developers of the Original Software are Dr Peter Christen
# (Department of Computer Science, Australian National University) and Dr Tim
# Churches (Centre for Epidemiology and Research, New South Wales Department
# of Health). Copyright (C) 2002, 2003 the Australian National University and
# others. All Rights Reserved.
# Contributors:
#
# =============================================================================
"""Module tcsv.py - Module for parsing malformed CSV and other delimited files.
Author: Tim Churches
"""
# -----------------------------------------------------------------------------
import re # Regular expression module
# -----------------------------------------------------------------------------
class delimited_parser:
"""A parser for comma-separated value (CSV) and other delimited strings.
Maybe change 'null_value' to 'missing' ?? (PC, 22/8/2002)
Using the escape character '\' ('\\' in Python) makes the re module crash.
"""
def __init__(self, delimiter_chars=',', quote_chars='"', escape_chars='/', \
comment_chars='#', null_chars='.', \
null_strings =['null','None','none','missing'], null_value='', \
as_strings=0):
"""Creates an instance of a delimited string parser, which parses a
delimited string into a into a sequence of fields.
ATTRIBUTES:
delimiter_chars A string of characters each of which may act as a field
delimiter
quote_chars A string of characters, each of which may act as a
quoting character enclosing a field. The first
character in this string is substituted for all the
others in the output.
escape_chars A string of characters, each of which may act as an
escaping character for delimiter chacters embedded in
fields.
comment_chars A string of characters, each of which signifies a
comment line if it appears as the first character on
that line (that is, as the first character in line).
null_chars A string of characters, each of which is interpreted
as signifying a null or missing value in line.
null_strings A sequence of strings, each of which is interpreted as
signifying a null or missing value in line.
null_value The value to return when an element of null_chars or
null_strings is encountered.
as_strings If true, return each field as a string, not as an
appropriate data type.
"""
self.delimiter_chars=delimiter_chars
self.quote_chars=quote_chars
self.escape_chars=escape_chars
self.comment_chars=comment_chars
self.null_chars=null_chars
self.null_strings=null_strings
self.null_value=null_value
self.as_strings=as_strings
# ---------------------------------------------------------------------------
def typefld(self,fld):
"""Returns an appropriate type of value depending on contents of fld,
which should be a string. Utility function used by parse_delimited().
If a string is a number but starts with a '0' it is handled as a
normal string (useful for e.g. postcodes like '0200'). Thanks to
Marion Sturtevant for pointing this out.
ARGUMENTS:
fld A string to be returned as the appropriate type
"""
fld = fld.strip()
# Check if it's a number not starting with a '0'
#
if (len(fld) > 0) and \
(fld[0] in ['.','1','2','3','4','5','6','7','8','9']):
try:
return int(fld)
except:
try:
return long(fld)
except:
try:
return float(fld)
except:
pass
# Assume it's a string (handle numbers starting with a '0' as strings)
#
# print "fld:", fld
if (fld == ''):
return self.null_value
elif (fld in self.null_strings):
return self.null_value
elif (len(fld) == 1) and (fld in self.null_chars):
return self.null_value
else:
return fld
# ---------------------------------------------------------------------------
def parse(self,line):
"""Parses the argument line into a sequence of fields.
"""
delimiter_pattern = re.compile('[' + self.delimiter_chars + ']')
quote_pattern = re.compile('[' + self.quote_chars + ']')
embedded_quote_pattern = re.compile('^(.*)(?:[' + self.quote_chars + \
'])(.*)(?:[' + self.quote_chars + '])(.*)$')
single_embedded_quote_pattern = re.compile('^(.*)(?:[' + \
self.quote_chars + '])(.*)$')
even_dequote_pattern = re.compile('(?:^[' + self.quote_chars + \
'])(.*)(?:[' + self.quote_chars + ']$)')
odd_opening_dequote_pattern = re.compile('(?:^[' + self.quote_chars + \
'])(.*)')
odd_closing_dequote_pattern = re.compile('(.*)(?:[' + self.quote_chars + \
']$)')
escape_pattern = '[' + self.escape_chars + ']$'
comment_pattern = '^[' + self.comment_chars + ']'
if (re.search(comment_pattern, line) == None):
n_quotes_line = len(re.findall(quote_pattern,line))
if ((n_quotes_line % 2) == 0): # Balanced quotes - good!
balanced_quotes = 1
else:
balanced_quotes = 0
raw_parsed_line = re.split(delimiter_pattern,line)
parsed_line = []
quoted = 0
escaped = 0
for fld in raw_parsed_line:
n_quotes = len(re.findall(quote_pattern,fld))
if (n_quotes % 2 == 0) and (n_quotes > 1):
fld = fld.strip()
if (len(re.findall(even_dequote_pattern,fld)) > 0):
fld = re.findall(even_dequote_pattern,fld)[0]
parsed_line.append(fld)
elif (len(re.findall(embedded_quote_pattern,fld)) > 0):
pfld = re.findall(embedded_quote_pattern,fld)[0]
if (len(pfld) == 3):
fld = pfld[0] + '"' + pfld[1] + '" ' + pfld[2]
parsed_line.append(fld.strip())
else:
parsed_line.append(self.typefld(fld))
elif (n_quotes % 2 == 1) and (quoted == 0):
fld = fld.lstrip()
if (len(re.findall(odd_opening_dequote_pattern,fld)) > 0):
fld = re.findall(odd_opening_dequote_pattern,fld)[0]
parsed_line.append(fld)
quoted = 1
elif (len(re.findall(single_embedded_quote_pattern,fld)) > 0):
fld = re.findall(single_embedded_quote_pattern,fld)[0]
parsed_line.append(fld[0] + self.quote_chars[0] + fld[1])
elif (n_quotes % 2 == 1) and (quoted == 1):
fld = fld.rstrip()
parsed_line[-1] = parsed_line[-1] + "," + \
re.findall(odd_closing_dequote_pattern,fld)[0]
quoted = 0
elif (n_quotes == 0) and (quoted == 1):
if (balanced_quotes == 1):
parsed_line[-1] = parsed_line[-1] + "," + fld
else:
parsed_line.append(fld)
quoted = 0
elif (re.search(escape_pattern, fld) <> None) and (escaped == 0):
parsed_line.append(fld[:-1])
escaped = 1
elif (escaped == 1):
if re.search(escape_pattern, fld) == None:
escaped = 0
parsed_line[-1] = parsed_line[-1] + "," + fld
else:
parsed_line[-1] = parsed_line[-1] + "," + fld[:-1]
else:
parsed_line.append(self.typefld(fld))
# print fld, quoted
else:
parsed_line = None
if (self.as_strings):
stringified = []
for fld in parsed_line:
stringified.append(str(fld))
return stringified
else:
return parsed_line
# ---------------------------------------------------------------------------
def testparse(self,line):
print "Input:", line
print "Output:", self.parse(line)
print
# ---------------------------------------------------------------------------
# Various test examples, call module as command line argument: 'python tcsv.py'
if __name__ == "__main__":
p = delimited_parser() # Get a parsing object
print "With defaults" # - - - - - - - - - - - - - - - - - - - - - - - - - -
p.testparse('1, 2,"3",4,"five, cinque or lima",6," seven ",8,8.5,9,0')
print "With as_strings=1" # - - - - - - - - - - - - - - - - - - - - - - - -
p.as_strings=1
p.testparse('1, 2,"3",4,"five, cinque or lima",6," seven ",8,8.5,9,0')
print "With defaults" # - - - - - - - - - - - - - - - - - - - - - - - - - -
p.as_strings=0
p.testparse('1, 2,"3 ", 4 , "five, cinque or lima",6," seven ",8,9,0')
p.testparse('1, 2,"3 ", 4 , "five, cinque or lima,6," seven ",8,9,0')
p.testparse('1, 2,"3 ", 4 , five, cinque or lima",6," seven ",8,9,0')
print "With delimiter_chars='\t'" # - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars='\t'
p.testparse('1\t2\t3\tHello world\t4')
p.delimiter_chars='\t'
p.testparse('1, 2,"3 ", 4 , five/, cinque or lima,6," seven ",8,9,0')
print "With delimiter_chars=',;'" # - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars=',;'
p.testparse('1,2;3,4;5;6,7,8,9;0')
p.testparse('1,2;.,4;5;6,null,8,9;0')
p.testparse('1,2;.,4;5;6,null,,98765432198765432198721987654321987654321;0')
print "With defaults" # - - - - - - - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars=','
p.testparse('one, two,three, "four,five,six,,,,seven", eight,"nine , ten",eleven')
p.testparse('one, two,three, four/,five/,six/,/,/,/,seven, eight,"nine , ten",eleven')
p.testparse('#one, two,three, four/,five/,six/,/,/,/,seven, eight,"nine , ten",eleven')
p.testparse('1, 2,"3 ", 4 , "five cinque"" or lima",6," seven ",8,9,0')
p.testparse('1, 2,"3 ", 4 , five, cinque"" or lima,6," seven ",8,9,0')
print "With delimiter_chars=' '" # - - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars=' '
p.testparse('1 2 3 4 5 6 7 8 9 0')
p.testparse('words delimited by double spaces')
print "With delimiter_chars=' '" # - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars=' '
p.testparse('words delimited by double spaces')
print "With defaults" # - - - - - - - - - - - - - - - - - - - - - - - - - - -
p.delimiter_chars=','
p.testparse('1988,"ayr" gobi rd,ulan bator,mongolia,3456')
p.testparse('1988,""ayr"" gobi rd,ulan bator,mongolia,3456')
p.testparse(' 1988, 13th flr "tumbi-umbi st, ulan bator ,mongolia, 3456 ')
p.testparse('1996,"old mill culkinny rd,brookhome,Missing,2333,')
print "With quote_chars set to double and single quotes" # - - - - - - - - -
p.quote_chars='"' + "'"
p.testparse('1, 2,"3",4,' + "'five, cinque or lima',6,' seven ',8,8.5,9,0")
p.testparse('1988,"ayr'+ "' gobi rd,ulan bator,mongolia,3456")