forked from wireservice/csvkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
csvclean
executable file
·66 lines (55 loc) · 2.75 KB
/
csvclean
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
#!/usr/bin/env python
from os.path import splitext
import sys
from csvkit import CSVKitReader, CSVKitWriter
from csvkit.cli import CSVKitUtility
from csvkit.cleanup import RowChecker
class CSVClean(CSVKitUtility):
description = 'Fix errors in a CSV file.'
def add_arguments(self):
self.argparser.add_argument('-n', '--dry-run', dest='dryrun', action='store_true',
help='If this argument is present, no output will be created. Information about what would have been done will be printed to STDERR.''')
def main(self):
reader = CSVKitReader(self.args.file, **self.reader_kwargs)
if self.args.dryrun:
checker = RowChecker(reader)
for row in checker.checked_rows():
pass
if checker.errs:
for e in checker.errs:
sys.stdout.write("Line %i: %s\n" % (e.line_number,e.msg))
else:
sys.stdout.write("No errors.\n")
if checker.joins:
sys.stdout.write("%i rows would have been joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
else:
base,ext = splitext(self.args.file.name)
# should we preserve delimiters and other dialect args from CLI?
cleaned_file = CSVKitWriter(open("%s_out.csv" % base,"w"), **self.writer_kwargs)
checker = RowChecker(reader)
cleaned_file.writerow(checker.column_names)
for row in checker.checked_rows():
cleaned_file.writerow(row)
if checker.errs:
# should we preserve delimiters and other dialect args from CLI?
err_filename = "%s_err.csv" % base
err_file = CSVKitWriter(open(err_filename, "w"), **self.writer_kwargs)
err_header = ['line_number','msg']
err_header.extend(checker.column_names)
err_file.writerow(err_header)
for e in checker.errs:
err_file.writerow(self._format_error_row(e))
err_count = len(checker.errs)
sys.stdout.write("%i error%s logged to %s\n" % (err_count,"" if err_count == 1 else "s", err_filename))
else:
sys.stdout.write("No errors.\n")
if checker.joins:
sys.stdout.write("%i rows were joined/reduced to %i rows after eliminating expected internal line breaks.\n" % (checker.rows_joined, checker.joins))
def _format_error_row(self, e):
"""Format a row for """
err_row = [e.line_number, e.msg]
err_row.extend(e.row)
return err_row
if __name__ == '__main__':
utility = CSVClean()
utility.main()