forked from wireservice/csvkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
csvjoin
executable file
·105 lines (79 loc) · 4.69 KB
/
csvjoin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/usr/bin/env python
import sys
from csvkit import CSVKitReader, CSVKitWriter
from csvkit import join
from csvkit.cli import CSVFileType, CSVKitUtility, match_column_identifier
class CSVJoin(CSVKitUtility):
description = 'Execute a SQL-like join to merge CSV files on a specified column or columns.'
epilog = 'Note that the join operation requires reading all files into memory. Don\'t try this on very large files.'
override_flags = 'f'
def add_arguments(self):
self.argparser.add_argument('files', metavar="FILES", nargs='+', type=CSVFileType(),
help='The CSV files to operate on. If only one is specified, it will be copied to STDOUT.')
self.argparser.add_argument('-c', '--columns', dest='columns',
help='The column name(s) on which to join. Should be either one name (or index) or a comma-separated list with one name (or index) for each file, in the same order that the files were specified. May also be left unspecified, in which case the two files will be joined sequentially without performing any matching.')
self.argparser.add_argument('--outer', dest='outer_join', action='store_true',
help='Perform a full outer join, rather than the default inner join.')
self.argparser.add_argument('--left', dest='left_join', action='store_true',
help='Perform a left outer join, rather than the default inner join. If more than two files are provided this will be executed as a sequence of left outer joins, starting at the left.')
self.argparser.add_argument('--right', dest='right_join', action='store_true',
help='Perform a right outer join, rather than the default inner join. If more than two files are provided this will be executed as a sequence of right outer joins, starting at the right.')
def main(self):
if len(self.args.files) < 2:
sys.exit('You must specify at least two files to join.')
if self.args.columns:
join_column_names = self._parse_join_column_names(self.args.columns)
if len(join_column_names) == 1:
join_column_names = join_column_names * len(self.args.files)
if len(join_column_names) != len(self.args.files):
sys.exit('The number of join column names must match the number of files, or be a single column name that exists in all files.')
if (self.args.left_join or self.args.right_join or self.args.outer_join) and not self.args.columns:
sys.exit('You must provide join column names when performing an outer join.')
if self.args.left_join and self.args.right_join:
sys.exit('It is not valid to specify both a left and a right join.')
tables = []
for f in self.args.files:
tables.append(list(CSVKitReader(f, **self.reader_kwargs)))
join_column_ids = []
for i, t in enumerate(tables):
join_column_ids.append(match_column_identifier(t[0], join_column_names[i]))
jointab = []
if self.args.left_join:
# Left outer join
jointab = tables[0]
for i, t in enumerate(tables[1:]):
jointab = join.left_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
elif self.args.right_join:
# Right outer join
jointab = tables[-1]
remaining_tables = tables[:-1]
remaining_tables.reverse()
for i, t in enumerate(remaining_tables):
jointab = join.right_outer_join(t, join_column_ids[-(i + 2)], jointab, join_column_ids[-1])
elif self.args.outer_join:
# Full outer join
jointab = tables[0]
for i, t in enumerate(tables[1:]):
jointab = join.full_outer_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
else:
if self.args.columns:
# Inner join
jointab = tables[0]
for i, t in enumerate(tables[1:]):
jointab = join.inner_join(jointab, join_column_ids[0], t, join_column_ids[i + 1])
else:
jointab = tables[0]
# Sequential join
for t in tables[1:]:
jointab = join.sequential_join(jointab, t)
output = CSVKitWriter(sys.stdout, **self.writer_kwargs)
for row in jointab:
output.writerow(row)
def _parse_join_column_names(self, join_string):
"""
Parse a list of join columns.
"""
return map(str.strip, join_string.split(','))
if __name__ == '__main__':
utility = CSVJoin()
utility.main()