forked from wireservice/csvkit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
csvstat
executable file
·106 lines (81 loc) · 3.17 KB
/
csvstat
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
import datetime
import random
import sys
from csvkit import table
from csvkit.cli import CSVKitUtility
from heapq import nlargest
from operator import itemgetter
class CSVStat(CSVKitUtility):
description = 'Print descriptive statistics for each column in a CSV file.'
override_flags = 'l'
def add_arguments(self):
pass
def main(self):
tab = table.Table.from_csv(self.args.file, **self.reader_kwargs)
null_excluder = lambda i: i is not None
for c in tab:
values = sorted(filter(null_excluder, c))
uniques = set(c)
uniques.discard(None)
sys.stdout.write((u'%3i. %s\n' % (c.order + 1, c.name)).encode('utf-8'))
if c.type == None:
sys.stdout.write(u'\tEmpty column\n')
continue
sys.stdout.write(u'\t%s\n' % c.type)
sys.stdout.write(u'\tNulls: %s\n' % (u'Yes' if c.nullable else u'No'))
if len(uniques) <= 5 and c.type is not bool:
sys.stdout.write((u'\tValues: %s\n' % ', '.join(list(uniques))).encode('utf-8'))
else:
# Skip min/max for strings and bools
if c.type not in [unicode, bool]:
minval = min(values)
maxval = max(values)
if c.type in [datetime.datetime, datetime.date, datetime.time]:
minval = minval.isoformat()
maxval = maxval.isoformat()
sys.stdout.write(u'\tMin: %s\n' % min(values))
sys.stdout.write(u'\tMax: %s\n' % max(values))
if c.type in [int, float]:
sys.stdout.write(u'\tSum: %s\n' % sum(values))
sys.stdout.write(u'\tMean: %s\n' % (sum(values) / len(values)))
sys.stdout.write(u'\tMedian: %s\n' % median(values))
sys.stdout.write(u'\tUnique values: %i\n' % len(uniques))
if len(uniques) != len(values):
sys.stdout.write(u'\t5 most frequent values:\n')
for top in freq(values):
sys.stdout.write(u'\t\t%s:\t%s\n' % (top[0], top[1]))
if c.type == unicode:
sys.stdout.write(u'\tMax length: %i\n' % c.max_length)
sys.stdout.write(u'\n')
sys.stdout.write(u'Row count: %s\n' % tab.count_rows())
def median(l):
"""
compute the median of a list.
"""
length = len(l)
if len(l) % 2 == 1:
return l[((length + 1) / 2) - 1]
else:
a = l[(length / 2) - 1]
b = l[length / 2]
return (float(a + b)) / 2
def freq(l):
"""
Count the number of times each value occurs in a column.
"""
count = {}
n = 5
for x in l:
s = str(x)
if count.has_key(s):
count[s] += 1
else:
count[s] = 1
# This will iterate through dictionary, return N highest
# values as (key, value) tuples.
top = nlargest(n, count.iteritems(), itemgetter(1))
return top
if __name__ == '__main__':
utility = CSVStat()
utility.main()