csvgrep

#!/usr/bin/env python

import re
import sys

from csvkit import CSVKitReader, CSVKitWriter
from csvkit.cli import CSVKitUtility, CSVFileType, parse_column_identifiers, print_column_names
from csvkit.grep import FilteringCSVReader

class CSVGrep(CSVKitUtility):
    description = 'Search CSV files. Like the unix "grep" command, but for tabular data.'
    override_flags = 'f'

    def add_arguments(self):
        self.argparser.add_argument('-n', '--names', dest='names_only', action='store_true',
                        help='Display column names and indices from the input CSV and exit.')
        self.argparser.add_argument('-c', '--columns', dest='columns',
                        help='A comma separated list of column indices or names to be searched.')
        self.argparser.add_argument('-r', '--regex', dest='regex', action='store_true',
                        help='If specified, the search pattern will be treated as a Python regular expression.')
        self.argparser.add_argument('-i', '--invert-match', dest='inverse', action='store_true',
                        help='If specified, select non-matching instead of matching rows.')
        self.argparser.add_argument('pattern', metavar="PATTERN", nargs='?',
                        help='The pattern to search for.')
        self.argparser.add_argument('file', metavar="FILE", nargs='?', type=CSVFileType(), default=sys.stdin,
                        help='The CSV file to operate on. If omitted, will accept input on STDIN.')

    def main(self):
        if self.args.names_only:
            print_column_names(self.args.file, sys.stdout, **self.reader_kwargs)
            sys.exit()

        if not self.args.pattern:
            sys.exit('A pattern must be specified unless using the -n option.')

        rows = CSVKitReader(self.args.file, **self.reader_kwargs)
        column_names = rows.next()

        column_ids = parse_column_identifiers(self.args.columns, column_names)
        
        if self.args.regex:
            pattern = re.compile(self.args.pattern)
        else:
            pattern = self.args.pattern
            
        patterns = dict((c, pattern) for c in column_ids)

        output = CSVKitWriter(sys.stdout, **self.writer_kwargs)
        output.writerow(column_names)

        filter_reader = FilteringCSVReader(rows, header=False, patterns=patterns, inverse = self.args.inverse)

        for i, row in enumerate(filter_reader):
            output.writerow(row)
                
if __name__ == "__main__":
    utility = CSVGrep()
    utility.main()