bbab2csv

#!/usr/bin/python
#
# bbab2csv - BlackBerry Address Book to CSV
# Copyright 2011 Brandon Mintern, bmintern@gmail.com
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
# Hosted on github
 
usage = """
Usage: This program is designed to process the result of a backup produced by
the barrybackup command line program. The result of such a backup is a tarball
at ~/.barry/backup/<PIN>/<PIN>-<YYYYMMDD>-<HHMSS>.tar.gz. Inside that tarball
is a directory named "Address Book" which contains a binary file for each
contact.

To run it, execute
  bbab2csv TARBALL
where TARBALL is the file generated by barrybackup. The resulting CSV will
be printed to STDOUT, and unrecognized data will be printed to STDERR.
"""

# The layout of each BlackBerry contact file seems to be:
# 1. 2-byte little-endian field length l
# 2. field type byte
# 3. l-byte field
# 4. (1-3) repeated or 0x00
#
# The following list of tuples indicates the field types (see #2 above) we
# know about. This could be improved by moving the information to a separate
# file and then reading/parsing it here.
#
# The order in which field types appear is the same order in which the columns
# will be output in the resulting CSV.
#
# After running the program, any unknown fields will be reported to standard
# error. That information can be used to update this program to work better in
# the future.

name = "Name" # This field is directly used in the code for sorting
fields = [ (0x20, name)
         , (0x56, "Nickname")
         , (0x21, "Organization")
         , (0x07, "Home Phone")
         , (0x08, "Mobile Phone")
         , (0x06, "Work Phone")
         , (0x11, "Home Phone 2")
         , (0x12, "Mobile Phone 2")
         , (0x0a, "BB PIN")
         , (0x3d, "Home Address")
         , (0x3e, "Home Address 2")
         , (0x45, "Home City")
         , (0x46, "Home State")
         , (0x47, "Home ZIP")
         , (0x40, "Home Directions")
         , (0x23, "Work Address")
         , (0x24, "Work Address 2")
         , (0x26, "Work City")
         , (0x27, "Work State")
         ]

field_types = dict(fields) # Allow for indexing known fields by hex value
field_names = [x[1] for x in fields] # Allow for easy iteration over names

import sys, csv, tarfile
from os import path
from struct import unpack
from collections import defaultdict

def read_bb_field (fh):
    """
    Reads a standard field in a BlackBerry Address Book contact file. The
    format (to the best of my knowledge) is laid out above.

    Returns a tuple of (field_name, field_value), with field_name being a
    string if it is a known field type and an integer value otherwise.

    If the initial 2-byte read is not actually 2 bytes, it returns
    (False, <read value>) to indicate a likely end-of-file condition.
    """
    # Read the first two bytes to determine the length of the entire field
    length_bytes = fh.read(2)
    if len(length_bytes) < 2:
        return False, length_bytes
    length = unpack("<H", length_bytes)[0]
    # Read the 1-byte field type
    field_type = unpack("<B", fh.read(1))[0]
    # Read the field itself, assumed to be length bytes as indicated above
    field = fh.read(length)
    try:
        field_type = field_types[field_type]
        # all known fields are NULL-terminated strings. Strip the NULL
        # character
        field = field[:-1]
    except KeyError:
        # Log unknown field types. Note that we can improve this a lot by:
        #   1. Adding known irrelevant field types that aren't logged
        #   2. Generating a better string representation of the field than
        #      simply dumping the binary value in there as a string.
        log('Unknown field type: %x, value: "%s"' % (field_type, field))
    return field_type, field

def iterate_contacts (tarball):
    """
    Take an open tarfile object and yield open contact files from the Address
    Book. The contact files will be closed when iteration is resumed.
    """
    for member in tarball:
        if path.split(member.name)[0] == "Address Book":
            contact = tarball.extractfile(member)
            yield contact
            contact.close()

def log (msg):
    print >>sys.stderr, msg

try:
    try:
        tarball = tarfile.open(sys.argv[1])
    except IOError:
        log('\nNo such file: "%s"' % sys.argv[1])
        raise
except (IndexError, IOError):
    log(usage)
    exit(2)

contacts = []
for contactfile in iterate_contacts(tarball):
    # We'll store each field from the contact file in a dict mapping field
    # name to list of field values. It's important to store it as a list
    # because the name field (for example) appears more than once
    contact = defaultdict(list)
    while True:
        field_type, field = read_bb_field(contactfile)
        if field_type is False: # end of input
            # assert field == "\0", "Non-NULL closing byte"
            break
        contact[field_type].append(field)
    contacts.append(contact)

# We're done with the tarball; free up some memory
tarball.close()

# Get CSV output object and write header row
csvout = csv.writer(sys.stdout)
csvout.writerow(field_names)

# Iterate over the contacts in order by name. Note that x[name] will never
# fail because x is a defaultdict(list)
for contact in sorted(contacts, key=lambda x: x[name]):
    # Write out the contact info for known fields to the CSV. Here we join the
    # list items corresponding to each field
    csvout.writerow(map(lambda x: " ".join(contact[x]),
                        field_names))