build.py

# -*- coding: utf-8 -*-
#
# hashtool - hashing of entire trees of files
# Copyright (C) 2015 - Felipe Machado
#
#

# Python future imports
from __future__ import print_function

# Python stanrad library
import sys
import os
import os.path
import hashlib
import json
import math
import argparse
import datetime
import traceback
import time


MAX_TRIES = 5

last_processed_index = 0
count = 0


###############################################################################
def calculateFileHash( path, block_size=50*(2**20), hash_functions = ["sha1"] ):
    # Compact form:
    # return hashlib.sha1(open(path).read()).hexdigest()
    #
    # But reading and updating in blocks is much more efficient
    # for large files, and doesn't require much RAM

    try:
        f = open(path, 'rb')
    except (OSError, IOError) as e:
        print( e )
        print( traceback.format_exc() )
        return None

    hash_objs = {}
    for hash_function in hash_functions:
        hash_objs[ hash_function ] = hashlib.new( hash_function )

    while True:
        data = f.read( block_size )
        if not data:
            break

        for hash_function in hash_functions:
            hash_objs[ hash_function ].update(data)

    for hash_function in hash_functions:
        hash_objs[ hash_function ] = hash_objs[ hash_function ].hexdigest()

    f.close()
    return hash_objs


def calculateHashes( files_info, files_info_keys, total_size,
    hash_functions = ["sha1"], continue_data = {} ):

    global last_processed_index, count

    percentage = 0

    #for file_path in files_info_keys:
    for i in range( last_processed_index, len( files_info_keys ) ):
        file_path = files_info_keys[ i ]
        file_hashes = None
        tries = 0

        #print( file_path )
        #continue

        while file_hashes is None:
            if file_path in continue_data and "hashes" in continue_data[ file_path ]:
                file_hashes = continue_data[ file_path ][ "hashes" ]
                break

            try:
                file_hashes = calculateFileHash( file_path,
                    hash_functions=hash_functions )
            except Exception as e:
                print( "Error calculating file hash: %s" % file_path )
                print( e )
                print( traceback.format_exc() )

                file_hashes = None

                tries += 1

                if tries >= MAX_TRIES:
                    raise e

                time.sleep( 10 )


        files_info[file_path]["hashes"] = file_hashes

        # Progress counting and printing
        count += files_info[file_path]["size"]

        last_processed_index = i

        if math.floor( ( 10000 * count ) / total_size ) > percentage:
            percentage = math.floor( ( 10000 * count ) / total_size )
            print( datetime.datetime.now(), percentage / 100, "%", file=sys.stderr )


def createFilesInfoTable( path, files_info={} ):
    total_size = 0

    try:
        path_listdir = os.listdir( path )
    except (OSError, IOError) as e:
        print( e )
        print( traceback.format_exc() )
        return (None, 0)

    for f in path_listdir:
        filepath = os.path.join( path, f )

        if os.path.isdir(filepath):
            ( subfolder_info, subfolder_size ) = createFilesInfoTable(
                filepath, files_info )

            total_size += subfolder_size
        else:
            file_size = os.path.getsize(filepath)

            files_info[filepath] = {
                "size": file_size
            }

            total_size += file_size

    return ( files_info, total_size )
###############################################################################


###############################################################################
# Public function #
###################
def buildTreeHash( args ):
    root_folders = args.path
    output_file = args.out
    continue_data = {}

    if args.partial is not None:
        continue_data = json.loads( open( args.partial ).read() )

    files_info = {}
    total_size = 0
    for path in root_folders:
        print("Folder: %s" % path, file=sys.stderr)
        (path_files_info, path_total_size) = createFilesInfoTable(path, files_info)
        total_size += path_total_size

    files_info_keys = sorted( files_info.keys() )

    print( "Total files:", len( files_info_keys ) )

    global last_processed_index, count
    last_processed_index = 0
    count = 0

    while True:
        try:
            calculateHashes( files_info, files_info_keys, total_size, args.hashes, continue_data )
            break
        except ( KeyboardInterrupt, Exception ) as e:
            print( "Error calculating file hashes (",
                files_info_keys[ last_processed_index ], ")" )
            print( e )
            print( traceback.format_exc() )

            answer = input( "Try again? (Yes/No/Skip): " )

            if answer == "No":
                break
            elif answer == "Skip":
                last_processed_index += 1

    #return

    # JSON Pretty print
    json_string = json.dumps( files_info, sort_keys=True, indent=4,
                              separators=(',', ': ') )

    if output_file:
        open( output_file, 'w' ).write(json_string)
    else:
        print( json_string )
###############################################################################