apdf.py

#!/usr/bin/env python
"""
Analyzes PDF files by looking at their characteristics in order to add some 
intelligence into the determination of them being malicious or benign.

Usage:
$ apdf.py [-h] [-m MOVE] [-y YARARULES] Path

Produces a high level overview of a PDF to quickly determine if further
analysis is needed based on it's characteristics

positional arguments:
  Path                  Path to directory/file(s) to be scanned

optional arguments:
  -h, --help            show this help message and exit
  -m MOVE, --move MOVE  Directory to move files triggering YARA hits to
  -y YARARULES, --yararules YARARULES
                        Path to YARA rules. Rules should contain a weighted
                        score in the metadata section. (i.e. weight = 3)

example: python apdf.py -m tmp/badness -y foo/pdf.yara bar/getsome.pdf
"""

# apdf.py was created by Glenn P. Edwards Jr.
#         http://hiddenillusion.blogspot.com
#                 @hiddenillusion
# Version 0.2 
# Date: 10-11-2012
# Requirements:
#    - Python 2.x
#    - YARA (http://plusvic.github.io/yara/)
#    - pdfid (http://blog.didierstevens.com/programs/pdf-tools/)
#
# Optional:    
#    * This script will work without these but may miss some conditions to evaluate 
#     based on the missing data they would provide (i.e. - # of Pages) *
#
#    - pdfinfo (www.foolabs.com/xpdf/download.html)
#    - a "weight" field within the YARA's rule meta should be added to help in the 
#     final evaluation
#        i.e. - rule pdf_example {meta: weight = 3 strings: $s = "evil" 
#       condition: $s}
#
# To-Do:
#    - suppress pdfid's output log
#    - be able to print out which conditions it met in the rules


import argparse
import binascii
import datetime
import hashlib
import os
import re
import shutil 
import subprocess
import sys
import time
import zipfile

from decimal import Decimal

"""
Chose to _import_ PDFiD instead of just using subprocess to spawn it so it can be 
statically compiled for use on Windows.  

If you don't have it installed on your system, you can just download it and have 
it in the same directory as this script.
"""

try:
    import pdfid 
except ImportError:
    print "[!] PDFiD not installed"
    sys.exit()
try:
    import yara
except ImportError:
    print "[!] Yara not installed"
    sys.exit()    
    
# Initialize the list(s) where PDF attribs will be added to
counter = []
page_counter = []
# Initialize the YARA scoring count
yscore = []
ydir = False

# Misc. formatting
trailer = ("=" * 35)
filler = ("-" * 35)


def main():
    # Set the path to file(s)
    ploc = args['Path']
    if os.path.isfile(ploc):
        fileID(ploc)
    elif os.path.isdir(ploc):
        pwalk(ploc)    

# Quote idea credited to: https://github.com/marpaia/jadPY ... useful for 
# Windows, what can I say...
def q(s):
    quote = "\""
    s = quote + s + quote
    return s

def sha256(pdf):
    try:
        f = open(pdf, "rb")
        data = f.read()
        sha256 =  hashlib.sha256(data).hexdigest()
        f.close()
    except Exception, msg:
        print msg

    return sha256
    
def fileID(pdf):
    """
    Generally the PDF header will be within the first (4) bytes but since the 
    PDF specs say it 

    can be within the first (1024) bytes I'd rather check for atleast (1) 
    instance of it within that large range.  This limits the chance of the PDF 
    using a header evasion trick and then won't end up getting analyzed.  This
    evasion behavior could later be detected with a YARA rule.
    """

    f = open(pdf,'rb')
    s = f.read(1024)
    if '\x25\x50\x44\x46' in s:
        print "\n" + trailer
        print "[+] Analyzing: %s" % pdf
        print filler
        print "[-] Sha256: %s" % sha256(pdf)
        info(pdf)
    elif os.path.isdir(pdf): pwalk(pdf)
    f.close()
    
def pwalk(ploc):
    # Recursivly walk the supplied path and process files accordingly
    for root, dirs, files in os.walk(ploc):
        for name in files: 
            f = os.path.join(root, name)
            fileID(f)

        
def info(pdf):
    command = "pdfinfo " + q(pdf)
    try:
        p = subprocess.Popen(command,
                            shell=True,
                            stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
        #for line in p.stdout:
        #    if re.match('Pages:\s+(0|1)$', line):
        #        counter.append("pages")
        #         print "[-] (1) page PDF"  
        for line in p.stderr:
            if re.search('Unterminated hex string|Loop in Pages tree|Illegal digit in hex char in name', line):
                counter.append("sketchy")
                print "[-] Sketchyness detected" 
            elif re.search('Unexpected end of file in flate stream|End of file inside array', line):
                counter.append("eof")
                print "[-] EoF problem" 
            elif re.search('Couldn\'t find trailer dictionary', line):
                counter.append("trailer")            
            elif re.search('Invalid XRef entry|No valid XRef size in trailer|Invalid XRef entry|Couldn\'t read xref table', line):
                counter.append("xref")
                print "[-] Invalid XREF"
                break
    except Exception, msg:
        print "[!] pdfinfo error: %s" % msg
        pass
    id(pdf)


def id(pdf):
    try:
        # (dir, allNames, extraData, disarm, force), force)
        command = pdfid.PDFiD2String(pdfid.PDFiD(pdf, True, True, 
                                        False, True), True)
        extra = True
    except Exception:
        # I've observed some files raising errors with the 'extraData' switch
        command = pdfid.PDFiD2String(pdfid.PDFiD(pdf, True, False, False, 
                                                True), True)
        print "[!] PDFiD couldn\'t parse extra data"
        extra = False

    for line in command.split('\n'):
        count = re.split(r'[\s]+', line)
        if "PDF Header" in line and not re.match('%PDF-1\.\d', count[3]):
            counter.append("header")
            print "[-] Invalid version number : \"%s\"" % count[3]
        elif "/Page " in line:
            page_counter.append(count[2])
        elif "/Pages " in line:
            page_counter.append(count[2])
        elif "/JS " in line and not re.match('0', count[2]):
            counter.append("js")
            print "[-] JavaScript count.......: %s" % count[2]
            if count[2] > "1":
                counter.append("mucho_javascript")
                print "\t[*] That\'s a lot of js ..."
        elif "/AcroForm " in line and not re.match('0', count[2]):
            counter.append("acroform")
            print "[-] AcroForm...............: %s" % count[2]
        elif "/AA " in line and not re.match('0', count[2]):
            counter.append("aa")
            print "[-] Additional Action......: %s" % count[2]
        elif "/OpenAction " in line and not re.match('0', count[2]):
            counter.append("oa")
            print "[-] Open Action............: %s" % count[2]
        elif "/Launch " in line and not re.match('0', count[2]):
            counter.append("launch")
            print "[-] Launch Action..........: %s" % count[2]
        elif "/EmbeddedFiles " in line and not re.match('0', count[2]):
            counter.append("embed")
            print "[-] Embedded File..........: %s" % count[2]
        #elif "trailer" in line and not re.match('0|1', count[2]):
        #    print "[-] Trailer count..........: %s" % count[2]
        #    print "\t[*] Multiple versions detected"
        elif "Total entropy:" in line:
            tentropy = count[3]        
            print "[-] Total Entropy..........: %7s" % count[3]
        elif "Entropy inside streams:" in line:
            ientropy = count[4]
            print "[-] Entropy inside streams : %7s" % count[4]
        elif "Entropy outside streams:" in line:
            oentropy = count[4]    
            print "[-] Entropy outside streams: %7s" % count[4]
    """
    Entropy levels:
    0 = orderly, 8 = random
    ASCII text file = ~2/4
    ZIP archive = ~ 7/8
    PDF Malicious
            - total   : 6.3
            - inside  : 6.6
            - outside : 4.9
    PDF Benign
            - total   : 6.7
            - inside  : 7.2
            - outside : 5.1
    Determine if Total Entropy & Entropy Inside Stream are significantly 
    different than Entropy Outside Streams -> i.e. might indicate a payload w/ 
    long, uncompressed NOP-sled 
    ref = http://blog.didierstevens.com/2009/05/14/malformed-pdf-documents
    """        
    if not extra == False:    
        te_long = Decimal(tentropy)
        te_short = Decimal(tentropy[0:3])
        ie_long = Decimal(ientropy)    
        ie_short = Decimal(ientropy[0:3])    
        oe_long = Decimal(oentropy)    
        oe_short = Decimal(oentropy[0:3])    
        ent = (te_short + ie_short) / 2
        # I know 'entropy' might get added twice to the counter 
        # (doesn't matter) but I wanted to separate these to be alerted on them 
        # individually
        togo = (8 - oe_long) # Don't want to apply this if it goes over the max of 8
        if togo > 2:
            if oe_long + 2 > te_long:
                counter.append("entropy")        
                print "\t[*] Entropy of outside stream is questionable:"
                print "\t[-] Outside (%s) +2 (%s) > Total (%s)" % \
                            (oe_long,oe_long +2,te_long)
        elif oe_long > te_long:
            counter.append("entropy")        
            print "\t[*] Entropy of outside stream is questionable:"
            print "\t[-] Outside (%s) > Total (%s)" % (oe_long,te_long)
        if str(te_short) <= "2.0" or str(ie_short) <= "2.0":
            counter.append("entropy")        
            print "\t[*] LOW entropy detected:"
            print "\t[-] Total (%s) or Inside (%s) <= 2.0" % (te_short,ie_short)

    # Process the /Page(s) results here just to make sure they were both read
    if re.match('0', page_counter[0]) and re.match('0', page_counter[1]):
        counter.append("page")
        print "[-] Page count suspicious:"  
        print "\t[*] Both /Page (%s) and /Pages (%s) = 0" % \
                    (page_counter[0],page_counter[1])
    elif re.match('0', page_counter[0]) and not re.match('0', page_counter[1]):
        counter.append("page")
        print "[-] Page count suspicious, no individual pages defined:"  
        print "\t[*] /Page = (%s) , /Pages = (%s)" % \
                    (page_counter[0],page_counter[1])
    elif re.match('1$', page_counter[0]):
        counter.append("page")
        print "[-] (1) page PDF"  
            
    yarascan(pdf)


def yarascan(pdf):
    try:
        ymatch = r.match(pdf)
        if len(ymatch):
            print "[-] YARA hit(s): %s" % ymatch
            for rule in ymatch:
                meta = rule.meta
                for key, value in meta.iteritems():
                    # If the YARA rule has a weight in it's metadata then parse 
                    # that for later calculation
                    if "weight" in key:
                      yscore.append(value)
                if not ydir == False:
                    print "[-] Moving malicious file to:",ydir
                    # This will move the file if _any_ YARA rule triggers...
                    # which might trick you if the
                    # rule that triggers on it doesn't have a weight or is 
                    # displayed in the output
                    if not os.path.exists(ydir):
                        os.makedirs(ydir)
                    try:
                        shutil.move(pdf, ydir)
                    except Exception, msg:
                        continue
    except Exception, msg:
        print msg
    
    eval(counter)
    

def eval(counter):
    """ 
    Evaluate the discovered contents of the PDF and assign a severity rating
    based on the conditions configured below.

    Rating system: 0 (benign), >=2 (sketchy), >=3 (medium), >=5 (high)
    """
    print filler    
    ytotal = sum(yscore)
    print "[-] Total YARA score.......: %s" % ytotal
    sev = 0

    # Below are various combinations used to add some intelligence and help 
    # evaluate if a file is malicious or benign.  
    # This is where you can add your own thoughts or modify existing checks.
    
    # HIGH
    if "page" in counter and "launch" in counter and "js" in counter: sev = 5
    elif "page" in counter and "xref" in counter: sev += 5
    elif "page" in counter and "aa" in counter and "js" in counter: sev += 5
    elif "page" in counter and "oa" in counter and "js" in counter: sev += 5

    # MEDIUM
    if "header" in counter and "xref" in counter: sev += 3
    elif "header" in counter and "js" in counter and "page" in counter: sev += 3
    elif "header" in counter and "launch" in counter and "page" in counter: sev += 3
    elif "header" in counter and "aa" in counter and "page" in counter: sev += 3

    if "page" in counter and "mucho_javascript" in counter: sev += 3
    elif "page" in counter and "acroform" in counter and "embed" in counter: sev += 3
    elif "page" in counter and "acroform" in counter and "js" in counter: sev += 3

    if "entropy" in counter and "page" in counter: sev += 3    
    elif "entropy" in counter and "aa" in counter: sev += 3    
    elif "entropy" in counter and "oa" in counter: sev += 3    
    elif "entropy" in counter and "js" in counter: sev += 3    

    if "oa" in counter and "js" in counter: sev += 3
    if "aa" in counter and "mucho_javascript" in counter: sev += 3

    # Heuristically sketchy
    if "page" in counter and "js" in counter: sev += 2
    if "sketchy" in counter and "page" in counter: sev += 2
    elif "sketchy" in counter and "aa" in counter: sev += 2
    elif "sketchy" in counter and "oa" in counter: sev += 2
    elif "sketchy" in counter and "launch" in  counter: sev += 2
    elif "sketchy" in counter and "eof" in counter: sev += 1

    if "page" in counter and "aa" in counter: sev += 1
    if "page" in counter and "header" in counter: sev += 1    
    if "header" in counter and "embed" in counter: sev += 1
    
    print "[-] Total severity score...: %s" % sev
    sev = (ytotal + sev)
    print "[-] Overall score..........: %s" % sev
    
    if sev >= 5: print trailer + "\n[!] HIGH probability of being malicious"
    elif sev >= 3: print trailer + "\n[!] MEDIUM probability of being malicious"
    elif sev >= 2: print trailer + "\n[!] Heuristically sketchy"
    elif sev >= 0: print trailer + "\n[-] Scanning didn't determine anything warranting suspicion"

    # Clear out the scores to start fresh for the next analysis
    del counter[:]
    del page_counter[:]    
    del yscore[:]

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Produces a high level overview'\
                ' of a PDF to quickly determine if further analysis is needed based '\
                'on it\'s characteristics')
    parser.add_argument('-m','--move', help='Directory to move files triggering '\
                'YARA hits to', required=False)
    parser.add_argument('-y','--yararules', help='Path to YARA rules.  Rules '\
                'should contain a weighted score in the metadata section. '\
                '(i.e. weight = 3)', 
                required=False,
                default="pdf_rules.yara")
    parser.add_argument('Path', help='Path to directory/file(s) to be scanned')
    args = vars(parser.parse_args())
    
    # Verify supplied path exists or die
    if not os.path.exists(args['Path']):
        parser.error("The supplied path does not exist")
            
    # Configure YARA rules
    if args['yararules']:
        rules = args['yararules']
    else:
        rules = '/usr/local/etc/capabilities.yara' # REMnux location
        
    if not os.path.exists(rules):
        parser.error("Correct path to YARA rules?")
        sys.exit()
    else:
        try:    
            r = yara.compile(rules)
            if args['move']:
                ydir = args['move']
        except Exception, msg:
            print "[!] YARA compile error: %s" % msg
            sys.exit()

    main()