diff --git a/annoatlas.py b/annoatlas.py new file mode 100644 index 0000000..5a676f9 --- /dev/null +++ b/annoatlas.py @@ -0,0 +1,46 @@ +#!/usr/bin/python3 +# annoatlas +# +# A terminal prompt interface to combine an annotation file with a Human Protein Atlas dataset +# +# By Daniel R Faulkner + +from lib import libAnnoAtlas +from lib import libAnnoShared +import argparse + +## Command line options: +### Parse the command line arguments +parser = argparse.ArgumentParser(description="Combine annotation and Human Protein Atlas datasets") +# Arguments: +# Required +parser.add_argument("input", help="Annotation filename", type=argparse.FileType('r')) +parser.add_argument("atlas", help="Human Protein Atlas filename", type=argparse.FileType('r')) +parser.add_argument("output", help="Output filename", type=argparse.FileType('w')) +# Optional +parser.add_argument("-c","--column", help="Feature name column (annotation file)", nargs=1, type=int) +parser.add_argument("-r","--regex", help="Use regular expression string matching", action="store_true") +parser.add_argument("-a","--atlascols", help="Human Protein Atlas columns to include", action="store") + +# Any commands entered without a flag +args = parser.parse_args() + +regex = 0 +if args.regex: + regex = 1 + +GeneCol = -1 +if args.column: + GeneCol = args.column[0] + +atlasCol = [] +if args.atlascols: + atlasCol = args.atlascols.split(',') + +# Run the command +libAnnoAtlas.combineEntries(args.input,args.atlas,args.output, atlasCol, regex, GeneCol) + +# Close files +args.input.close() +args.atlas.close() +args.output.close() diff --git a/lib/libAnnoAtlas.py b/lib/libAnnoAtlas.py new file mode 100644 index 0000000..c11ebb7 --- /dev/null +++ b/lib/libAnnoAtlas.py @@ -0,0 +1,147 @@ +# libAnnoShared +# Functions to compare an annotation file with the Human Protein Atlas dataset +# +# By Daniel R Faulkner + +from lib import libAnnoShared +import re + +EmptyChar = "." # Character to use for empty fields + +# Creates a class for addressing Human Protein Atlas files +# Usage example: +#atlasobj = atlas(open('atlasfilename')) +#HPAline = atlasobj.returndataline('GeneName') +class atlas(object): + """Functions and operations which use the Human Protein Atlas dataset""" + def __init__(self, atlasfileobj): + """Perform object setup and perform any preprocessing""" + self.fileobj = atlasfileobj + # Store the header and store the start position for the data entries + atlasfileobj.seek(0) + self.header = atlasfileobj.readline() + dataone = atlasfileobj.readline() + self.datastart = atlasfileobj.tell()-len(dataone) + # Determine key column numbers + # Human Protein Atlas column titles + geneName = "Gene" + geneAlt = '"Gene synonym"' + ensemblID = "Ensembl" + # Convert to column numbers + self.geneNameCol = libAnnoShared.columnnum(self.header,geneName) + self.geneAltCol = libAnnoShared.columnnum(self.header,geneAlt) + self.ensemblIDCol = libAnnoShared.columnnum(self.header,ensemblID) + def returndataline(self, gene): + """Return the correlating entry from the Human Protein Atlas dataset""" + dataentry = "" + # Process each entry looking for a match + gene = gene.strip() + gene = gene.upper() + self.fileobj.seek(self.datastart) + line = self.fileobj.readline() + while line: + # Prepare the fields + fields = line.split('\t') # Divide into seperate fields + name = fields[self.geneNameCol].strip() + ID = fields[self.ensemblIDCol].strip() + synonyms=[] + alts = fields[self.geneAltCol].split(',') + for item in alts: + item = item.strip() + item = item.replace('"',"") + synonyms.append(item.upper()) + # Perform the comparison + if gene == name.upper(): + # Check gene name + dataentry = line + elif gene in synonyms: + # Check gene synonyms + dataentry = line + elif gene == ID.upper(): + # Check Ensembl ID + dataentry = line + # Load new line, unless an entry has already been located + line = self.fileobj.readline() + if dataentry!="": + line = None + return dataentry + +# Adds columns from a Human Protein Atlas file to an annotation file. +# Usage example: +#combineEntries(open('annofilename'), open('atlasfilename'), open('outputfile','w'), ['Position','Chromosome'], 0, 9): +def combineEntries(annofile, atlasfile, outputfile, atlasterms = [], regex = 0, annocol=-1): + """Adds Human Protein Atlas information to an annotation file""" + # Check which atlas columns to use + atlasobj = atlas(atlasfile) + header = atlasobj.header.strip() + headerfields = header.split('\t') + atlascols = [] + # If a number is passed as a search term use as a column number, and remove from list + searchterms = [] + for item in atlasterms: + col = -1 + try: + col = int(item) + except: + stritem = item.replace('"',"") # Remove any quotes (which may confuse matches) + searchterms.append(stritem) + if col>-1 and col-1: + fields = line.split('\t') + geneName = fields[annocol] + else: + annotation = libAnnoShared.Annotation(line, type, header) + geneName = annotation.repID + # Lookup the line + atlasline = atlasobj.returndataline(geneName) + atlasline = atlasline.strip() + atlasfields = atlasline.split('\t') + numfields = len(atlasfields) + # Output the desired fields + outputline = line.strip() + for item in atlascols: + if item