Skip to content

Commit

Permalink
Adding Protein Atlas utility
Browse files Browse the repository at this point in the history
  • Loading branch information
DanielFaulkner committed Feb 8, 2021
1 parent 3ebc092 commit 63e3e69
Show file tree
Hide file tree
Showing 5 changed files with 214 additions and 17 deletions.
46 changes: 46 additions & 0 deletions annoatlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#!/usr/bin/python3
# annoatlas
#
# A terminal prompt interface to combine an annotation file with a Human Protein Atlas dataset
#
# By Daniel R Faulkner

from lib import libAnnoAtlas
from lib import libAnnoShared
import argparse

## Command line options:
### Parse the command line arguments
parser = argparse.ArgumentParser(description="Combine annotation and Human Protein Atlas datasets")
# Arguments:
# Required
parser.add_argument("input", help="Annotation filename", type=argparse.FileType('r'))
parser.add_argument("atlas", help="Human Protein Atlas filename", type=argparse.FileType('r'))
parser.add_argument("output", help="Output filename", type=argparse.FileType('w'))
# Optional
parser.add_argument("-c","--column", help="Feature name column (annotation file)", nargs=1, type=int)
parser.add_argument("-r","--regex", help="Use regular expression string matching", action="store_true")
parser.add_argument("-a","--atlascols", help="Human Protein Atlas columns to include", action="store")

# Any commands entered without a flag
args = parser.parse_args()

regex = 0
if args.regex:
regex = 1

GeneCol = -1
if args.column:
GeneCol = args.column[0]

atlasCol = []
if args.atlascols:
atlasCol = args.atlascols.split(',')

# Run the command
libAnnoAtlas.combineEntries(args.input,args.atlas,args.output, atlasCol, regex, GeneCol)

# Close files
args.input.close()
args.atlas.close()
args.output.close()
147 changes: 147 additions & 0 deletions lib/libAnnoAtlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
# libAnnoShared
# Functions to compare an annotation file with the Human Protein Atlas dataset
#
# By Daniel R Faulkner

from lib import libAnnoShared
import re

EmptyChar = "." # Character to use for empty fields

# Creates a class for addressing Human Protein Atlas files
# Usage example:
#atlasobj = atlas(open('atlasfilename'))
#HPAline = atlasobj.returndataline('GeneName')
class atlas(object):
"""Functions and operations which use the Human Protein Atlas dataset"""
def __init__(self, atlasfileobj):
"""Perform object setup and perform any preprocessing"""
self.fileobj = atlasfileobj
# Store the header and store the start position for the data entries
atlasfileobj.seek(0)
self.header = atlasfileobj.readline()
dataone = atlasfileobj.readline()
self.datastart = atlasfileobj.tell()-len(dataone)
# Determine key column numbers
# Human Protein Atlas column titles
geneName = "Gene"
geneAlt = '"Gene synonym"'
ensemblID = "Ensembl"
# Convert to column numbers
self.geneNameCol = libAnnoShared.columnnum(self.header,geneName)
self.geneAltCol = libAnnoShared.columnnum(self.header,geneAlt)
self.ensemblIDCol = libAnnoShared.columnnum(self.header,ensemblID)
def returndataline(self, gene):
"""Return the correlating entry from the Human Protein Atlas dataset"""
dataentry = ""
# Process each entry looking for a match
gene = gene.strip()
gene = gene.upper()
self.fileobj.seek(self.datastart)
line = self.fileobj.readline()
while line:
# Prepare the fields
fields = line.split('\t') # Divide into seperate fields
name = fields[self.geneNameCol].strip()
ID = fields[self.ensemblIDCol].strip()
synonyms=[]
alts = fields[self.geneAltCol].split(',')
for item in alts:
item = item.strip()
item = item.replace('"',"")
synonyms.append(item.upper())
# Perform the comparison
if gene == name.upper():
# Check gene name
dataentry = line
elif gene in synonyms:
# Check gene synonyms
dataentry = line
elif gene == ID.upper():
# Check Ensembl ID
dataentry = line
# Load new line, unless an entry has already been located
line = self.fileobj.readline()
if dataentry!="":
line = None
return dataentry

# Adds columns from a Human Protein Atlas file to an annotation file.
# Usage example:
#combineEntries(open('annofilename'), open('atlasfilename'), open('outputfile','w'), ['Position','Chromosome'], 0, 9):
def combineEntries(annofile, atlasfile, outputfile, atlasterms = [], regex = 0, annocol=-1):
"""Adds Human Protein Atlas information to an annotation file"""
# Check which atlas columns to use
atlasobj = atlas(atlasfile)
header = atlasobj.header.strip()
headerfields = header.split('\t')
atlascols = []
# If a number is passed as a search term use as a column number, and remove from list
searchterms = []
for item in atlasterms:
col = -1
try:
col = int(item)
except:
stritem = item.replace('"',"") # Remove any quotes (which may confuse matches)
searchterms.append(stritem)
if col>-1 and col<len(headerfields):
if col not in atlascols:
atlascols.append(col)
# If a string is passed treat as an exact header string or search term
count = 0
if not regex: # If the search term is an exact match include the column
for item in headerfields:
for search in searchterms:
if search.upper()==item.upper():
if count not in atlascols:
atlascols.append(count)
count = count+1
elif regex: # Check if the search term is a regular expression match (if regular expression flag used)
for item in headerfields:
if libAnnoShared.regexMatch(item,searchterms):
if count not in atlascols:
atlascols.append(count)
count = count+1
# Add all columns if no columns specified
if len(atlasterms)==0:
for i in range(0,len(headerfields)):
atlascols.append(i)
# Process the annotation files
type = libAnnoShared.detectFileType(annofile)
line = annofile.readline()
header = line
while line[0]=="#":
header = line
line = annofile.readline()
# Write out updated header line
if header[0]=="#": # Check this is a true header line
newheader = header.strip()
for item in atlascols:
newheader=newheader+"\t"+headerfields[item]
newheader = newheader+"\n"
outputfile.write(newheader)
# Add data lines
while line:
# Get the gene name
if annocol>-1:
fields = line.split('\t')
geneName = fields[annocol]
else:
annotation = libAnnoShared.Annotation(line, type, header)
geneName = annotation.repID
# Lookup the line
atlasline = atlasobj.returndataline(geneName)
atlasline = atlasline.strip()
atlasfields = atlasline.split('\t')
numfields = len(atlasfields)
# Output the desired fields
outputline = line.strip()
for item in atlascols:
if item<numfields:
outputline=outputline+"\t"+atlasfields[item]
else:
outputline=outputline+"\t"+EmptyChar
outputline = outputline+"\n"
outputfile.write(outputline)
line = annofile.readline()
2 changes: 1 addition & 1 deletion lib/libAnnoFeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0,
while line[0]=="#":
header = line
line = annofileobj.readline()
extracolstmp = "{} Name\t{} Type\t{} Strand\t{} Distance\tWithin Name\tWithin Strand\tWithin Type\tWithin Distance\t{} Name\t{} Type\t{} Strand\t{} Distance\n"
extracolstmp = "\t{} Name\t{} Type\t{} Strand\t{} Distance\tWithin Name\tWithin Strand\tWithin Type\tWithin Distance\t{} Name\t{} Type\t{} Strand\t{} Distance\n"
if senseorder:
extracols = extracolstmp.format("AntiSense","AntiSense","AntiSense","AntiSense","Sense","Sense","Sense","Sense")
else:
Expand Down
17 changes: 1 addition & 16 deletions lib/libAnnoFilter.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#
# By Daniel R Faulkner

from lib.libAnnoShared import columnnum
from lib.libAnnoShared import columnnum, regexMatch
import re

## Filter functions
Expand Down Expand Up @@ -93,21 +93,6 @@ def filterChromosome(annoobj, names=[], exclude=0, regex=0):

## Supporting Functions

# Support regular expressions in the names list
def regexMatch(query, list):
"""Loops through a list of regular expressions until a match is found"""
# Loop through the list looking for a match
passed = 0
for name in list:
try:
regname = re.compile(name)
except:
raise Exception(name+" is not a compatible regular expression.")
if regname.search(query):
passed=1
break
return passed

# Produce a dictionary object of first open reading frame positions
# INPUT: The ORF fileobject
# OUTPUT: Dictionary of end positions
Expand Down
19 changes: 19 additions & 0 deletions lib/libAnnoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
#
# By Daniel R Faulkner

import re

# Return the column number from a name in the header.
# Note: This is case sensitive. An advancement in future maybe to return a dictionary with all header columns.
def columnnum(header, name):
Expand All @@ -16,6 +18,23 @@ def columnnum(header, name):
count = count +1
return column

# Checks the query against a list of regular expressions.
# INPUT: A string to examine. A list of regular expression search terms.
# OUTPUT: Returns 0 if no match is found or 1 if one of the regular expressions matches
def regexMatch(query, list):
"""Loops through a list of regular expressions until a match is found"""
# Loop through the list looking for a match
passed = 0
for name in list:
try:
regname = re.compile(name)
except:
raise Exception(name+" is not a compatible regular expression.")
if regname.search(query):
passed=1
break
return passed

# Creates an object for an entry in an annotation file and applies basic conversions to make the data consistant
# INPUT: Line from annotation file, the input file format and the header line where available.
# NOTE: Scores are stored as integers. This could be changed to a float for greater accuracy. (Supported by DFAM and GTF)
Expand Down

0 comments on commit 63e3e69

Please sign in to comment.