Adding Protein Atlas utility

DanielFaulkner · Feb 8, 2021 · 63e3e69 · 63e3e69
1 parent 3ebc092
commit 63e3e69
Show file tree

Hide file tree

Showing 5 changed files with 214 additions and 17 deletions.
diff --git a/annoatlas.py b/annoatlas.py
@@ -0,0 +1,46 @@
+#!/usr/bin/python3
+# annoatlas
+#
+# A terminal prompt interface to combine an annotation file with a Human Protein Atlas dataset
+#
+# By Daniel R Faulkner
+
+from lib import libAnnoAtlas
+from lib import libAnnoShared
+import argparse
+
+## Command line options:
+### Parse the command line arguments
+parser = argparse.ArgumentParser(description="Combine annotation and Human Protein Atlas datasets")
+# Arguments:
+# Required
+parser.add_argument("input", help="Annotation filename", type=argparse.FileType('r'))
+parser.add_argument("atlas", help="Human Protein Atlas filename", type=argparse.FileType('r'))
+parser.add_argument("output", help="Output filename", type=argparse.FileType('w'))
+# Optional
+parser.add_argument("-c","--column", help="Feature name column (annotation file)", nargs=1, type=int)
+parser.add_argument("-r","--regex", help="Use regular expression string matching", action="store_true")
+parser.add_argument("-a","--atlascols", help="Human Protein Atlas columns to include", action="store")
+
+# Any commands entered without a flag
+args = parser.parse_args()
+
+regex = 0
+if args.regex:
+    regex = 1
+
+GeneCol = -1
+if args.column:
+    GeneCol = args.column[0]
+
+atlasCol = []
+if args.atlascols:
+    atlasCol = args.atlascols.split(',')
+
+# Run the command
+libAnnoAtlas.combineEntries(args.input,args.atlas,args.output, atlasCol, regex, GeneCol)
+
+# Close files
+args.input.close()
+args.atlas.close()
+args.output.close()
diff --git a/lib/libAnnoAtlas.py b/lib/libAnnoAtlas.py
@@ -0,0 +1,147 @@
+# libAnnoShared
+# Functions to compare an annotation file with the Human Protein Atlas dataset
+#
+# By Daniel R Faulkner
+
+from lib import libAnnoShared
+import re
+
+EmptyChar = "."     # Character to use for empty fields
+
+# Creates a class for addressing Human Protein Atlas files
+# Usage example:
+#atlasobj = atlas(open('atlasfilename'))
+#HPAline = atlasobj.returndataline('GeneName')
+class atlas(object):
+    """Functions and operations which use the Human Protein Atlas dataset"""
+    def __init__(self, atlasfileobj):
+        """Perform object setup and perform any preprocessing"""
+        self.fileobj = atlasfileobj
+        # Store the header and store the start position for the data entries
+        atlasfileobj.seek(0)
+        self.header = atlasfileobj.readline()
+        dataone = atlasfileobj.readline()
+        self.datastart = atlasfileobj.tell()-len(dataone)
+        # Determine key column numbers
+        # Human Protein Atlas column titles
+        geneName = "Gene"
+        geneAlt = '"Gene synonym"'
+        ensemblID = "Ensembl"
+        # Convert to column numbers
+        self.geneNameCol = libAnnoShared.columnnum(self.header,geneName)
+        self.geneAltCol = libAnnoShared.columnnum(self.header,geneAlt)
+        self.ensemblIDCol = libAnnoShared.columnnum(self.header,ensemblID)
+    def returndataline(self, gene):
+        """Return the correlating entry from the Human Protein Atlas dataset"""
+        dataentry = ""
+        # Process each entry looking for a match
+        gene = gene.strip()
+        gene = gene.upper()
+        self.fileobj.seek(self.datastart)
+        line = self.fileobj.readline()
+        while line:
+            # Prepare the fields
+            fields = line.split('\t')   # Divide into seperate fields
+            name = fields[self.geneNameCol].strip()
+            ID = fields[self.ensemblIDCol].strip()
+            synonyms=[]
+            alts = fields[self.geneAltCol].split(',')
+            for item in alts:
+                item = item.strip()
+                item = item.replace('"',"")
+                synonyms.append(item.upper())
+            # Perform the comparison
+            if gene == name.upper():
+                # Check gene name
+                dataentry = line
+            elif gene in synonyms:
+                # Check gene synonyms
+                dataentry = line
+            elif gene == ID.upper():
+                # Check Ensembl ID
+                dataentry = line
+            # Load new line, unless an entry has already been located
+            line = self.fileobj.readline()
+            if dataentry!="":
+                line = None
+        return dataentry
+
+# Adds columns from a Human Protein Atlas file to an annotation file.
+# Usage example:
+#combineEntries(open('annofilename'), open('atlasfilename'), open('outputfile','w'), ['Position','Chromosome'], 0, 9):
+def combineEntries(annofile, atlasfile, outputfile, atlasterms = [], regex = 0, annocol=-1):
+    """Adds Human Protein Atlas information to an annotation file"""
+    # Check which atlas columns to use
+    atlasobj = atlas(atlasfile)
+    header = atlasobj.header.strip()
+    headerfields = header.split('\t')
+    atlascols = []
+    # If a number is passed as a search term use as a column number, and remove from list
+    searchterms = []
+    for item in atlasterms:
+        col = -1
+        try:
+            col = int(item)
+        except:
+            stritem = item.replace('"',"")  # Remove any quotes (which may confuse matches)
+            searchterms.append(stritem)
+        if col>-1 and col<len(headerfields):
+            if col not in atlascols:
+                atlascols.append(col)
+    # If a string is passed treat as an exact header string or search term
+    count = 0
+    if not regex:   # If the search term is an exact match include the column
+        for item in headerfields:
+            for search in searchterms:
+                if search.upper()==item.upper():
+                    if count not in atlascols:
+                        atlascols.append(count)
+                count = count+1
+    elif regex:     # Check if the search term is a regular expression match (if regular expression flag used)
+        for item in headerfields:
+            if libAnnoShared.regexMatch(item,searchterms):
+                if count not in atlascols:
+                    atlascols.append(count)
+            count = count+1
+    # Add all columns if no columns specified
+    if len(atlasterms)==0:
+        for i in range(0,len(headerfields)):
+            atlascols.append(i)
+    # Process the annotation files
+    type = libAnnoShared.detectFileType(annofile)
+    line = annofile.readline()
+    header = line
+    while line[0]=="#":
+        header = line
+        line = annofile.readline()
+    # Write out updated header line
+    if header[0]=="#":  # Check this is a true header line
+        newheader = header.strip()
+        for item in atlascols:
+            newheader=newheader+"\t"+headerfields[item]
+        newheader = newheader+"\n"
+        outputfile.write(newheader)
+    # Add data lines
+    while line:
+        # Get the gene name
+        if annocol>-1:
+            fields = line.split('\t')
+            geneName = fields[annocol]
+        else:
+            annotation = libAnnoShared.Annotation(line, type, header)
+            geneName = annotation.repID
+        # Lookup the line
+        atlasline = atlasobj.returndataline(geneName)
+        atlasline = atlasline.strip()
+        atlasfields = atlasline.split('\t')
+        numfields = len(atlasfields)
+        # Output the desired fields
+        outputline = line.strip()
+        for item in atlascols:
+            if item<numfields:
+                outputline=outputline+"\t"+atlasfields[item]
+            else:
+                outputline=outputline+"\t"+EmptyChar
+        outputline = outputline+"\n"
+        outputfile.write(outputline)
+        line = annofile.readline()
diff --git a/lib/libAnnoFeat.py b/lib/libAnnoFeat.py
@@ -254,7 +254,7 @@ def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0,
     while line[0]=="#":
         header = line
         line = annofileobj.readline()
-    extracolstmp = "{} Name\t{} Type\t{} Strand\t{} Distance\tWithin Name\tWithin Strand\tWithin Type\tWithin Distance\t{} Name\t{} Type\t{} Strand\t{} Distance\n"
+    extracolstmp = "\t{} Name\t{} Type\t{} Strand\t{} Distance\tWithin Name\tWithin Strand\tWithin Type\tWithin Distance\t{} Name\t{} Type\t{} Strand\t{} Distance\n"
     if senseorder:
         extracols = extracolstmp.format("AntiSense","AntiSense","AntiSense","AntiSense","Sense","Sense","Sense","Sense")
     else:

diff --git a/lib/libAnnoFilter.py b/lib/libAnnoFilter.py
@@ -3,7 +3,7 @@
 #
 # By Daniel R Faulkner
 
-from lib.libAnnoShared import columnnum
+from lib.libAnnoShared import columnnum, regexMatch
 import re
 
 ## Filter functions
@@ -93,21 +93,6 @@ def filterChromosome(annoobj, names=[], exclude=0, regex=0):
 
 ## Supporting Functions
 
-# Support regular expressions in the names list
-def regexMatch(query, list):
-    """Loops through a list of regular expressions until a match is found"""
-    # Loop through the list looking for a match
-    passed = 0
-    for name in list:
-        try:
-            regname = re.compile(name)
-        except:
-            raise Exception(name+" is not a compatible regular expression.")
-        if regname.search(query):
-            passed=1
-            break
-    return passed
-
 # Produce a dictionary object of first open reading frame positions
 # INPUT: The ORF fileobject
 # OUTPUT: Dictionary of end positions

diff --git a/lib/libAnnoShared.py b/lib/libAnnoShared.py
@@ -3,6 +3,8 @@
 #
 # By Daniel R Faulkner
 
+import re
+
 # Return the column number from a name in the header.
 # Note: This is case sensitive. An advancement in future maybe to return a dictionary with all header columns.
 def columnnum(header, name):
@@ -16,6 +18,23 @@ def columnnum(header, name):
             count = count +1
     return column
 
+# Checks the query against a list of regular expressions.
+# INPUT: A string to examine. A list of regular expression search terms.
+# OUTPUT: Returns 0 if no match is found or 1 if one of the regular expressions matches
+def regexMatch(query, list):
+    """Loops through a list of regular expressions until a match is found"""
+    # Loop through the list looking for a match
+    passed = 0
+    for name in list:
+        try:
+            regname = re.compile(name)
+        except:
+            raise Exception(name+" is not a compatible regular expression.")
+        if regname.search(query):
+            passed=1
+            break
+    return passed
+
 # Creates an object for an entry in an annotation file and applies basic conversions to make the data consistant
 # INPUT: Line from annotation file, the input file format and the header line where available.
 # NOTE: Scores are stored as integers. This could be changed to a float for greater accuracy. (Supported by DFAM and GTF)