Merge pull request #9 from DanielFaulkner/extratesting

Updated tests and fixes where errors found in running the tests.
DanielFaulkner · Feb 9, 2021 · 58ad283 · 58ad283
2 parents ced0da4 + aa0d832
commit 58ad283
Show file tree

Hide file tree

Showing 9 changed files with 144 additions and 26 deletions.
diff --git a/annofeat.py b/annofeat.py
@@ -18,19 +18,23 @@
 parser.add_argument("reference", help="Reference filename", type=argparse.FileType('r'))
 parser.add_argument("output", help="Output filename", type=argparse.FileType('w'))
 # Optional
-parser.add_argument("-m","--margin", help="Basepair margin to include", nargs=1, type=int)
+parser.add_argument("-m","--margin", help="Margin to include in basepairs [use with -o]", nargs=1, type=int)
 parser.add_argument("-a","--all", help="List all features", action="store_true")
-parser.add_argument("-t","--title", help="Column title", action="store")
-parser.add_argument("-c","--closest", help="List closest features regardless of distance", action="store_true")
+parser.add_argument("-t","--title", help="Column title [use with -o]", action="store")
+parser.add_argument("-o","--overlap", help="Only examine overlapping region", action="store_true")
 parser.add_argument("-s","--sense", help="Order closest items by sense and antisense", action="store_true")
+parser.add_argument("-f","--features", help="Features to include when look for closest feature", action="store")
 
 # Any commands entered without a flag
 args = parser.parse_args()
 
 # Setup variables:
+features=['exon','transcript']
+
 all = 0
 if args.all:
     all = 1
+    features=[]
 
 margin = 0
 if args.margin:
@@ -40,22 +44,26 @@
 if args.title:
     title = args.title
 
-useclosest = 0
-if args.closest:
-    useclosest = 1
+useoverlap = 0
+if args.overlap:
+    useoverlap = 1
 
 usesense = 0
 if args.sense:
     usesense = 1
 
+if args.features:
+    features=args.features.split(',')
+
 # Run the command
 print("Indexing reference file")
 trackobj = libAnnoShared.loadTrackFile(args.reference)
-if useclosest:
-    libAnnoFeat.featureClosestAddColumn(args.query,trackobj,args.output,usesense, all)
-else:
-    print("Processing annotations *This may take some time*")
+print("Processing annotations *This may take some time if used with large unsorted files*")
+if useoverlap:
     libAnnoFeat.featureOverlappingAddColumn(args.query,trackobj,args.output,margin,title,all)
+else:
+    libAnnoFeat.featureClosestAddColumn(args.query,trackobj,args.output,usesense, all, features)
+
 
 # Close files
 args.query.close()

diff --git a/lib/libAnnoAtlas.py b/lib/libAnnoAtlas.py
@@ -8,6 +8,8 @@
 
 EmptyChar = "."     # Character to use for empty fields
 
+# TODO: Additional error checking to return an error message if an out of range column number is used
+
 # Creates a class for addressing Human Protein Atlas files
 # Usage example:
 #atlasobj = atlas(open('atlasfilename'))

diff --git a/lib/libAnnoFeat.py b/lib/libAnnoFeat.py
@@ -244,7 +244,7 @@ def featureOverlappingAddColumn(annofileobj, reftrackobj, outfileobj, margin=0,
 # Simple usage example:
 #trackobj = libAnnoShared.loadTrackFile(open("RefFilename.gtf"))            # Load the files into a track object
 #featureClosestAddColumn(open('queryfile.bed'), trackobj, open('output.tsv','w'))  # Outputs a file with additional columns on nearby features
-def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0, returnall=0):
+def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0, returnall=0, features=[]):
     """Adds a column detailing the feature in a region"""
     type = libAnnoShared.detectFileType(annofileobj)
     annofileobj.seek(0)
@@ -274,7 +274,7 @@ def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0,
             if annoObj.alignStart<startPrev:# Reset start position if the current annotation preceeds the previous one
                 startpos=None               # If no start position given the function moves to the start of the chromosome
                 startPrev = annoObj.alignStart
-            before, within, after, startpos = featureClosest(annoObj, reftrackobj, startpos, [], returnall)
+            before, within, after, startpos = featureClosest(annoObj, reftrackobj, startpos, features, returnall)
             # Check if we are outputting based on file position or sense/antisense.
             # TODO
             newcolstmp = "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n"

diff --git a/lib/libAnnoShared.py b/lib/libAnnoShared.py
@@ -257,6 +257,7 @@ def indexFile(self):
                     annoentry = Annotation(line,self.type,self.header)
                     chrName = annoentry.chrName
                     startpos = annoentry.alignStart
+                chrName = chrName.upper()
                 # Check if the annotations are ordered by start position, smallest to largest
                 if startpos<curStart and chrName==curChr:   # Check if the start position has decreased while the chromosome has stayed the same
                     self.sorted = 0
@@ -265,7 +266,7 @@ def indexFile(self):
                 if chrName!=curChr:         # If the new chr name is different, update the start position
                     if chrName not in indexChr:
                         position = self.fileobj.tell()-len(line)
-                        indexChr[chrName.upper()] = position
+                        indexChr[chrName] = position
                     else:                               # NOTE: Need to account for unordered files!
                         #print("WARNING: File not sorted by chromosome, access times will be longer.")
                         self.warningmsg = "WARNING: File not grouped by chromosome, access times will be longer."

diff --git a/lib/libAnnoSort.py b/lib/libAnnoSort.py
@@ -25,11 +25,12 @@ def sort(trackobj, outputfile):
         # Create a list of alignment start positions and line positions
         line = trackobj.fileobj.readline()
         while line:
-            annoentry = libAnnoShared.Annotation(line, trackobj.type, trackobj.header)
-            if annoentry.chrName.upper()==chromsome:
-                chrlineindex.append([annoentry.alignStart,trackobj.fileobj.tell()-len(line)])
-            elif annoentry.chrName.upper()!=chromsome and trackobj.ordered==1:
-                line = None
+            if line[0]!="#":
+                annoentry = libAnnoShared.Annotation(line, trackobj.type, trackobj.header)
+                if annoentry.chrName.upper()==chromsome:
+                    chrlineindex.append([annoentry.alignStart,trackobj.fileobj.tell()-len(line)])
+                elif annoentry.chrName.upper()!=chromsome and trackobj.ordered==1:
+                    line = None
             if line:
                 line = trackobj.fileobj.readline()
         # Sort the start positions

diff --git a/test/atlastest.py b/test/atlastest.py
@@ -0,0 +1,47 @@
+#!/usr/bin/python3
+
+# Small test program to call the annoatlas.py utility and add Human Protein Atlas columns to an annotation file.
+# For use with the linux operating system. May need modifiying to work with other operating system types.
+
+import subprocess
+import os
+import sys
+import zipfile
+
+# Variables
+annotationFile = "sampledata/atlasInput.bed"    # Annotation file with a column of gene IDs
+outputFile1 = "CombinedOutput1.tsv"               # Output filename
+outputFile2 = "CombinedOutput2.tsv"               # Output filename
+AtlasName = "proteinatlas.tsv"
+AtlasPath = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
+GeneCol = 6                     # Column with gene names in the annotation file
+terms1 = "9,10,50,Chromosome"   # Exact search terms or column numbers
+terms2 = "RNA"                  # Search terms using regular expression
+
+# Create test directory
+command = "mkdir atlasfiles"
+subprocess.call(command, shell=True)
+
+# Atlas file locator:
+print("Locating Human Protein Atlas file")
+if not os.path.exists(AtlasName):
+    print("\nHuman Protein Atlas data file not found in this directory - "+AtlasName)
+    print("\nIf you have previously downloaded the file please place a copy or link with the same filename in this directoy.")
+    print("\nIf you do no have this file it can be downloaded from : "+AtlasPath)
+    print("\nDownload Human Protein Atlas tsv data file (10mb) (Y) or exit (N)")
+    usrin = input('(default N):')
+    if usrin.upper()=="Y":
+        command = "wget "+AtlasPath
+        subprocess.call(command, shell=True)
+        atlaszipped = zipfile.ZipFile("proteinatlas.tsv.zip")
+        atlaszipped.extractall()
+    else:
+        sys.exit()
+
+# Combine columns (specified by number)
+command = "python3 ../annoatlas.py "+annotationFile+" "+AtlasName+" atlasfiles/"+outputFile1+" -c "+str(GeneCol)+" -a "+terms1
+subprocess.call(command, shell=True)
+
+# Combine columns (specified by regular expression)
+command = "python3 ../annoatlas.py "+annotationFile+" "+AtlasName+" atlasfiles/"+outputFile2+" -c "+str(GeneCol)+" -a "+terms2+" -r"
+subprocess.call(command, shell=True)
diff --git a/test/featuretest.py b/test/featuretest.py
@@ -5,20 +5,31 @@
 
 import subprocess
 
-# Variables
+# Variabl]es
 queryfile = "GTF.gtf"
 reffile = "GTF.gtf"
-outputfile = "AddedFeatureFile.gtf"
-header = "Overlapping"
-all = 0
-margin = 0
+outputfile1 = "OverlappingFeatureFile.gtf"
+outputfile2 = "ClosestFeatureFile.gtf"
+title = "Overlapping"   # Preceding title (used with overlap option)
+all = 1                 # Return all or highest priority match
+margin = 0              # Extend the overlapping region by this many base pairs (used with overlap option)
+sense = 1               # Order the results by sense/antisense (not used with overlap option)
 
 # Create test directory
 command = "mkdir featurefiles"
 subprocess.call(command, shell=True)
 
 # Run the annotation utility to add overlapping feature columns
-command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile+" -m "+str(margin)+" -t "+header
+# Overlapping features
+command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile1+" -m "+str(margin)+" -t "+title+" -o"
 if all:
     command = command+" -a"
 subprocess.call(command, shell=True)
+
+# Closest feature
+command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile2
+if all:
+    command = command+" -a"
+if sense:
+    command = command+" -s"
+subprocess.call(command, shell=True)
diff --git a/test/scriptexample.sh b/test/scriptexample.sh
@@ -12,5 +12,5 @@ echo
 echo Expanding the gtf file so all columns are separated by a tab
 python3 ../annotabify.py scriptoutput/convertedfile.gtf scriptoutput/tabbifiedfile.gtf -t 1
 echo
-echo Adding overlapping feature columns to the annotation file. Comparing to itself. Treating file as gtf formatted for purpose of parsing the file.
-python3 ../annofeat.py scriptoutput/tabbifiedfile.gtf scriptoutput/tabbifiedfile.gtf scriptoutput/overlappingfeatures.gtf -t Overlapping -a
+echo Adding closest feature columns to the annotation file. Comparing to itself. Treating file as gtf formatted for purpose of parsing the file.
+python3 ../annofeat.py scriptoutput/tabbifiedfile.gtf scriptoutput/tabbifiedfile.gtf scriptoutput/closestfeatures.gtf -a
diff --git a/test/sorttest.py b/test/sorttest.py
@@ -0,0 +1,48 @@
+#!/usr/bin/python3
+
+# Small test program to call the annosort.py script and sort an annotation file.
+# For use with the linux operating system. May need modifiying to work with other operating system types.
+
+import subprocess
+import random
+
+# Variables
+inputfile = "GTF.gtf"
+unsortedfile = "unsorted.gtf"
+sortedfile = "sorted.gtf"
+
+# Create test directory
+command = "mkdir sortedfiles"
+subprocess.call(command, shell=True)
+
+# Scramble input file:  (use with small files)
+print("Randomising input file")
+inputobj = open("sampledata/"+inputfile)
+outputobj = open("sortedfiles/"+unsortedfile,'w')
+line = inputobj.readline()
+while line[0]=="#":
+    outputobj.write(line)
+    line = inputobj.readline()
+linestore = []
+while line:
+    linestore.append(line)
+    line = inputobj.readline()
+while len(linestore)>0:
+    lineout = linestore.pop(random.randrange(len(linestore)))
+    outputobj.write(lineout)
+outputobj.close()
+
+# Display the status of the unsorted file
+print("Unsorted file status:")
+command = "python3 ../annosort.py sortedfiles/"+unsortedfile+" -s"
+subprocess.call(command, shell=True)
+
+# Sort the unsorted file
+print("Sorting file")
+command = "python3 ../annosort.py sortedfiles/"+unsortedfile+" -o sortedfiles/"+sortedfile
+subprocess.call(command, shell=True)
+
+# Display the status of the sorted file
+print("Sorted file status:")
+command = "python3 ../annosort.py sortedfiles/"+sortedfile+" -s"
+subprocess.call(command, shell=True)