Skip to content

Commit

Permalink
Merge pull request #9 from DanielFaulkner/extratesting
Browse files Browse the repository at this point in the history
Updated tests and fixes where errors found in running the tests.
  • Loading branch information
DanielFaulkner authored Feb 9, 2021
2 parents ced0da4 + aa0d832 commit 58ad283
Show file tree
Hide file tree
Showing 9 changed files with 144 additions and 26 deletions.
28 changes: 18 additions & 10 deletions annofeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,19 +18,23 @@
parser.add_argument("reference", help="Reference filename", type=argparse.FileType('r'))
parser.add_argument("output", help="Output filename", type=argparse.FileType('w'))
# Optional
parser.add_argument("-m","--margin", help="Basepair margin to include", nargs=1, type=int)
parser.add_argument("-m","--margin", help="Margin to include in basepairs [use with -o]", nargs=1, type=int)
parser.add_argument("-a","--all", help="List all features", action="store_true")
parser.add_argument("-t","--title", help="Column title", action="store")
parser.add_argument("-c","--closest", help="List closest features regardless of distance", action="store_true")
parser.add_argument("-t","--title", help="Column title [use with -o]", action="store")
parser.add_argument("-o","--overlap", help="Only examine overlapping region", action="store_true")
parser.add_argument("-s","--sense", help="Order closest items by sense and antisense", action="store_true")
parser.add_argument("-f","--features", help="Features to include when look for closest feature", action="store")

# Any commands entered without a flag
args = parser.parse_args()

# Setup variables:
features=['exon','transcript']

all = 0
if args.all:
all = 1
features=[]

margin = 0
if args.margin:
Expand All @@ -40,22 +44,26 @@
if args.title:
title = args.title

useclosest = 0
if args.closest:
useclosest = 1
useoverlap = 0
if args.overlap:
useoverlap = 1

usesense = 0
if args.sense:
usesense = 1

if args.features:
features=args.features.split(',')

# Run the command
print("Indexing reference file")
trackobj = libAnnoShared.loadTrackFile(args.reference)
if useclosest:
libAnnoFeat.featureClosestAddColumn(args.query,trackobj,args.output,usesense, all)
else:
print("Processing annotations *This may take some time*")
print("Processing annotations *This may take some time if used with large unsorted files*")
if useoverlap:
libAnnoFeat.featureOverlappingAddColumn(args.query,trackobj,args.output,margin,title,all)
else:
libAnnoFeat.featureClosestAddColumn(args.query,trackobj,args.output,usesense, all, features)


# Close files
args.query.close()
Expand Down
2 changes: 2 additions & 0 deletions lib/libAnnoAtlas.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@

EmptyChar = "." # Character to use for empty fields

# TODO: Additional error checking to return an error message if an out of range column number is used

# Creates a class for addressing Human Protein Atlas files
# Usage example:
#atlasobj = atlas(open('atlasfilename'))
Expand Down
4 changes: 2 additions & 2 deletions lib/libAnnoFeat.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def featureOverlappingAddColumn(annofileobj, reftrackobj, outfileobj, margin=0,
# Simple usage example:
#trackobj = libAnnoShared.loadTrackFile(open("RefFilename.gtf")) # Load the files into a track object
#featureClosestAddColumn(open('queryfile.bed'), trackobj, open('output.tsv','w')) # Outputs a file with additional columns on nearby features
def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0, returnall=0):
def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0, returnall=0, features=[]):
"""Adds a column detailing the feature in a region"""
type = libAnnoShared.detectFileType(annofileobj)
annofileobj.seek(0)
Expand Down Expand Up @@ -274,7 +274,7 @@ def featureClosestAddColumn(annofileobj, reftrackobj, outfileobj, senseorder=0,
if annoObj.alignStart<startPrev:# Reset start position if the current annotation preceeds the previous one
startpos=None # If no start position given the function moves to the start of the chromosome
startPrev = annoObj.alignStart
before, within, after, startpos = featureClosest(annoObj, reftrackobj, startpos, [], returnall)
before, within, after, startpos = featureClosest(annoObj, reftrackobj, startpos, features, returnall)
# Check if we are outputting based on file position or sense/antisense.
# TODO
newcolstmp = "\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n"
Expand Down
3 changes: 2 additions & 1 deletion lib/libAnnoShared.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,6 +257,7 @@ def indexFile(self):
annoentry = Annotation(line,self.type,self.header)
chrName = annoentry.chrName
startpos = annoentry.alignStart
chrName = chrName.upper()
# Check if the annotations are ordered by start position, smallest to largest
if startpos<curStart and chrName==curChr: # Check if the start position has decreased while the chromosome has stayed the same
self.sorted = 0
Expand All @@ -265,7 +266,7 @@ def indexFile(self):
if chrName!=curChr: # If the new chr name is different, update the start position
if chrName not in indexChr:
position = self.fileobj.tell()-len(line)
indexChr[chrName.upper()] = position
indexChr[chrName] = position
else: # NOTE: Need to account for unordered files!
#print("WARNING: File not sorted by chromosome, access times will be longer.")
self.warningmsg = "WARNING: File not grouped by chromosome, access times will be longer."
Expand Down
11 changes: 6 additions & 5 deletions lib/libAnnoSort.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,11 +25,12 @@ def sort(trackobj, outputfile):
# Create a list of alignment start positions and line positions
line = trackobj.fileobj.readline()
while line:
annoentry = libAnnoShared.Annotation(line, trackobj.type, trackobj.header)
if annoentry.chrName.upper()==chromsome:
chrlineindex.append([annoentry.alignStart,trackobj.fileobj.tell()-len(line)])
elif annoentry.chrName.upper()!=chromsome and trackobj.ordered==1:
line = None
if line[0]!="#":
annoentry = libAnnoShared.Annotation(line, trackobj.type, trackobj.header)
if annoentry.chrName.upper()==chromsome:
chrlineindex.append([annoentry.alignStart,trackobj.fileobj.tell()-len(line)])
elif annoentry.chrName.upper()!=chromsome and trackobj.ordered==1:
line = None
if line:
line = trackobj.fileobj.readline()
# Sort the start positions
Expand Down
47 changes: 47 additions & 0 deletions test/atlastest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/usr/bin/python3

# Small test program to call the annoatlas.py utility and add Human Protein Atlas columns to an annotation file.
# For use with the linux operating system. May need modifiying to work with other operating system types.

import subprocess
import os
import sys
import zipfile

# Variables
annotationFile = "sampledata/atlasInput.bed" # Annotation file with a column of gene IDs
outputFile1 = "CombinedOutput1.tsv" # Output filename
outputFile2 = "CombinedOutput2.tsv" # Output filename
AtlasName = "proteinatlas.tsv"
AtlasPath = "https://www.proteinatlas.org/download/proteinatlas.tsv.zip"
GeneCol = 6 # Column with gene names in the annotation file
terms1 = "9,10,50,Chromosome" # Exact search terms or column numbers
terms2 = "RNA" # Search terms using regular expression

# Create test directory
command = "mkdir atlasfiles"
subprocess.call(command, shell=True)

# Atlas file locator:
print("Locating Human Protein Atlas file")
if not os.path.exists(AtlasName):
print("\nHuman Protein Atlas data file not found in this directory - "+AtlasName)
print("\nIf you have previously downloaded the file please place a copy or link with the same filename in this directoy.")
print("\nIf you do no have this file it can be downloaded from : "+AtlasPath)
print("\nDownload Human Protein Atlas tsv data file (10mb) (Y) or exit (N)")
usrin = input('(default N):')
if usrin.upper()=="Y":
command = "wget "+AtlasPath
subprocess.call(command, shell=True)
atlaszipped = zipfile.ZipFile("proteinatlas.tsv.zip")
atlaszipped.extractall()
else:
sys.exit()

# Combine columns (specified by number)
command = "python3 ../annoatlas.py "+annotationFile+" "+AtlasName+" atlasfiles/"+outputFile1+" -c "+str(GeneCol)+" -a "+terms1
subprocess.call(command, shell=True)

# Combine columns (specified by regular expression)
command = "python3 ../annoatlas.py "+annotationFile+" "+AtlasName+" atlasfiles/"+outputFile2+" -c "+str(GeneCol)+" -a "+terms2+" -r"
subprocess.call(command, shell=True)
23 changes: 17 additions & 6 deletions test/featuretest.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,31 @@

import subprocess

# Variables
# Variabl]es
queryfile = "GTF.gtf"
reffile = "GTF.gtf"
outputfile = "AddedFeatureFile.gtf"
header = "Overlapping"
all = 0
margin = 0
outputfile1 = "OverlappingFeatureFile.gtf"
outputfile2 = "ClosestFeatureFile.gtf"
title = "Overlapping" # Preceding title (used with overlap option)
all = 1 # Return all or highest priority match
margin = 0 # Extend the overlapping region by this many base pairs (used with overlap option)
sense = 1 # Order the results by sense/antisense (not used with overlap option)

# Create test directory
command = "mkdir featurefiles"
subprocess.call(command, shell=True)

# Run the annotation utility to add overlapping feature columns
command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile+" -m "+str(margin)+" -t "+header
# Overlapping features
command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile1+" -m "+str(margin)+" -t "+title+" -o"
if all:
command = command+" -a"
subprocess.call(command, shell=True)

# Closest feature
command = "python3 ../annofeat.py sampledata/"+queryfile+" sampledata/"+reffile+" featurefiles/"+outputfile2
if all:
command = command+" -a"
if sense:
command = command+" -s"
subprocess.call(command, shell=True)
4 changes: 2 additions & 2 deletions test/scriptexample.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,5 +12,5 @@ echo
echo Expanding the gtf file so all columns are separated by a tab
python3 ../annotabify.py scriptoutput/convertedfile.gtf scriptoutput/tabbifiedfile.gtf -t 1
echo
echo Adding overlapping feature columns to the annotation file. Comparing to itself. Treating file as gtf formatted for purpose of parsing the file.
python3 ../annofeat.py scriptoutput/tabbifiedfile.gtf scriptoutput/tabbifiedfile.gtf scriptoutput/overlappingfeatures.gtf -t Overlapping -a
echo Adding closest feature columns to the annotation file. Comparing to itself. Treating file as gtf formatted for purpose of parsing the file.
python3 ../annofeat.py scriptoutput/tabbifiedfile.gtf scriptoutput/tabbifiedfile.gtf scriptoutput/closestfeatures.gtf -a
48 changes: 48 additions & 0 deletions test/sorttest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#!/usr/bin/python3

# Small test program to call the annosort.py script and sort an annotation file.
# For use with the linux operating system. May need modifiying to work with other operating system types.

import subprocess
import random

# Variables
inputfile = "GTF.gtf"
unsortedfile = "unsorted.gtf"
sortedfile = "sorted.gtf"

# Create test directory
command = "mkdir sortedfiles"
subprocess.call(command, shell=True)

# Scramble input file: (use with small files)
print("Randomising input file")
inputobj = open("sampledata/"+inputfile)
outputobj = open("sortedfiles/"+unsortedfile,'w')
line = inputobj.readline()
while line[0]=="#":
outputobj.write(line)
line = inputobj.readline()
linestore = []
while line:
linestore.append(line)
line = inputobj.readline()
while len(linestore)>0:
lineout = linestore.pop(random.randrange(len(linestore)))
outputobj.write(lineout)
outputobj.close()

# Display the status of the unsorted file
print("Unsorted file status:")
command = "python3 ../annosort.py sortedfiles/"+unsortedfile+" -s"
subprocess.call(command, shell=True)

# Sort the unsorted file
print("Sorting file")
command = "python3 ../annosort.py sortedfiles/"+unsortedfile+" -o sortedfiles/"+sortedfile
subprocess.call(command, shell=True)

# Display the status of the sorted file
print("Sorted file status:")
command = "python3 ../annosort.py sortedfiles/"+sortedfile+" -s"
subprocess.call(command, shell=True)

0 comments on commit 58ad283

Please sign in to comment.