Skip to content

Commit

Permalink
Merge pull request #28 from yhoogstrate/mem_red_3
Browse files Browse the repository at this point in the history
Another reduction of memory footpint
  • Loading branch information
yhoogstrate committed Mar 15, 2016
2 parents 240565f + 8d579d9 commit 72a3297
Show file tree
Hide file tree
Showing 19 changed files with 139 additions and 95 deletions.
8 changes: 8 additions & 0 deletions Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
2016-03-15 Youri Hoogstrate

* Version 2.12.1: Another reuction of memory footprint - from
linear scale (as the number of samples increase) to chunk-wise.

* Changed verbosity settings, requires the --verbose argument to get
a detailed output.

2016-03-14 Youri Hoogstrate

* Version 2.12.0: Huge reduction of memory footprint - from
Expand Down
24 changes: 24 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Bioinformatics first published online December 10, 2015
- [--strand-specific-matching](#--strand-specific-matching)
- [--acceptor-donor-order-specific-matching](#--acceptor-donor-order-specific-matching)
- [Input formats](#input-formats)
- [--verbose](#--verbose)
- [Galaxy](#galaxy-1)
- [Examples](#examples)
- [Example 01: one sample, two tools](#example-01-one-sample-two-tools)
Expand Down Expand Up @@ -441,6 +442,29 @@ Or run the following command line argument to get an overview of the versions at

fuma --formats

#### --verbose ####

If you would like to see additional statistics during runtime (or post-runtime
if you store the output) you should run FuMa with the `--verbose` argument:

fuma \
-a "hg19:genes_hg19.bed" \
\
-s "chimerascan:chimerascan:FOO_chimerascan/chimeras.bedpe" \
"defuse:defuse:FOO_defuse/results.tsv" \
-l "chimerascan:hg19" \
"defuse:hg19" \
-f "list" \
-o "chimerascan_defuse_overlap.txt" \
--verbose

This allows the user to inspect the numbers of duplicate fusions, the
number of parsed genes from the gene set and showing which datasets
are being compared at run time.

* Note: As of 2.12.1 this argument is required, in preliminary versions
this was by default enabled.

### Galaxy ###

After having FuMa installed in Galaxy via the toolshed, it can be opened by typing '*fuma*' in the '*search tools*' field on the left panel in galaxy. When it has opened, the interface should be similar to [Fig. S2: FuMa in Galaxy](#fig-s2-fuma-in-galaxy). The main input of the Galaxy wrapper is a set of datasets. You can as add many datasets as the server can handle in terms of resources. For each dataset the user needs to specify (1) the history item in galaxy that contains the output file of the fusion gene detection experiment, (2) the corresponding file format and name of the tool that corresponds to the history item and (3) a corresponding gene annotation file (in BED format). Lastly, the user can specify the desired output format and proceed with the analysis.
Expand Down
4 changes: 2 additions & 2 deletions fuma/CompareFusionsBySpanningGenes.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@


class CompareFusionsBySpanningGenes:
logger = logging.getLogger("FuMA::Readers::CompareFusionsBySpanningGenes")
logger = logging.getLogger("FuMa::CompareFusionsBySpanningGenes")

def __init__(self,experiment_1,experiment_2,args):
self.experiment_1 = experiment_1
Expand All @@ -37,7 +37,7 @@ def __init__(self,experiment_1,experiment_2,args):
self.args = args

def find_overlap(self):
self.logger.info("Comparing: '"+self.experiment_1.name+"' with '"+self.experiment_2.name + "'" + " - using '"+self.args.matching_method+"'-based matching")
self.logger.debug("Comparing: '"+self.experiment_1.name+"' with '"+self.experiment_2.name + "'" + " - using '"+self.args.matching_method+"'-based matching")
overlap_between_experiments = FusionDetectionExperiment(self.experiment_1.name+"_vs._"+self.experiment_2.name)

if(self.experiment_1.genes_spanning_left_junction and self.experiment_2.genes_spanning_left_junction and self.experiment_1.genes_spanning_right_junction and self.experiment_2.genes_spanning_right_junction):
Expand Down
12 changes: 6 additions & 6 deletions fuma/FusionDetectionExperiment.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from fuma.Fusion import STRAND_REVERSE

class FusionDetectionExperiment:
logger = logging.getLogger("FuMA::Readers::FusionDetectionExperiment")
logger = logging.getLogger("FuMa::FusionDetectionExperiment")

def __init__(self,name):
self.name = name
Expand Down Expand Up @@ -230,7 +230,7 @@ def annotate_genes(self,gene_annotation):

def annotate_genes_left(self,gene_annotation):
if(not self.genes_spanning_left_junction):
self.logger.info("Annotating genes on the left junction: "+self.name+" - "+gene_annotation.name)
self.logger.debug("Annotating genes on the left junction: "+self.name+" - "+gene_annotation.name)

for fusion in self.__iter__():
if(fusion.annotated_genes_left == None): # if object is not set, make it an empty list
Expand All @@ -243,7 +243,7 @@ def annotate_genes_left(self,gene_annotation):

def annotate_genes_right(self,gene_annotation):
if(not self.genes_spanning_right_junction):
self.logger.info("Annotating genes on the right junction: "+self.name+" - "+gene_annotation.name)
self.logger.debug("Annotating genes on the right junction: "+self.name+" - "+gene_annotation.name)

for fusion in self:
if(fusion.annotated_genes_right == None): # if object is not set, make it an empty list
Expand Down Expand Up @@ -360,9 +360,9 @@ def remove_duplicates(self,args):
self.add_fusion(fusion)

if(self.name.find("vs.") == -1):
self.logger.info("* Full: "+str(old_count))
self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
self.logger.info("* Unique: "+str(len(self)))
self.logger.debug("* Full: "+str(old_count))
self.logger.debug("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
self.logger.debug("* Unique: "+str(len(self)))

return len(self)

Expand Down
2 changes: 1 addition & 1 deletion fuma/GeneAnnotation.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ def __init__(self,name):
self.gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)

def add_annotation(self,gene,chromosome,start,stop):
self.logger.debug("Adding annotation "+str(self.n)+": "+chromosome+":"+str(start)+"-"+str(stop)+" = "+str(gene))
#self.logger.debug("Adding annotation "+str(self.n)+": "+chromosome+":"+str(start)+"-"+str(stop)+" = "+str(gene))
self.gas[HTSeq.GenomicInterval(chromosome,start,stop)] += gene
self.n += 1

Expand Down
50 changes: 30 additions & 20 deletions fuma/OverlapComplex.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,8 @@


class OverlapComplex:
logger = logging.getLogger("FuMa::OverlapComplex")

def __init__(self):
self.datasets = []
self.dataset_names = []
Expand Down Expand Up @@ -77,16 +79,16 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
This makes the algorithm much more effictent (reduces space
complexity from 0.5(n^2) => 2n).
"""

n = len(self.datasets)

self.logger.info("Determining the overlap of fusion genes in "+str(n)+" datasets")

self.matrix_tmp = {}

for i in range(len(self.datasets)):
self.matrix_tmp[str(i+1)] = self.datasets[i]

#comparisons = self.find_combination_table(n)

if(args.format=="list" and export_dir != False):
if args.long_gene_size > 0:
large_genes = "Spans large gene (>"+str(args.long_gene_size)+"bp)"
Expand All @@ -95,23 +97,24 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):

export_dir.write("Left-genes\tRight-genes\t"+large_genes+"\t"+"\t".join(self.dataset_names)+"\n")

ri = 0
for r in self.find_combination_table(len(self.datasets)):
ri = len(self.datasets)-1
r_0 = self.find_combination_table_r_i(len(self.datasets),ri,0)

# First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time
dont_remove = []
matches_this_iteration = set([])

for c in r:
keys = self.create_keys(c)
#for c in r:
#keys = self.create_keys(c)

dont_remove.append(keys[0])
dont_remove.append(keys[1])
#dont_remove.append(keys[0])
#dont_remove.append(keys[1])

if(args.format != "list"):
for candidate in self.matrix_tmp.keys():
if candidate not in dont_remove:
del(self.matrix_tmp[candidate])
#if(args.format != "list"):
#for candidate in self.matrix_tmp.keys():
#if candidate not in dont_remove:
#del(self.matrix_tmp[candidate])

# Then run analysis
for c in r:
Expand All @@ -129,8 +132,9 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
self.matches_total[keys[2]] = len(matches[0])

if(args.format=="list"):# Write those that are not marked to go to the next iteration to a file
if(len(r[0]) > 2):
for export_key in previous_comparisons:#comparisons[ri-1]:
if(len(r_0) > 2):
for export_key in self.find_combination_table_r(len(self.datasets),ri-1):#previous_comparisons:#comparisons[ri-1]:
export_key = [str(x) for x in export_key]
export_key = '.'.join(export_key)

self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
Expand All @@ -140,10 +144,10 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
#del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove...

previous_comparisons = r
ri += 1

if(args.format == "list" and export_dir != False):
export_key = '.'.join(r[0])
export_key = '.'.join([str(x) for x in r_0])
self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,set([]),args) ## if this was once in a list to be removed, remove...?

return matches
Expand All @@ -152,11 +156,17 @@ def find_combination_table(self,n):
in_list = range(1,n+1)

for r in range(2,len(in_list)+1):
table_i_tmp = itertools.combinations(in_list,r)
table_i = []
for i in table_i_tmp:
table_i.append(list(i))
yield table_i
yield (list(j) for j in itertools.combinations(in_list,r))

def find_combination_table_r(self,n,r):
in_list = range(1,n+1)
r += 2
return (list(j) for j in itertools.combinations(in_list,r))

def find_combination_table_r_i(self,n,r,i):
in_list = range(1,n+1)
r += 2
return list(list(itertools.combinations(in_list,r))[i])

def set_annotation(self,arg_gene_annotation):
self.gene_annotation = arg_gene_annotation
Expand Down
4 changes: 2 additions & 2 deletions fuma/ParseBED.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
import HTSeq

class ParseBED(GeneAnnotation):
logger = logging.getLogger("FuMA::ParseBED")
logger = logging.getLogger("FuMa::ParseBED")

def __init__(self,filename,name,long_gene_size):
GeneAnnotation.__init__(self,name)
Expand All @@ -57,7 +57,7 @@ def parse(self,filename):
if(len(line) > 0):
self.parse_line(line)

self.logger.info('Size of Gene Annotation: '+str(len(self)))
self.logger.debug('Size of Gene Annotation: '+str(len(self)))

def cleanup_chr_name(self,chr_name):
"""Given the large number of fusion genes, we remove all 'chr'
Expand Down
Loading

0 comments on commit 72a3297

Please sign in to comment.