Merge pull request #28 from yhoogstrate/mem_red_3

Another reduction of memory footpint
yhoogstrate · Mar 15, 2016 · 72a3297 · 72a3297
2 parents 240565f + 8d579d9
commit 72a3297
Show file tree

Hide file tree

Showing 19 changed files with 139 additions and 95 deletions.
diff --git a/Changelog b/Changelog
@@ -1,3 +1,11 @@
+2016-03-15  Youri Hoogstrate
+
+	* Version 2.12.1: Another reuction of memory footprint - from
+	  linear scale (as the number of samples increase) to chunk-wise.
+
+	* Changed verbosity settings, requires the --verbose argument to get
+	  a detailed output.
+
 2016-03-14  Youri Hoogstrate
 
 	* Version 2.12.0: Huge reduction of memory footprint - from

diff --git a/README.md b/README.md
@@ -30,6 +30,7 @@ Bioinformatics first published online December 10, 2015
          - [--strand-specific-matching](#--strand-specific-matching)
          - [--acceptor-donor-order-specific-matching](#--acceptor-donor-order-specific-matching)
          - [Input formats](#input-formats)
+         - [--verbose](#--verbose)
     - [Galaxy](#galaxy-1)
 - [Examples](#examples)
     - [Example 01: one sample, two tools](#example-01-one-sample-two-tools)
@@ -441,6 +442,29 @@ Or run the following command line argument to get an overview of the versions at
 
 	fuma --formats
 
+#### --verbose ####
+
+If you would like to see additional statistics during runtime (or post-runtime
+if you store the output) you should run FuMa with the `--verbose` argument:
+
+	fuma \
+	    -a  "hg19:genes_hg19.bed" \
+	    \
+	    -s  "chimerascan:chimerascan:FOO_chimerascan/chimeras.bedpe" \
+	        "defuse:defuse:FOO_defuse/results.tsv" \
+	    -l  "chimerascan:hg19" \
+	        "defuse:hg19" \
+	    -f  "list" \
+	    -o  "chimerascan_defuse_overlap.txt" \
+	    --verbose
+
+This allows the user to inspect the numbers of duplicate fusions, the
+number of parsed genes from the gene set and showing which datasets
+are being compared at run time.
+
+* Note: As of 2.12.1 this argument is required, in preliminary versions
+this was by default enabled.
+
 ### Galaxy ###
 
 After having FuMa installed in Galaxy via the toolshed, it can be opened by typing '*fuma*' in the '*search tools*' field on the left panel in galaxy. When it has opened, the interface should be similar to [Fig. S2: FuMa in Galaxy](#fig-s2-fuma-in-galaxy). The main input of the Galaxy wrapper is a set of datasets. You can as add many datasets as the server can handle in terms of resources. For each dataset the user needs to specify (1) the history item in galaxy that contains the output file of the fusion gene detection experiment, (2) the corresponding file format and name of the tool that corresponds to the history item and (3) a corresponding gene annotation file (in BED format). Lastly, the user can specify the desired output format and proceed with the analysis.

diff --git a/fuma/CompareFusionsBySpanningGenes.py b/fuma/CompareFusionsBySpanningGenes.py
@@ -28,7 +28,7 @@
 
 
 class CompareFusionsBySpanningGenes:
-	logger = logging.getLogger("FuMA::Readers::CompareFusionsBySpanningGenes")
+	logger = logging.getLogger("FuMa::CompareFusionsBySpanningGenes")
 
 	def __init__(self,experiment_1,experiment_2,args):
 		self.experiment_1 = experiment_1
@@ -37,7 +37,7 @@ def __init__(self,experiment_1,experiment_2,args):
 		self.args = args
 
 	def find_overlap(self):
-		self.logger.info("Comparing: '"+self.experiment_1.name+"' with '"+self.experiment_2.name + "'" + " - using '"+self.args.matching_method+"'-based matching")
+		self.logger.debug("Comparing: '"+self.experiment_1.name+"' with '"+self.experiment_2.name + "'" + " - using '"+self.args.matching_method+"'-based matching")
 		overlap_between_experiments = FusionDetectionExperiment(self.experiment_1.name+"_vs._"+self.experiment_2.name)
 
 		if(self.experiment_1.genes_spanning_left_junction and self.experiment_2.genes_spanning_left_junction and self.experiment_1.genes_spanning_right_junction and self.experiment_2.genes_spanning_right_junction):

diff --git a/fuma/FusionDetectionExperiment.py b/fuma/FusionDetectionExperiment.py
@@ -31,7 +31,7 @@
 from fuma.Fusion import STRAND_REVERSE
 
 class FusionDetectionExperiment:
-	logger = logging.getLogger("FuMA::Readers::FusionDetectionExperiment")
+	logger = logging.getLogger("FuMa::FusionDetectionExperiment")
 
 	def __init__(self,name):
 		self.name = name
@@ -230,7 +230,7 @@ def annotate_genes(self,gene_annotation):
 
 	def annotate_genes_left(self,gene_annotation):
 		if(not self.genes_spanning_left_junction):
-			self.logger.info("Annotating genes on the left junction: "+self.name+" - "+gene_annotation.name)
+			self.logger.debug("Annotating genes on the left junction: "+self.name+" - "+gene_annotation.name)
 
 			for fusion in self.__iter__():
 				if(fusion.annotated_genes_left == None):				# if object is not set, make it an empty list
@@ -243,7 +243,7 @@ def annotate_genes_left(self,gene_annotation):
 
 	def annotate_genes_right(self,gene_annotation):
 		if(not self.genes_spanning_right_junction):
-			self.logger.info("Annotating genes on the right junction: "+self.name+" - "+gene_annotation.name)
+			self.logger.debug("Annotating genes on the right junction: "+self.name+" - "+gene_annotation.name)
 
 			for fusion in self:
 				if(fusion.annotated_genes_right == None):				# if object is not set, make it an empty list
@@ -360,9 +360,9 @@ def remove_duplicates(self,args):
 			self.add_fusion(fusion)
 
 		if(self.name.find("vs.") == -1):
-			self.logger.info("* Full: "+str(old_count))
-			self.logger.info("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
-			self.logger.info("* Unique: "+str(len(self)))
+			self.logger.debug("* Full: "+str(old_count))
+			self.logger.debug("* Gene-spanning: "+str(old_count-stats_non_gene_spanning))
+			self.logger.debug("* Unique: "+str(len(self)))
 
 		return len(self)
 

diff --git a/fuma/GeneAnnotation.py b/fuma/GeneAnnotation.py
@@ -36,7 +36,7 @@ def __init__(self,name):
 		self.gas = HTSeq.GenomicArrayOfSets("auto", stranded=False)
 
 	def add_annotation(self,gene,chromosome,start,stop):
-		self.logger.debug("Adding annotation "+str(self.n)+": "+chromosome+":"+str(start)+"-"+str(stop)+" = "+str(gene))
+		#self.logger.debug("Adding annotation "+str(self.n)+": "+chromosome+":"+str(start)+"-"+str(stop)+" = "+str(gene))
 		self.gas[HTSeq.GenomicInterval(chromosome,start,stop)] += gene
 		self.n += 1
 

diff --git a/fuma/OverlapComplex.py b/fuma/OverlapComplex.py
@@ -31,6 +31,8 @@
 
 
 class OverlapComplex:
+	logger = logging.getLogger("FuMa::OverlapComplex")
+
 	def __init__(self):
 		self.datasets = []
 		self.dataset_names = []
@@ -77,16 +79,16 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
 		This makes the algorithm much more effictent (reduces space
 		complexity from 0.5(n^2) => 2n).
 		"""
-
 		n = len(self.datasets)
 
+		self.logger.info("Determining the overlap of fusion genes in "+str(n)+" datasets")
+
 		self.matrix_tmp = {}
 
 		for i in range(len(self.datasets)):
 			self.matrix_tmp[str(i+1)] = self.datasets[i]
 
 		#comparisons = self.find_combination_table(n)
-
 		if(args.format=="list" and export_dir != False):
 			if args.long_gene_size > 0:
 				large_genes = "Spans large gene (>"+str(args.long_gene_size)+"bp)"
@@ -95,23 +97,24 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
 
 			export_dir.write("Left-genes\tRight-genes\t"+large_genes+"\t"+"\t".join(self.dataset_names)+"\n")
 
+		ri = 0
 		for r in self.find_combination_table(len(self.datasets)):
-			ri = len(self.datasets)-1
+			r_0 = self.find_combination_table_r_i(len(self.datasets),ri,0)
 
 			# First cleanup the memory - reduces space complexity from 0.5(n^2) => 2n. In addition, memory should decrease in time
 			dont_remove = []
 			matches_this_iteration = set([])
 
-			for c in r:
-				keys = self.create_keys(c)
+			#for c in r:
+				#keys = self.create_keys(c)
 
-				dont_remove.append(keys[0])
-				dont_remove.append(keys[1])
+				#dont_remove.append(keys[0])
+				#dont_remove.append(keys[1])
 
-			if(args.format != "list"):
-				for candidate in self.matrix_tmp.keys():
-					if candidate not in dont_remove:
-						del(self.matrix_tmp[candidate])
+			#if(args.format != "list"):
+				#for candidate in self.matrix_tmp.keys():
+					#if candidate not in dont_remove:
+						#del(self.matrix_tmp[candidate])
 
 			# Then run analysis
 			for c in r:
@@ -129,8 +132,9 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
 				self.matches_total[keys[2]] = len(matches[0])
 
 			if(args.format=="list"):# Write those that are not marked to go to the next iteration to a file
-				if(len(r[0]) > 2):
-					for export_key in previous_comparisons:#comparisons[ri-1]:
+				if(len(r_0) > 2):
+					for export_key in self.find_combination_table_r(len(self.datasets),ri-1):#previous_comparisons:#comparisons[ri-1]:
+						export_key = [str(x) for x in export_key]
 						export_key = '.'.join(export_key)
 
 						self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
@@ -140,10 +144,10 @@ def overlay_fusions(self,sparse=True,export_dir=False,args=None):
 						self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,matches_this_iteration,args)
 						#del(self.matrix_tmp[export_key]) ## if this was once in a list to be removed, remove...
 
-			previous_comparisons = r
+			ri += 1
 
 		if(args.format == "list" and export_dir != False):
-			export_key = '.'.join(r[0])
+			export_key = '.'.join([str(x) for x in r_0])
 			self.matrix_tmp[export_key].export_to_list(export_dir,self.dataset_names,set([]),args) ## if this was once in a list to be removed, remove...?
 
 		return matches
@@ -152,11 +156,17 @@ def find_combination_table(self,n):
 		in_list = range(1,n+1)
 
 		for r in range(2,len(in_list)+1):
-			table_i_tmp = itertools.combinations(in_list,r)
-			table_i = []
-			for i in table_i_tmp:
-				table_i.append(list(i))
-			yield table_i
+			yield (list(j) for j in itertools.combinations(in_list,r))
+
+	def find_combination_table_r(self,n,r):
+		in_list = range(1,n+1)
+		r += 2
+		return (list(j) for j in itertools.combinations(in_list,r))
+
+	def find_combination_table_r_i(self,n,r,i):
+		in_list = range(1,n+1)
+		r += 2
+		return list(list(itertools.combinations(in_list,r))[i])
 
 	def set_annotation(self,arg_gene_annotation):
 		self.gene_annotation = arg_gene_annotation

diff --git a/fuma/ParseBED.py b/fuma/ParseBED.py
@@ -41,7 +41,7 @@
 import HTSeq
 
 class ParseBED(GeneAnnotation):
-	logger = logging.getLogger("FuMA::ParseBED")
+	logger = logging.getLogger("FuMa::ParseBED")
 
 	def __init__(self,filename,name,long_gene_size):
 		GeneAnnotation.__init__(self,name)
@@ -57,7 +57,7 @@ def parse(self,filename):
 				if(len(line) > 0):
 					self.parse_line(line)
 
-		self.logger.info('Size of Gene Annotation: '+str(len(self)))
+		self.logger.debug('Size of Gene Annotation: '+str(len(self)))
 
 	def cleanup_chr_name(self,chr_name):
 		"""Given the large number of fusion genes, we remove all 'chr'