FastaStats shows Contig N50 as well.

josiahseaman · Jan 23, 2018 · e39cac1 · e39cac1
1 parent b5a74b1
commit e39cac1
Show file tree

Hide file tree

Showing 2 changed files with 36 additions and 13 deletions.
diff --git a/DNASkittleUtils/FastaStats.py b/DNASkittleUtils/FastaStats.py
@@ -12,6 +12,7 @@
 # N50 = contig length so that half of the contigs are longer and 1/2 of contigs are shorter
 from __future__ import print_function, division, absolute_import, with_statement
 import sys
+from itertools import chain
 
 from DNASkittleUtils.Contigs import read_contigs
 
@@ -25,7 +26,7 @@ def cumulative_sum(numbers_list):
     return running_sums
 
 
-def collect_n50_stats(scaffold_lengths):
+def collect_n50_stats(scaffold_lengths, prefix=''):
     """N50:
     the length of the shortest contig such that the sum of contigs of equal
     length or longer is at least 50% of the total length of all contigs"""
@@ -37,40 +38,62 @@ def collect_n50_stats(scaffold_lengths):
     csum = cumulative_sum(all_len)
 
     assembly_size = sum(scaffold_lengths)
-    stats['N'] = int(assembly_size)
+    stats[prefix + 'N'] = int(assembly_size)
     halfway_point = (assembly_size // 2)
 
     # get index for cumsum >= N/2
     for i, x in enumerate(csum):
         if x >= halfway_point:
-            stats['N50'] = all_len[i]
+            stats[prefix + 'N50'] = all_len[i]
             break
 
     # N90
-    stats['nx90'] = int(assembly_size * 0.90)
+    stats[prefix + 'nx90'] = int(assembly_size * 0.90)
 
     # index for csumsum >= 0.9*N
     for i, x in enumerate(csum):
-        if x >= stats['nx90']:
-            stats['N90'] = all_len[i]
+        if x >= stats[prefix + 'nx90']:
+            stats[prefix + 'N90'] = all_len[i]
             break
 
     return stats
 
 
 def scaffold_lengths_from_fasta(input_fasta_path):
-    contigs = read_contigs(input_fasta_path)
-    lengths = [len(x.seq) for x in contigs]
+    scaffolds = read_contigs(input_fasta_path)
+    lengths = [len(x.seq) for x in scaffolds]
+    return scaffolds, lengths
+
+
+def split_by_N(scaffolds):
+    length_collection = set()
+    for scaffold in scaffolds:
+        pieces = scaffold.seq.split('N')
+        length_collection.add((len(p) for p in pieces))
+    lengths = list(chain(*length_collection))
     return lengths
 
 
 def all_stats(input_fasta):
-    lengths = scaffold_lengths_from_fasta(input_fasta)
-    return collect_n50_stats(lengths)
+    scaffolds, lengths = scaffold_lengths_from_fasta(input_fasta)
+    scaffold_stats = collect_n50_stats(lengths, prefix='Scaffold ')
+    contig_lengths = split_by_N(scaffolds)
+    contig_stats = collect_n50_stats(contig_lengths, prefix='Contig ')
+    scaffold_stats.update(contig_stats)
+    scaffold_stats['N%'] = (1 - (scaffold_stats['Contig N'] / float(scaffold_stats['Scaffold N']))) * 100
+    return scaffold_stats
 
 
 if __name__ == '__main__':
-    input_fasta_name= sys.argv[1]
+    input_fasta_name = sys.argv[1]
     assembly_stats = all_stats(input_fasta_name)
-    for key in assembly_stats:
+    label_order = ['Scaffold N', 'Scaffold N50', 'Scaffold N90', 'Scaffold nx90',
+                   'Contig N', 'Contig N50', 'Contig N90', 'Contig nx90',
+                   'N%']
+    for key in label_order:
         print(key + ":", "{:,}".format(assembly_stats[key]))
+    for key in assembly_stats:  # unordered labels
+        if key not in label_order:
+            print(key + ":", "{:,}".format(assembly_stats[key]))
+
+
diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='DNASkittleUtils',
-    version='1.0.7',
+    version='1.0.9',
     description='Bioinformatics functions that have been useful in multiple projects.  Manipulating FASTA files, executing pipelines, etc.',
     author='Josiah Seaman',
     author_email='[email protected]',