Skip to content

Commit

Permalink
FastaStats shows Contig N50 as well.
Browse files Browse the repository at this point in the history
  • Loading branch information
josiahseaman committed Jan 23, 2018
1 parent b5a74b1 commit e39cac1
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 13 deletions.
47 changes: 35 additions & 12 deletions DNASkittleUtils/FastaStats.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
# N50 = contig length so that half of the contigs are longer and 1/2 of contigs are shorter
from __future__ import print_function, division, absolute_import, with_statement
import sys
from itertools import chain

from DNASkittleUtils.Contigs import read_contigs

Expand All @@ -25,7 +26,7 @@ def cumulative_sum(numbers_list):
return running_sums


def collect_n50_stats(scaffold_lengths):
def collect_n50_stats(scaffold_lengths, prefix=''):
"""N50:
the length of the shortest contig such that the sum of contigs of equal
length or longer is at least 50% of the total length of all contigs"""
Expand All @@ -37,40 +38,62 @@ def collect_n50_stats(scaffold_lengths):
csum = cumulative_sum(all_len)

assembly_size = sum(scaffold_lengths)
stats['N'] = int(assembly_size)
stats[prefix + 'N'] = int(assembly_size)
halfway_point = (assembly_size // 2)

# get index for cumsum >= N/2
for i, x in enumerate(csum):
if x >= halfway_point:
stats['N50'] = all_len[i]
stats[prefix + 'N50'] = all_len[i]
break

# N90
stats['nx90'] = int(assembly_size * 0.90)
stats[prefix + 'nx90'] = int(assembly_size * 0.90)

# index for csumsum >= 0.9*N
for i, x in enumerate(csum):
if x >= stats['nx90']:
stats['N90'] = all_len[i]
if x >= stats[prefix + 'nx90']:
stats[prefix + 'N90'] = all_len[i]
break

return stats


def scaffold_lengths_from_fasta(input_fasta_path):
contigs = read_contigs(input_fasta_path)
lengths = [len(x.seq) for x in contigs]
scaffolds = read_contigs(input_fasta_path)
lengths = [len(x.seq) for x in scaffolds]
return scaffolds, lengths


def split_by_N(scaffolds):
length_collection = set()
for scaffold in scaffolds:
pieces = scaffold.seq.split('N')
length_collection.add((len(p) for p in pieces))
lengths = list(chain(*length_collection))
return lengths


def all_stats(input_fasta):
lengths = scaffold_lengths_from_fasta(input_fasta)
return collect_n50_stats(lengths)
scaffolds, lengths = scaffold_lengths_from_fasta(input_fasta)
scaffold_stats = collect_n50_stats(lengths, prefix='Scaffold ')
contig_lengths = split_by_N(scaffolds)
contig_stats = collect_n50_stats(contig_lengths, prefix='Contig ')
scaffold_stats.update(contig_stats)
scaffold_stats['N%'] = (1 - (scaffold_stats['Contig N'] / float(scaffold_stats['Scaffold N']))) * 100
return scaffold_stats


if __name__ == '__main__':
input_fasta_name= sys.argv[1]
input_fasta_name = sys.argv[1]
assembly_stats = all_stats(input_fasta_name)
for key in assembly_stats:
label_order = ['Scaffold N', 'Scaffold N50', 'Scaffold N90', 'Scaffold nx90',
'Contig N', 'Contig N50', 'Contig N90', 'Contig nx90',
'N%']
for key in label_order:
print(key + ":", "{:,}".format(assembly_stats[key]))
for key in assembly_stats: # unordered labels
if key not in label_order:
print(key + ":", "{:,}".format(assembly_stats[key]))


2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='DNASkittleUtils',
version='1.0.7',
version='1.0.9',
description='Bioinformatics functions that have been useful in multiple projects. Manipulating FASTA files, executing pipelines, etc.',
author='Josiah Seaman',
author_email='[email protected]',
Expand Down

0 comments on commit e39cac1

Please sign in to comment.