-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
6 changed files
with
253 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#!/usr/bin/env python | ||
""" | ||
A script to generate the input coverage table for CONCOCT using a BEDFile. | ||
Output is written to stdout. The BEDFile defines the regions used as | ||
subcontigs for concoct. This makes it possible to get the coverage for | ||
subcontigs without specifically mapping reads against the subcontigs. | ||
@author: inodb, alneberg | ||
""" | ||
from __future__ import print_function | ||
import sys | ||
import os | ||
import argparse | ||
import subprocess | ||
import glob | ||
from signal import signal, SIGPIPE, SIG_DFL | ||
import pandas as pd | ||
from datetime import datetime as dt | ||
|
||
def check_bed_file_for_errors(bedfile): | ||
with open(bedfile) as ifh: | ||
for line in ifh: | ||
line = line.strip() | ||
original_id, _, _, cutup_id = line.split('\t') | ||
try: | ||
assert 'concoct_part_' not in original_id | ||
assert 'concoct_part_' in cutup_id | ||
except AssertionError: | ||
sys.stderr.write(("ERROR! Something is wrong with the line:\n'{}'\n" | ||
"Perhaps 'concoct_part_' is misplaced or missing? Exiting!\n").format(line)) | ||
sys.exit(-1) | ||
|
||
def generate_input_table(bedfile, bamfiles, samplenames=None): | ||
"""Reads input files into dictionaries then prints everything in the table | ||
format required for running CONCOCT.""" | ||
|
||
p = subprocess.Popen(["samtools", "bedcov", bedfile] + bamfiles, stdout=subprocess.PIPE) | ||
|
||
out, err = p.communicate() | ||
if p.returncode != 0: | ||
#print("out:", out) | ||
#print("err:", err) | ||
#sys.stderr.write(out) | ||
#sys.stderr.write(err) | ||
raise Exception('Error with running samtools bedcov') | ||
else: | ||
# Header | ||
if samplenames == None: | ||
# Use index if no sample names given in header | ||
col_names = [os.path.splitext(os.path.basename(bamfile))[0] for bamfile in bamfiles] | ||
else: | ||
# Use given sample names in header | ||
col_names = samplenames | ||
header=["cov_mean_sample_{}".format(n) for n in col_names] | ||
|
||
# Content | ||
if sys.version_info[0] < 3: | ||
from StringIO import StringIO | ||
else: | ||
from io import StringIO | ||
|
||
fh = StringIO(out.decode('utf-8')) | ||
df = pd.read_table(fh, header=None) | ||
avg_coverage_depth = df[df.columns[4:]].divide((df[2]-df[1]), axis=0) | ||
avg_coverage_depth.index = df[3] | ||
avg_coverage_depth.columns = header | ||
avg_coverage_depth.to_csv(sys.stdout, index_label='contig', sep='\t', float_format='%.3f') | ||
|
||
if __name__ == "__main__": | ||
|
||
|
||
#print(dt.today(), "START SCRIPT") | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("bedfile", help="Contigs BEDFile with four columns representing: 'Contig ID, Start Position, " | ||
"End Position and SubContig ID' respectively. The Subcontig ID must contain the pattern " | ||
"'concoct_part_[0-9]*' while the contigs which are not cutup cannot contain this pattern. " | ||
"This file can be generated by the cut_up_fasta.py script.") | ||
parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to the original contigs.") | ||
parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr " | ||
"of bamfiles. Default sample names used are the file names of the bamfiles, excluding the file extension.") | ||
args = parser.parse_args() | ||
|
||
|
||
#print(dt.today(), "bedfile used:", args.bedfile) | ||
#print(dt.today(), "bam used:", args.bamfiles) | ||
# Get sample names | ||
if args.samplenames != None: | ||
samplenames = [ s[:-1] for s in open(args.samplenames).readlines() ] | ||
if len(samplenames) != len(args.bamfiles): | ||
raise Exception("Nr of names ({0}) in samplenames should be equal to nr of given " | ||
"bamfiles ({1})".format(len(samplenames), len(args.bamfiles))) | ||
else: | ||
samplenames=None | ||
|
||
# ignore broken pipe error when piping output | ||
# http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html | ||
signal(SIGPIPE,SIG_DFL) | ||
check_bed_file_for_errors(args.bedfile) | ||
generate_input_table(args.bedfile, args.bamfiles, samplenames=samplenames) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
#!/usr/bin/env python | ||
"""extract_fasta_bins.py | ||
Extract a fasta file for each cluster from a concoct result file. | ||
""" | ||
|
||
import argparse | ||
import sys | ||
import os | ||
from Bio import SeqIO | ||
import pandas as pd | ||
from collections import defaultdict | ||
|
||
def main(args): | ||
all_seqs = {} | ||
for i, seq in enumerate(SeqIO.parse(args.fasta_file, "fasta")): | ||
all_seqs[seq.id] = seq | ||
df = pd.read_csv(args.cluster_file) | ||
try: | ||
assert df.columns[0] == 'contig_id' | ||
assert df.columns[1] == 'cluster_id' | ||
except AssertionError: | ||
sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n") | ||
sys.exit(-1) | ||
|
||
cluster_to_contigs = defaultdict(list) | ||
for i, row in df.iterrows(): | ||
cluster_to_contigs[row['cluster_id']].append(row['contig_id']) | ||
|
||
for cluster_id, contig_ids in cluster_to_contigs.items(): | ||
output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id)) | ||
seqs = [all_seqs[contig_id] for contig_id in contig_ids] | ||
with open(output_file, 'w') as ofh: | ||
SeqIO.write(seqs, ofh, 'fasta') | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("fasta_file", help="Input Fasta file.") | ||
parser.add_argument("cluster_file", help="Concoct output cluster file") | ||
parser.add_argument("--output_path", default=os.getcwd(), | ||
help="Directory where files will be printed") | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
#!/usr/bin/env python | ||
""" | ||
With contigs cutup with cut_up_fasta.py as input, sees to that the consequtive | ||
parts of the original contigs are merged. | ||
prints result to stdout. | ||
@author: alneberg | ||
""" | ||
from __future__ import print_function | ||
import sys | ||
import os | ||
import argparse | ||
from collections import defaultdict, Counter | ||
import re | ||
|
||
CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)") | ||
|
||
def original_contig_name_special(s): | ||
n = s.split(".")[-1] | ||
try: | ||
original_id, part_nr = CONTIG_PART_EXPR.match(s).group(1,2) | ||
return original_id, part_nr | ||
except AttributeError: # No matches for concoct_part regex | ||
return s, 0 | ||
|
||
def main(args): | ||
all_seqs = {} | ||
all_originals = defaultdict(dict) | ||
first = True | ||
with open(args.cutup_clustering_result, 'r') as ifh: | ||
for line in ifh: | ||
if first: | ||
first=False | ||
if 'contig_id' not in line: | ||
sys.stderr.write(("ERROR! The term 'contig_id' was not found on the first row. Please make sure that there " | ||
"is a header line before continuing. Exiting\n")) | ||
sys.exit(-1) | ||
continue | ||
line = line.strip() | ||
contig_id, cluster_id = line.split(',') | ||
original_contig_name, part_id = original_contig_name_special(contig_id) | ||
|
||
all_originals[original_contig_name][part_id] = cluster_id | ||
|
||
merged_contigs_stack = [] | ||
|
||
sys.stdout.write("contig_id,cluster_id\n") | ||
for original_contig_id, part_ids_d in all_originals.items(): | ||
if len(part_ids_d) > 1: | ||
c = Counter(part_ids_d.values()) | ||
cluster_id = c.most_common(1)[0][0] | ||
c_string = [(a,b) for a, b in c.items()] | ||
if len(c.values()) > 1: | ||
sys.stderr.write("No consensus cluster for contig {}: {}\t Chosen cluster: {}\n".format(original_contig_id, c_string, cluster_id)) | ||
else: | ||
cluster_id = list(part_ids_d.values())[0] | ||
|
||
sys.stdout.write("{},{}\n".format(original_contig_id, cluster_id)) | ||
|
||
if __name__ == "__main__": | ||
parser = argparse.ArgumentParser(description=__doc__) | ||
parser.add_argument("cutup_clustering_result", help=("Input cutup clustering result.")) | ||
args = parser.parse_args() | ||
|
||
main(args) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
#!/usr/bin/env python3 | ||
# This script prints any comment in a structured and prety way. | ||
import sys | ||
comm=sys.argv[1] | ||
delim=sys.argv[2] | ||
|
||
print('\n'+delim*120) | ||
|
||
max_len=90 | ||
|
||
cut=comm.split(" ") | ||
line="" | ||
for word in cut: | ||
if (len(line) + 1 + len(word))>max_len: | ||
edge1=(120-len(line))/2 - 5 | ||
edge2=120-edge1-len(line) - 10 | ||
|
||
edge1 = int(edge1) | ||
edge2 = int(edge2) | ||
print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5) | ||
line=word | ||
else: | ||
line = line+" "+word | ||
edge1=(120-len(line))/2 - 5 | ||
edge2=120-edge1-len(line) - 10 | ||
|
||
edge1 = int(edge1) | ||
edge2 = int(edge2) | ||
|
||
print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5) | ||
|
||
print(delim*120+'\n') | ||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters