Skip to content

Commit

Permalink
added modded_scripts
Browse files Browse the repository at this point in the history
  • Loading branch information
billytaj committed Jul 12, 2024
2 parents c8ce586 + c8ce586 commit 8453b6b
Show file tree
Hide file tree
Showing 6 changed files with 253 additions and 2 deletions.
7 changes: 6 additions & 1 deletion Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -348,12 +348,15 @@ RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v1.10.0/salmon-
&& tar -xzvf salmon-1.10.0_linux_x86_64.tar.gz \
&& rm *.tar.gz


ENV PATH="${PATH}:/quackers_tools/salmon-latest_linux_x86_64/bin"
ENV PATH="${PATH}:/quackers_tools/fastANI"
ENV PATH="${PATH}:/quackers_tools/fasttree"
ENV PATH="${PATH}:/quackers_tools/mash"
ENV PATH="${PATH}:/quackers_tools/adapterremoval"
ENV PATH="${PATH}:/quackers_tools/cdhit_dup"
ENV PATH="${PATH}:/quackers_tools/megahit/bin"



WORKDIR /quackers_pipe
Expand All @@ -369,9 +372,11 @@ RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/Config.i

WORKDIR /quackers_pipe/scripts

RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/clean_reads_reconcile.py
RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/contig_reconcile.py
RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/sam_sift.py
RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/clean_reads_reconcile.py

WORKDIR /quackers_pipe/modded_scripts



Expand Down
101 changes: 101 additions & 0 deletions modded_scripts/concoct_coverage_table.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#!/usr/bin/env python
"""
A script to generate the input coverage table for CONCOCT using a BEDFile.
Output is written to stdout. The BEDFile defines the regions used as
subcontigs for concoct. This makes it possible to get the coverage for
subcontigs without specifically mapping reads against the subcontigs.
@author: inodb, alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
import subprocess
import glob
from signal import signal, SIGPIPE, SIG_DFL
import pandas as pd
from datetime import datetime as dt

def check_bed_file_for_errors(bedfile):
with open(bedfile) as ifh:
for line in ifh:
line = line.strip()
original_id, _, _, cutup_id = line.split('\t')
try:
assert 'concoct_part_' not in original_id
assert 'concoct_part_' in cutup_id
except AssertionError:
sys.stderr.write(("ERROR! Something is wrong with the line:\n'{}'\n"
"Perhaps 'concoct_part_' is misplaced or missing? Exiting!\n").format(line))
sys.exit(-1)

def generate_input_table(bedfile, bamfiles, samplenames=None):
"""Reads input files into dictionaries then prints everything in the table
format required for running CONCOCT."""

p = subprocess.Popen(["samtools", "bedcov", bedfile] + bamfiles, stdout=subprocess.PIPE)

out, err = p.communicate()
if p.returncode != 0:
#print("out:", out)
#print("err:", err)
#sys.stderr.write(out)
#sys.stderr.write(err)
raise Exception('Error with running samtools bedcov')
else:
# Header
if samplenames == None:
# Use index if no sample names given in header
col_names = [os.path.splitext(os.path.basename(bamfile))[0] for bamfile in bamfiles]
else:
# Use given sample names in header
col_names = samplenames
header=["cov_mean_sample_{}".format(n) for n in col_names]

# Content
if sys.version_info[0] < 3:
from StringIO import StringIO
else:
from io import StringIO

fh = StringIO(out.decode('utf-8'))
df = pd.read_table(fh, header=None)
avg_coverage_depth = df[df.columns[4:]].divide((df[2]-df[1]), axis=0)
avg_coverage_depth.index = df[3]
avg_coverage_depth.columns = header
avg_coverage_depth.to_csv(sys.stdout, index_label='contig', sep='\t', float_format='%.3f')

if __name__ == "__main__":


#print(dt.today(), "START SCRIPT")
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("bedfile", help="Contigs BEDFile with four columns representing: 'Contig ID, Start Position, "
"End Position and SubContig ID' respectively. The Subcontig ID must contain the pattern "
"'concoct_part_[0-9]*' while the contigs which are not cutup cannot contain this pattern. "
"This file can be generated by the cut_up_fasta.py script.")
parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to the original contigs.")
parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr "
"of bamfiles. Default sample names used are the file names of the bamfiles, excluding the file extension.")
args = parser.parse_args()


#print(dt.today(), "bedfile used:", args.bedfile)
#print(dt.today(), "bam used:", args.bamfiles)
# Get sample names
if args.samplenames != None:
samplenames = [ s[:-1] for s in open(args.samplenames).readlines() ]
if len(samplenames) != len(args.bamfiles):
raise Exception("Nr of names ({0}) in samplenames should be equal to nr of given "
"bamfiles ({1})".format(len(samplenames), len(args.bamfiles)))
else:
samplenames=None

# ignore broken pipe error when piping output
# http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
signal(SIGPIPE,SIG_DFL)
check_bed_file_for_errors(args.bedfile)
generate_input_table(args.bedfile, args.bamfiles, samplenames=samplenames)

44 changes: 44 additions & 0 deletions modded_scripts/extract_fasta_bins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/usr/bin/env python
"""extract_fasta_bins.py
Extract a fasta file for each cluster from a concoct result file.
"""

import argparse
import sys
import os
from Bio import SeqIO
import pandas as pd
from collections import defaultdict

def main(args):
all_seqs = {}
for i, seq in enumerate(SeqIO.parse(args.fasta_file, "fasta")):
all_seqs[seq.id] = seq
df = pd.read_csv(args.cluster_file)
try:
assert df.columns[0] == 'contig_id'
assert df.columns[1] == 'cluster_id'
except AssertionError:
sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n")
sys.exit(-1)

cluster_to_contigs = defaultdict(list)
for i, row in df.iterrows():
cluster_to_contigs[row['cluster_id']].append(row['contig_id'])

for cluster_id, contig_ids in cluster_to_contigs.items():
output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id))
seqs = [all_seqs[contig_id] for contig_id in contig_ids]
with open(output_file, 'w') as ofh:
SeqIO.write(seqs, ofh, 'fasta')

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("fasta_file", help="Input Fasta file.")
parser.add_argument("cluster_file", help="Concoct output cluster file")
parser.add_argument("--output_path", default=os.getcwd(),
help="Directory where files will be printed")
args = parser.parse_args()

main(args)
66 changes: 66 additions & 0 deletions modded_scripts/merge_cutup_clustering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
#!/usr/bin/env python
"""
With contigs cutup with cut_up_fasta.py as input, sees to that the consequtive
parts of the original contigs are merged.
prints result to stdout.
@author: alneberg
"""
from __future__ import print_function
import sys
import os
import argparse
from collections import defaultdict, Counter
import re

CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)")

def original_contig_name_special(s):
n = s.split(".")[-1]
try:
original_id, part_nr = CONTIG_PART_EXPR.match(s).group(1,2)
return original_id, part_nr
except AttributeError: # No matches for concoct_part regex
return s, 0

def main(args):
all_seqs = {}
all_originals = defaultdict(dict)
first = True
with open(args.cutup_clustering_result, 'r') as ifh:
for line in ifh:
if first:
first=False
if 'contig_id' not in line:
sys.stderr.write(("ERROR! The term 'contig_id' was not found on the first row. Please make sure that there "
"is a header line before continuing. Exiting\n"))
sys.exit(-1)
continue
line = line.strip()
contig_id, cluster_id = line.split(',')
original_contig_name, part_id = original_contig_name_special(contig_id)

all_originals[original_contig_name][part_id] = cluster_id

merged_contigs_stack = []

sys.stdout.write("contig_id,cluster_id\n")
for original_contig_id, part_ids_d in all_originals.items():
if len(part_ids_d) > 1:
c = Counter(part_ids_d.values())
cluster_id = c.most_common(1)[0][0]
c_string = [(a,b) for a, b in c.items()]
if len(c.values()) > 1:
sys.stderr.write("No consensus cluster for contig {}: {}\t Chosen cluster: {}\n".format(original_contig_id, c_string, cluster_id))
else:
cluster_id = list(part_ids_d.values())[0]

sys.stdout.write("{},{}\n".format(original_contig_id, cluster_id))

if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument("cutup_clustering_result", help=("Input cutup clustering result."))
args = parser.parse_args()

main(args)
35 changes: 35 additions & 0 deletions modded_scripts/print_comment.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env python3
# This script prints any comment in a structured and prety way.
import sys
comm=sys.argv[1]
delim=sys.argv[2]

print('\n'+delim*120)

max_len=90

cut=comm.split(" ")
line=""
for word in cut:
if (len(line) + 1 + len(word))>max_len:
edge1=(120-len(line))/2 - 5
edge2=120-edge1-len(line) - 10

edge1 = int(edge1)
edge2 = int(edge2)
print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5)
line=word
else:
line = line+" "+word
edge1=(120-len(line))/2 - 5
edge2=120-edge1-len(line) - 10

edge1 = int(edge1)
edge2 = int(edge2)

print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5)

print(delim*120+'\n')



2 changes: 1 addition & 1 deletion quackers_paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ def __init__(self, output_folder_path, config_path = None):


self.tool_install_path = "/quackers_tools"
self.temp_internal_scripts_path = "/home/billy/storage/quackers"
self.temp_internal_scripts_path = "/quackers_pipe"
#self.mwrap_temp_path = os.path.join(self.temp_internal_scripts_path, "modded_scripts")

self.megahit_path = os.path.join(self.tool_install_path, "megahit", "bin", "megahit")
Expand Down

0 comments on commit 8453b6b

Please sign in to comment.