added modded_scripts

ParkinsonLab · Jul 12, 2024 · 8453b6b · 8453b6b
2 parents c8ce586 + c8ce586
commit 8453b6b
Show file tree

Hide file tree

Showing 6 changed files with 253 additions and 2 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -348,12 +348,15 @@ RUN wget https://github.com/COMBINE-lab/salmon/releases/download/v1.10.0/salmon-
 && tar -xzvf salmon-1.10.0_linux_x86_64.tar.gz \
 && rm *.tar.gz
 
+
 ENV PATH="${PATH}:/quackers_tools/salmon-latest_linux_x86_64/bin"
 ENV PATH="${PATH}:/quackers_tools/fastANI"
 ENV PATH="${PATH}:/quackers_tools/fasttree"
 ENV PATH="${PATH}:/quackers_tools/mash"
 ENV PATH="${PATH}:/quackers_tools/adapterremoval"
 ENV PATH="${PATH}:/quackers_tools/cdhit_dup"
+ENV PATH="${PATH}:/quackers_tools/megahit/bin"
+
 
 
 WORKDIR /quackers_pipe
@@ -369,9 +372,11 @@ RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/Config.i
 
 WORKDIR /quackers_pipe/scripts
 
-RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/clean_reads_reconcile.py
 RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/contig_reconcile.py
 RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/sam_sift.py
+RUN wget https://raw.githubusercontent.com/ParkinsonLab/quackers/v1.0.0/scripts/clean_reads_reconcile.py
+
+WORKDIR /quackers_pipe/modded_scripts
 
 
 

diff --git a/modded_scripts/concoct_coverage_table.py b/modded_scripts/concoct_coverage_table.py
@@ -0,0 +1,101 @@
+#!/usr/bin/env python
+"""
+A script to generate the input coverage table for CONCOCT using a BEDFile.
+
+Output is written to stdout. The BEDFile defines the regions used as 
+subcontigs for concoct. This makes it possible to get the coverage for 
+subcontigs without specifically mapping reads against the subcontigs.
+
+@author: inodb, alneberg
+"""
+from __future__ import print_function
+import sys
+import os
+import argparse
+import subprocess
+import glob
+from signal import signal, SIGPIPE, SIG_DFL
+import pandas as pd
+from datetime import datetime as dt
+
+def check_bed_file_for_errors(bedfile):
+    with open(bedfile) as ifh:
+        for line in ifh:
+            line = line.strip()
+            original_id, _, _, cutup_id = line.split('\t')
+            try:
+                assert 'concoct_part_' not in original_id
+                assert 'concoct_part_' in cutup_id
+            except AssertionError:
+                sys.stderr.write(("ERROR! Something is wrong with the line:\n'{}'\n"
+                "Perhaps 'concoct_part_' is misplaced or missing? Exiting!\n").format(line))
+                sys.exit(-1)
+
+def generate_input_table(bedfile, bamfiles, samplenames=None):
+    """Reads input files into dictionaries then prints everything in the table
+    format required for running CONCOCT."""
+
+    p = subprocess.Popen(["samtools", "bedcov", bedfile] + bamfiles, stdout=subprocess.PIPE)
+
+    out, err = p.communicate()
+    if p.returncode != 0:
+        #print("out:", out)
+        #print("err:", err)
+        #sys.stderr.write(out)
+        #sys.stderr.write(err)
+        raise Exception('Error with running samtools bedcov')
+    else:
+        # Header
+        if samplenames == None:
+            # Use index if no sample names given in header
+            col_names = [os.path.splitext(os.path.basename(bamfile))[0] for bamfile in bamfiles]
+        else:
+            # Use given sample names in header
+            col_names = samplenames
+        header=["cov_mean_sample_{}".format(n) for n in col_names]
+
+        # Content
+        if sys.version_info[0] < 3:
+            from StringIO import StringIO
+        else:
+            from io import StringIO
+
+        fh = StringIO(out.decode('utf-8'))
+        df = pd.read_table(fh, header=None)
+        avg_coverage_depth = df[df.columns[4:]].divide((df[2]-df[1]), axis=0)
+        avg_coverage_depth.index = df[3]
+        avg_coverage_depth.columns = header 
+        avg_coverage_depth.to_csv(sys.stdout, index_label='contig', sep='\t', float_format='%.3f')
+
+if __name__ == "__main__":
+
+
+    #print(dt.today(), "START SCRIPT")
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("bedfile", help="Contigs BEDFile with four columns representing: 'Contig ID, Start Position, "
+    "End Position and SubContig ID' respectively. The Subcontig ID must contain the pattern "
+    "'concoct_part_[0-9]*' while the contigs which are not cutup cannot contain this pattern. "
+    "This file can be generated by the cut_up_fasta.py script.")
+    parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to the original contigs.")
+    parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr "
+    "of bamfiles. Default sample names used are the file names of the bamfiles, excluding the file extension.")
+    args = parser.parse_args()
+
+
+    #print(dt.today(), "bedfile used:", args.bedfile)
+    #print(dt.today(), "bam used:", args.bamfiles)
+    # Get sample names
+    if args.samplenames != None:
+        samplenames = [ s[:-1] for s in open(args.samplenames).readlines() ]
+        if len(samplenames) != len(args.bamfiles):
+            raise Exception("Nr of names ({0}) in samplenames should be equal to nr of given "
+                            "bamfiles ({1})".format(len(samplenames), len(args.bamfiles)))
+    else:
+        samplenames=None
+
+    # ignore broken pipe error when piping output
+    # http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
+    signal(SIGPIPE,SIG_DFL)
+    check_bed_file_for_errors(args.bedfile)
+    generate_input_table(args.bedfile, args.bamfiles, samplenames=samplenames)
+
diff --git a/modded_scripts/extract_fasta_bins.py b/modded_scripts/extract_fasta_bins.py
@@ -0,0 +1,44 @@
+#!/usr/bin/env python
+"""extract_fasta_bins.py 
+
+Extract a fasta file for each cluster from a concoct result file.
+"""
+
+import argparse
+import sys
+import os
+from Bio import SeqIO
+import pandas as pd
+from collections import defaultdict
+
+def main(args):
+    all_seqs = {}
+    for i, seq in enumerate(SeqIO.parse(args.fasta_file, "fasta")):
+        all_seqs[seq.id] = seq
+    df = pd.read_csv(args.cluster_file)
+    try:
+        assert df.columns[0] == 'contig_id'
+        assert df.columns[1] == 'cluster_id'
+    except AssertionError:
+        sys.stderr.write("ERROR! Header line was not 'contig_id, cluster_id', please adjust your input file. Exiting!\n")
+        sys.exit(-1)
+
+    cluster_to_contigs = defaultdict(list)
+    for i, row in df.iterrows():
+        cluster_to_contigs[row['cluster_id']].append(row['contig_id'])
+
+    for cluster_id, contig_ids in cluster_to_contigs.items():
+        output_file = os.path.join(args.output_path, "{0}.fa".format(cluster_id))
+        seqs = [all_seqs[contig_id] for contig_id in contig_ids] 
+        with open(output_file, 'w') as ofh:
+            SeqIO.write(seqs, ofh, 'fasta')
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("fasta_file", help="Input Fasta file.")
+    parser.add_argument("cluster_file", help="Concoct output cluster file")
+    parser.add_argument("--output_path", default=os.getcwd(), 
+            help="Directory where files will be printed")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/modded_scripts/merge_cutup_clustering.py b/modded_scripts/merge_cutup_clustering.py
@@ -0,0 +1,66 @@
+#!/usr/bin/env python
+"""
+With contigs cutup with cut_up_fasta.py as input, sees to that the consequtive
+parts of the original contigs are merged.
+
+prints result to stdout.
+
+@author: alneberg
+"""
+from __future__ import print_function
+import sys
+import os
+import argparse
+from collections import defaultdict, Counter
+import re
+
+CONTIG_PART_EXPR = re.compile("(.*)\.concoct_part_([0-9]*)")
+
+def original_contig_name_special(s):
+    n = s.split(".")[-1]
+    try:
+        original_id, part_nr = CONTIG_PART_EXPR.match(s).group(1,2)
+        return original_id, part_nr
+    except AttributeError: # No matches for concoct_part regex
+        return s, 0
+
+def main(args):
+    all_seqs = {}
+    all_originals = defaultdict(dict)
+    first = True
+    with open(args.cutup_clustering_result, 'r') as ifh:
+        for line in ifh:
+            if first:
+                first=False
+                if 'contig_id' not in line:
+                    sys.stderr.write(("ERROR! The term 'contig_id' was not found on the first row. Please make sure that there "
+                    "is a header line before continuing. Exiting\n"))
+                    sys.exit(-1)
+                continue
+            line = line.strip()
+            contig_id, cluster_id = line.split(',')
+            original_contig_name, part_id = original_contig_name_special(contig_id)
+
+            all_originals[original_contig_name][part_id] = cluster_id
+
+    merged_contigs_stack = []
+
+    sys.stdout.write("contig_id,cluster_id\n")
+    for original_contig_id, part_ids_d in all_originals.items():
+        if len(part_ids_d) > 1:
+            c = Counter(part_ids_d.values())
+            cluster_id = c.most_common(1)[0][0]
+            c_string = [(a,b) for a, b in c.items()]
+            if len(c.values()) > 1:
+                sys.stderr.write("No consensus cluster for contig {}: {}\t Chosen cluster: {}\n".format(original_contig_id, c_string, cluster_id))
+        else:
+            cluster_id = list(part_ids_d.values())[0]
+
+        sys.stdout.write("{},{}\n".format(original_contig_id, cluster_id))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("cutup_clustering_result", help=("Input cutup clustering result."))
+    args = parser.parse_args()
+
+    main(args)
diff --git a/modded_scripts/print_comment.py b/modded_scripts/print_comment.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python3
+# This script prints any comment in a structured and prety way.
+import sys
+comm=sys.argv[1]
+delim=sys.argv[2]
+
+print('\n'+delim*120)
+
+max_len=90
+
+cut=comm.split(" ")
+line=""
+for word in cut:
+    if (len(line) + 1 + len(word))>max_len:
+        edge1=(120-len(line))/2 - 5
+        edge2=120-edge1-len(line) - 10
+
+        edge1 = int(edge1)
+        edge2 = int(edge2)
+        print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5)
+        line=word
+    else:
+        line = line+" "+word
+edge1=(120-len(line))/2 - 5
+edge2=120-edge1-len(line) - 10
+
+edge1 = int(edge1)
+edge2 = int(edge2)
+
+print(delim*5 + " "*edge1 + line + " "*edge2 + delim*5)
+
+print(delim*120+'\n')
+
+
+
diff --git a/quackers_paths.py b/quackers_paths.py
@@ -336,7 +336,7 @@ def __init__(self, output_folder_path, config_path = None):
 
 
         self.tool_install_path = "/quackers_tools"
-        self.temp_internal_scripts_path = "/home/billy/storage/quackers"
+        self.temp_internal_scripts_path = "/quackers_pipe"
         #self.mwrap_temp_path = os.path.join(self.temp_internal_scripts_path, "modded_scripts")
 
         self.megahit_path       = os.path.join(self.tool_install_path, "megahit", "bin", "megahit")