Skip to content

Commit

Permalink
changes to curated markers and corresponding workflows with phylogenies
Browse files Browse the repository at this point in the history
  • Loading branch information
elizabethmcd committed Dec 19, 2019
1 parent 38699bb commit 4661093
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 17 deletions.
16 changes: 7 additions & 9 deletions bin/create-genome-phylogeny
Original file line number Diff line number Diff line change
Expand Up @@ -23,14 +23,14 @@ from Bio.SeqRecord import SeqRecord
from collections import defaultdict, Counter

# Arguments
parser = argparse.ArgumentParser(description = "Create ribosomal phylogenies using specific ribosomal markers for archaea and/or bacteria")
parser = argparse.ArgumentParser(description = "Create ribosomal phylogenies using specific ribosomal markers for archaea or bacteria")
parser._action_groups.pop()
required = parser.add_argument_group("required arguments")
optional = parser.add_argument_group("optional arguments")
metadata = parser.add_argument_group("metadata output files for ITOL")
required.add_argument('--input', metavar='INPUT', help='Directory where genomes to be screened are held')
required.add_argument('--output', metavar='OUTPUT', help='Directory to store results and intermediate files')
required.add_argument('--domain', metavar='DOMAIN', help='archaea, bacteria, all')
required.add_argument('--domain', metavar='DOMAIN', help='archaea, bacteria')
required.add_argument('--phylogeny', metavar='PHY', help='fastree, raxml')
optional.add_argument('--threads',metavar='THREADS',help='Optional: number of threads for calculating a tree using RAxML. This is not taken into account using Fastree')
optional.add_argument('--loci', metavar='LOCI', default='12', help='Output genomes with less than x number of loci. By default prints genomes that have less than 12 ribosomal loci markers.')
Expand Down Expand Up @@ -105,7 +105,7 @@ if os.path.isdir(OUTPUT) == True:
sys.exit()

# check for ribosomal markers directory
if os.path.isdir("ribosomal_markers/") == False:
if os.path.isdir("curated_markers/ribosomal_markers/") == False:
print(" The ribosomal markers directory could not be found."+"\n"+" Please either download the markers from https://github.com/elizabethmcd/metabolisHMM/releases/download/v2.0/metabolisHMM_v2.0_markers.tgz and decompress the tarball, or move the directory to where you are running the workflow from.")
sys.exit()

Expand Down Expand Up @@ -134,16 +134,14 @@ elif METADATA == 'OFF':


# different ribosomal markers for archaea/bacteria/all
bacteria_list = ['rpL14','rpL15','rpL16','rpL18','rpL22','rpL24','rpL2','rpL3','rpL4','rpL5','rpL6','rpS10','rpS17','rpS19','rpS3','rpS8']
archaea_list = ['rpL14','rpL15','rpL18','rpL22','rpL24','rpL2','rpL3','rpL4','rpL5','rpL6','rpS17','rpS19','rpS3','rpS8']
all_list = ['rpL14','rpL15','rpL18','rpL22','rpL24','rpL2', 'rpL3','rpL4','rpL5','rpL6','rpS17','rpS19','rpS3','rpS8']
bacteria_list = ['rpL14_bact','rpL15_bact','rpL16_bact','rpL18_bact','rpL22_bact','rpL24_bact','rpL2_bact','rpL3_bact','rpL4_bact','rpL5_bact','rpL6_bact','rpS10_bact','rpS17_bact','rpS19_bact','rpS3_bact','rpS8_bact']
archaea_list = ['rpL14_arch','rpL15_arch','rpL16_arch','rpL18_arch','rpL22_arch','rpL24_arch','rpL2_arch','rpL3_arch','rpL4_arch','rpL5_arch','rpL6_arch', 'rpS10_arch','rpS17_arch','rpS19_arch','rpS3_arch','rpS8_arch']


if DOMAIN == 'archaea':
prot_list=archaea_list
elif DOMAIN == 'bacteria':
prot_list=bacteria_list
elif DOMAIN == 'all':
prot_list=all_list


# if .fna predict CDS and reformat header names because prodigal makes them stupid
Expand Down Expand Up @@ -187,7 +185,7 @@ for genome in reformatted_genomes:
dir=name
os.makedirs(OUTPUT+"/out/"+dir)
for prot in prot_list:
marker ="ribosomal_markers/"+prot+"_bact.hmm"
marker ="curated_markers/ribosomal_markers/"+prot+".hmm"
outname= OUTPUT + "/out/"+dir+"/"+name + "-" + prot + ".out"
cmd = ["hmmsearch", "--tblout="+outname, marker, genome]
subprocess.call(cmd, stdout=FNULL)
Expand Down
11 changes: 4 additions & 7 deletions bin/single-marker-phylogeny
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ optional.add_argument("--threads",metavar='THREADS',help="number of threads for
optional.add_argument("--kofam", metavar='KOFAM', help="Use KEGG HMMs from the KofamKOALA set. Options = ON or OFF")
optional.add_argument("--ko_list", metavar='KOLIST', help="Point to location of the KofamKoala ko_list file if using the KofamKOALA KEGG HMMs")
ribosomal.add_argument("--ribo_tree", metavar='RIBO', default='OFF', help="Make corresponding ribosomal phylogeny of genomes containing hits of the provided single marker. " )
ribosomal.add_argument("--domain", metavar='DOMAIN', help="If constructing corresponding ribosomal tree, select the domain for which your hits belong to. Options: bacteria, archaea, all")
ribosomal.add_argument("--domain", metavar='DOMAIN', help="If constructing corresponding ribosomal tree, select the domain for which your hits belong to. Options: bacteria, archaea")
ribosomal.add_argument('--loci', metavar='LOCI', default='12', help='Output genomes with less than x number of loci. By default prints genomes that have less than 12 ribosomal loci markers.')
metadata.add_argument('--metadata', metavar='METADATA',help='Option for outputting ITOL formatted metadata files. ON or OFF')
metadata.add_argument('--names', metavar='NAMES', help="Provided .csv formatted metadata file of filenames and corresponding taxonomical or group names")
Expand Down Expand Up @@ -284,15 +284,12 @@ elif PHYTOOL == "raxml":

# Corresponding ribosomal tree of hits and options
# lists
bacteria_list = ['rpL14','rpL15','rpL16','rpL18','rpL22','rpL24','rpL2','rpL3','rpL4','rpL5','rpL6','rpS10','rpS17','rpS19','rpS3','rpS8']
archaea_list = ['rpL14','rpL15','rpL18','rpL22','rpL24','rpL2','rpL3','rpL4','rpL5','rpL6','rpS17','rpS19','rpS3','rpS8']
all_list = ['rpL14','rpL15','rpL18','rpL22','rpL24','rpL2', 'rpL3','rpL4','rpL5','rpL6','rpS17','rpS19','rpS3','rpS8']
bacteria_list = ['rpL14_bact','rpL15_bact','rpL16_bact','rpL18_bact','rpL22_bact','rpL24_bact','rpL2_bact','rpL3_bact','rpL4_bact','rpL5_bact','rpL6_bact','rpS10_bact','rpS17_bact','rpS19_bact','rpS3_bact','rpS8_bact']
archaea_list = ['rpL14_arch','rpL15_arch','rpL16_arch','rpL18_arch','rpL22_arch','rpL24_arch','rpL2_arch','rpL3_arch','rpL4_arch','rpL5_arch','rpL6_arch', 'rpS10_arch','rpS17_arch','rpS19_arch','rpS3_arch','rpS8_arch']
if DOMAIN == 'archaea':
prot_list=archaea_list
elif DOMAIN == 'bacteria':
prot_list=bacteria_list
elif DOMAIN == 'all':
prot_list=all_list
# runs phylogeny on specific hits of single marker
if RIBO == 'ON':
print("Making corresponding ribosomal phylogeny of single marker hits...")
Expand All @@ -306,7 +303,7 @@ if RIBO == 'ON':
genome = OUTPUT + "/genomes/"+hit+".reformatted.faa"
os.makedirs(OUTPUT+"/ribo_out/"+hit)
for prot in prot_list:
marker ="ribosomal_markers/"+prot+"_bact.hmm"
marker ="curated_markers/ribosomal_markers/"+prot+".hmm"
outname= OUTPUT + "/ribo_out/"+hit+"/"+hit + "-" + prot + ".out"
cmd = ["hmmsearch", "--tblout="+outname, marker, genome]
subprocess.call(cmd, stdout=FNULL)
Expand Down
2 changes: 1 addition & 1 deletion bin/summarize-metabolism
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ out_intm = OUTPUT + "/out"
out_results = OUTPUT + "/results"
out_genomes = OUTPUT + "/genomes"
OUTFILE = args.summary
markers=glob.glob("metabolic_markers/*.hmm")
markers=glob.glob("curated_markers/metabolic_markers/*.hmm")
genomes=glob.glob(GENOMEFILES)
PLOTTING = args.plotting

Expand Down
Binary file removed markerdb_v1.8.tgz
Binary file not shown.
Binary file added metabolisHMM_markers_v1.9.tgz
Binary file not shown.

0 comments on commit 4661093

Please sign in to comment.