diff --git a/anvio/__init__.py b/anvio/__init__.py index a941ecbb0..4e9e19503 100644 --- a/anvio/__init__.py +++ b/anvio/__init__.py @@ -2541,6 +2541,26 @@ def TABULATE(table, header, numalign="right", max_width=0): "and internal anvi'o heuristics control whether or not indels should be reported, but with this " "flag all indels are reported."} ), + 'list-defline-variables': ( + ['--list-defline-variables'], + {'default': False, + 'action': 'store_true', + 'help': "When declared, anvi'o will list the variable names that can be used to construct deflines in " + "FASTA outputs from the user-defined `--defline-format` strings."} + ), + 'defline-format': ( + ['--defline-format'], + {'default': '{gene_caller_id}', + 'metavar': "F-STRING", + 'help': "Proivide a defline template for anvi'o to use when generating the FASTA output. The way this " + "works is actually quite simple: first you learn about all the options that exist using the " + "`--list-defline-variables`, and then use them to create your template. Available variables " + "should be listed within curly brackets, which will be evaluated in contex. Anything outside " + "of curly brackets will be kept as is. For instance, if you would like your defline to have " + "the gene caller ID after the contig name in which it occurs, you can use this template: " + "'{contig_name}_{gene_caller_id}', and your defline will look like '>XXX_182'. See more " + "examples in online help."} + ), 'report-extended-deflines': ( ['--report-extended-deflines'], {'default': False, diff --git a/anvio/dbops.py b/anvio/dbops.py index fe03f1f2c..3b9ef769d 100644 --- a/anvio/dbops.py +++ b/anvio/dbops.py @@ -1120,10 +1120,42 @@ def get_gene_amino_acid_sequence(self, gene_caller_ids): return sequences - def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True, include_aa_sequences=False, flank_length=0, - output_file_path_external_gene_calls=None, simple_headers=False, report_aa_sequences=False, wrap=120, rna_alphabet=False): + def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True, + include_aa_sequences=False, flank_length=0, output_file_path_external_gene_calls=None, + simple_headers=False, list_defline_variables=False, defline_format='{gene_caller_id}', + report_aa_sequences=False, wrap=120, rna_alphabet=False): + + ################################################################################################## + # + # DEFLIINE FORMATTING REPORTING RELATED PRE-CHECKS + # + ################################################################################################## + # available options to determine deflines through user-provided f-strings. the dictionary is + # populated below, and if you make any changes here, please don't forget to update it there too: + defline_data_dict = {'gene_caller_id': None, + 'contig_name': None, + 'start': None, + 'stop': None, + 'direction': None, + 'length': None, + 'contigs_db_project_name': None} + + # if the user needs to see the list, show the list and quit + if list_defline_variables: + self.run.warning(f"Here are the variables you can use to provide a user-defined defline template: ") + for key in defline_data_dict.keys(): + self.run.info_single("{%s}" % key) + self.run.info_single("Remember, by default, anvi'o will only use '{gene_caller_id}' to format the deflines of " + "FASTA files it produces.", level=0, nl_before=1, nl_after=1, mc='red') + + sys.exit() + + ################################################################################################## + # + # BUNCH OF SANITY CHECKS BEFORE WE GET INTO BUSINESS + # + ################################################################################################## - # bunch of sanity checks below if not isinstance(gene_caller_ids_list, list): raise ConfigError("Gene caller's ids must be of type 'list'") @@ -1166,6 +1198,20 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil "also ask FASTA file headers for gene sequences to be not simple. External gene calls file and the FASTA " "file must match, and anvi'o will have to take care of it without your supervision.") + # if we came all the way down here without a defline format, let's set one up: + if not defline_format: + defline_format = "{gene_caller_id}" + + # we will also check if the `defline_format` is composed of variables that are defined in + # the `defline_data_dict` which is filled later + utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict) + + ################################################################################################## + # + # BUSINESS TIME + # + ################################################################################################## + # finally getting our sequences initialized. please NOTE that we do it only if there are no # contig sequences available OR if the gene caller ids of interest is not represented among # those that were previously initialized. @@ -1255,6 +1301,17 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil else: gene_call['aa_sequence'] = None + # let's populate the dictionary that holds all the information that could be used to report + # gene FASTA files. if you change anything in this dictionary, please don't forget to + # update the list of variables where it is first defined in this function. + defline_data_dict = {'gene_caller_id': gene_callers_id, + 'contig_name': gene_call['contig'], + 'start': gene_call['start'], + 'stop': gene_call['stop'], + 'direction': gene_call['direction'], + 'length': gene_call['length'], + 'contigs_db_project_name': self.a_meta['project_name_str']} + if output_file_path_external_gene_calls: # if the user is asking for an external gene calls file, the FASTA file for sequences # should not start with digits and we also need to set the contig name in sequences @@ -1266,10 +1323,9 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil gene_call['start'] = 0 gene_call['stop'] = gene_call['length'] else: - if simple_headers: - gene_call['header'] = '%d' % (gene_callers_id) - else: - gene_call['header'] = '%d ' % (gene_callers_id) + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']]) + gene_call['header'] = utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict) + if not simple_headers: + gene_call['header'] += gene_call['header'] + ' ' + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']]) # adding the updated gene call to our sequences dict. sequences_dict[gene_callers_id] = gene_call @@ -4247,7 +4303,7 @@ def init(self): # set a project name for the contigs database without any funny # characters to make sure it can be used programmatically later. - self.meta['project_name_str'] = self.meta['project_name'].translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_') \ + self.meta['project_name_str'] = self.meta['project_name'].strip().translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_').strip('_') \ if self.meta['project_name'] else '___'.join(['UNKNOWN', self.meta['contigs_db_hash']]) if 'creation_date' not in self.meta: diff --git a/anvio/docs/programs/anvi-get-sequences-for-gene-calls.md b/anvio/docs/programs/anvi-get-sequences-for-gene-calls.md index 0186bdf98..20d2c6d02 100644 --- a/anvio/docs/programs/anvi-get-sequences-for-gene-calls.md +++ b/anvio/docs/programs/anvi-get-sequences-for-gene-calls.md @@ -1,4 +1,4 @@ -This program allows you to **export the sequences of your gene calls** from a %(contigs-db)s or %(genomes-storage-db)s in the form of a %(genes-fasta)s. +This program allows you to **export the sequences of your gene calls** from a %(contigs-db)s or %(genomes-storage-db)s in the form of a %(genes-fasta)s. If you want other information about your gene calls from a %(contigs-db)s, you can run %(anvi-export-gene-calls)s (which outputs a %(gene-calls-txt)s) or get the coverage and detection information with %(anvi-export-gene-coverage-and-detection)s. @@ -11,7 +11,7 @@ anvi-get-sequences-for-gene-calls -c %(contigs-db)s \ -o path/to/output {{ codestop }} -This is create a %(genes-fasta)s that contains every gene in your contigs database. If you only want a specific subset of genes, you can run the following: +This is create a %(genes-fasta)s that contains every gene in your contigs database. If you only want a specific subset of genes, you can run the following: {{ codestart }} anvi-get-sequences-for-gene-calls -c %(contigs-db)s \ @@ -20,7 +20,91 @@ anvi-get-sequences-for-gene-calls -c %(contigs-db)s \ --delimiter , {{ codestop }} -Now the resulting %(genes-fasta)s will contain only those three genes. +Now the resulting %(genes-fasta)s will contain only those three genes. + +Please note that this program allows you to format the deflines of the resulting FASTA file to a great extent. For this, it uses a set of previously-defined variables you can use to define a template of your liking. You can learn about the available variables, you can include the following flag in your command: + +{{ codestart }} +anvi-get-sequences-for-gene-calls -c %(contigs-db)s \ + -o path/to/output \ + --list-defline-variables +{{ codestop }} + +which will give you an output similar to this: + +``` +WARNING +=============================================== +Here are the variables you can use to provide a user-defined defline template: + +* {gene_caller_id} +* {contig_name} +* {start} +* {stop} +* {direction} +* {length} +* {contigs_db_project_name} + +Remember, by default, anvi'o will only use '{gene_caller_id}' to format the +deflines of FASTA files it produces. +``` + +Now you can use the `--defline-format` parameter with any combination of these variables to control the output FASTA file deflines. Here are some examples using the %(contigs-db)s file included in the Infant Gut Dataset, and the contents of the resulting FASTA file: + +``` +anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \ + --gene-caller-ids 77 \ + -o 77.fa \ + --defline-format "{gene_caller_id}" +``` + +``` +>77 +ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT +AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT +TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA +GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA +GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA +GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA +``` + +The simple option shown above also is the default defline format anvi'o uses. Here is a more sophisticated example: + +``` +anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \ + --gene-caller-ids 77 \ + -o 77.fa \ + --defline-format "{contigs_db_project_name}_{contig_name}_{gene_caller_id}" +``` + +``` +>Infant_Gut_Contigs_from_Sharon_et_al_Day17a_QCcontig1_77 +ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT +AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT +TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA +GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA +GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA +GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA +``` + +And finally, here is an example that is quite comprehensive in its request from anvi'o: + +``` +anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \ + --gene-caller-ids 77 \ + -o 77.fa \ + --defline-format "{gene_caller_id} contig_name:{contig_name}|gene_start:{start}|gene_stop:{stop}|gene_length:{length}|gene_direction:{direction}" +``` + +``` +>77 contig_name:Day17a_QCcontig1|gene_start:67276|gene_stop:67939|gene_length:663|gene_direction:f +ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT +AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT +TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA +GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA +GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA +GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA +``` You also have the option to report the output in [gff3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), report extended deflines for each gene, or report amino acid sequences instead of nucleotide sequences. @@ -33,6 +117,6 @@ anvi-get-sequences-for-gene-calls -g %(genomes-storage-db)s \ -o path/to/output {{ codestop }} -This will create a %(genes-fasta)s that contains every gene in your genomes storage database. To focus on only a subset of the genomes contained in your database, use the flag `--genome-names`. You can provide a comma-delimited list of genome names or a flat text file that contains one genome per line. Alternatively, you could provide a list of gene-caller-ids as specified above. +This will create a %(genes-fasta)s that contains every gene in your genomes storage database. To focus on only a subset of the genomes contained in your database, use the flag `--genome-names`. You can provide a comma-delimited list of genome names or a flat text file that contains one genome per line. Alternatively, you could provide a list of gene-caller-ids as specified above. You also have the option to report the output in [gff3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), report extended deflines for each gene, or report amino acid sequences instead of nucleotide sequences. diff --git a/anvio/docs/programs/anvi-get-sequences-for-hmm-hits.md b/anvio/docs/programs/anvi-get-sequences-for-hmm-hits.md index 6ff0c37ea..626e9aa4e 100644 --- a/anvio/docs/programs/anvi-get-sequences-for-hmm-hits.md +++ b/anvio/docs/programs/anvi-get-sequences-for-hmm-hits.md @@ -81,6 +81,209 @@ anvi-get-sequences-for-hmm-hits -c %(contigs-db)s \ -o %(genes-fasta)s {{ codestop }} +### Change the formatting of the output FASTA files + +Please note that this program allows you to format the deflines of the resulting FASTA file to a great extent whenever possible. For this, it uses a set of previously-defined variables you can use to define a template of your liking. You can learn about the available variables, you can include the following flag in your command: + +{{ codestart }} +anvi-get-sequences-for-hmm-hits -c %(contigs-db)s \ + --list-defline-variables +{{ codestop }} + +Which will give you an output similar to the one below: + +``` +WARNING +=============================================== +Here are the variables you can use to provide a user-defined defline template: + +* {gene_name} +* {gene_callers_id} +* {contig_name} +* {gene_unique_id} +* {bin_name} +* {source} +* {e_value} +* {start} +* {stop} +* {length} + +Remember, by default, anvi'o will use the following template to format the +deflines of FASTA files it produces whenever possible. + +{gene_name}___{gene_unique_id} bin_id:{bin_name}|source:{source}|e_value:{e_value}|contig:{contig_name}|gene_callers_id:{gene_callers_id}|start:{start}|stop:{stop}|length:{length} +``` + +With this default template, %(anvi-get-sequences-for-hmm-hits)s will provide a FASTA file with quite a comprehensive defline. The following examples use the data pack for the Infant Gut Dataset, after running the following command in the data directory: + +{{ codestart }} +%(anvi-import-collection)s additional-files/collections/merens.txt \ + -p PROFILE.db \ + -c CONTIGS.db \ + -C merens +{{ codestop }} + +Here are a few commands that show how different deflines impact the FASTA output, starting with the default defline format: + +``` +anvi-get-sequences-for-hmm-hits -c CONTIGS.db \ + -o OUTPUT.fa +``` + +``` +>RsfS___Bacteria_71___a2cb9835d40f1ea052bc7aea2ef8c12924c76f4fa00e00b69f7dbecb bin_id:CONTIGS|source:Bacteria_71|e_value:2.6e-14|contig:Day17a_QCcontig1|gene_callers_id:22|start:17642|stop:17792|length:150 +>SecE___Bacteria_71___fe4bf2883dfee62dcf2ed93acde8df5de5be421675d7231997dc5f91 bin_id:CONTIGS|source:Bacteria_71|e_value:3.6e-22|contig:Day17a_QCcontig1|gene_callers_id:105|start:95904|stop:96075|length:171 +>Ribosomal_L1___Bacteria_71___9a3cc2232443c0c0e995fef5f76f34d52fb111476e0e2fa2dbe0f09e bin_id:CONTIGS|source:Bacteria_71|e_value:8.4e-59|contig:Day17a_QCcontig1|gene_callers_id:116|start:106378|stop:107068|length:690 +>SecG___Bacteria_71___d23b8716a0b03543405ac9835ce127b981fc1c067d9b1e48edf9d861 bin_id:CONTIGS|source:Bacteria_71|e_value:1e-20|contig:Day17a_QCcontig1|gene_callers_id:205|start:200881|stop:201118|length:237 +>SmpB___Bacteria_71___a95dd711a5d10cfb4bd9ca3d836aed077d35647161d4908a9c8252fe bin_id:CONTIGS|source:Bacteria_71|e_value:2.8e-66|contig:Day17a_QCcontig1|gene_callers_id:208|start:204418|stop:204883|length:465 +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -c CONTIGS.db \ + -o OUTPUT.fa \ + -p PROFILE.db \ + -C merens +``` + +``` +>RsfS___Bacteria_71___a2cb9835d40f1ea052bc7aea2ef8c12924c76f4fa00e00b69f7dbecb bin_id:E_facealis|source:Bacteria_71|e_value:2.6e-14|contig:Day17a_QCcontig1|gene_callers_id:22|start:17642|stop:17792|length:150 +>SecE___Bacteria_71___fe4bf2883dfee62dcf2ed93acde8df5de5be421675d7231997dc5f91 bin_id:E_facealis|source:Bacteria_71|e_value:3.6e-22|contig:Day17a_QCcontig1|gene_callers_id:105|start:95904|stop:96075|length:171 +>Ribosomal_L1___Bacteria_71___9a3cc2232443c0c0e995fef5f76f34d52fb111476e0e2fa2dbe0f09e bin_id:E_facealis|source:Bacteria_71|e_value:8.4e-59|contig:Day17a_QCcontig1|gene_callers_id:116|start:106378|stop:107068|length:690 +>SecG___Bacteria_71___d23b8716a0b03543405ac9835ce127b981fc1c067d9b1e48edf9d861 bin_id:E_facealis|source:Bacteria_71|e_value:1e-20|contig:Day17a_QCcontig1|gene_callers_id:205|start:200881|stop:201118|length:237 +>SmpB___Bacteria_71___a95dd711a5d10cfb4bd9ca3d836aed077d35647161d4908a9c8252fe bin_id:E_facealis|source:Bacteria_71|e_value:2.8e-66|contig:Day17a_QCcontig1|gene_callers_id:208|start:204418|stop:204883|length:465 +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --hmm-source Bacteria_71 \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 +``` + +``` +>Ribosomal_L27___Bacteria_71___477a28bece0b76ebaef1a9415cbd15b422ec581619e437a79f227f2f bin_id:E_facealis|source:Bacteria_71|e_value:4.1e-37|contig:Day17a_QCcontig2|gene_callers_id:1130|start:86504|stop:86792|length:288 +>Ribosomal_L3___Bacteria_71___611453ec6ee187ebae05815acd0d20efa3be77b791a69055aa9ae776 bin_id:S_epidermidis|source:Bacteria_71|e_value:1.4e-19|contig:Day17a_QCcontig7|gene_callers_id:2339|start:19929|stop:20592|length:663 +>Ribosomal_L3___Bacteria_71___3e44e2b950f135b147b3a5ca7813581228c3d91c7d2cb69c8543237d bin_id:E_facealis|source:Bacteria_71|e_value:1.5e-20|contig:Day17a_QCcontig16|gene_callers_id:3080|start:229935|stop:230565|length:630 +>Ribosomal_L3___Bacteria_71___0d18b80b3c3eb77af3683e64fa4278ac08929c53c8a4593c205b8849 bin_id:P_rhinitidis|source:Bacteria_71|e_value:9.1e-17|contig:Day17a_QCcontig21|gene_callers_id:3313|start:54724|stop:55357|length:633 +>Ribosomal_L28___Bacteria_71___e36828b18440c369bcf880a267cea6d55fa65cf1b0b1d66c8784901a bin_id:E_facealis|source:Bacteria_71|e_value:2.2e-24|contig:Day17a_QCcontig23|gene_callers_id:3633|start:124223|stop:124412|length:189 +>Ribosomal_L27___Bacteria_71___bd93789ee7b2f24fefa4ddfc0949be06573a6f6f3cdc85a6ada54323 bin_id:S_epidermidis|source:Bacteria_71|e_value:1e-36|contig:Day17a_QCcontig29|gene_callers_id:4151|start:9097|stop:9382|length:285 +>Ribosomal_L27___Bacteria_71___2b6e4f252a14a16375d107711cde0363f266f1981fa353a2421fd989 bin_id:S_aureus|source:Bacteria_71|e_value:7.2e-37|contig:Day17a_QCcontig56|gene_callers_id:6123|start:46552|stop:46837|length:285 +>Ribosomal_L27___Bacteria_71___51f9c99d5530e7342657805b778105e8e75c178b1780140163e528e3 bin_id:P_rhinitidis|source:Bacteria_71|e_value:3.8e-36|contig:Day17a_QCcontig58|gene_callers_id:6253|start:92104|stop:92401|length:297 +>Ribosomal_L28___Bacteria_71___4be054e14a9cb7f80dea306fbbcb391a195acda0a705853b6b010a2e bin_id:P_avidum|source:Bacteria_71|e_value:1.7e-26|contig:Day17a_QCcontig60|gene_callers_id:6345|start:76696|stop:76933|length:237 +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \ + --defline-format "{" +``` + +``` +Init .........................................: 4451 splits in 13 bin(s) + + +Config Error: Your f-string syntax is not working for anvi'o :/ Perhaps you forgot to open or + close a curly bracket? +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \ + --defline-format "{lol}" +``` + +``` +Config Error: Some of the variables in your f-string does not occur in the source dictionary + :/ Here is the list of those that are not matching to anything: lol. In the + meantime, these are the known keys: gene_name, gene_callers_id, contig_name, + gene_unique_id, bin_name, source, e_value, start, stop, length. +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --hmm-source Bacteria_71 \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \ + --return-best-hit \ + --defline-format "{bin_name}_{gene_callers_id}" +``` + +``` +>E_facealis_1130 +>S_epidermidis_2339 +>E_facealis_3080 +>P_rhinitidis_3313 +>E_facealis_3633 +>S_epidermidis_4151 +>S_aureus_6123 +>P_rhinitidis_6253 +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --hmm-source Bacteria_71 \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \ + --return-best-hit \ + --defline-format "{bin_name}_{source}_{gene_callers_id}" +``` + +``` +>E_facealis_Bacteria_71_1130 +>S_epidermidis_Bacteria_71_2339 +>E_facealis_Bacteria_71_3080 +>P_rhinitidis_Bacteria_71_3313 +``` + +--- + +``` +anvi-get-sequences-for-hmm-hits -p PROFILE.db \ + -c CONTIGS.db \ + -C merens \ + -o OUTPUT.fa \ + --hmm-source Bacteria_71 \ + --gene-names Ribosomal_L27,Ribosomal_L28,Ribosomal_L3 \ + --return-best-hit \ + --defline-format "{gene_name}_{gene_callers_id} source:{source}|contig:{contig_name}|start:{start}|stop:{stop}" +``` + +``` +>Ribosomal_L27_1130 source:Bacteria_71|contig:Day17a_QCcontig2|start:86504|stop:86792 +>Ribosomal_L3_2339 source:Bacteria_71|contig:Day17a_QCcontig7|start:19929|stop:20592 +>Ribosomal_L3_3080 source:Bacteria_71|contig:Day17a_QCcontig16|start:229935|stop:230565 +>Ribosomal_L3_3313 source:Bacteria_71|contig:Day17a_QCcontig21|start:54724|stop:55357 +``` + +--- + +Please note that anvi'o will not check whether your defline format will result in FASTA entries with identical deflines. + + ### Get HMM hits independently aligned and concatenated The resulting file can be used for phylogenomics analyses via %(anvi-gen-phylogenomic-tree)s or through more sophisticated tools for curating alignments and computing trees. diff --git a/anvio/hmmops.py b/anvio/hmmops.py index 1bb205a5a..fb1003ca6 100644 --- a/anvio/hmmops.py +++ b/anvio/hmmops.py @@ -25,7 +25,7 @@ class SequencesForHMMHits: - def __init__(self, contigs_db_path, sources=set([]), split_names_of_interest=set([]), init=True, run=run, progress=progress, bin_name=None): + def __init__(self, contigs_db_path, sources=set([]), split_names_of_interest=set([]), init=True, run=run, progress=progress, bin_name=None, defline_format=None): self.run = run self.progress = progress @@ -45,6 +45,28 @@ def __init__(self, contigs_db_path, sources=set([]), split_names_of_interest=set self.genes_in_contigs = {} self.splits_in_contigs = {} + self.defline_data_dict = {'gene_name': None, + 'gene_callers_id': None, + 'contig_name': None, + 'gene_unique_id': None, + 'bin_name': None, + 'source': None, + 'e_value': None, + 'start': None, + 'stop': None, + 'length': None} + + + if defline_format: + self.defline_format = defline_format + else: + self.defline_format = ("{gene_name}___{gene_unique_id} bin_id:{bin_name}|source:{source}|" + "e_value:{e_value}|contig:{contig_name}|gene_callers_id:{gene_callers_id}|" + "start:{start}|stop:{stop}|length:{length}") + + # an immediate check if the defline format is acceptable + utils.get_f_string_evaluated_by_dict(self.defline_format, self.defline_data_dict) + if contigs_db_path: self.init_dicts(contigs_db_path, split_names_of_interest) self.initialized = True @@ -782,9 +804,22 @@ def filter_hmm_sequences_dict_for_bins_that_lack_more_than_N_genes(self, hmm_seq def get_FASTA_header_and_sequence_for_gene_unique_id(self, hmm_sequences_dict_for_splits, gene_unique_id): - entry = hmm_sequences_dict_for_splits[gene_unique_id] - header = '%s___%s ' % (entry['gene_name'], gene_unique_id) + '|'.join(['%s:%s' % (k, str(entry[k])) for k in ['bin_id', 'source', 'e_value', 'contig', 'gene_callers_id', 'start', 'stop', 'length']]) + e = hmm_sequences_dict_for_splits[gene_unique_id] + + self.defline_data_dict = {'gene_name': e['gene_name'], + 'gene_callers_id': e['gene_callers_id'], + 'contig_name': e['contig'], + 'gene_unique_id': gene_unique_id, + 'bin_name': e['bin_id'], + 'source': e['source'], + 'e_value': e['e_value'], + 'start': e['start'], + 'stop': e['stop'], + 'length': e['length']} + + header = utils.get_f_string_evaluated_by_dict(self.defline_format, self.defline_data_dict) sequence = hmm_sequences_dict_for_splits[gene_unique_id]['sequence'] + return (header, sequence) @@ -946,18 +981,15 @@ def __store_individual_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_spl for gene_id in genes_aligned: hmm_sequences_dict_for_splits[gene_id]['sequence'] = genes_aligned[gene_id] - f = open(output_file_path, 'w') + with open(output_file_path, 'w') as f: + for gene_unique_id in hmm_sequences_dict_for_splits: + header, sequence = self.get_FASTA_header_and_sequence_for_gene_unique_id(hmm_sequences_dict_for_splits, gene_unique_id) - for gene_unique_id in hmm_sequences_dict_for_splits: - header, sequence = self.get_FASTA_header_and_sequence_for_gene_unique_id(hmm_sequences_dict_for_splits, gene_unique_id) + if wrap: + sequence = textwrap.fill(sequence, wrap, break_on_hyphens=False) - if wrap: - sequence = textwrap.fill(sequence, wrap, break_on_hyphens=False) - - f.write('>%s\n' % header) - f.write('%s\n' % sequence) - - f.close() + f.write('>%s\n' % header) + f.write('%s\n' % sequence) def store_hmm_sequences_into_FASTA(self, hmm_sequences_dict_for_splits, output_file_path, wrap=120, concatenate_genes=False, partition_file_path=None, separator=None, genes_order=None, align_with=None, just_do_it=False): diff --git a/anvio/tests/run_component_tests_for_metagenomics.sh b/anvio/tests/run_component_tests_for_metagenomics.sh index 80beadd3b..cdf763f1a 100755 --- a/anvio/tests/run_component_tests_for_metagenomics.sh +++ b/anvio/tests/run_component_tests_for_metagenomics.sh @@ -643,10 +643,18 @@ anvi-get-codon-frequencies -c $output_dir/CONTIGS.db \ --no-progress SHOW_FILE $output_dir/CODON_frequencies_for_the_contigs_db.txt -INFO "Getting back the sequence for gene call 3" +INFO "Learning about the available defline variables" anvi-get-sequences-for-gene-calls -c $output_dir/CONTIGS.db \ --gene-caller-ids 3 \ -o $output_dir/Sequence_for_gene_caller_id_3.fa \ + --list-defline-variables \ + --no-progress + +INFO "Getting back the sequence for gene call 3 with a specific defline template" +anvi-get-sequences-for-gene-calls -c $output_dir/CONTIGS.db \ + --gene-caller-ids 3 \ + -o $output_dir/Sequence_for_gene_caller_id_3.fa \ + --defline-format "{contigs_db_project_name}_{contig_name}_{gene_caller_id}" \ --no-progress INFO "Getting back the AA sequence for gene call 3" @@ -697,12 +705,13 @@ anvi-get-sequences-for-hmm-hits -c $output_dir/CONTIGS.db \ -L \ --no-progress -INFO "Get DNA sequences for HMM hits for a bin in a collection" +INFO "Get DNA sequences for HMM hits for a bin in a collection with a defline format" anvi-get-sequences-for-hmm-hits -p $output_dir/SAMPLES-MERGED/PROFILE.db \ -c $output_dir/CONTIGS.db \ -C CONCOCT \ -b Bin_1 \ -o $output_dir/hmm_hits_sequences_in_Bin_1.txt \ + --defline-format "{contig_name}_{gene_callers_id}" --no-progress INFO "Get DNA sequences for all ABC transporter hits defined in an HMM source" diff --git a/anvio/utils.py b/anvio/utils.py index f1632d8f3..937a90c6b 100644 --- a/anvio/utils.py +++ b/anvio/utils.py @@ -1755,6 +1755,43 @@ def get_cmd_line(): return ' '.join(c_argv) +def get_f_string_evaluated_by_dict(f_string, d): + """A function to evaluate the contents of an f-string given a dictionary. + + This simple function enables the following, even when the variables in the f-string + are not defined in a given context, but appear as keys in a dictionary: + + >>> d = {'bar': 'apple', 'foo': 'pear', 'num': 5} + >>> f_string = "{num}_{bar}_or_{foo}" + >>> print(f"{get_f_string_evaluated_by_dict(f_string, d)}") + "5_apple_or_pear" + + This functionality enables to receive a user-defined f-string from the commandline, + and interpret it into a meaningful string using a dictionary. This is similar to the + following use from earlier days of Python, but it doesn't bother the user to know + about variable types and deal with an annoying syntax: + + >>> d = {'bar': 'apple', 'foo': 'pear', 'num': 5} + >>> print("%(num)d_%(bar)s_or_%(foo)s" % d) + "5_apple_or_pear" + """ + + stringlets = [p.split('}') for p in f_string.split('{')] + + if any([len(s) == 1 or len(s[0]) == 0 for s in stringlets[1:]]): + raise ConfigError("Your f-string syntax is not working for anvi'o :/ Perhaps you " + "forgot to open or close a curly bracket?") + + unrecognized_vars = [s[0] for s in stringlets[1:] if s[0] not in d] + if len(unrecognized_vars): + raise ConfigError(f"Some of the variables in your f-string does not occur in the source " + f"dictionary :/ Here is the list of those that are not matching to anything: " + f"{', '.join(unrecognized_vars)}. In the meantime, these are the known keys: " + f"{', '.join(d.keys())}.") + + return stringlets[0][0] + ''.join([f"{d[s[0]]}{s[1]}" for s in stringlets[1:]]) + + def get_time_to_date(local_time, fmt='%Y-%m-%d %H:%M:%S'): try: local_time = float(local_time) diff --git a/bin/anvi-get-sequences-for-gene-calls b/bin/anvi-get-sequences-for-gene-calls index 697c11419..c5e6860e4 100755 --- a/bin/anvi-get-sequences-for-gene-calls +++ b/bin/anvi-get-sequences-for-gene-calls @@ -45,6 +45,7 @@ def export_from_contigs(args): gene_caller_ids_list=gene_caller_ids, output_file_path=args.output_file, simple_headers=not args.report_extended_deflines, + list_defline_variables=args.list_defline_variables, wrap=args.wrap ) @@ -62,7 +63,8 @@ def export_from_contigs(args): else: if args.annotation_source: raise ConfigError("A functional annotation source is only relevant for the GFF output format.") - c.get_sequences_for_gene_callers_ids(**func_kwargs, report_aa_sequences=args.get_aa_sequences, flank_length=args.flank_length, output_file_path_external_gene_calls=args.external_gene_calls) + c.get_sequences_for_gene_callers_ids(**func_kwargs, report_aa_sequences=args.get_aa_sequences, flank_length=args.flank_length, + output_file_path_external_gene_calls=args.external_gene_calls, defline_format=args.defline_format) def export_from_genomes_storage(genomes_storage_db_path, output_file_path): @@ -103,10 +105,8 @@ if __name__ == '__main__': groupA = parser.add_argument_group('OPTION #1: GET SEQUENCES FROM A CONTIGS DB') groupA.add_argument(*anvio.A('contigs-db'), **anvio.K('contigs-db', {'required': False})) groupA.add_argument(*anvio.A('gene-caller-ids'), **anvio.K('gene-caller-ids')) - groupA.add_argument(*anvio.A('flank-length'), **anvio.K('flank-length')) groupA.add_argument(*anvio.A('delimiter'), **anvio.K('delimiter')) - groupA.add_argument(*anvio.A('report-extended-deflines'), **anvio.K('report-extended-deflines')) - groupA.add_argument(*anvio.A('wrap'), **anvio.K('wrap')) + groupA.add_argument(*anvio.A('flank-length'), **anvio.K('flank-length')) groupA.add_argument(*anvio.A('get-aa-sequences'), **anvio.K('get-aa-sequences')) groupA.add_argument(*anvio.A('external-gene-calls'), **anvio.K('external-gene-calls', {"help": ("An optional external gene calls file path that precisely " @@ -123,8 +123,14 @@ if __name__ == '__main__': groupC.add_argument(*anvio.A('genomes-storage'), **anvio.K('genomes-storage', {'required': False})) groupC.add_argument(*anvio.A('genomes-names'), **anvio.K('genomes-names')) - groupD = parser.add_argument_group('OPTIONS COMMON TO ALL INPUTS') - groupD.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'required': True})) + groupD = parser.add_argument_group('FASTA OUTPUT FORMATTING OPTIONS') + groupD.add_argument(*anvio.A('report-extended-deflines'), **anvio.K('report-extended-deflines')) + groupD.add_argument(*anvio.A('wrap'), **anvio.K('wrap')) + groupD.add_argument(*anvio.A('list-defline-variables'), **anvio.K('list-defline-variables')) + groupD.add_argument(*anvio.A('defline-format'), **anvio.K('defline-format')) + + groupE = parser.add_argument_group('OPTIONS COMMON TO ALL INPUTS') + groupE.add_argument(*anvio.A('output-file'), **anvio.K('output-file', {'required': True})) args = parser.get_args(parser) diff --git a/bin/anvi-get-sequences-for-hmm-hits b/bin/anvi-get-sequences-for-hmm-hits index d36d13f69..2f9bb3f0c 100755 --- a/bin/anvi-get-sequences-for-hmm-hits +++ b/bin/anvi-get-sequences-for-hmm-hits @@ -76,6 +76,19 @@ def main(args): if not (args.internal_genomes or args.external_genomes or args.contigs_db or args.profile_db): raise ConfigError("You gotta give this program some input files :/ Come on.") + if args.list_defline_variables: + defline_data_dict = hmmops.SequencesForHMMHits(None).defline_data_dict + defline_format = hmmops.SequencesForHMMHits(None).defline_format + + run.warning(f"Here are the variables you can use to provide a user-defined defline template: ") + for key in defline_data_dict.keys(): + run.info_single("{%s}" % key) + run.info_single("Remember, by default, anvi'o will use the following template to format the deflines of " + "FASTA files it produces whenever possible.", level=0, nl_before=1, nl_after=1, mc='red') + + run.info_single("%s" % defline_format, level=0, nl_after=1, cut_after=0) + sys.exit() + if args.list_hmm_sources or args.list_available_gene_names: if args.contigs_db: if args.list_hmm_sources: @@ -153,7 +166,8 @@ def main(args): contigs_db_name = os.path.basename(args.contigs_db[:-3]) splits_dict = {contigs_db_name: list(contigs_db.splits_basic_info.keys())} - s = hmmops.SequencesForHMMHits(args.contigs_db, sources = hmm_sources) + defline_format = args.defline_format if args.defline_format else None + s = hmmops.SequencesForHMMHits(args.contigs_db, sources=hmm_sources, defline_format=defline_format) CHK = lambda: exec('raise ConfigError("Your selections returned an empty list of genes to work with :/")') if not len(hmm_sequences_dict) else None @@ -311,6 +325,8 @@ if __name__ == '__main__': a nice name..") groupE.add_argument(*anvio.A('output-file'), **anvio.K('output-file')) groupE.add_argument(*anvio.A('no-wrap'), **anvio.K('no-wrap')) + groupE.add_argument(*anvio.A('list-defline-variables'), **anvio.K('list-defline-variables')) + groupE.add_argument(*anvio.A('defline-format'), **anvio.K('defline-format', {'default': None})) groupF = parser.add_argument_group('THE ALPHABET', "The sequences are reported in DNA alphabet, but you can also get them\ translated just like all the other cool kids.")