Skip to content

Commit

Permalink
Merge pull request #2332 from merenlab/pretty-defline-formatting
Browse files Browse the repository at this point in the history
Advanced defline formatting
  • Loading branch information
meren authored Sep 25, 2024
2 parents aa73ca2 + 2e67baa commit c023873
Show file tree
Hide file tree
Showing 9 changed files with 497 additions and 34 deletions.
20 changes: 20 additions & 0 deletions anvio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2541,6 +2541,26 @@ def TABULATE(table, header, numalign="right", max_width=0):
"and internal anvi'o heuristics control whether or not indels should be reported, but with this "
"flag all indels are reported."}
),
'list-defline-variables': (
['--list-defline-variables'],
{'default': False,
'action': 'store_true',
'help': "When declared, anvi'o will list the variable names that can be used to construct deflines in "
"FASTA outputs from the user-defined `--defline-format` strings."}
),
'defline-format': (
['--defline-format'],
{'default': '{gene_caller_id}',
'metavar': "F-STRING",
'help': "Proivide a defline template for anvi'o to use when generating the FASTA output. The way this "
"works is actually quite simple: first you learn about all the options that exist using the "
"`--list-defline-variables`, and then use them to create your template. Available variables "
"should be listed within curly brackets, which will be evaluated in contex. Anything outside "
"of curly brackets will be kept as is. For instance, if you would like your defline to have "
"the gene caller ID after the contig name in which it occurs, you can use this template: "
"'{contig_name}_{gene_caller_id}', and your defline will look like '>XXX_182'. See more "
"examples in online help."}
),
'report-extended-deflines': (
['--report-extended-deflines'],
{'default': False,
Expand Down
72 changes: 64 additions & 8 deletions anvio/dbops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1120,10 +1120,42 @@ def get_gene_amino_acid_sequence(self, gene_caller_ids):
return sequences


def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True, include_aa_sequences=False, flank_length=0,
output_file_path_external_gene_calls=None, simple_headers=False, report_aa_sequences=False, wrap=120, rna_alphabet=False):
def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_file_path=None, reverse_complement_if_necessary=True,
include_aa_sequences=False, flank_length=0, output_file_path_external_gene_calls=None,
simple_headers=False, list_defline_variables=False, defline_format='{gene_caller_id}',
report_aa_sequences=False, wrap=120, rna_alphabet=False):

##################################################################################################
#
# DEFLIINE FORMATTING REPORTING RELATED PRE-CHECKS
#
##################################################################################################
# available options to determine deflines through user-provided f-strings. the dictionary is
# populated below, and if you make any changes here, please don't forget to update it there too:
defline_data_dict = {'gene_caller_id': None,
'contig_name': None,
'start': None,
'stop': None,
'direction': None,
'length': None,
'contigs_db_project_name': None}

# if the user needs to see the list, show the list and quit
if list_defline_variables:
self.run.warning(f"Here are the variables you can use to provide a user-defined defline template: ")
for key in defline_data_dict.keys():
self.run.info_single("{%s}" % key)
self.run.info_single("Remember, by default, anvi'o will only use '{gene_caller_id}' to format the deflines of "
"FASTA files it produces.", level=0, nl_before=1, nl_after=1, mc='red')

sys.exit()

##################################################################################################
#
# BUNCH OF SANITY CHECKS BEFORE WE GET INTO BUSINESS
#
##################################################################################################

# bunch of sanity checks below
if not isinstance(gene_caller_ids_list, list):
raise ConfigError("Gene caller's ids must be of type 'list'")

Expand Down Expand Up @@ -1166,6 +1198,20 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
"also ask FASTA file headers for gene sequences to be not simple. External gene calls file and the FASTA "
"file must match, and anvi'o will have to take care of it without your supervision.")

# if we came all the way down here without a defline format, let's set one up:
if not defline_format:
defline_format = "{gene_caller_id}"

# we will also check if the `defline_format` is composed of variables that are defined in
# the `defline_data_dict` which is filled later
utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)

##################################################################################################
#
# BUSINESS TIME
#
##################################################################################################

# finally getting our sequences initialized. please NOTE that we do it only if there are no
# contig sequences available OR if the gene caller ids of interest is not represented among
# those that were previously initialized.
Expand Down Expand Up @@ -1255,6 +1301,17 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
else:
gene_call['aa_sequence'] = None

# let's populate the dictionary that holds all the information that could be used to report
# gene FASTA files. if you change anything in this dictionary, please don't forget to
# update the list of variables where it is first defined in this function.
defline_data_dict = {'gene_caller_id': gene_callers_id,
'contig_name': gene_call['contig'],
'start': gene_call['start'],
'stop': gene_call['stop'],
'direction': gene_call['direction'],
'length': gene_call['length'],
'contigs_db_project_name': self.a_meta['project_name_str']}

if output_file_path_external_gene_calls:
# if the user is asking for an external gene calls file, the FASTA file for sequences
# should not start with digits and we also need to set the contig name in sequences
Expand All @@ -1266,10 +1323,9 @@ def get_sequences_for_gene_callers_ids(self, gene_caller_ids_list=[], output_fil
gene_call['start'] = 0
gene_call['stop'] = gene_call['length']
else:
if simple_headers:
gene_call['header'] = '%d' % (gene_callers_id)
else:
gene_call['header'] = '%d ' % (gene_callers_id) + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])
gene_call['header'] = utils.get_f_string_evaluated_by_dict(defline_format, defline_data_dict)
if not simple_headers:
gene_call['header'] += gene_call['header'] + ' ' + ';'.join(['%s:%s' % (k, str(gene_call[k])) for k in ['contig', 'start', 'stop', 'direction', 'rev_compd', 'length']])

# adding the updated gene call to our sequences dict.
sequences_dict[gene_callers_id] = gene_call
Expand Down Expand Up @@ -4247,7 +4303,7 @@ def init(self):

# set a project name for the contigs database without any funny
# characters to make sure it can be used programmatically later.
self.meta['project_name_str'] = self.meta['project_name'].translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_') \
self.meta['project_name_str'] = self.meta['project_name'].strip().translate({ord(c): "_" for c in "\"'!@#$%^&*()[]{};:,./<>?\|`~-=_+ "}).replace('__', '_').strip('_') \
if self.meta['project_name'] else '___'.join(['UNKNOWN', self.meta['contigs_db_hash']])

if 'creation_date' not in self.meta:
Expand Down
92 changes: 88 additions & 4 deletions anvio/docs/programs/anvi-get-sequences-for-gene-calls.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
This program allows you to **export the sequences of your gene calls** from a %(contigs-db)s or %(genomes-storage-db)s in the form of a %(genes-fasta)s.
This program allows you to **export the sequences of your gene calls** from a %(contigs-db)s or %(genomes-storage-db)s in the form of a %(genes-fasta)s.

If you want other information about your gene calls from a %(contigs-db)s, you can run %(anvi-export-gene-calls)s (which outputs a %(gene-calls-txt)s) or get the coverage and detection information with %(anvi-export-gene-coverage-and-detection)s.

Expand All @@ -11,7 +11,7 @@ anvi-get-sequences-for-gene-calls -c %(contigs-db)s \
-o path/to/output
{{ codestop }}

This is create a %(genes-fasta)s that contains every gene in your contigs database. If you only want a specific subset of genes, you can run the following:
This is create a %(genes-fasta)s that contains every gene in your contigs database. If you only want a specific subset of genes, you can run the following:

{{ codestart }}
anvi-get-sequences-for-gene-calls -c %(contigs-db)s \
Expand All @@ -20,7 +20,91 @@ anvi-get-sequences-for-gene-calls -c %(contigs-db)s \
--delimiter ,
{{ codestop }}

Now the resulting %(genes-fasta)s will contain only those three genes.
Now the resulting %(genes-fasta)s will contain only those three genes.

Please note that this program allows you to format the deflines of the resulting FASTA file to a great extent. For this, it uses a set of previously-defined variables you can use to define a template of your liking. You can learn about the available variables, you can include the following flag in your command:

{{ codestart }}
anvi-get-sequences-for-gene-calls -c %(contigs-db)s \
-o path/to/output \
--list-defline-variables
{{ codestop }}

which will give you an output similar to this:

```
WARNING
===============================================
Here are the variables you can use to provide a user-defined defline template:
* {gene_caller_id}
* {contig_name}
* {start}
* {stop}
* {direction}
* {length}
* {contigs_db_project_name}
Remember, by default, anvi'o will only use '{gene_caller_id}' to format the
deflines of FASTA files it produces.
```

Now you can use the `--defline-format` parameter with any combination of these variables to control the output FASTA file deflines. Here are some examples using the %(contigs-db)s file included in the Infant Gut Dataset, and the contents of the resulting FASTA file:

```
anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \
--gene-caller-ids 77 \
-o 77.fa \
--defline-format "{gene_caller_id}"
```

```
>77
ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT
AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT
TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA
GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA
GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA
GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA
```

The simple option shown above also is the default defline format anvi'o uses. Here is a more sophisticated example:

```
anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \
--gene-caller-ids 77 \
-o 77.fa \
--defline-format "{contigs_db_project_name}_{contig_name}_{gene_caller_id}"
```

```
>Infant_Gut_Contigs_from_Sharon_et_al_Day17a_QCcontig1_77
ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT
AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT
TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA
GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA
GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA
GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA
```

And finally, here is an example that is quite comprehensive in its request from anvi'o:

```
anvi-get-sequences-for-gene-calls -c INFANT-GUT-TUTORIAL/CONTIGS.db \
--gene-caller-ids 77 \
-o 77.fa \
--defline-format "{gene_caller_id} contig_name:{contig_name}|gene_start:{start}|gene_stop:{stop}|gene_length:{length}|gene_direction:{direction}"
```

```
>77 contig_name:Day17a_QCcontig1|gene_start:67276|gene_stop:67939|gene_length:663|gene_direction:f
ATGAGTAATTTATTGCGAGCAGAAGGCGTGTCTTATCAAGTAAATGGTCGTAGCATTCTTTCTGATATTGATTTGTCATTTGAAACAGGCAGCAATACAACAATTGTTGGTCCTTCAGGT
AGCGGGAAAAGTACATTTTTAAAAATTTTATCTTCATTATTAAGTCCTACAGAAGGCGAAATTTTTTATCAAGAAGCGCCAATTACTACAATGCCAATCGAAACATACCGCCAAAAGGTT
TCTTATTGTTTTCAGCAGCCAACTTTATTTGGTGAAACCGTGTATGATAATTTGTTATTTCCATTTACCGTCAGACAAGAAGCGTTTAATCAGGAAAAAGTCGTGGCATTACTCCAACAA
GTGAAATTGCCCGCTGCTTATCTTGAAAAGAAAATAGCCGAACTCTCTGGTGGTGAGCGACAACGGGTTGCTTTGCTACGAAACATTATTTTTGTACCAGATGTTTTATTATTAGACGAA
GTTACAACGGGATTAGATGAAGAAAGCAAACAGATTGTCAATCAATTGTTAAACCAATTAAACAAAGAGCAAGGAGTCACGCTGGTTCGTGTCACGCATGATACCGAAGAAATTCAGCAA
GCACAGCAAGTGATTCGTATTGTAGCAGGAAAGGTGGCGCCGACAGATGGATTTAGCAGTTAA
```

You also have the option to report the output in [gff3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), report extended deflines for each gene, or report amino acid sequences instead of nucleotide sequences.

Expand All @@ -33,6 +117,6 @@ anvi-get-sequences-for-gene-calls -g %(genomes-storage-db)s \
-o path/to/output
{{ codestop }}

This will create a %(genes-fasta)s that contains every gene in your genomes storage database. To focus on only a subset of the genomes contained in your database, use the flag `--genome-names`. You can provide a comma-delimited list of genome names or a flat text file that contains one genome per line. Alternatively, you could provide a list of gene-caller-ids as specified above.
This will create a %(genes-fasta)s that contains every gene in your genomes storage database. To focus on only a subset of the genomes contained in your database, use the flag `--genome-names`. You can provide a comma-delimited list of genome names or a flat text file that contains one genome per line. Alternatively, you could provide a list of gene-caller-ids as specified above.

You also have the option to report the output in [gff3 format](https://github.com/The-Sequence-Ontology/Specifications/blob/master/gff3.md), report extended deflines for each gene, or report amino acid sequences instead of nucleotide sequences.
Loading

0 comments on commit c023873

Please sign in to comment.