diff --git a/FirstBuildChecker.pyc b/FirstBuildChecker.pyc index f5f2a20..4cb5808 100755 Binary files a/FirstBuildChecker.pyc and b/FirstBuildChecker.pyc differ diff --git a/README b/README deleted file mode 100644 index 0581bc0..0000000 --- a/README +++ /dev/null @@ -1,154 +0,0 @@ -MitoFinder version 1.0.1 22/03/2019 -Author : RĂ©mi ALLIO - -Mitofinder is a pipeline to assemble mitochondrial genomes and extract mitochondrial genes from trimmed -sequencing read data. - -# Requirements - -This software is suitable for all linux-like systems with gcc installed (Unfortunately, not MAC and Windows). - -# Installation guide for MitoFinder - -## Download mitofinder_vX.tar.gz at [www.github.com] - -$ tar -zxf mitofinder_vX.tar.gz -$ cd mitofinder_vX -$ ./install.sh - -$ PATH/TO/MITOFINDER_VX/mitofinder -h - -## Add mitofinder to your path -> linux - -$ cd mitofinder_vX/ -$ p=$(pwd) -$ echo -e "\n#Path to mitofinder \nexport PATH=$PATH:$p" >> ~/.bashrc -$ source ~/.bashrc - -# How to use MitoFinder -TIP: use mitofinder --example to print usual examples of use - -First, you can choose the assembler using the following options: --- megahit (default: faster) --- metaspades (recommended: a bit slower but more efficient (see associated paper). WARNING: Not compatible with single-end reads) --- idba - -### For mitochondrial genome assembly - -## Trimmed paired-end reads -$ mitofinder -j [jobname] -1 [left_reads.fastq.gz] -2 [right_reads.fastq.gz] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -## Trimmed single-end reads -$ mitofinder -j [jobname] -s [SE_reads.fastq.gz] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -## MitoFinder can be used with your own assembly (one or several contig.s in fasta format) -$ mitofinder -j [jobname] -a [assembly.fasta] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] - -### Restart -Use the same command line. -WARNING: If you want to make the assembly again (for example because it failed) you have to remove the result assembly directory. If not, MitoFinder will skip the assembly step. - -# OUTPUT - -## Result folder - -Mitofinder returns several files for each mitochondrial contig found: -- [job_name]_partial_mito_1.fasta containing a mitochondrial contig -- [job_name]_partial_mito_1.gff containing the final annotation for a given contig -- [job_name]_partial_mito_1.arwen containing the result of the tRNA annotation returned by the Arwen software. -- [job_name]_partial_mito_1.gb containing the final annotation for a given contig (option --out_gb) -- [job_name]_final_genes.fasta containing the final genes selected from all contigs by MitoFinder - - -## UCE annotation -MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. -MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: -- [job_name]link.scafSeq containing all assembled contigs from raw reads. - -To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). -You can thus use the file [job_name]link.scafSeq and start the pipeline at the "Finding UCE" step. - -# Help -usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] - [-1 PE1] [-2 PE2] [-s SE] [-a ASSEMBLY] [-m MEM] - [-l SHORTESTCONTIG] [-p PROCESSORSTOUSE] [-r REFSEQFILE] - [-e BLASTEVAL] [--out_gb] - [--blastidentitynucl BLASTIDENTITYNUCL] - [--blastidentityprot BLASTIDENTITYPROT] - [--blastsize ALIGNCUTOFF] [--circularsize CIRCULARSIZE] - [--circularoffset CIRCULAROFFSET] [-cove COVECUTOFF] - [-o ORGANISMTYPE] [-v] [--example] - -Mitofinder is a pipeline to assemble and annotate mitochondrial DNA from -trimmed sequencing reads. - -optional arguments: - -h, --help show this help message and exit - --megahit Use Megahit for assembly. (Default) - --idba Use IDBA-UD for assembly. - --metaspades Use MetaSPAdes for assembly. - -j PROCESSNAME, --jobname PROCESSNAME - Job name to be used throughout the process - -1 PE1, --Paired-end1 PE1 - File with forward paired-end reads - -2 PE2, --Paired-end2 PE2 - File with reverse paired-end reads - -s SE, --Single-end SE - File with single-end reads - -a ASSEMBLY, --assembly ASSEMBLY - File with your own assembly - -m MEM, --max-memory MEM - max memory to use in Go (Megahit or MetaSPAdes) - -l SHORTESTCONTIG, --length SHORTESTCONTIG - Shortest contig length to be used in scaffolding. - Default = 100 - -p PROCESSORSTOUSE, --processors PROCESSORSTOUSE - Number of threads Mitofinder will use at most. - -r REFSEQFILE, --refseq REFSEQFILE - Reference mitochondrial genome in GenBank format - (.gb). - -e BLASTEVAL, --blaste BLASTEVAL - e-value of blast program used for contig - identification and annotation. Default = 0.000001 - --out_gb Create annotation output file in GenBank format. - --blastidentitynucl BLASTIDENTITYNUCL - Nucleotide identity percentage for a hit to be - retained. Default = 50 - --blastidentityprot BLASTIDENTITYPROT - Amino acid identity percentage for a hit to be - retained. Default = 40 - --blastsize ALIGNCUTOFF - Percentage of overlap in blast best hit to be - retained. Default = 30 - --circularsize CIRCULARSIZE - Size to consider when checking for circularization. - Default = 45 - --circularoffset CIRCULAROFFSET - Offset from start and finish to consider when looking - for circularization. Default = 200 - -cove COVECUTOFF, --covecutoff COVECUTOFF - Cove cutoff for tRNAscan-SE. Default = 7 - -o ORGANISMTYPE, --organism ORGANISMTYPE - Organism genetic code following NCBI table (integer): - 1. The Standard Code 2. The Vertebrate Mitochondrial - Code 3. The Yeast Mitochondrial Code 4. The Mold, - Protozoan, and Coelenterate Mitochondrial Code and the - Mycoplasma/Spiroplasma Code 5. The Invertebrate - Mitochondrial Code 6. The Ciliate, Dasycladacean and - Hexamita Nuclear Code 9. The Echinoderm and Flatworm - Mitochondrial Code 10. The Euplotid Nuclear Code 11. - The Bacterial, Archaeal and Plant Plastid Code 12. The - Alternative Yeast Nuclear Code 13. The Ascidian - Mitochondrial Code 14. The Alternative Flatworm - Mitochondrial Code 16. Chlorophycean Mitochondrial - Code 21. Trematode Mitochondrial Code 22. Scenedesmus - obliquus Mitochondrial Code 23. Thraustochytrium - Mitochondrial Code 24. Pterobranchia Mitochondrial - Code 25. Candidate Division SR1 and Gracilibacteria - Code - -v, --version Version 1.0.2 - --example Print getting started examples - -# To cite Mitofinder - -Allio R., Schomaker-Bastos A., Romiguier J., Prosdocimi F., Nabholz B. & Delsuc F. (2019). Efficient automated extraction of mitogenomic data in target enrichment phylogenomics with MitoFinder. Molecular Ecology Resources (submitted). diff --git a/README.md b/README.md index e8a50e6..c645828 100755 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# MitoFinder version 1.0.2 06/01/2019 +# MitoFinder version 1.1 Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F.
@@ -16,14 +16,12 @@ This software is suitable for all linux-like systems with gcc installed (Unfortu 1. [Installation guide for MitoFinder](#installation-guide-for-mitofinder) 2. [How to use MitoFinder](#how-to-use-mitofinder) -3. [INPUTS](#inputs) -4. [OUTPUTS](#outputs) -5. [Test case](#test-case) -6. [Restart](#restart) -7. [UCE annotation](#uce-annotation) -8. [Associated publication](#associated-publication) -9. [How to get reference mitochondrial genomes from ncbi](#how-to-get-reference-mitochondrial-genomes-from-ncbi) -10. [Detailed options](#detailed-options) +3. [Detailed options](#detailed-options) +4. [INPUTS](#inputs) +5. [OUTPUTS](#outputs) +6. [UCE annotation](#uce-annotation) +7. [Associated publication](#associated-publication) +8. [How to get reference mitochondrial genomes from ncbi](#how-to-get-reference-mitochondrial-genomes-from-ncbi) # Installation guide for MitoFinder @@ -71,7 +69,7 @@ First, you can choose the assembler using the following options: ## Mitochondrial genome assembly -**TIP**: use mitofinder --example to print usual examples of use +TIP: use mitofinder --example to print usual examples of use ### Trimmed paired-end reads ```shell @@ -88,74 +86,17 @@ mitofinder -j [jobname] -s [SE_reads.fastq.gz] -r [genbank_reference.gb] -o [gen mitofinder -j [jobname] -a [assembly.fasta] -r [genbank_reference.gb] -o [genetic_code] -p [threads] -m [memory] ``` -# INPUTS - -Mitofinder needs several files to run depending on the method you have choosen (see above): -- [x] **Reference_file.gb** containing at least one mitochondrial genome of reference extracted from [NCBI](https://www.ncbi.nlm.nih.gov/) -- [ ] **left_reads.fastq.gz** containing the left reads of paired-end sequencing -- [ ] **right_reads.fastq.gz** containing the right reads of paired-end sequencing -- [ ] **SE_reads.fastq.gz** containing the reads of single-end sequencing -- [ ] **assembly.fasta** containing the assembly on which MitoFinder have to find and annotate mitochondrial contig.s - -**TIP 1**: If you hesitate between two or more mitogenomes to use as references, you can put them together in the same reference file. MitoFinder will use all of them to find mitochondrial signal and will choose the best matching one for each gene during the annotation step. -**TIP 2**: If you want to assemble several mitogenomes for which you have different references, you can group them together in the same reference file. This allows to use MitoFinder in a loop and does not affect the result because MitoFinder will select the best matching reference for the annotation step. -**TIP 3**: If you have a large number of reads, from a whole genome sequencing for example, you can pre-filter mitochondrial reads using [mirabait](http://manpages.ubuntu.com/manpages/xenial/man1/mirabait.1.html). This will considerably reduce the computation time. However, this can be done only if you have a reference mitogenome for a closely related species because MIRA uses mapping to select reads and this method is rather stringent. If you don't have the mitogenome of a closely related species you can just use a random subset of your read data (~ 15/20 Millions of reads) to reduce computation time and assemble the mitogenome. Of course, in doing so, you could lose some of the information if the mitogenome coverage is unsufficient. - -# OUTPUTS -### Result folder - -Mitofinder returns several files for each mitochondrial contig found: -- [x] **[job_name]_partial_mito_1.fasta** containing a mitochondrial contig -- [x] **[job_name]_partial_mito_1_genes.fasta** containing the annoted genes for a given contig -- [x] **[job_name]_partial_mito_1.gff** containing the final annotation for a given contig -- [x] **[job_name]_partial_mito_1.arwen** containing the result of the tRNA annotation returned by the Arwen software -- [x] **[job_name]_partial_mito_1.gb** containing the final annotation for a given contig (option --out_gb) -- [x] **[job_name]_final_genes.fasta** containing the final genes selected from all contigs by MitoFinder - -# Test case -A test case is available in the testcase directory. - -To run it: -```shell -cd /PATH/TO/MITOFINDER/testcase/ - -mitofinder -j test -s test.fastq -r test_reference.gb -o 5 -p 5 -m 5 -``` - -# Restart +### Restart Use the same command line. WARNING: If you want to make the assembly again (for example because it failed) you have to remove the result assembly directory. If not, MitoFinder will skip the assembly step. -# UCE annotation -MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. -MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: -- **[job_name]link.scafSeq** containing all assembled contigs from raw reads. - -To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). -You can thus use the file **[job_name]link.scafSeq** and start the pipeline at the **"Finding UCE"** step. - -# Associated publication - -Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F. (2019). MitoFinder: efficient automated large-scale extraction of mitogenomic data in target enrichment phylogenomics. BioRxiv, 685412. https://doi.org/10.1101/685412 - -# HOW TO GET REFERENCE MITOCHONDRIAL GENOMES FROM NCBI - -1. Go to [NCBI](https://www.ncbi.nlm.nih.gov/) -2. Select "Nucleotide" in the search bar -3. Search for mitochondrion genomes: -- [x] RefSeq (if available) -- [x] Sequence length from 12000 to 20000 -4. Download complete record in GenBank format - -![](/image/NCBI.png) - # Detailed options ```shell usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] [-1 PE1] [-2 PE2] [-s SE] [-a ASSEMBLY] [-m MEM] [-l SHORTESTCONTIG] [-p PROCESSORSTOUSE] [-r REFSEQFILE] - [-e BLASTEVAL] [--out_gb] + [-e BLASTEVAL] [-n NWALK] [--out_gb] [--blastidentitynucl BLASTIDENTITYNUCL] [--blastidentityprot BLASTIDENTITYPROT] [--blastsize ALIGNCUTOFF] [--circularsize CIRCULARSIZE] @@ -163,7 +104,7 @@ usage: mitofinder [-h] [--megahit] [--idba] [--metaspades] [-j PROCESSNAME] [-o ORGANISMTYPE] [-v] [--example] Mitofinder is a pipeline to assemble and annotate mitochondrial DNA from -cleaned sequencing datas. +trimmed sequencing reads. optional arguments: -h, --help show this help message and exit @@ -171,7 +112,7 @@ optional arguments: --idba Use IDBA-UD for assembly. --metaspades Use MetaSPAdes for assembly. -j PROCESSNAME, --jobname PROCESSNAME - Job name to be used throughout the project + Sequence ID to be used throughout the process -1 PE1, --Paired-end1 PE1 File with forward paired-end reads -2 PE2, --Paired-end2 PE2 @@ -181,29 +122,33 @@ optional arguments: -a ASSEMBLY, --assembly ASSEMBLY File with your own assembly -m MEM, --max-memory MEM - max memory to use in Go (megahit or metaspades) + max memory to use in Go (Megahit or MetaSPAdes) -l SHORTESTCONTIG, --length SHORTESTCONTIG Shortest contig length to be used in scaffolding. Default = 100 -p PROCESSORSTOUSE, --processors PROCESSORSTOUSE Number of threads Mitofinder will use at most. -r REFSEQFILE, --refseq REFSEQFILE - Reference mitochondrial genome.s in GenBank format - (.gb). You can put several reference mitgenomes - together. + Reference mitochondrial genome in GenBank format + (.gb). -e BLASTEVAL, --blaste BLASTEVAL - e-Value for blast program for contig identification - and annotation. Default = 0.000001 - --out_gb Create annotation output with genbank format. + e-value of blast program used for contig + identification and annotation. Default = 0.000001 + -n NWALK, --nwalk NWALK + Maximum number of codon steps to be tested on each + size of the gene to find the start and stop codon + during the annotation step. Default = 20 (60 bases) + --out_gb Do not create annotation output file in GenBank + format. --blastidentitynucl BLASTIDENTITYNUCL - Nucleotide percentage of identity or a hit to be - considered good. Default = 50 + Nucleotide identity percentage for a hit to be + retained. Default = 50 --blastidentityprot BLASTIDENTITYPROT - Amino acid percentage of identity or a hit to be - considered good. Default = 40 + Amino acid identity percentage for a hit to be + retained. Default = 40 --blastsize ALIGNCUTOFF - Percentage of covered span in blast best hit to be - considered good. Default = 30 + Percentage of overlap in blast best hit to be + retained. Default = 30 --circularsize CIRCULARSIZE Size to consider when checking for circularization. Default = 45 @@ -213,8 +158,7 @@ optional arguments: -cove COVECUTOFF, --covecutoff COVECUTOFF Cove cutoff for tRNAscan-SE. Default = 7 -o ORGANISMTYPE, --organism ORGANISMTYPE - What should the genome checking and annotation - consider as genetic code type. NCBI table (integer): + Organism genetic code following NCBI table (integer): 1. The Standard Code 2. The Vertebrate Mitochondrial Code 3. The Yeast Mitochondrial Code 4. The Mold, Protozoan, and Coelenterate Mitochondrial Code and the @@ -231,9 +175,57 @@ optional arguments: Mitochondrial Code 24. Pterobranchia Mitochondrial Code 25. Candidate Division SR1 and Gracilibacteria Code - -v, --version Version 1.0.2 - --example Print usual examples of use + -v, --version Version 1.1 + --example Print getting started examples ``` +# INPUTS + +Mitofinder needs several files to run depending on the method you have choosen (see above): +- [x] **Reference_file.gb** containing at least one mitochondrial genome of reference extracted from [NCBI](https://www.ncbi.nlm.nih.gov/) +- [ ] **left_reads.fastq.gz** containing the left reads of paired-end sequencing +- [ ] **right_reads.fastq.gz** containing the right reads of paired-end sequencing +- [ ] **SE_reads.fastq.gz** containing the reads of single-end sequencing +- [ ] **assembly.fasta** containing the assembly on which MitoFinder have to find and annotate mitochondrial contig.s + +# OUTPUTS +### Result folder + +Mitofinder returns several files for each mitochondrial contig found: +- [x] **[job_name]_final_genes.fasta** containing the final genes selected from all contigs by MitoFinder +- [x] **[job_name]_mtDNA_contig.fasta** containing a mitochondrial contig +- [x] **[job_name]_mtDNA_contig_final.gff** containing the final annotation for a given contig +- [x] **[job_name]_mtDNA_contig_final.tbl** containing the final annotation for a given contig (Genbank submission format) +- [x] **[job_name]_mtDNA_contig.gb** containing the final annotation for a given contig +- [x] **[job_name]_mtDNA_contig_genes.fasta** containing the annoted genes for a given contig +- [x] **[job_name]_mtDNA_contig.png** schematic representation of the annotation of the mtDNA contig + + +# UCE annotation +MitoFinder starts by assembling both mitochondrial and nuclear reads. It is only in a second step that mitochondrial contigs are identified and extracted. +MitoFinder thus provides UCE contigs already assembled and the annotation can be done from the following file: +- **[job_name]link.scafSeq** containing all assembled contigs from raw reads. + +To do so, we recommend the PHYLUCE pipeline, which is specifically designed to annotate ultraconserved elements (Faircloth 2015; Tutorial: https://phyluce.readthedocs.io/en/latest/tutorial-one.html#finding-uce-loci). +You can thus use the file **[job_name]link.scafSeq** and start the pipeline at the **"Finding UCE"** step. + +# Associated publication + +Allio, R., Schomaker-Bastos, A., Romiguier, J., Prosdocimi, F., Nabholz, B., & Delsuc, F. (2019). MitoFinder: efficient automated large-scale extraction of mitogenomic data in target enrichment phylogenomics. BioRxiv, 685412. https://doi.org/10.1101/685412 + + +# HOW TO GET REFERENCE MITOCHONDRIAL GENOMES FROM NCBI + +1. Go to [NCBI](https://www.ncbi.nlm.nih.gov/) +2. Select "Nucleotide" in the search bar +3. Search for mitochondrion genomes: +- [x] RefSeq (if available) +- [x] Sequence length from 12000 to 20000 +4. Download complete record in GenBank format + +![](/image/NCBI.png) + + + diff --git a/circularizationCheck.pyc b/circularizationCheck.pyc index ad7fa99..9658cec 100755 Binary files a/circularizationCheck.pyc and b/circularizationCheck.pyc differ diff --git a/genbankOutput.py b/genbankOutput.py index 2b18007..47f8a40 100755 --- a/genbankOutput.py +++ b/genbankOutput.py @@ -31,7 +31,7 @@ from Bio.Data import CodonTable from decimal import Decimal -def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloroplast = False, dLoopSize = 800): +def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloroplast = False, dLoopSize = 800, nWalk = 20): ''' Creates a genbank file based on a fasta file given (resultfile) and a list of features that the genbank file should present (listoffeaturestooutput) @@ -68,21 +68,24 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl main_start_pos = SeqFeature.ExactPosition(thisFeatureAlignment.startBase) main_end_pos = SeqFeature.ExactPosition(thisFeatureAlignment.endBase) - + if main_feature_type == "gene": - codonDiff = ((main_end_pos - (main_start_pos + 1)) % 3) + codonDiff = ((main_end_pos - main_start_pos+1) % 3) if codonDiff == 2: main_end_pos += 1 elif codonDiff == 1: main_end_pos -= 1 + #print main_start_pos + #print main_end_pos + #print thisFeatureAlignment.frame + # 2. Use the locations do define a FeatureLocation if thisFeatureAlignment.frame < 0: strandToOutput = -1 else: strandToOutput = 1 - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) - + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) # 3. Create a SeqFeature main_feature = SeqFeature.SeqFeature(main_feature_location,type=main_feature_type, qualifiers=main_feature_qualifiers) ''' @@ -103,11 +106,10 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl lastFeatureAlignment = thisFeatureAlignment ''' - # 4. Append your newly created SeqFeature to your SeqRecord if main_feature_type == "gene": cds_qualifiers = dict(main_feature_qualifiers) - coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase-1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna = coding_dna.reverse_complement() translationTable = thisFeatureAlignment.translationTable @@ -120,8 +122,8 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl listOfStartCodons.append(startCodonTranslation) startCodons = tuple(listOfStartCodons) #need to make it a tuple so that startswith works with it stopCodons = ('*','$','#','+') - nWalkStart = 20 - nWalkStop = 20 + nWalkStart = nWalk + nWalkStop = nWalk ''' For genes in the -1 strand, we look for the stop codons at the start and the start codons at the end! ''' @@ -130,8 +132,8 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl tempStopCodons = stopCodons startCodons = tempStopCodons stopCodons = tempStartCodons - nWalkStart = 20 - nWalkStop = 20 + nWalkStart = nWalk + nWalkStop = nWalk try: ''' @@ -153,33 +155,45 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl and not coding_dna_TranslationBackward.startswith(startCodons) and n < nWalkStart and startBase - (3*(n+1)) >= 0: try: n += 1 - coding_dna_Backward = Seq(str(finalResults.seq[startBase - (3*n):endBase]), IUPAC.unambiguous_dna) + coding_dna_Backward = Seq(str(finalResults.seq[startBase - (3*n) - 1:endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna_Backward = coding_dna_Backward.reverse_complement() coding_dna_TranslationBackward = coding_dna_Backward.translate(table=translationTable) - coding_dna_Forward = Seq(str(finalResults.seq[startBase + (3*n):endBase]), IUPAC.unambiguous_dna) + coding_dna_Forward = Seq(str(finalResults.seq[startBase -1 - (3*n):endBase]), IUPAC.unambiguous_dna) if strandToOutput == -1: + coding_dna_Forward = Seq(str(finalResults.seq[startBase -1 + (3*n):endBase]), IUPAC.unambiguous_dna) coding_dna_Forward = coding_dna_Forward.reverse_complement() coding_dna_TranslationForward = coding_dna_Forward.translate(table=translationTable) if strandToOutput == -1: coding_dna_TranslationBackward = coding_dna_TranslationBackward[::-1] coding_dna_TranslationForward = coding_dna_TranslationForward[::-1] + except: pass else: - if coding_dna_TranslationBackward.startswith(startCodons): + if coding_dna_TranslationForward.startswith(startCodons): + main_start_pos = SeqFeature.ExactPosition(startBase - (3*n)) + if strandToOutput == -1: + SeqFeature.ExactPosition(startBase + (3*n)) + #startBase += (3*n) #backup + #thisFeatureAlignment.startBase = startBase #backup + thisFeatureAlignment.startBase = main_start_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) + elif coding_dna_TranslationBackward.startswith(startCodons): main_start_pos = SeqFeature.ExactPosition(startBase - (3*n)) - startBase += (3*n) - thisFeatureAlignment.startBase = startBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) - elif coding_dna_TranslationForward.startswith(startCodons): - main_start_pos = SeqFeature.ExactPosition(startBase + (3*n)) - startBase += (3*n) - thisFeatureAlignment.startBase = startBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + #startBase += (3*n) #backup + #thisFeatureAlignment.startBase = startBase #backup + thisFeatureAlignment.startBase = main_start_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) except: pass - + + ''' + Updating coding_dna with (new) coordinates + ''' + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase-1:thisFeatureAlignment.endBase]), IUPAC.unambiguous_dna) + if strandToOutput == -1: + coding_dna = coding_dna.reverse_complement() ''' Making sure it ends with * (stop codon) ''' @@ -199,16 +213,15 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl and not coding_dna_TranslationBackward.endswith(stopCodons) and n < nWalkStop and endBase + (3*(n+1)) <= len(finalResults): try: n += 1 - coding_dna_Backward = Seq(str(finalResults.seq[startBase:endBase - (3*n)]), IUPAC.unambiguous_dna) + coding_dna_Backward = Seq(str(finalResults.seq[startBase -1 :endBase - (3*n)]), IUPAC.unambiguous_dna) if strandToOutput == -1: + coding_dna_Backward = Seq(str(finalResults.seq[startBase -1 :endBase + (3*n)]), IUPAC.unambiguous_dna) coding_dna_Backward = coding_dna_Backward.reverse_complement() coding_dna_TranslationBackward = coding_dna_Backward.translate(table=translationTable) - - coding_dna_Forward = Seq(str(finalResults.seq[startBase:endBase + (3*n)]), IUPAC.unambiguous_dna) + coding_dna_Forward = Seq(str(finalResults.seq[startBase - 1:endBase + (3*n)]), IUPAC.unambiguous_dna) if strandToOutput == -1: coding_dna_Forward = coding_dna_Forward.reverse_complement() coding_dna_TranslationForward = coding_dna_Forward.translate(table=translationTable) - if strandToOutput == -1: coding_dna_TranslationBackward = coding_dna_TranslationBackward[::-1] coding_dna_TranslationForward = coding_dna_TranslationForward[::-1] @@ -217,18 +230,23 @@ def genbankOutput(resultGbFile, resultFile, listOfFeaturesToOutput, buildCloropl else: if coding_dna_TranslationBackward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition(endBase - (3 * n)) - endBase -= (3 * (n-1)) - thisFeatureAlignment.endBase = endBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + if strandToOutput == -1: + main_end_pos = SeqFeature.ExactPosition(endBase + (3 * n)) + #endBase -= (3 * (n-1)) #backup + #thisFeatureAlignment.endBase = endBase #backup + thisFeatureAlignment.endBase = main_end_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) elif coding_dna_TranslationForward.endswith(stopCodons): main_end_pos = SeqFeature.ExactPosition(endBase + (3 * n)) - endBase += (3 * (n-1)) - thisFeatureAlignment.endBase = endBase - main_feature_location = SeqFeature.FeatureLocation(main_start_pos,main_end_pos,strand=strandToOutput) + #endBase += (3 * (n-1)) #backup + #thisFeatureAlignment.endBase = endBase #backup + thisFeatureAlignment.endBase = main_end_pos #update + main_feature_location = SeqFeature.FeatureLocation(main_start_pos-1,main_end_pos,strand=strandToOutput) except: pass + + coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase -1 :thisFeatureAlignment.endBase]),IUPAC.unambiguous_dna) - coding_dna = Seq(str(finalResults.seq[thisFeatureAlignment.startBase:thisFeatureAlignment.endBase]),IUPAC.unambiguous_dna) if strandToOutput == 1: coding_dna_Translation = coding_dna.translate(table=translationTable) else: diff --git a/genbankOutput.pyc b/genbankOutput.pyc index 7bb8869..15a6663 100755 Binary files a/genbankOutput.pyc and b/genbankOutput.pyc differ diff --git a/geneChecker.pyc b/geneChecker.pyc index 56f8fd2..61b14d8 100755 Binary files a/geneChecker.pyc and b/geneChecker.pyc differ diff --git a/geneChecker_fasta.py b/geneChecker_fasta.py index e1105fc..3375196 100755 --- a/geneChecker_fasta.py +++ b/geneChecker_fasta.py @@ -145,12 +145,12 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu for hsp in qhit.hsps: #hsp object checking, this contains the alignment info featureName = qhit.id if float(str(hsp.ident_num)+".00")/float(str(hsp.aln_span)+".00")*100 >= float(cutoffEquality_prot): - if hsp.aln_span*3+3 >= alignCutOff: + if hsp.aln_span*3 >= alignCutOff: if featureName in listOfImportantFeatures: targetFeature = listOfImportantFeatures[featureName] - startBase = min(hsp.query_range[0],hsp.query_range[1]) + startBase = min(hsp.query_range[0],hsp.query_range[1])+1 endBase = max(hsp.query_range[0],hsp.query_range[1]) - alignLen = endBase - startBase + alignLen = (endBase-startBase)+1 if featureName in listOfPresentFeatures: mainFeatureName = featureName mainFeatureFound = listOfPresentFeatures[mainFeatureName] @@ -171,7 +171,7 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.frame = featureFrame alignment.startBase = startBase alignment.endBase = endBase - alignment.seqFound = refSeq.seq[startBase:endBase] + alignment.seqFound = refSeq.seq[startBase-1:endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[qhit.id], alignment, featureFrame <= -1) else: @@ -184,13 +184,14 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.frame = featureFrame alignment.startBase = startBase alignment.endBase = endBase - alignment.seqFound = refSeq.seq[startBase:endBase] + alignment.seqFound = refSeq.seq[(startBase-1):endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[qhit.id], alignment, featureFrame <= -1) if alignLen >= (len(targetFeature*3)+3) * 0.99: #if we've already built a lot, dont even bother with finding splits listOfCompleteGenes.append(featureName) break + #exit() #copying the blast result in order for this info to be assessed later if the user desires shutil.copyfile("important_features.blast.xml", out_blast+"_ref.cds.blast.xml") os.remove("important_features.blast.xml") @@ -275,12 +276,13 @@ def geneCheck(fastaReference, resultFile, cutoffEquality_prot, cutoffEquality_nu alignment.seqFound = refSeq.seq[startBase:endBase] listOfPresentFeatures[featureName] = (listOfImportantFeatures[featureName], alignment, featureFrame == -1) break + shutil.copyfile("important_features.blast.xml", out_blast+"_ref.blast.xml") shutil.copyfile("important_features.fasta", out_blast+"_ref.fasta") os.remove("important_features.blast.xml") os.remove("important_features.fasta") - + return (listOfPresentFeatures, listOfImportantFeatures, listOfSplits, listOfCompleteGenes) def createImageOfAnnotation(sequenceObject, outputFile): @@ -308,7 +310,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): for gbkFeature in sequenceObject.features: if gbkFeature.type == 'tRNA' or gbkFeature.type == 'CDS' or gbkFeature.type == 'rRNA' or gbkFeature.type == 'D-loop': - featureLen = gbkFeature.location.end - gbkFeature.location.start + featureLen = (gbkFeature.location.end - gbkFeature.location.start) + 1 featureRelativeSize = horizontalSize * featureLen / len(sequenceObject.seq) featureRelativeStart = (horizontalSize * gbkFeature.location.start / len(sequenceObject.seq)) + 1 @@ -388,6 +390,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): percent_equality_prot=sys.argv[8] percent_equality_nucl=sys.argv[9] genbank=sys.argv[10] + nWalk=sys.argv[11] if sys.argv[1] == '-h' or sys.argv[1] == '--help': print 'Usage: genbank_reference fasta_file output_file organism_type(integer, default=2) alignCutOff(float, default=45) coveCutOff(7)' print 'Only the first, second, and third arguments are required.' @@ -490,7 +493,7 @@ def createImageOfAnnotation(sequenceObject, outputFile): listOfFeaturesToOutput.sort() print 'Total features found after Arwen: ',len(listOfFeaturesToOutput) - finalResults = genbankOutput.genbankOutput(outputFile, resultFile, listOfFeaturesToOutput, False, 900) + finalResults = genbankOutput.genbankOutput(outputFile, resultFile, listOfFeaturesToOutput, False, 900, nWalk) with open(outputFile, "w") as outputResult: count = SeqIO.write(finalResults, outputResult, "genbank") diff --git a/mitofinder b/mitofinder index de66b11..a289171 100755 --- a/mitofinder +++ b/mitofinder @@ -1,5 +1,5 @@ #!/usr/bin/python -#Version: 1.0.2 +#Version: 1.1 #Authors: Allio Remi & Schomaker-Bastos Alex #ISEM - CNRS - LAMPADA - IBQM - UFRJ @@ -65,7 +65,7 @@ if __name__ == "__main__": parser.add_argument('--metaspades', help='Use MetaSPAdes for assembly. ', default=False, dest='metaspades', action='store_true') # parser.add_argument('--annotate', help='Use Mitofinder to annotate a given mitochondrial (-c) ', dest='Annotate', action='store_true') - parser.add_argument('-j', '--jobname', help = 'Job name to be used throughout the process', default="MitoFinder_run", dest='processName') + parser.add_argument('-j', '--jobname', help = 'Sequence ID to be used throughout the process', default="", dest='processName') parser.add_argument('-1', '--Paired-end1', help='File with forward paired-end reads', default="", dest='PE1') parser.add_argument('-2', '--Paired-end2', help='File with reverse paired-end reads', default="", dest='PE2') parser.add_argument('-s', '--Single-end', help='File with single-end reads', default="", dest='SE') @@ -77,11 +77,13 @@ if __name__ == "__main__": default=100, dest='shortestContig') parser.add_argument('-p', '--processors', help='Number of threads Mitofinder will use at most.', type=int, default=4, dest='processorsToUse') - parser.add_argument('-r', '--refseq', help='Reference mitochondrial genome.s in GenBank format (.gb). You can put several reference mitgenomes together.', + parser.add_argument('-r', '--refseq', help='Reference mitochondrial genome in GenBank format (.gb).', default="", dest='refSeqFile') parser.add_argument('-e', '--blaste', help='e-value of blast program used for contig identification and annotation. Default = 0.000001', type=float, default=0.000001, dest='blasteVal') - parser.add_argument('--out_gb', help='Create annotation output file in GenBank format.', default=False, dest='genbk', action='store_true') + parser.add_argument('-n', '--nwalk', help='Maximum number of codon steps to be tested on each size of the gene to find the start and stop codon during the annotation step. Default = 20 (60 bases)', type=int, + default=20, dest='nWalk') + parser.add_argument('--out_gb', help='Do not create annotation output file in GenBank format.', default=True, dest='genbk', action='store_false') parser.add_argument('--blastidentitynucl', help='Nucleotide identity percentage for a hit to be retained. Default = 50', default=50, type=float, dest='blastIdentityNucl') parser.add_argument('--blastidentityprot', help='Amino acid identity percentage for a hit to be retained. Default = 40', @@ -114,7 +116,7 @@ if __name__ == "__main__": 24. Pterobranchia Mitochondrial Code\ 25. Candidate Division SR1 and Gracilibacteria Code", type=int, default=1, dest='organismType') - parser.add_argument('-v', '--version', help="Version 1.0.2", default=False, dest='versionCheck', action='store_true') + parser.add_argument('-v', '--version', help="Version 1.1", default=False, dest='versionCheck', action='store_true') parser.add_argument('--example', help="Print getting started examples", default=False, dest='example', action='store_true') args = parser.parse_args() @@ -128,10 +130,20 @@ if __name__ == "__main__": exit() if args.versionCheck == True: - print "MitoFinder version 1.0.2" + print "MitoFinder version 1.1" exit() - - Logfile=args.processName+".log" + + if args.PE1 == "" and args.PE2 == "" and args.SE == "" and args.Assembly == "" : + print "\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data." + exit() + if args.refSeqFile == "": + print "\nERROR: Reference file is required (-r option)" + exit() + if args.processName == "": + print "\nERROR: SeqID is required (-j option)" + exit() + + Logfile=args.processName+"_MitoFinder.log" Logfile=os.path.join(initial_path,Logfile) logfile=open(Logfile,"w") logfile.write('Command line: %s' % ' '.join(sys.argv)+"\n") @@ -145,15 +157,7 @@ if __name__ == "__main__": print 'Now running MitoFinder ...\n' print "Start time : "+datetime.now().strftime("%Y-%m-%d %H:%M:%S")+'\n' - if args.PE1 == "" and args.PE2 == "" and args.SE == "" and args.Assembly == "" : - print "\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data." - logfile.write("\nERROR: Read or assembly files are not specified.\n Please, use -1 -2 -s or -a option to specify input data.\n") - exit() - if args.refSeqFile == "": - print "ERROR: Reference file is required (-r option)" - logfile.write("ERROR: Reference file is required (-r option)") - exit() - + print "Job name = "+args.processName args.refSeqFile=os.path.join(initial_path,args.refSeqFile) @@ -240,10 +244,6 @@ if __name__ == "__main__": logfile.write("\n") if pathToMegahitFolder.lower() == 'default' and args.idba == False and args.metaspades == False: pathToMegahitFolder = os.path.join(module_dir, 'megahit/') - command = 'make -C '+pathToMegahitFolder - args1 = shlex.split(command) - make = Popen(args1, stdout=open("log","w")) - make.wait() logfile.write('WARNING: Megahit is still set in its default folder. Change it in the config file if you encounter problems running this script.\n') print 'WARNING: Megahit is still set in its default folder. Change it in the config file if you encounter problems running this script.' @@ -795,8 +795,8 @@ if __name__ == "__main__": #creating some stat file: print "\n" - print "Creating summary statistics for "+args.processName+"_best_contig.fasta" - logfile.write("\n\n"+"Creating summary statistics for "+args.processName+"_best_contig.fasta\n") + print "Creating summary statistics for the mtDNA contig" + logfile.write("\n\n"+"Creating summary statistics for the mtDNA contig\n") finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna) finalStatsFile = open(pathOfFinalResults + args.processName + '.stats', 'w') @@ -809,19 +809,25 @@ if __name__ == "__main__": else: finalStatsFile.write("Circularization: No\n") - shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_best_contig.fasta") + command = module_dir+"/rename_fasta_seqID.py " + args.processName + " " + resultFile + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " " + str(1) + args1 = shlex.split(command) + rename = Popen(args1, stdout=open(os.devnull, 'wb')) + rename.wait() + + os.remove(resultFile) + #shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta") # Annotating with gene_checker print "" - print "Annotating mitochondrial contigs" + print "Annotating mitochondrial contig" print "" logfile.write("\nAnnotating\n\n") if recordCount > 1: #if more than 1 ref - if os.path.isfile(pathtowork+'/ref_for_best_contig.fasta') == True: - os.remove(pathtowork+'/ref_for_best_contig.fasta') + if os.path.isfile(pathtowork+'/ref_for_mtDNA_contig.fasta') == True: + os.remove(pathtowork+'/ref_for_mtDNA_contig.fasta') for line in open(pathtowork+"/genes_list"): if line.rstrip() != "rrnL" and line.rstrip() != "rrnS": @@ -831,7 +837,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ resultFile + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" + command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -842,7 +848,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ resultFile + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" + command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig.fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -861,7 +867,7 @@ if __name__ == "__main__": dico_query[testedGene]=query bestScore=score - refFile=open(pathtowork+'/ref_for_best_contig.fasta','a') + refFile=open(pathtowork+'/ref_for_mtDNA_contig.fasta','a') for cle, valeur in dico_query.items(): for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): if name.replace(">","") == valeur: @@ -869,26 +875,26 @@ if __name__ == "__main__": refFile.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_best_contig.fasta" + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.fasta" + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() else: - best_ref=open(pathtowork + "/ref_for_best_contig.fasta","w") + best_ref=open(pathtowork + "/ref_for_mtDNA_contig.fasta","w") for line in open(pathtowork+"/genes_list"): gene=line.rstrip() for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_best_contig.fasta" + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.fasta" + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() - test_arwen=pathOfFinalResults + "/" + args.processName+"_best_contig.arwen" + test_arwen=pathOfFinalResults + "/" + args.processName+"_mtDNA_contig.arwen" if os.path.isfile(test_arwen) == True: print "tRNA annotation with Arwen run well.\n" logfile.write("tRNA annotation with Arwen run well.\n\n") @@ -896,7 +902,7 @@ if __name__ == "__main__": print "ERROR: tRNA annotation failed.\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write("ERROR: tRNA annotation failed.\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_mito.gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig.gb" if os.path.isfile(test_gene_checker) == True: print "Annotation completed\n" logfile.write("Annotation completed\n\n") @@ -985,7 +991,7 @@ if __name__ == "__main__": for line in open(pathtowork+"/"+'contig_list.txt','r'): pathOfResult = pathtowork+"/"+args.processName+'_contig_'+str(c)+'.fasta' - resultFile = args.processName + '_partial_mito_'+str(c)+'.fasta' + resultFile = args.processName + '_mtDNA_contig_'+str(c)+'.fasta' with open(resultFile, "w") as outputResult: #create draft file to be checked and annotated finalResults = SeqIO.read(open(pathOfResult, 'rU'), "fasta", generic_dna) @@ -999,32 +1005,38 @@ if __name__ == "__main__": #creating some stat file: print "\n" - print "Creating summary statistics for Partial_mito_"+str(c) + print "Creating summary statistics for mtDNA contig "+str(c) print "" - logfile.write("\n\n"+"Creating summary statistics for Partial_mito_"+str(c)+"\n\n") + logfile.write("\n\n"+"Creating summary statistics for mtDNA contig "+str(c)+"\n\n") finalResults = SeqIO.read(open(resultFile, 'rU'), "fasta", generic_dna) - finalStatsFile = open(pathOfFinalResults + args.processName + '_partial_mito_'+str(c)+'.stats', 'w') + finalStatsFile = open(pathOfFinalResults + args.processName + '_mtDNA_contig_'+str(c)+'.stats', 'w') finalStatsFile.write('Statistics for contig '+str(c)+':\n\n') finalStatsFile.write('Length: ' + str(len(finalResults.seq)) + "\n") finalStatsFile.write('GC content: ' + ("{0:.2f}".format(SeqUtils.GC(finalResults.seq))) + '%\n') finalStatsFile.write("Circularization: No\n") - shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta") + command = module_dir+"/rename_fasta_seqID.py " + args.processName + " " + resultFile + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + str(c) + args1 = shlex.split(command) + rename = Popen(args1, stdout=open(os.devnull, 'wb')) + rename.wait() + + os.remove(resultFile) + #shutil.copyfile(resultFile, pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta") # creating best ref file for annotation - command = blastFolder+"/makeblastdb -in " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -dbtype nucl" #need to formatdb refseq first + command = blastFolder+"/makeblastdb -in " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -dbtype nucl" #need to formatdb refseq first args1 = shlex.split(command) formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() if recordCount > 1: #if more than 1 ref - print "Looking for best reference genes for Partial_mito_"+str(c) + print "Looking for best reference genes for mtDNA contig "+str(c) print "" - logfile.write("Looking for best reference genes for Partial_mito_"+str(c)+"\n\n") + logfile.write("Looking for best reference genes for mtDNA contig "+str(c)+"\n\n") if os.path.isfile(pathtowork+'/ref_for_contig_'+str(c)+'.fasta') == True: os.remove(pathtowork+'/ref_for_contig_'+str(c)+'.fasta') @@ -1037,7 +1049,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" + command = blastFolder+"/blastx -db " + "ref_" + gene+ "_database.fasta" + " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6" + " -query_gencode " + str(args.organismType) + " -seg no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -1048,7 +1060,7 @@ if __name__ == "__main__": formatDB = Popen(args1, stdout=open(os.devnull, 'wb')) formatDB.wait() with open(pathtowork+'/'+gene+'_blast_out.txt','w') as BlastResultGene: - command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" + command = blastFolder+"/blastn -db " + "ref_" + gene+ "_database.fasta"+ " -query "+ pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " -evalue " + str(blasteVal) + " -outfmt 6 -perc_identity " + str(args.blastIdentityNucl) + " -dust no" args1 = shlex.split(command) blast = Popen(args1, stdout=BlastResultGene) blast.wait() @@ -1075,9 +1087,9 @@ if __name__ == "__main__": refFile.close() - print "Annotating Partial_mito_"+str(c) + print "Annotating mtDNA contig "+str(c) print "" - logfile.write("Annotating Partial_mito_"+str(c)+"\n\n") + logfile.write("Annotating mtDNA contig "+str(c)+"\n\n") # Annotating with gene_checker """QRY_dico={} @@ -1102,14 +1114,14 @@ if __name__ == "__main__": tmp.write("LOCUS"+x) tmp.close() - command = module_dir+"/geneChecker.py " + "../best_query.gb" + " " + resultFile + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + command = module_dir+"/geneChecker.py " + "../best_query.gb" + " " + resultFile + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() """ if recordCount > 1: - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contig_" + str(c) + ".fasta" + " " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contig_" + str(c) + ".fasta" + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() @@ -1123,28 +1135,28 @@ if __name__ == "__main__": best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contigs.fasta" + " " + pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".fasta" + " " + args.processName+"_partial_mito_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_contigs.fasta" + " " + pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".fasta" + " " + args.processName+"_mtDNA_contig_"+str(c)+".gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() - test_arwen=pathOfFinalResults+"/"+args.processName+"_partial_mito_"+str(c)+".arwen" + test_arwen=pathOfFinalResults+"/"+args.processName+"_mtDNA_contig_"+str(c)+".arwen" if os.path.isfile(test_arwen) == True: - print "tRNA annotation with Arwen run well for partial mito "+str(c)+".\n" - logfile.write("tRNA annotation with Arwen run well for partial mito "+str(c)+".\n"+"\n") + print "tRNA annotation with Arwen run well for mtDNA contig "+str(c)+".\n" + logfile.write("tRNA annotation with Arwen run well for mtDNA contig "+str(c)+".\n"+"\n") else: print "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write("ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_partial_mito_"+str(c)+".gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig_"+str(c)+".gb" if os.path.isfile(test_gene_checker) == True: print "Annotation completed\n" logfile.write("Annotation completed\n"+"\n") else: - print "ERROR: Gene annotation failed for partial mito "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n" - logfile.write("ERROR: Gene annotation failed for partial mito "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n"+"\n") + print "ERROR: Gene annotation failed for mtDNA contig "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n" + logfile.write("ERROR: Gene annotation failed for mtDNA contig "+str(c)+".\nPlease check "+ pathtowork + "/geneChecker_error.log to see what happened\nAborting\n"+"\n") exit() c+=1 @@ -1153,8 +1165,8 @@ if __name__ == "__main__": else: if recordCount > 1: #if more than 1 ref - if os.path.isfile(pathtowork+'/ref_for_best_contig.fasta') == True: - os.remove(pathtowork+'/ref_for_best_contig.fasta') + if os.path.isfile(pathtowork+'/ref_for_mtDNA_contig.fasta') == True: + os.remove(pathtowork+'/ref_for_mtDNA_contig.fasta') for line in open(pathtowork+"/genes_list"): if line.rstrip() != "rrnL" and line.rstrip() != "rrnS": @@ -1194,7 +1206,7 @@ if __name__ == "__main__": dico_query[testedGene]=query bestScore=score - refFile=open(pathtowork+'/ref_for_best_contig.fasta','a') + refFile=open(pathtowork+'/ref_for_mtDNA_contig.fasta','a') for cle, valeur in dico_query.items(): for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): if name.replace(">","") == valeur: @@ -1202,7 +1214,7 @@ if __name__ == "__main__": refFile.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) fifthStep.wait() @@ -1211,14 +1223,14 @@ if __name__ == "__main__": print "Running Annotation\n" logfile.write("Running Annotation\n"+"\n") - best_ref=open(pathtowork + "/ref_for_best_contig.fasta","w") + best_ref=open(pathtowork + "/ref_for_mtDNA_contig.fasta","w") for line in open(pathtowork+"/genes_list"): gene=line.rstrip() for name, seq in read_fasta(open(pathtowork+"/ref_"+gene+ "_database.fasta")): best_ref.write(name+"\n"+seq+"\n") best_ref.close() - command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_best_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mito.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk) + command = module_dir+"/geneChecker_fasta.py " + pathtowork + "/ref_for_mtDNA_contig.fasta" + " " + args.CONTIG + " " + args.processName+"_mtDNA_contig.gb" + " " + str(args.organismType) + " " + str(args.aligncutoff) + " " + str(args.coveCutOff) + " " + str(blasteVal) + " " + str(args.blastIdentityProt) + " " + str(args.blastIdentityNucl) + " " + str(args.genbk)+ " " + str(args.nWalk) args1 = shlex.split(command) fifthStep = Popen(args1, cwd=pathOfFinalResults,stdout=open('geneChecker.log', 'w'), stderr=open('geneChecker_erreur.log', 'w')) fifthStep.wait() @@ -1232,7 +1244,7 @@ if __name__ == "__main__": print "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n" logfile.write( "ERROR: tRNA annotation with Arwen failed\nPlease check "+ pathtowork + "/geneChecker_erreur.log or geneChecker.log to see what happened\nAborting\n"+"\n") exit() - test_gene_checker=pathOfFinalResults+args.processName+"_mito.gb" + test_gene_checker=pathOfFinalResults+args.processName+"_mtDNA_contig.gb" if os.path.isfile(test_gene_checker) == True: print "Annotation is done\n" logfile.write("Annotation is done\n"+"\n") @@ -1275,11 +1287,19 @@ if __name__ == "__main__": logfile.write("\n") #sort gff - for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): - command = module_dir+"/sort_gff.py " + f - args1 = shlex.split(command) - sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) - sort_gff.wait() + pathlist = glob.glob(pathOfFinalResults+"/*.gb") + if len(pathlist) == 1 : + for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): + command = module_dir+"/sort_gff.py " + f + " " + args.processName +".1 " + str(args.organismType) + args1 = shlex.split(command) + sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) + sort_gff.wait() + else: + for f in glob.glob(pathOfFinalResults+"/*_raw.gff"): + command = module_dir+"/sort_gff.py " + f + " " + args.processName +"."+ f.split("_raw")[0][-1] + " " + str(args.organismType) + args1 = shlex.split(command) + sort_gff = Popen(args1, cwd=pathOfFinalResults, stdout=open('geneChecker.log', 'a'), stderr=open('geneChecker_erreur.log', 'a')) + sort_gff.wait() #check genes (doublon ?) @@ -1339,7 +1359,9 @@ if __name__ == "__main__": os.remove(f) shutil.copy(pathOfFinalResults+"Arwen.log", tmpfiles+"/") os.remove(pathOfFinalResults+"Arwen.log") - + for f in glob.glob(pathOfFinalResults+"*_raw.gff"): + shutil.copy(f, tmpfiles+"/") + os.remove(f) for f in glob.glob(pathtowork+"/*blast*"): shutil.copy(f, tmpfiles+"/") os.remove(f) diff --git a/rename_fasta_seqID.py b/rename_fasta_seqID.py new file mode 100755 index 0000000..2ecb989 --- /dev/null +++ b/rename_fasta_seqID.py @@ -0,0 +1,23 @@ +#! /usr/bin/env python + +import sys +import os.path + +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + +seqID=sys.argv[1] +fasta = open(sys.argv[2]) +fout = open(sys.argv[3], 'w') +c=sys.argv[4] + +for name, seq in read_fasta(fasta): + fout.write(">"+seqID+"."+str(c)+"\n"+seq+"\n") diff --git a/runIDBA.pyc b/runIDBA.pyc index cb78398..9c95e44 100755 Binary files a/runIDBA.pyc and b/runIDBA.pyc differ diff --git a/runMegahit.pyc b/runMegahit.pyc index 409f2bc..8af34e8 100755 Binary files a/runMegahit.pyc and b/runMegahit.pyc differ diff --git a/runMetaspades.pyc b/runMetaspades.pyc index 56d017e..e810c5e 100755 Binary files a/runMetaspades.pyc and b/runMetaspades.pyc differ diff --git a/sort_gff.py b/sort_gff.py index ed76419..cb99c02 100755 --- a/sort_gff.py +++ b/sort_gff.py @@ -5,13 +5,29 @@ import operator import collections +def read_fasta(fp): + name, seq = None, [] + for line in fp: + line = line.rstrip() + if line.startswith(">"): + if name: yield (name, ''.join(seq)) + name, seq = line, [] + else: + seq.append(line) + if name: yield (name, ''.join(seq)) + file1=open(sys.argv[1]) +fasta=open(sys.argv[1].split("_raw.gff")[0]+".fasta") +seqID=sys.argv[2] dico={} dicotrna={} dicof={} c=0 +for name, seq in read_fasta(fasta): + length=len(seq) + for line in open(sys.argv[1]): dico[line.rstrip().split("\t")[8]]=line.rstrip() @@ -45,14 +61,399 @@ sorted_x = sorted(dicof.items(), key=operator.itemgetter(0)) sorted_dict = collections.OrderedDict(sorted_x) -fout=sys.argv[1].split("_raw.gff")[0]+"_filtered.gff" -fout=open(fout, "w") +gout=sys.argv[1].split("_raw.gff")[0]+"_final.gff" +gout=open(gout, "w") +gout.write(seqID+"\t"+"mitofinder\tsource\t1\t"+str(length)+"\t.\t+\t.\tName=source "+seqID.replace("_"," ")+"\n") +tout=sys.argv[1].split("_raw.gff")[0]+"_final.tbl" +tout=open(tout, "w") +tout.write(">Feature "+seqID+"\n") +#tout.write("1\t"+str(length)+"\tREFERENCE\n") +#tout.write("\t\t\tMitoFinder\txxxxxx\n") for k, v in sorted_dict.items(): if not dicotrna.has_key(v.split("\t")[8]): - fout.write(v+"\n") - + col1=seqID + col2=v.split("\t")[1] + col3=v.split("\t")[2] + col4=v.split("\t")[3] + col5=v.split("\t")[4] + col6=v.split("\t")[5] + col7=v.split("\t")[6] + col8=v.split("\t")[7] + col9=v.split("\t")[8].rstrip() + if "COX1" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit I\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "COX2" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit II\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "COX3" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome c oxidase subunit III\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + if "ND1" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 1\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND2" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 2\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND3" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 3\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND4" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND4L" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 4L\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ND5" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tNADH dehydrogenase subunit 5\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "CYTB" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tcytochrome b\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ATP8" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 8\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "ATP6" == col9: + size=int(col5)-int(col4)+1 + if size%3 == 0: + ext="" + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + else: + ext=" Note: Not a multiple of 3" + if col7 == "+": + tout.write(col4+"\t"+col5+">"+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+">"+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + if col7 == "-": + tout.write("<"+col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write("<"+col4+"\t"+col5+"\t"+"CDS\n") + tout.write("\t\t\tproduct\tATP synthase F0 subunit 6\n") + tout.write("\t\t\ttransl_table\t"+sys.argv[3]+"\n") + tout.write("\t\t\tNote\tPartial sequence\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+ext+"\n") + gout.write(col1+"\t"+col2+"\t"+"CDS"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" CDS"+ext+"\n") + + if "tRNA" in col9: + size=int(col5)-int(col4)+1 + if size <= 120: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"tRNA\n") + tout.write("\t\t\ttRNA\t"+col9+"\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"tRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" "+"\n") + if "trn" in col9: + size=int(col5)-int(col4)+1 + if size <= 120: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"tRNA\n") + tout.write("\t\t\ttRNA\t"+col9+"\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"tRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" tRNA"+"\n") + if "rrnL" in col9: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"rRNA\n") + tout.write("\t\t\trRNA\t"+col9+"\n") + tout.write("\t\t\tproduct\tlarge subunit ribosomal RNA\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"rRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" rRNA"+"\n") + if "rrnS" in col9: + tout.write(col4+"\t"+col5+"\t"+"gene\n") + tout.write("\t\t\tgene\t"+col9+"\n") + tout.write(col4+"\t"+col5+"\t"+"rRNA\n") + tout.write("\t\t\trRNA\t"+col9+"\n") + tout.write("\t\t\tproduct\tsmall subunit ribosomal RNA\n") + gout.write(col1+"\t"+col2+"\t"+"gene"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" gene"+"\n") + gout.write(col1+"\t"+col2+"\t"+"rRNA"+"\t"+col4+"\t"+col5+"\t"+col6+"\t"+col7+"\t"+col8+"\t"+"Name="+col9+" rRNA"+"\n") +gout.close() +tout.close() + diff --git a/tRNAscanChecker.pyc b/tRNAscanChecker.pyc index d47678a..00e3f29 100755 Binary files a/tRNAscanChecker.pyc and b/tRNAscanChecker.pyc differ