From db3c9c8fb5605b36171b9d727ce5c6cf601b78a1 Mon Sep 17 00:00:00 2001 From: ChocoParrot Date: Wed, 8 Sep 2021 11:15:57 +0800 Subject: [PATCH] [factor] initial commit --- .gitignore | 110 ++++++++++++++++++ LICENSE | 22 ++++ README.md | 0 environment.yml | 3 + orffinder/__init__.py | 1 + orffinder/orffinder.py | 230 +++++++++++++++++++++++++++++++++++++ orffinder/tests/gene.fasta | 20 ++++ orffinder/tests/test.py | 9 ++ setup.py | 21 ++++ 9 files changed, 416 insertions(+) create mode 100644 .gitignore create mode 100644 LICENSE create mode 100644 README.md create mode 100644 environment.yml create mode 100644 orffinder/__init__.py create mode 100644 orffinder/orffinder.py create mode 100644 orffinder/tests/gene.fasta create mode 100644 orffinder/tests/test.py create mode 100644 setup.py diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..1bac219 --- /dev/null +++ b/.gitignore @@ -0,0 +1,110 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# pyenv +.python-version + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ + +#VSCode +.vscode + +# OSX +.DS_Store diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..9944c79 --- /dev/null +++ b/LICENSE @@ -0,0 +1,22 @@ + +MIT License + +Copyright (c) 2021 Chokyotager + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..21473a4 --- /dev/null +++ b/environment.yml @@ -0,0 +1,3 @@ +name: orffinder_env +dependencies: + - biopython diff --git a/orffinder/__init__.py b/orffinder/__init__.py new file mode 100644 index 0000000..073dcd1 --- /dev/null +++ b/orffinder/__init__.py @@ -0,0 +1 @@ +name="orffinder" diff --git a/orffinder/orffinder.py b/orffinder/orffinder.py new file mode 100644 index 0000000..375683f --- /dev/null +++ b/orffinder/orffinder.py @@ -0,0 +1,230 @@ +from Bio import SeqIO +from Bio.Seq import Seq + +def __reformSequence (sequence): + + if isinstance(sequence, str): + + sequence = Seq(sequence) + + return sequence + +def getORFs (sequence, minimum_length=75, start_codons=["ATG"], stop_codons=["TAA", "TAG", "TGA"], remove_nested=False, trim_trailing=False): + + """ + Returns the loci of discovered ORFs in a dictionary format. + + sequence: sequence in Biopython Seq or String format. + minimum_length: minimum size of ORF in nucleotides. + start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"] + stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"] + remove_nested: remove all ORFs completely encased in another. Default: False + trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False + """ + + sequence = __reformSequence(sequence) + + def findSense (sequence, sense="+", start_codons=["ATG"], stop_codons=["TAA", "TAG", "TGA"]): + + start_codon_positions = list() + stop_codon_positions = list() + + # Iterate through frames + for frame in range(3): + + for i in range(frame, len(sequence), 3): + + if sequence[i : i + 3] in start_codons: + start_codon_positions.append({"position": i + 1, "frame": frame + 1, "sense": sense}) + + if sequence[i : i + 3] in stop_codons: + stop_codon_positions.append({"position": i + 4, "frame": frame + 1, "sense": sense}) + + return start_codon_positions, stop_codon_positions + + sequence_length = len(sequence) + forward = str(sequence.seq).upper() + reverse = str(sequence.reverse_complement().seq)[::-1].upper() + + forward_start, forward_stop = findSense(forward, "+") + reverse_start, reverse_stop = findSense(reverse, "-") + + all_starts = forward_start + reverse_start + all_stops = forward_stop + reverse_stop + + all_starts.sort(key=lambda x: x["position"], reverse=False) + all_stops.sort(key=lambda x: x["position"], reverse=False) + + for stop_codon in all_stops: + + stop_codon["occupied"] = False + + orfs = list() + + # Corroborate search strategy + for start_codon in all_starts: + + position = start_codon["position"] + frame = start_codon["frame"] + sense = start_codon["sense"] + + warp_case = True + + for stop_codon in all_stops: + + right_frame = stop_codon["frame"] == frame + right_sense = stop_codon["sense"] == sense + + length = stop_codon["position"] - position + right_length = length >= minimum_length + + if right_frame and right_sense and length > 0: + + warp_case = False + + if stop_codon["occupied"]: + break + + # Registered ORF + if right_length: + orfs.append({"start": position, "end": stop_codon["position"], "frame": frame, "sense": sense, "length": length, "trailing": False}) + + stop_codon["occupied"] = True + break + + if warp_case and not trim_trailing: + + length = sequence_length - position + right_length = length >= minimum_length + + if right_length: + orfs.append({"start": position, "end": -1, "frame": frame, "sense": sense, "length": length, "trailing": True}) + + # Reorder by length + orfs.sort(key=lambda x: x["length"], reverse=True) + + # Remove nested + if remove_nested: + + unnested_orfs = list() + + for orf_1 in orfs: + + appendable = True + + for orf_2 in orfs: + + if orf_2["start"] < orf_1["start"] and orf_2["end"] > orf_1["end"] and orf_1["end"] != -1: + appendable = False + break + + if appendable: + + unnested_orfs.append(orf_1) + + orfs = unnested_orfs + + for i in range(len(orfs)): + + orf = orfs[i] + orf["index"] = i + 1 + + if orf["sense"] == "-": + orf["start"] = sequence_length - orf["start"] + + if orf["end"] == -1: + orf["end"] = 0 + + else: + orf["end"] = sequence_length - orf["end"] + + elif orf["end"] == -1: + orf["end"] = sequence_length + + return orfs + +def getORFNucleotides (sequence, return_loci=False, **kwargs): + + """ + Returns the loci of discovered ORFs in a dictionary format. + + sequence: sequence in Biopython Seq or String format. + return_loci: return the loci together with the nucleotide sequences. + minimum_length: minimum size of ORF in nucleotides. + start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"] + stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"] + remove_nested: remove all ORFs completely encased in another. Default: False + trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False + """ + + sequence = __reformSequence(sequence) + + loci = getORFs(sequence, **kwargs) + + sequence_length = len(sequence) + forward = str(sequence.seq).upper() + reverse = str(sequence.reverse_complement().seq)[::-1].upper() + + nucleotides = list() + + for locus in loci: + + if locus["sense"] == "+": + locus["nucleotide"] = Seq(forward[locus["start"] - 1 : locus["end"] - 1]) + + else: + locus["nucleotide"] = Seq(reverse[sequence_length - locus["start"] - 1 : sequence_length - locus["end"] - 1]) + + nucleotides.append(locus["nucleotide"]) + + if return_loci: + + return loci + + else: + + return nucleotides + +def getORFProteins (sequence, translation_table=1, return_loci=False, **kwargs): + + """ + Returns the loci of discovered ORFs in a dictionary format. + + sequence: sequence in Biopython Seq or String format. + return_loci: return the loci together with the protein sequences. + minimum_length: minimum size of ORF in nucleotides. + start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"] + stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"] + remove_nested: remove all ORFs completely encased in another. Default: False + trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False + """ + + sequence = __reformSequence(sequence) + + loci = getORFs(sequence, **kwargs) + + sequence_length = len(sequence) + forward = str(sequence.seq).upper() + reverse = str(sequence.reverse_complement().seq)[::-1].upper() + + proteins = list() + + for locus in loci: + + difference = locus["length"] % 3 + + if locus["sense"] == "+": + locus["protein"] = Seq(forward[locus["start"] - 1 : locus["end"] - 1 - difference]).translate() + + else: + locus["protein"] = Seq(reverse[sequence_length - locus["start"] - 1 : sequence_length - locus["end"] - 1 - difference]).translate() + + proteins.append(locus["protein"]) + + if return_loci: + + return loci + + else: + + return proteins diff --git a/orffinder/tests/gene.fasta b/orffinder/tests/gene.fasta new file mode 100644 index 0000000..d5b85a6 --- /dev/null +++ b/orffinder/tests/gene.fasta @@ -0,0 +1,20 @@ +>NM_001115114.1 Danio rerio glyceraldehyde-3-phosphate dehydrogenase (gapdh), mRNA +ACTCACACCAAGTGTCAGGACGAACAGAGGCTTCTCACAAACGAGGACACAACCAAATCAGGCATAATGG +TTAAAGTTGGTATTAACGGATTCGGTCGCATTGGCCGTCTGGTGACCCGTGCTGCTTTCTTGACCAAGAA +AGTGGAGATCGTGGCCATCAATGACCCATTCATTGACCTTGATTACATGGTTTACATGTTCCAGTACGAC +TCCACCCATGGAAAGTACAAGGGTGAGGTTAAGGCAGAAGGCGGCAAACTGGTCATTGATGGTCATGCAA +TCACAGTCTATAGCGAGAGGGACCCAGCCAACATTAAGTGGGGTGATGCAGGTGCTACTTATGTTGTGGA +GTCTACTGGTGTCTTCACTACTATTGAGAAGGCTTCTGCTCACATTAAGGGTGGTGCAAAGAGAGTCATC +ATCTCTGCCCCAAGTGCAGATGCCCCCATGTTTGTCATGGGTGTCAACCATGAGAAATATGACAACTCTC +TCACAGTTGTAAGCAATGCCTCCTGCACCACCAACTGCCTGGCTCCTTTGGCAAAGGTCATCAATGATAA +CTTTGTCATCGTTGAAGGTCTTATGAGCACTGTTCATGCCATCACAGCAACACAGAAGACCGTTGATGGG +CCCTCTGGGAAGCTGTGGAGGGATGGCCGTGGTGCCAGTCAGAACATCATCCCAGCCTCCACTGGGGCTG +CCAAGGCTGTAGGCAAAGTAATTCCTGAGCTCAATGGCAAGCTTACTGGTATGGCCTTCCGTGTCCCCAC +CCCCAATGTCTCTGTTGTGGATCTGACAGTCCGTCTTGAGAAACCTGCCAAGTATGATGAGATCAAGAAA +GTCGTCAAGGCTGCAGCTGATGGGCCCATGAAAGGAATTCTGGGATACACGGAGCACCAGGTTGTGTCCA +CTGACTTCAATGGGGATTGCCGTTCATCCATCTTTGACGCTGGTGCTGGTATTGCTCTCAACGATCACTT +TGTCAAGCTGGTCACATGGTATGACAATGAGTTCGGTTACAGCAACCGTGTATGTGACCTGATGGCACAC +ATGGCCTCCAAGGAGTAGATGTGACCCCTTTGCTGTTTCTTTTTTTTGATACGCGACCATTCTCCCATCT +GGTTGAATGTTTGCACCACGTGCCTGGAAGGAAATTACATGCTTAAATTGAAGACCAATATTATTTTTAT +ATACTCTGTTCTGTTTCGTGTGTGAGGTTAAAAATAAATGTTGACTTCAAAGGCTTTTCTGTCTGTTAAC +AACTTGCGATGGAATAAAAGTCCTCTGTTTGTGAGAAATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAA diff --git a/orffinder/tests/test.py b/orffinder/tests/test.py new file mode 100644 index 0000000..e3567c4 --- /dev/null +++ b/orffinder/tests/test.py @@ -0,0 +1,9 @@ +from Bio import SeqIO +from Bio.Seq import Seq + +import orffinder + +sequence = list(SeqIO.parse("gene.fasta", "fasta"))[0] + +out = orffinder.getORFProteins(sequence, minimum_length=75, remove_nested=True) +print(out) diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..4053337 --- /dev/null +++ b/setup.py @@ -0,0 +1,21 @@ +import setuptools + +with open("README.md", "r") as fh: + long_description = fh.read() + +setuptools.setup( + name="orffinder", + version="1.0.0", + author="ChocoParrot", + author_email="lachocoparrot@gmail.com", + description="ORFFinder API.", + long_description=long_description, + long_description_content_type="text/markdown", + url="https://github.com/Chokyotager/ORFFinder", + packages=setuptools.find_packages(), + classifiers=( + "Programming Language :: Python :: 3", + "License :: OSI Approved :: MIT License", + "Operating System :: OS Independent", + ), +)