[factor] initial commit

Chokyotager · Sep 8, 2021 · db3c9c8 · db3c9c8
commit db3c9c8
Show file tree

Hide file tree

Showing 9 changed files with 416 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,110 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# pyenv
+.python-version
+
+# celery beat schedule file
+celerybeat-schedule
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+
+#VSCode
+.vscode
+
+# OSX
+.DS_Store
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,22 @@
+
+MIT License
+
+Copyright (c) 2021 Chokyotager
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/README.md b/README.md
diff --git a/environment.yml b/environment.yml
@@ -0,0 +1,3 @@
+name: orffinder_env
+dependencies:
+  - biopython
diff --git a/orffinder/__init__.py b/orffinder/__init__.py
@@ -0,0 +1 @@
+name="orffinder"
diff --git a/orffinder/orffinder.py b/orffinder/orffinder.py
@@ -0,0 +1,230 @@
+from Bio import SeqIO
+from Bio.Seq import Seq
+
+def __reformSequence (sequence):
+
+    if isinstance(sequence, str):
+
+        sequence = Seq(sequence)
+
+    return sequence
+
+def getORFs (sequence, minimum_length=75, start_codons=["ATG"], stop_codons=["TAA", "TAG", "TGA"], remove_nested=False, trim_trailing=False):
+
+    """
+    Returns the loci of discovered ORFs in a dictionary format.
+
+    sequence: sequence in Biopython Seq or String format.
+    minimum_length: minimum size of ORF in nucleotides.
+    start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"]
+    stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"]
+    remove_nested: remove all ORFs completely encased in another. Default: False
+    trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False
+    """
+
+    sequence = __reformSequence(sequence)
+
+    def findSense (sequence, sense="+", start_codons=["ATG"], stop_codons=["TAA", "TAG", "TGA"]):
+
+        start_codon_positions = list()
+        stop_codon_positions = list()
+
+        # Iterate through frames
+        for frame in range(3):
+
+            for i in range(frame, len(sequence), 3):
+
+                if sequence[i : i + 3] in start_codons:
+                    start_codon_positions.append({"position": i + 1, "frame": frame + 1, "sense": sense})
+
+                if sequence[i : i + 3] in stop_codons:
+                    stop_codon_positions.append({"position": i + 4, "frame": frame + 1, "sense": sense})
+
+        return start_codon_positions, stop_codon_positions
+
+    sequence_length = len(sequence)
+    forward = str(sequence.seq).upper()
+    reverse = str(sequence.reverse_complement().seq)[::-1].upper()
+
+    forward_start, forward_stop = findSense(forward, "+")
+    reverse_start, reverse_stop = findSense(reverse, "-")
+
+    all_starts = forward_start + reverse_start
+    all_stops = forward_stop + reverse_stop
+
+    all_starts.sort(key=lambda x: x["position"], reverse=False)
+    all_stops.sort(key=lambda x: x["position"], reverse=False)
+
+    for stop_codon in all_stops:
+
+        stop_codon["occupied"] = False
+
+    orfs = list()
+
+    # Corroborate search strategy
+    for start_codon in all_starts:
+
+        position = start_codon["position"]
+        frame = start_codon["frame"]
+        sense = start_codon["sense"]
+
+        warp_case = True
+
+        for stop_codon in all_stops:
+
+            right_frame = stop_codon["frame"] == frame
+            right_sense = stop_codon["sense"] == sense
+
+            length = stop_codon["position"] - position
+            right_length = length >= minimum_length
+
+            if right_frame and right_sense and length > 0:
+
+                warp_case = False
+
+                if stop_codon["occupied"]:
+                    break
+
+                # Registered ORF
+                if right_length:
+                    orfs.append({"start": position, "end": stop_codon["position"], "frame": frame, "sense": sense, "length": length, "trailing": False})
+
+                stop_codon["occupied"] = True
+                break
+
+        if warp_case and not trim_trailing:
+
+            length = sequence_length - position
+            right_length = length >= minimum_length
+
+            if right_length:
+                orfs.append({"start": position, "end": -1, "frame": frame, "sense": sense, "length": length, "trailing": True})
+
+    # Reorder by length
+    orfs.sort(key=lambda x: x["length"], reverse=True)
+
+    # Remove nested
+    if remove_nested:
+
+        unnested_orfs = list()
+
+        for orf_1 in orfs:
+
+            appendable = True
+
+            for orf_2 in orfs:
+
+                if orf_2["start"] < orf_1["start"] and orf_2["end"] > orf_1["end"] and orf_1["end"] != -1:
+                    appendable = False
+                    break
+
+            if appendable:
+
+                unnested_orfs.append(orf_1)
+
+        orfs = unnested_orfs
+
+    for i in range(len(orfs)):
+
+        orf = orfs[i]
+        orf["index"] = i + 1
+
+        if orf["sense"] == "-":
+            orf["start"] = sequence_length - orf["start"]
+
+            if orf["end"] == -1:
+                orf["end"] = 0
+
+            else:
+                orf["end"] = sequence_length - orf["end"]
+
+        elif orf["end"] == -1:
+            orf["end"] = sequence_length
+
+    return orfs
+
+def getORFNucleotides (sequence, return_loci=False, **kwargs):
+
+    """
+    Returns the loci of discovered ORFs in a dictionary format.
+
+    sequence: sequence in Biopython Seq or String format.
+    return_loci: return the loci together with the nucleotide sequences.
+    minimum_length: minimum size of ORF in nucleotides.
+    start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"]
+    stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"]
+    remove_nested: remove all ORFs completely encased in another. Default: False
+    trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False
+    """
+
+    sequence = __reformSequence(sequence)
+
+    loci = getORFs(sequence, **kwargs)
+
+    sequence_length = len(sequence)
+    forward = str(sequence.seq).upper()
+    reverse = str(sequence.reverse_complement().seq)[::-1].upper()
+
+    nucleotides = list()
+
+    for locus in loci:
+
+        if locus["sense"] == "+":
+            locus["nucleotide"] = Seq(forward[locus["start"] - 1 : locus["end"] - 1])
+
+        else:
+            locus["nucleotide"] = Seq(reverse[sequence_length - locus["start"] - 1 : sequence_length - locus["end"] - 1])
+
+        nucleotides.append(locus["nucleotide"])
+
+    if return_loci:
+
+        return loci
+
+    else:
+
+        return nucleotides
+
+def getORFProteins (sequence, translation_table=1, return_loci=False, **kwargs):
+
+    """
+    Returns the loci of discovered ORFs in a dictionary format.
+
+    sequence: sequence in Biopython Seq or String format.
+    return_loci: return the loci together with the protein sequences.
+    minimum_length: minimum size of ORF in nucleotides.
+    start_codons: recognised 3-base-pair codons for initialisation. Default: ["ATG"]
+    stop_codons: recognised 3-base pair condons for termination. Default: ["TAA", "TAG", "TGA"]
+    remove_nested: remove all ORFs completely encased in another. Default: False
+    trim_trailing: remove ORFs are the edge of the sequence that do not have a defined stop codon. Default: False
+    """
+
+    sequence = __reformSequence(sequence)
+
+    loci = getORFs(sequence, **kwargs)
+
+    sequence_length = len(sequence)
+    forward = str(sequence.seq).upper()
+    reverse = str(sequence.reverse_complement().seq)[::-1].upper()
+
+    proteins = list()
+
+    for locus in loci:
+
+        difference = locus["length"] % 3
+
+        if locus["sense"] == "+":
+            locus["protein"] = Seq(forward[locus["start"] - 1 : locus["end"] - 1 - difference]).translate()
+
+        else:
+            locus["protein"] = Seq(reverse[sequence_length - locus["start"] - 1 : sequence_length - locus["end"] - 1  - difference]).translate()
+
+        proteins.append(locus["protein"])
+
+    if return_loci:
+
+        return loci
+
+    else:
+
+        return proteins
diff --git a/orffinder/tests/gene.fasta b/orffinder/tests/gene.fasta
@@ -0,0 +1,20 @@
+>NM_001115114.1 Danio rerio glyceraldehyde-3-phosphate dehydrogenase (gapdh), mRNA
+ACTCACACCAAGTGTCAGGACGAACAGAGGCTTCTCACAAACGAGGACACAACCAAATCAGGCATAATGG
+TTAAAGTTGGTATTAACGGATTCGGTCGCATTGGCCGTCTGGTGACCCGTGCTGCTTTCTTGACCAAGAA
+AGTGGAGATCGTGGCCATCAATGACCCATTCATTGACCTTGATTACATGGTTTACATGTTCCAGTACGAC
+TCCACCCATGGAAAGTACAAGGGTGAGGTTAAGGCAGAAGGCGGCAAACTGGTCATTGATGGTCATGCAA
+TCACAGTCTATAGCGAGAGGGACCCAGCCAACATTAAGTGGGGTGATGCAGGTGCTACTTATGTTGTGGA
+GTCTACTGGTGTCTTCACTACTATTGAGAAGGCTTCTGCTCACATTAAGGGTGGTGCAAAGAGAGTCATC
+ATCTCTGCCCCAAGTGCAGATGCCCCCATGTTTGTCATGGGTGTCAACCATGAGAAATATGACAACTCTC
+TCACAGTTGTAAGCAATGCCTCCTGCACCACCAACTGCCTGGCTCCTTTGGCAAAGGTCATCAATGATAA
+CTTTGTCATCGTTGAAGGTCTTATGAGCACTGTTCATGCCATCACAGCAACACAGAAGACCGTTGATGGG
+CCCTCTGGGAAGCTGTGGAGGGATGGCCGTGGTGCCAGTCAGAACATCATCCCAGCCTCCACTGGGGCTG
+CCAAGGCTGTAGGCAAAGTAATTCCTGAGCTCAATGGCAAGCTTACTGGTATGGCCTTCCGTGTCCCCAC
+CCCCAATGTCTCTGTTGTGGATCTGACAGTCCGTCTTGAGAAACCTGCCAAGTATGATGAGATCAAGAAA
+GTCGTCAAGGCTGCAGCTGATGGGCCCATGAAAGGAATTCTGGGATACACGGAGCACCAGGTTGTGTCCA
+CTGACTTCAATGGGGATTGCCGTTCATCCATCTTTGACGCTGGTGCTGGTATTGCTCTCAACGATCACTT
+TGTCAAGCTGGTCACATGGTATGACAATGAGTTCGGTTACAGCAACCGTGTATGTGACCTGATGGCACAC
+ATGGCCTCCAAGGAGTAGATGTGACCCCTTTGCTGTTTCTTTTTTTTGATACGCGACCATTCTCCCATCT
+GGTTGAATGTTTGCACCACGTGCCTGGAAGGAAATTACATGCTTAAATTGAAGACCAATATTATTTTTAT
+ATACTCTGTTCTGTTTCGTGTGTGAGGTTAAAAATAAATGTTGACTTCAAAGGCTTTTCTGTCTGTTAAC
+AACTTGCGATGGAATAAAAGTCCTCTGTTTGTGAGAAATGAAAAAAAAAAAAAAAAAAAAAAAAAAAAA