Skip to content

Commit

Permalink
add tests to to_html: visualize N within mutations
Browse files Browse the repository at this point in the history
  • Loading branch information
akikuno committed Jun 8, 2023
1 parent 1d0a1f2 commit 7825101
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 66 deletions.
6 changes: 4 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,16 @@

setuptools.setup(
name="cstag",
version="0.4.0",
version="0.4.1",
author="Akihiro Kuno",
author_email="[email protected]",
description="Python module to manipulate the minimap2's CS tag",
long_description=long_description,
long_description_content_type="text/markdown",
url="https://github.com/akikuno/cstag",
packages=setuptools.find_packages(where="src",),
packages=setuptools.find_packages(
where="src",
),
package_dir={"": "src"},
classifiers=[
"Programming Language :: Python :: 3.7",
Expand Down
107 changes: 55 additions & 52 deletions src/cstag/to_html.py
Original file line number Diff line number Diff line change
@@ -1,24 +1,6 @@
import re


def to_html(CSTAG: str, DESCRIPTION: str = "") -> None:
"""Output HTML file showing a sequence with mutations colored
Args:
CSTAG (str): cs tag in the **long** format
DESCRIPTION (str): (optional) header information in the output file
Return:
HTML file (*OUTPUT_FILE_NAME.html*)
Example:
>>> import cstag
>>> CSTAG = "cs:Z:=AC+GGG=T-ACGT*at~gt10cg=GNNN"
>>> DESCRIPTION = "Example"
>>> cstag_html = cstag.to_html(CSTAG, DESCRIPTION)
https://user-images.githubusercontent.com/15861316/158910398-67f480d2-8742-412a-b528-40e545c46513.png
"""
if not re.search(r"[ACGT]", CSTAG):
raise Exception("Error: cs tag must be a long format")

html_header = """<!DOCTYPE html>
HTML_HEADER = """<!DOCTYPE html>
<html>
<head>
<style>
Expand Down Expand Up @@ -79,45 +61,50 @@ def to_html(CSTAG: str, DESCRIPTION: str = "") -> None:
</style>
</head>
<body>
"""
"""

html_legend = """
<p class = "p_legend">
Labels:
<span class="Ins">Insertion</span>
<span class="Del">Deletion</span>
<span class="Sub">Substitution</span>
<span class="Splice">Splicing</span>
<span class="Unknown">Unknown</span>
</p>
<hr>
"""
HTML_LEGEND = """
<p class = "p_legend">
Labels:
<span class="Ins">Insertion</span>
<span class="Del">Deletion</span>
<span class="Sub">Substitution</span>
<span class="Splice">Splicing</span>
<span class="Unknown">Unknown</span>
</p>
<hr>
"""

html_footer = """
</body>
</html>
"""
HTML_FOOTER = """
</body>
</html>
"""

description = DESCRIPTION
if description:
description = f"<h1>{description}</h1>"

cs = CSTAG.replace("cs:Z:", "")
if cs.startswith("N"):
cs = "=" + cs
list_cs = re.split(r"([-+*~=])", cs)[1:]
list_cs = [i + j for i, j in zip(list_cs[0::2], list_cs[1::2])]
def validate_cstag(cstag: str) -> None:
if not re.search(r"[ACGTN]", cstag):
raise Exception("Error: cs tag must be a long format")


def process_cstag(cstag: str) -> str:
cstag = cstag.replace("cs:Z:", "")
cstag_split_n = re.split(r"(N+)", cstag)
cs_mark_n = "".join(["@" + cs if cs.startswith("N") else cs for cs in cstag_split_n])
list_cs = re.split(r"([-+*~=@])", cs_mark_n)
list_cs = [l for l in list_cs if l != ""]
list_cs = [i + j for i, j in zip(list_cs[0::2], list_cs[1::2])]
html_cs = []
idx = 0
while idx < len(list_cs):
cs = list_cs[idx]
if cs[0] == "=":
cs = re.sub(r"(N+)", r"<span class='Unknown'>\1</span>", cs)
if cs.startswith("="):
html_cs.append(cs[1:])
elif cs.startswith("@"):
cs = re.sub(r"(N+)", r"<span class='Unknown'>\1</span>", cs[1:])
html_cs.append(cs)
elif cs[0] == "*":
html_cs.append(f"<span class='Sub'>{cs[2].upper()}")
while idx < len(list_cs) - 1 and list_cs[idx+1].startswith("*"):
while idx < len(list_cs) - 1 and list_cs[idx + 1].startswith("*"):
html_cs.append(f"{list_cs[idx+1][2].upper()}")
idx += 1
html_cs.append("</span>")
Expand All @@ -131,17 +118,33 @@ def to_html(CSTAG: str, DESCRIPTION: str = "") -> None:
right = cs[-2:].upper()
html_cs.append(f"<span class='Splice'>{left + splice + right}</span>")
idx += 1

html_cs = "".join(html_cs)
html_cs = f"<p class='p_seq'>{html_cs}</p>"
return f"<p class='p_seq'>{html_cs}</p>"


def to_html(cstag: str, description: str = "") -> str:
"""Output HTML string showing a sequence with mutations colored
Args:
cstag (str): cs tag in the **long** format
description (str): (optional) header information in the output string
Return:
HTML string
Example:
>>> import cstag
>>> cstag = "cs:Z:=AC+GGG=T-ACGT*at~gt10cg=GNNN"
>>> description = "Example"
>>> html_string = cstag.to_html(cstag, description)
"""
validate_cstag(cstag)
description_str = f"<h1>{description}</h1>" if description else ""
html_cs = process_cstag(cstag)
report = "\n".join(
[
html_header,
description,
html_legend,
HTML_HEADER,
description_str,
HTML_LEGEND,
html_cs,
html_footer,
HTML_FOOTER,
]
)
return report
59 changes: 47 additions & 12 deletions tests/test_to_html.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@


def test_html():
cs = "cs:Z:=AC+GGG=T-ACGT*at~gt10cg=GNNN"
cstag = "cs:Z:=AC+GGG=T-ACGT*at~gt10cg=GNNN"
description = "Example"
cs_html = to_html(cs, description)
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0].split()
answer = Path("tests", "data", "to_html", "report.html").read_text().split("\n")
Expand All @@ -15,9 +15,9 @@ def test_html():


def test_html_repeat_substitution():
cs = "cs:Z:=A*at*ag=A"
cstag = "cs:Z:=A*at*ag=A"
description = "Example"
cs_html = to_html(cs, description)
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0].split()
answer = Path("tests", "data", "to_html", "report_substitution.html").read_text().split("\n")
Expand All @@ -27,9 +27,9 @@ def test_html_repeat_substitution():


def test_html_repeat_substitution_start():
cs = "cs:Z:*at*ag=A"
cstag = "cs:Z:*at*ag=A"
description = "Example"
cs_html = to_html(cs, description)
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0].split()
answer = Path("tests", "data", "to_html", "report_substitution_start.html").read_text().split("\n")
Expand All @@ -39,9 +39,9 @@ def test_html_repeat_substitution_start():


def test_html_repeat_substitution_end():
cs = "cs:Z:=A*at*ag"
cstag = "cs:Z:=A*at*ag"
description = "Example"
cs_html = to_html(cs, description)
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0].split()
answer = Path("tests", "data", "to_html", "report_substitution_end.html").read_text().split("\n")
Expand All @@ -51,12 +51,47 @@ def test_html_repeat_substitution_end():


def test_html_start_from_N():
cs = "cs:Z:NNN=AC+GGG=T-ACGT*at~gt10cg=GNNN"
cstag = "cs:Z:NNN=AC+GGG=T-ACGT*at~gt10cg=GNNN"
description = "Example"
cs_html = to_html(cs, description)
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0].split()
answer = [
'<p', "class='p_seq'><span", "class='Unknown'>NNN</span>AC<span", "class='Ins'>GGG</span>T<span", "class='Del'>ACGT</span><span", "class='Sub'>T</span><span", "class='Splice'>GT----------CG</span>G<span", "class='Unknown'>NNN</span></p>"
]
"<p",
"class='p_seq'><span",
"class='Unknown'>NNN</span>AC<span",
"class='Ins'>GGG</span>T<span",
"class='Del'>ACGT</span><span",
"class='Sub'>T</span><span",
"class='Splice'>GT----------CG</span>G<span",
"class='Unknown'>NNN</span></p>",
]
assert test == answer


def test_html_deletion_plus_N():
cstag = "cs:Z:=T-ACGTNNN=G"
description = "Example"
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")]
test = test[0]
answer = "<p class='p_seq'>T<span class='Del'>ACGT</span><span class='Unknown'>NNN</span>G</p>"
assert test == answer


def test_html_N_within_deletions():
cstag = "cs:Z:=T-ACGTNNN-ACGT=G"
description = "Example"
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")][0]
answer = "<p class='p_seq'>T<span class='Del'>ACGT</span><span class='Unknown'>NNN</span><span class='Del'>ACGT</span>G</p>"
assert test == answer


def test_html_N_within_insertions():
cstag = "cs:Z:=T+ACGTNNN+ACGT=G"
description = "Example"
cs_html = to_html(cstag, description)
test = [h for h in cs_html.split("\n") if h.count("<p class='p_seq'>")][0]
answer = "<p class='p_seq'>T<span class='Ins'>ACGT</span><span class='Unknown'>NNN</span><span class='Ins'>ACGT</span>G</p>"
assert test == answer

0 comments on commit 7825101

Please sign in to comment.