-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #15 from akikuno:develop-v0.6.0
Develop-v0.6.0
- Loading branch information
Showing
6 changed files
with
225 additions
and
36 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,16 +1,17 @@ | ||
[build-system] | ||
requires = ["setuptools", "wheel"] | ||
build-backend = "setuptools.build_meta" | ||
requires = ["poetry-core>=1.0.0"] | ||
build-backend = "poetry.core.masonry.api" | ||
|
||
[project] | ||
[tool.poetry] | ||
name = "cstag" | ||
version = "0.5.1" | ||
version = "0.6.0" | ||
description = "Python module to manipulate the minimap2's CS tag" | ||
authors = [{ name = "Akihiro Kuno", email = "[email protected]" }] | ||
requires-python = ">=3.7" | ||
readme = { file = "README.md", content-type = "text/markdown" } | ||
license = { file = "LICENSE" } | ||
|
||
authors = ["Akihiro Kuno <[email protected]>"] | ||
homepage = "https://github.com/akikuno/cstag" | ||
repository = "https://github.com/akikuno/cstag" | ||
documentation = "https://akikuno.github.io/cstag/cstag" | ||
readme = "README.md" | ||
license = "MIT" | ||
classifiers = [ | ||
"Programming Language :: Python :: 3", | ||
"License :: OSI Approved :: MIT License", | ||
|
@@ -19,11 +20,5 @@ classifiers = [ | |
"Topic :: Scientific/Engineering :: Bio-Informatics", | ||
] | ||
|
||
[project.urls] | ||
homepage = "https://github.com/akikuno/cstag" | ||
repository = "https://github.com/akikuno/cstag" | ||
documentation = "https://akikuno.github.io/cstag/cstag" | ||
|
||
|
||
[tool.setuptools] | ||
packages.find = { where = ["src"] } | ||
[tool.poetry.dependencies] | ||
python = "^3.7" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
from __future__ import annotations | ||
|
||
from cstag.split import split | ||
from collections import deque | ||
|
||
from cstag.utils.validator import validate_long_format | ||
|
||
|
||
def find_ref_for_insertion(cs_tag_split: list[str], idx: int) -> str | None: | ||
idx_ref = idx - 1 | ||
while idx_ref >= 0: | ||
cs = cs_tag_split[idx_ref] | ||
if cs[0] in ["=", "-"]: | ||
return cs[-1].upper() | ||
if cs.startswith("*"): | ||
return cs[1].upper() | ||
idx_ref -= 1 | ||
return None | ||
|
||
|
||
def find_ref_for_deletion(cs_tag_split: list[str], idx: int) -> str: | ||
ref = deque([cs_tag_split[idx][1:].upper()]) | ||
idx_ref = idx - 1 | ||
while idx_ref >= 0: | ||
cs = cs_tag_split[idx_ref] | ||
if cs.startswith("="): | ||
ref.appendleft(cs[-1].upper()) | ||
break | ||
if cs.startswith("*"): | ||
ref.appendleft(cs[1].upper()) | ||
break | ||
idx_ref -= 1 | ||
return "".join(ref) | ||
|
||
|
||
def get_variant_annotations(cs_tag_split: list[str], position: int) -> list[tuple[int, str, str]]: | ||
variant_annotations = [] | ||
pos = position | ||
for idx, cs in enumerate(cs_tag_split): | ||
if cs.startswith("="): | ||
pos += len(cs) - 1 | ||
elif cs.startswith("*"): | ||
ref, alt = cs[1].upper(), cs[2].upper() | ||
variant_annotations.append((pos, ref, alt)) | ||
pos += 1 | ||
elif cs.startswith("+"): | ||
ref = find_ref_for_insertion(cs_tag_split, idx) | ||
alt = ref + cs[1:].upper() | ||
variant_annotations.append((pos - 1, ref, alt)) | ||
elif cs.startswith("-"): | ||
ref = find_ref_for_deletion(cs_tag_split, idx) | ||
variant_annotations.append((pos - 1, ref, ref[0])) | ||
elif cs.startswith("~"): | ||
continue | ||
|
||
return variant_annotations | ||
|
||
|
||
########################################################### | ||
# main | ||
########################################################### | ||
|
||
|
||
def to_vcf(cs_tag: str, chrom: str, pos: int) -> str: | ||
""" | ||
Convert a CS tag to VCF (Variant Call Format) string. | ||
Args: | ||
cs_tag (str): The CS tag representing the sequence alignment. | ||
chrom (str): The chromosome name. | ||
pos (int): The starting position for the sequence. | ||
Returns: | ||
str: The VCF-formatted string. | ||
Example: | ||
>>> import cstag | ||
>>> cs_tag = "=AC*gt=T-gg=C+tt=A" | ||
>>> chrom = "chr1" | ||
>>> pos = 1 | ||
>>> print(cstag.to_vcf(cstag, chrom, pos)) | ||
##fileformat=VCFv4.2 | ||
#CHROM POS ID REF ALT QUAL FILTER INFO | ||
chr1 3 . G T . . . | ||
chr1 4 . TGG T . . . | ||
chr1 5 . C CTT . . . | ||
""" | ||
|
||
validate_long_format(cs_tag) | ||
|
||
cs_tag_split = split(cs_tag) | ||
|
||
# Call POS, REF, ALT | ||
variants = get_variant_annotations(cs_tag_split, pos) | ||
|
||
# Write VCF | ||
HEADER = "##fileformat=VCFv4.2\n#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\n" | ||
|
||
vcf = HEADER.strip().split("\n") | ||
for pos, ref, alt in variants: | ||
vcf.append(f"{chrom}\t{pos}\t.\t{ref}\t{alt}\t.\t.\t.") | ||
|
||
return "\n".join(vcf) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,56 @@ | ||
from src.cstag.to_vcf import find_ref_for_insertion, find_ref_for_deletion, get_variant_annotations, to_vcf | ||
|
||
|
||
# find_ref_for_insertionのテスト | ||
def test_find_ref_for_insertion(): | ||
assert find_ref_for_insertion(["=ACGT", "*ga", "+a"], 0) is None | ||
assert find_ref_for_insertion(["=ACGT", "*ga", "+a"], 1) == "T" | ||
assert find_ref_for_insertion(["=ACGT", "*ga", "+a"], 2) == "G" | ||
assert find_ref_for_insertion(["=AC", "=GT", "-g", "+a"], 3) == "G" | ||
|
||
|
||
# find_ref_for_deletionのテスト | ||
def test_find_ref_for_deletion(): | ||
assert find_ref_for_deletion(["=AC", "-g"], 1) == "CG" | ||
assert find_ref_for_deletion(["=ACGT", "*ga", "-a"], 2) == "GA" | ||
assert find_ref_for_deletion(["=ACGT", "*ga", "-ac"], 2) == "GAC" | ||
assert find_ref_for_deletion(["=AC", "=GT", "+a", "-a"], 3) == "TA" | ||
|
||
|
||
# get_variant_annotationsのテスト | ||
def test_get_variant_annotations(): | ||
# single mutation | ||
assert get_variant_annotations(["=AC", "*ga", "=AC"], 1) == [(3, "G", "A")] | ||
assert get_variant_annotations(["=AC", "+a", "=AC"], 1) == [(2, "C", "CA")] | ||
assert get_variant_annotations(["=AC", "-a", "=AC"], 1) == [(2, "CA", "C")] | ||
# double mutations | ||
assert get_variant_annotations(["=AC", "*ga", "=AC", "*ct"], 1) == [(3, "G", "A"), (6, "C", "T")] | ||
assert get_variant_annotations(["=AC", "+a", "=AC", "+aa"], 1) == [(2, "C", "CA"), (4, "C", "CAA")] | ||
assert get_variant_annotations(["=AC", "-a", "=AC", "-aa"], 1) == [(2, "CA", "C"), (4, "CAA", "C")] | ||
# combinations | ||
assert get_variant_annotations(["=ACGT", "*ga", "+a"], 1) == [(5, "G", "A"), (5, "G", "GA")] | ||
assert get_variant_annotations(["=ACGT", "*ga", "-a"], 1) == [(5, "G", "A"), (5, "GA", "G")] | ||
assert get_variant_annotations(["=ACGT", "*ga", "-ac"], 1) == [(5, "G", "A"), (5, "GAC", "G")] | ||
# position | ||
assert get_variant_annotations(["=AC", "*ga", "=AC", "*ct"], 10) == [(12, "G", "A"), (15, "C", "T")] | ||
|
||
|
||
# to_vcf関数のテスト | ||
def test_to_vcf(): | ||
cs_tag1 = "=AC*gt=T-gg=C+tt=A" | ||
chrom1 = "chr1" | ||
pos1 = 1 | ||
expected_output1 = """##fileformat=VCFv4.2 | ||
#CHROM POS ID REF ALT QUAL FILTER INFO | ||
chr1 3 . G T . . . | ||
chr1 4 . TGG T . . . | ||
chr1 5 . C CTT . . .""" | ||
assert to_vcf(cs_tag1, chrom1, pos1) == expected_output1 | ||
|
||
cs_tag2 = "=AC*ga" | ||
chrom2 = "2" | ||
pos2 = 1 | ||
expected_output2 = """##fileformat=VCFv4.2 | ||
#CHROM POS ID REF ALT QUAL FILTER INFO | ||
2 3 . G A . . .""" | ||
assert to_vcf(cs_tag2, chrom2, pos2) == expected_output2 |