Skip to content

Commit

Permalink
Merge pull request #14 from akikuno:develop-v0.5.1
Browse files Browse the repository at this point in the history
Develop-v0.5.1
  • Loading branch information
akikuno authored Sep 2, 2023
2 parents 9323e81 + d08c665 commit cfcaced
Show file tree
Hide file tree
Showing 20 changed files with 510 additions and 232 deletions.
10 changes: 5 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ seq = "ACGTACGTACGTACG"
cstag.call(cigar, md, seq)
# => :2*ag:5-ag:4+ac~nn3nn:1

cstag.call(cigar, md, seq, is_long=True)
cstag.call(cigar, md, seq, long=True)
# => =AC*ag=TACGT-ag=ACGT+ac~nn3nn=G
```

Expand Down Expand Up @@ -107,7 +107,7 @@ import cstag

cs = "=ACGT*ac+gg-cc=T"
cstag.split(cs)
# => ['', '=ACGT', '*ac', '+gg', '-cc', '=T']
# => ['=ACGT', '*ac', '+gg', '-cc', '=T']
```

### Reverse Complement of a CS Tag
Expand All @@ -127,11 +127,11 @@ cstag.revcomp(cs)
import cstag
from pathlib import Path

cs = "=AC+GGG=T-ACGT*at~gt10cg=GNNN"
cs_tag = "=AC+GGG=T-ACGT*at~gt10cg=GNNN"
description = "Example"

cstag_html = cstag.to_html(cs, description)
Path("report.html").write_text(cstag_html)
cs_tag_html = cstag.to_html(cs_tag, description)
Path("report.html").write_text(cs_tag_html)
# => Output "report.html"
```
The resulting `report.html` looks like this :point_down:
Expand Down
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,10 @@ build-backend = "setuptools.build_meta"

[project]
name = "cstag"
version = "0.5.0"
version = "0.5.1"
description = "Python module to manipulate the minimap2's CS tag"
authors = [{ name = "Akihiro Kuno", email = "[email protected]" }]
requires-python = ">=3.7"
readme = { file = "README.md", content-type = "text/markdown" }
license = { file = "LICENSE" }

Expand Down
6 changes: 4 additions & 2 deletions src/cstag/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
.. include:: ../../README.md
"""

from cstag.call import call
from cstag.shorten import shorten
from cstag.lengthen import lengthen
from cstag.consensus import consensus
from cstag.to_html import to_html
from cstag.mask import mask
from cstag.call import call
from cstag.split import split
from cstag.revcomp import revcomp
from cstag.to_html import to_html
9 changes: 4 additions & 5 deletions src/cstag/call.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
from __future__ import annotations

import re
from cstag.shorten import shorten


Expand Down Expand Up @@ -157,15 +156,15 @@ def add_prefix(cs_tag: str) -> str:
###########################################################


def call(cigar: str, md: str, seq: str, is_long: bool = False, prefix: bool = False) -> str:
def call(cigar: str, md: str, seq: str, long: bool = False, prefix: bool = False) -> str:
"""
Generate a cs tag based on CIGAR, MD, and SEQ information.
Args:
cigar (str): CIGAR string representing the alignment.
md (str): MD tag representing mismatching positions/base.
seq (str): The sequence of the read.
is_long (bool, optional): Whether to return the cs tag in long format. Defaults to False.
long (bool, optional): Whether to return the cs tag in long format. Defaults to False.
prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Returns:
Expand All @@ -176,12 +175,12 @@ def call(cigar: str, md: str, seq: str, is_long: bool = False, prefix: bool = Fa
>>> cigar = "8M2D4M2I3N1M"
>>> md = "2A5^AG7"
>>> seq = "ACGTACGTACGTACG"
>>> cstag.call(cigar, md, seq, is_long=True)
>>> cstag.call(cigar, md, seq, long=True)
'=AC*ag=TACGT-ag=ACGT+ac~nn3nn=G'
"""
cigar, seq = trim_clips(cigar, seq)
cs_tag = generate_cs_long(cigar, md, seq)
if is_long is False:
if long is False:
cs_tag = shorten(cs_tag)
if prefix is True:
cs_tag = add_prefix(cs_tag)
Expand Down
190 changes: 142 additions & 48 deletions src/cstag/consensus.py
Original file line number Diff line number Diff line change
@@ -1,68 +1,162 @@
from __future__ import annotations

import re
from itertools import chain
from collections import deque, Counter

from cstag.utils.validator import validate_long_format


def extract_softclips(cigars: list[str]) -> list[int]:
"""
Extract the length of softclips from each cigars string.
def consensus(CSTAG: list, CIGAR: list, POS: list) -> str:
"""generate consensus of cs tags
Args:
CSTAG (list): cs tags in the **long** format
CIGAR (list): CIGAR strings (6th column in SAM file)
POS (list): 1-based leftmost mapping position (4th column in SAM file)
Return:
str: a consensus of cs tag in the **long** format
Example:
>>> import cstag
>>> cs_list = ["cs:Z:=ACGT", "cs:Z:=AC*gt=T", "cs:Z:=C*gt=T", "cs:Z:=C*gt=T", "cs:Z:=ACT+ccc=T"]
>>> cigar_list = ["4M","4M","1S3M", "3M", "3M3I1M"]
>>> pos_list = [6,6,6,7,6]
>>> cstag.consensus(cs_list, cigar_list, pos)
cs:Z:=AC*gt*T
cigars (list[str]): list of cigars strings.
Returns:
list[int]: list of softclip lengths for each cigars string.
"""
if not (len(CSTAG) == len(CIGAR) == len(POS)):
raise Exception("Error: Element numbers of each argument must be the same")
softclip_lengths = []
for cigar in cigars:
# Check if the cigars string starts with a softclip (e.g., "4S3M")
if re.match(r"^[0-9]+S", cigar):
# Extract the length of the softclip using regex and convert it to an integer
softclip_length = int(re.sub(r"^([0-9]+)S.*", r"\1", cigar))
else:
# No softclip, so the length is 0
softclip_length = 0
softclip_lengths.append(softclip_length)
return softclip_lengths

if not all(re.search(r"[ACGT]", cs) for cs in CSTAG):
raise Exception("Error: cs tag must be a long format")

pos_min = min(POS)
pos = [pos - pos_min for pos in POS]
def calculate_read_starts(positions: list[int], cigars: list[str]) -> list[int]:
"""
Calculate the start positions of each read based on positions and cigars strings.
softclips = [re.sub(r"^([0-9]+)S.*", r"\1", cigar) for cigar in CIGAR]
softclips = [int(s) if s.isdigit() else 0 for s in softclips]
Args:
positions (list[int]): 1-based leftmost mapping positions.
cigars (list[str]): cigars strings indicating the mapping of each read.
starts = [p + s for p, s in zip(pos, softclips)]
Returns:
list[int]: Calculated start positions for each read.
"""
pos_min = min(positions)
pos_offsets = [pos - pos_min for pos in positions]
softclips = extract_softclips(cigars)
starts = [p + s for p, s in zip(pos_offsets, softclips)]
return starts


def split_cs_tags(cs_tags: list[str]) -> list[deque[str]]:
"""
Split and process each cs tag in cs_tags.
Args:
cs_tags (list[str]): list of cs tags in the long format.
Returns:
list[deque[str]]: list of processed cs tags as deque objects.
"""
cs_tags_splitted = []
for cs_tag in cs_tags:
# Remove the prefix "cs:Z:" if present
cs_tag = cs_tag.replace("cs:Z:", "")
# Split the cs tag using special symbols (-, *, ~, =)
split_tags = re.split(r"([-*~=])", cs_tag)[1:]
# Combine the symbol with the corresponding sequence
combined_tags = [symbol + seq for symbol, seq in zip(split_tags[0::2], split_tags[1::2])]
# Remove the "=" symbols, as they are not needed for further processing
cleaned_tags = [tag.replace("=", "") for tag in combined_tags]
# Further split the tags by the base letters (A, C, G, T)
further_split_tags = [re.split(r"(?=[ACGT])", tag) for tag in cleaned_tags]
# Remove any empty strings generated by the split
non_empty_tags = [[elem for elem in tag if elem] for tag in further_split_tags]
# Flatten the list of lists into a single list
flat_tags = list(chain.from_iterable(non_empty_tags))
cs_tags_splitted.append(deque(flat_tags))
return cs_tags_splitted


def normalize_read_lengths(cs_list: list[deque[str]], starts: list[int]) -> list[deque[str]]:
"""
Normalize the lengths of each read in cs_list based on their starts positions.
cs_list = []
for cs in CSTAG:
cs = cs.replace("cs:Z:", "")
cs = re.split(r"([-*~=])", cs)[1:]
cs = [i + j for i, j in zip(cs[0::2], cs[1::2])]
cs = [c.replace("=", "") for c in cs]
cs = [re.split(r"(?=[ACGT])", c) for c in cs]
cs = [list(filter(None, c)) for c in cs]
cs = list(chain.from_iterable(cs))
cs_list.append(deque(cs))
Args:
cs_list (list[deque[str]]): list of deques representing the reads.
starts (list[int]): Starting positions of each read.
Returns:
list[deque[str]]: list of deques representing the reads, now normalized to the same length.
"""
cs_maxlen = max(len(cs) + start for cs, start in zip(cs_list, starts))
for i, (cs, start) in enumerate(zip(cs_list, starts)):
if start:

for i, start in enumerate(starts):
if start > 0:
cs_list[i].extendleft(["N"] * start)
if len(cs_list[i]) < cs_maxlen:
cs_list[i].extend(["N"] * (cs_maxlen - len(cs_list[i])))

def get_consensus(cs: tuple) -> str:
"""
When it is multimodal, return the first **mutated** mode encountered
"""
mostcommon = Counter(cs).most_common(1)
if len(mostcommon) == 1:
return mostcommon[0][0]
for key, val in mostcommon:
if not re.search(r"[ACGT]", key):
return key

cs_consensus = [get_consensus(cs) for cs in list(zip(*cs_list))]
return cs_list


def get_consensus(cs_list: list[deque[str]]) -> str:
cs_consensus = []
for cs in zip(*cs_list):
# Get the most common cs tag(s)
most_common_tags = Counter(cs).most_common()

# If there's a unique most common tag, return it
most_common_tag, _ = most_common_tags[0]
if len(most_common_tags) == 1 or most_common_tags[0][1] != most_common_tags[1][1]:
cs_consensus.append(most_common_tag)
continue
# If the most common tag is not unique (multimodal), return the first *mutated* mode
for tag, _ in most_common_tags:
if not re.search(r"[ACGT]", tag):
cs_consensus.append(tag)

cs_consensus = "".join(cs_consensus)
# Append "=" to [ACGTN]
return re.sub(r"([ACGTN]+)", r"=\1", cs_consensus)


###########################################################
# main
###########################################################


def consensus(cs_tags: list[str], cigars: list[str], positions: list[int], prefix: bool = False) -> str:
"""generate consensus of cs tags
Args:
cs_tags (list): cs tags in the **long** format
cigars (list): cigars strings (6th column in SAM file)
positions (list): 1-based leftmost mapping position (4th column in SAM file)
prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return:
str: a consensus of cs tag in the **long** format
Example:
>>> import cstag
>>> cs_tags = ["=ACGT", "=AC*gt=T", "=C*gt=T", "=C*gt=T", "=ACT+ccc=T"]
>>> cigars = ["4M","4M","1S3M", "3M", "3M3I1M"]
>>> positions = [6,6,6,7,6]
>>> cstag.consensus(cs_tags, cigars, positions)
=AC*gt=T
"""
if not (len(cs_tags) == len(cigars) == len(positions) > 0):
raise ValueError("Element numbers of each argument must be the same")

for cs_tag in cs_tags:
validate_long_format(cs_tag)

# Calculate the starts positions for each read
starts = calculate_read_starts(positions, cigars)

cs_list = split_cs_tags(cs_tags)

# Normalize the lengths of each read
cs_list = normalize_read_lengths(cs_list, starts)

cs_consensus = get_consensus(cs_list)

return "cs:Z:" + re.sub(r"([ACGTN]+)", r"=\1", cs_consensus)
return f"cs:Z:{cs_consensus}" if prefix else cs_consensus
12 changes: 5 additions & 7 deletions src/cstag/lengthen.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
from __future__ import annotations

import re
from cstag.utils.validator import validate_short_format


def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
Expand All @@ -20,9 +23,7 @@ def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
>>> cstag.lengthen(cs, cigar, seq)
=ACGT*ag=CGT
"""

if re.search(r"[ACGT]", cs_tag):
raise Exception("Error: input cs tag is not short format")
validate_short_format(cs_tag)

cs_tag_split = re.split(r"([-+*~:])", cs_tag.replace("cs:Z:", ""))[1:]
cs_tag_split = [i + j for i, j in zip(cs_tag_split[0::2], cs_tag_split[1::2])]
Expand All @@ -46,7 +47,4 @@ def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
idx += len(cs) - 1
cslong = "".join(cslong).replace(":", "=")

if prefix is True:
return "cs:Z:" + cslong
else:
return cslong
return f"cs:Z:{cslong}" if prefix else cslong
25 changes: 14 additions & 11 deletions src/cstag/mask.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,31 @@
from __future__ import annotations

import re

from cstag.utils.validator import validate_long_format, validate_threshold


def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10):
def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) -> str:
"""Mask low-quality bases to 'N'
Args:
cs_tag (str): cs tag in the **long** format
cigar (str): cigar strings (6th column in SAM file)
qual (str): ASCII of Phred-scaled base quaiity+33 (11th column in SAM file)
threshold (int): optional: Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
threshold (int, optional): Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
Return:
str: Masked cs tag
Example:
>>> import cstag
>>> cs_tag = "cs:Z:=ACGT*ac+gg-cc=T"
>>> cs_tag = "=ACGT*ac+gg-cc=T"
>>> cigar = "5M2I2D1M"
>>> qual = "AA!!!!AA"
>>> cstag.mask(cs_tag, qual)
cs:Z:=ACNN*an+ng-cc=T
=ACNN*an+ng-cc=T
"""
if not re.search(r"[ACGT]", cs_tag):
raise Exception("Error: cs tag must be a long format")
if not isinstance(threshold, int):
raise Exception("Error: threshold must be an integer")
if not 0 <= threshold <= 40:
raise Exception("Error: threshold must be within a range between 0 to 40")

validate_long_format(cs_tag)
validate_threshold(threshold)

mask_symbols = [chr(th + 33) for th in range(threshold + 1)]
mask_symbols = set(mask_symbols)
Expand All @@ -50,5 +52,6 @@ def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10):
cs[i + 1] = "N" if cs[0] == "=" else "n"
idx += i + 1
cs_masked.append("".join(cs))
cs_masked = "".join(cs_masked)

return "cs:Z:" + "".join(cs_masked)
return f"cs:Z:{cs_masked}" if prefix else cs_masked
Loading

0 comments on commit cfcaced

Please sign in to comment.