Merge pull request #14 from akikuno:develop-v0.5.1

Develop-v0.5.1
akikuno · Sep 2, 2023 · cfcaced · cfcaced
2 parents 9323e81 + d08c665
commit cfcaced
Show file tree

Hide file tree

Showing 20 changed files with 510 additions and 232 deletions.
diff --git a/README.md b/README.md
@@ -49,7 +49,7 @@ seq = "ACGTACGTACGTACG"
 cstag.call(cigar, md, seq)
 # => :2*ag:5-ag:4+ac~nn3nn:1
 
-cstag.call(cigar, md, seq, is_long=True)
+cstag.call(cigar, md, seq, long=True)
 # => =AC*ag=TACGT-ag=ACGT+ac~nn3nn=G
 ```
 
@@ -107,7 +107,7 @@ import cstag
 
 cs = "=ACGT*ac+gg-cc=T"
 cstag.split(cs)
-# => ['', '=ACGT', '*ac', '+gg', '-cc', '=T']
+# => ['=ACGT', '*ac', '+gg', '-cc', '=T']
 ```
 
 ### Reverse Complement of a CS Tag
@@ -127,11 +127,11 @@ cstag.revcomp(cs)
 import cstag
 from pathlib import Path
 
-cs = "=AC+GGG=T-ACGT*at~gt10cg=GNNN"
+cs_tag = "=AC+GGG=T-ACGT*at~gt10cg=GNNN"
 description = "Example"
 
-cstag_html = cstag.to_html(cs, description)
-Path("report.html").write_text(cstag_html)
+cs_tag_html = cstag.to_html(cs_tag, description)
+Path("report.html").write_text(cs_tag_html)
 # => Output "report.html"
 ```
 The resulting `report.html` looks like this :point_down:

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,9 +4,10 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "cstag"
-version = "0.5.0"
+version = "0.5.1"
 description = "Python module to manipulate the minimap2's CS tag"
 authors = [{ name = "Akihiro Kuno", email = "[email protected]" }]
+requires-python = ">=3.7"
 readme = { file = "README.md", content-type = "text/markdown" }
 license = { file = "LICENSE" }
 

diff --git a/src/cstag/__init__.py b/src/cstag/__init__.py
@@ -2,9 +2,11 @@
 .. include:: ../../README.md
 """
 
+from cstag.call import call
 from cstag.shorten import shorten
 from cstag.lengthen import lengthen
 from cstag.consensus import consensus
-from cstag.to_html import to_html
 from cstag.mask import mask
-from cstag.call import call
+from cstag.split import split
+from cstag.revcomp import revcomp
+from cstag.to_html import to_html
diff --git a/src/cstag/call.py b/src/cstag/call.py
@@ -1,6 +1,5 @@
 from __future__ import annotations
 
-import re
 from cstag.shorten import shorten
 
 
@@ -157,15 +156,15 @@ def add_prefix(cs_tag: str) -> str:
 ###########################################################
 
 
-def call(cigar: str, md: str, seq: str, is_long: bool = False, prefix: bool = False) -> str:
+def call(cigar: str, md: str, seq: str, long: bool = False, prefix: bool = False) -> str:
     """
     Generate a cs tag based on CIGAR, MD, and SEQ information.
 
     Args:
         cigar (str): CIGAR string representing the alignment.
         md (str): MD tag representing mismatching positions/base.
         seq (str): The sequence of the read.
-        is_long (bool, optional): Whether to return the cs tag in long format. Defaults to False.
+        long (bool, optional): Whether to return the cs tag in long format. Defaults to False.
         prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
 
     Returns:
@@ -176,12 +175,12 @@ def call(cigar: str, md: str, seq: str, is_long: bool = False, prefix: bool = Fa
         >>> cigar = "8M2D4M2I3N1M"
         >>> md = "2A5^AG7"
         >>> seq = "ACGTACGTACGTACG"
-        >>> cstag.call(cigar, md, seq, is_long=True)
+        >>> cstag.call(cigar, md, seq, long=True)
         '=AC*ag=TACGT-ag=ACGT+ac~nn3nn=G'
     """
     cigar, seq = trim_clips(cigar, seq)
     cs_tag = generate_cs_long(cigar, md, seq)
-    if is_long is False:
+    if long is False:
         cs_tag = shorten(cs_tag)
     if prefix is True:
         cs_tag = add_prefix(cs_tag)

diff --git a/src/cstag/consensus.py b/src/cstag/consensus.py
@@ -1,68 +1,162 @@
+from __future__ import annotations
+
 import re
 from itertools import chain
 from collections import deque, Counter
 
+from cstag.utils.validator import validate_long_format
+
+
+def extract_softclips(cigars: list[str]) -> list[int]:
+    """
+    Extract the length of softclips from each cigars string.
 
-def consensus(CSTAG: list, CIGAR: list, POS: list) -> str:
-    """generate consensus of cs tags
     Args:
-        CSTAG (list): cs tags in the **long** format
-        CIGAR (list): CIGAR strings (6th column in SAM file)
-        POS (list): 1-based leftmost mapping position (4th column in SAM file)
-    Return:
-        str: a consensus of cs tag in the **long** format
-    Example:
-        >>> import cstag
-        >>> cs_list = ["cs:Z:=ACGT", "cs:Z:=AC*gt=T", "cs:Z:=C*gt=T", "cs:Z:=C*gt=T", "cs:Z:=ACT+ccc=T"]
-        >>> cigar_list = ["4M","4M","1S3M", "3M", "3M3I1M"]
-        >>> pos_list = [6,6,6,7,6]
-        >>> cstag.consensus(cs_list, cigar_list, pos)
-        cs:Z:=AC*gt*T
+        cigars (list[str]): list of cigars strings.
+
+    Returns:
+        list[int]: list of softclip lengths for each cigars string.
     """
-    if not (len(CSTAG) == len(CIGAR) == len(POS)):
-        raise Exception("Error: Element numbers of each argument must be the same")
+    softclip_lengths = []
+    for cigar in cigars:
+        # Check if the cigars string starts with a softclip (e.g., "4S3M")
+        if re.match(r"^[0-9]+S", cigar):
+            # Extract the length of the softclip using regex and convert it to an integer
+            softclip_length = int(re.sub(r"^([0-9]+)S.*", r"\1", cigar))
+        else:
+            # No softclip, so the length is 0
+            softclip_length = 0
+        softclip_lengths.append(softclip_length)
+    return softclip_lengths
 
-    if not all(re.search(r"[ACGT]", cs) for cs in CSTAG):
-        raise Exception("Error: cs tag must be a long format")
 
-    pos_min = min(POS)
-    pos = [pos - pos_min for pos in POS]
+def calculate_read_starts(positions: list[int], cigars: list[str]) -> list[int]:
+    """
+    Calculate the start positions of each read based on positions and cigars strings.
 
-    softclips = [re.sub(r"^([0-9]+)S.*", r"\1", cigar) for cigar in CIGAR]
-    softclips = [int(s) if s.isdigit() else 0 for s in softclips]
+    Args:
+        positions (list[int]): 1-based leftmost mapping positions.
+        cigars (list[str]): cigars strings indicating the mapping of each read.
 
-    starts = [p + s for p, s in zip(pos, softclips)]
+    Returns:
+        list[int]: Calculated start positions for each read.
+    """
+    pos_min = min(positions)
+    pos_offsets = [pos - pos_min for pos in positions]
+    softclips = extract_softclips(cigars)
+    starts = [p + s for p, s in zip(pos_offsets, softclips)]
+    return starts
+
+
+def split_cs_tags(cs_tags: list[str]) -> list[deque[str]]:
+    """
+    Split and process each cs tag in cs_tags.
+
+    Args:
+        cs_tags (list[str]): list of cs tags in the long format.
+
+    Returns:
+        list[deque[str]]: list of processed cs tags as deque objects.
+    """
+    cs_tags_splitted = []
+    for cs_tag in cs_tags:
+        # Remove the prefix "cs:Z:" if present
+        cs_tag = cs_tag.replace("cs:Z:", "")
+        # Split the cs tag using special symbols (-, *, ~, =)
+        split_tags = re.split(r"([-*~=])", cs_tag)[1:]
+        # Combine the symbol with the corresponding sequence
+        combined_tags = [symbol + seq for symbol, seq in zip(split_tags[0::2], split_tags[1::2])]
+        # Remove the "=" symbols, as they are not needed for further processing
+        cleaned_tags = [tag.replace("=", "") for tag in combined_tags]
+        # Further split the tags by the base letters (A, C, G, T)
+        further_split_tags = [re.split(r"(?=[ACGT])", tag) for tag in cleaned_tags]
+        # Remove any empty strings generated by the split
+        non_empty_tags = [[elem for elem in tag if elem] for tag in further_split_tags]
+        # Flatten the list of lists into a single list
+        flat_tags = list(chain.from_iterable(non_empty_tags))
+        cs_tags_splitted.append(deque(flat_tags))
+    return cs_tags_splitted
+
+
+def normalize_read_lengths(cs_list: list[deque[str]], starts: list[int]) -> list[deque[str]]:
+    """
+    Normalize the lengths of each read in cs_list based on their starts positions.
 
-    cs_list = []
-    for cs in CSTAG:
-        cs = cs.replace("cs:Z:", "")
-        cs = re.split(r"([-*~=])", cs)[1:]
-        cs = [i + j for i, j in zip(cs[0::2], cs[1::2])]
-        cs = [c.replace("=", "") for c in cs]
-        cs = [re.split(r"(?=[ACGT])", c) for c in cs]
-        cs = [list(filter(None, c)) for c in cs]
-        cs = list(chain.from_iterable(cs))
-        cs_list.append(deque(cs))
+    Args:
+        cs_list (list[deque[str]]): list of deques representing the reads.
+        starts (list[int]): Starting positions of each read.
 
+    Returns:
+        list[deque[str]]: list of deques representing the reads, now normalized to the same length.
+    """
     cs_maxlen = max(len(cs) + start for cs, start in zip(cs_list, starts))
-    for i, (cs, start) in enumerate(zip(cs_list, starts)):
-        if start:
+
+    for i, start in enumerate(starts):
+        if start > 0:
             cs_list[i].extendleft(["N"] * start)
         if len(cs_list[i]) < cs_maxlen:
             cs_list[i].extend(["N"] * (cs_maxlen - len(cs_list[i])))
 
-    def get_consensus(cs: tuple) -> str:
-        """
-        When it is multimodal, return the first **mutated** mode encountered
-        """
-        mostcommon = Counter(cs).most_common(1)
-        if len(mostcommon) == 1:
-            return mostcommon[0][0]
-        for key, val in mostcommon:
-            if not re.search(r"[ACGT]", key):
-                return key
-
-    cs_consensus = [get_consensus(cs) for cs in list(zip(*cs_list))]
+    return cs_list
+
+
+def get_consensus(cs_list: list[deque[str]]) -> str:
+    cs_consensus = []
+    for cs in zip(*cs_list):
+        # Get the most common cs tag(s)
+        most_common_tags = Counter(cs).most_common()
+
+        # If there's a unique most common tag, return it
+        most_common_tag, _ = most_common_tags[0]
+        if len(most_common_tags) == 1 or most_common_tags[0][1] != most_common_tags[1][1]:
+            cs_consensus.append(most_common_tag)
+            continue
+        # If the most common tag is not unique (multimodal), return the first *mutated* mode
+        for tag, _ in most_common_tags:
+            if not re.search(r"[ACGT]", tag):
+                cs_consensus.append(tag)
+
     cs_consensus = "".join(cs_consensus)
+    # Append "=" to [ACGTN]
+    return re.sub(r"([ACGTN]+)", r"=\1", cs_consensus)
+
+
+###########################################################
+# main
+###########################################################
+
+
+def consensus(cs_tags: list[str], cigars: list[str], positions: list[int], prefix: bool = False) -> str:
+    """generate consensus of cs tags
+    Args:
+        cs_tags (list): cs tags in the **long** format
+        cigars (list): cigars strings (6th column in SAM file)
+        positions (list): 1-based leftmost mapping position (4th column in SAM file)
+        prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
+    Return:
+        str: a consensus of cs tag in the **long** format
+    Example:
+        >>> import cstag
+        >>> cs_tags = ["=ACGT", "=AC*gt=T", "=C*gt=T", "=C*gt=T", "=ACT+ccc=T"]
+        >>> cigars = ["4M","4M","1S3M", "3M", "3M3I1M"]
+        >>> positions = [6,6,6,7,6]
+        >>> cstag.consensus(cs_tags, cigars, positions)
+        =AC*gt=T
+    """
+    if not (len(cs_tags) == len(cigars) == len(positions) > 0):
+        raise ValueError("Element numbers of each argument must be the same")
+
+    for cs_tag in cs_tags:
+        validate_long_format(cs_tag)
+
+    # Calculate the starts positions for each read
+    starts = calculate_read_starts(positions, cigars)
+
+    cs_list = split_cs_tags(cs_tags)
+
+    # Normalize the lengths of each read
+    cs_list = normalize_read_lengths(cs_list, starts)
+
+    cs_consensus = get_consensus(cs_list)
 
-    return "cs:Z:" + re.sub(r"([ACGTN]+)", r"=\1", cs_consensus)
+    return f"cs:Z:{cs_consensus}" if prefix else cs_consensus
diff --git a/src/cstag/lengthen.py b/src/cstag/lengthen.py
@@ -1,4 +1,7 @@
+from __future__ import annotations
+
 import re
+from cstag.utils.validator import validate_short_format
 
 
 def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
@@ -20,9 +23,7 @@ def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
         >>> cstag.lengthen(cs, cigar, seq)
         =ACGT*ag=CGT
     """
-
-    if re.search(r"[ACGT]", cs_tag):
-        raise Exception("Error: input cs tag is not short format")
+    validate_short_format(cs_tag)
 
     cs_tag_split = re.split(r"([-+*~:])", cs_tag.replace("cs:Z:", ""))[1:]
     cs_tag_split = [i + j for i, j in zip(cs_tag_split[0::2], cs_tag_split[1::2])]
@@ -46,7 +47,4 @@ def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
             idx += len(cs) - 1
     cslong = "".join(cslong).replace(":", "=")
 
-    if prefix is True:
-        return "cs:Z:" + cslong
-    else:
-        return cslong
+    return f"cs:Z:{cslong}" if prefix else cslong
diff --git a/src/cstag/mask.py b/src/cstag/mask.py
@@ -1,29 +1,31 @@
+from __future__ import annotations
+
 import re
 
+from cstag.utils.validator import validate_long_format, validate_threshold
+
 
-def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10):
+def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) -> str:
     """Mask low-quality bases to 'N'
     Args:
         cs_tag (str): cs tag in the **long** format
         cigar (str): cigar strings (6th column in SAM file)
         qual (str): ASCII of Phred-scaled base quaiity+33 (11th column in SAM file)
-        threshold (int): optional: Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
+        threshold (int, optional): Phred Quality Score (defalt = 10). The low-quality bases are defined as 'less than or equal to the threshold'
+        prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
     Return:
         str: Masked cs tag
     Example:
         >>> import cstag
-        >>> cs_tag = "cs:Z:=ACGT*ac+gg-cc=T"
+        >>> cs_tag = "=ACGT*ac+gg-cc=T"
         >>> cigar = "5M2I2D1M"
         >>> qual = "AA!!!!AA"
         >>> cstag.mask(cs_tag, qual)
-        cs:Z:=ACNN*an+ng-cc=T
+        =ACNN*an+ng-cc=T
     """
-    if not re.search(r"[ACGT]", cs_tag):
-        raise Exception("Error: cs tag must be a long format")
-    if not isinstance(threshold, int):
-        raise Exception("Error: threshold must be an integer")
-    if not 0 <= threshold <= 40:
-        raise Exception("Error: threshold must be within a range between 0 to 40")
+
+    validate_long_format(cs_tag)
+    validate_threshold(threshold)
 
     mask_symbols = [chr(th + 33) for th in range(threshold + 1)]
     mask_symbols = set(mask_symbols)
@@ -50,5 +52,6 @@ def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10):
                     cs[i + 1] = "N" if cs[0] == "=" else "n"
             idx += i + 1
         cs_masked.append("".join(cs))
+    cs_masked = "".join(cs_masked)
 
-    return "cs:Z:" + "".join(cs_masked)
+    return f"cs:Z:{cs_masked}" if prefix else cs_masked