Merge pull request #17 from akikuno/develop-v0.6.2

Develop v0.6.2
akikuno · Sep 14, 2023 · 07cee1a · 07cee1a
2 parents e07c8e5 + 93feaf8
commit 07cee1a
Show file tree

Hide file tree

Showing 18 changed files with 703 additions and 186 deletions.
diff --git a/.github/workflows/deploy_ghpages.yml b/.github/workflows/deploy_ghpages.yml
@@ -19,7 +19,7 @@ jobs:
       - name: Build
         run: |
           python -m pip install --upgrade pip
-          pip install pdoc3
+          python -m pip install pdoc3
           pdoc --html --output-dir tmp --force src/cstag
 
       - name: Deploy

diff --git a/README.md b/README.md
@@ -7,21 +7,25 @@
 
 # cstag
 
-`cstag` is a Python library designed for handling and manipulating [minimap2's CS tags](https://github.com/lh3/minimap2#cs).
+`cstag` is a Python library tailored for the manipulation and handling of [minimap2's CS tags](https://github.com/lh3/minimap2#cs).
 
-## 🌟Features
+
+## 🌟 Features
 
 - `cstag.call()`: Generate a CS tag
-- `cstag.shorten()`: Convert a CS tag from long to short format
-- `cstag.lengthen()`: Convert a CS tag from short to long format
-- `cstag.consensus()`: Generate a consensus cs tag from multiple cs tags
-- `cstag.mask()`: Mask low-quality bases in a CS tag
-- `cstag.split()`: Split a CS tag
-- `cstag.revcomp()`: Converts a CS tag into its reverse complement.
-- `cstag.to_vcf()`: Output VCF
-- `cstag.to_html()`: Output HTML
+- `cstag.shorten()`: Convert a CS tag from its long to short format
+- `cstag.lengthen()`: Convert a CS tag from its short to long format
+- `cstag.consensus()`: Create a consensus CS tag from multiple CS tags
+- `cstag.mask()`: Mask low-quality bases within a CS tag
+- `cstag.split()`: Break down a CS tag into its constituent parts
+- `cstag.revcomp()`: Convert a CS tag to its reverse complement
+- `cstag.to_sequence()`: Reconstruct a reference subsequence from the alignment
+- `cstag.to_vcf()`: Generate a VCF file
+- `cstag.to_html()`: Produce an HTML representation
+
+For comprehensive documentation, please visit [our docs](https://akikuno.github.io/cstag/cstag/).  
+To add CS tags to SAM/BAM files, check out [`cstag-cli`](https://github.com/akikuno/cstag-cli).  
 
-Visit the [documentation](https://akikuno.github.io/cstag/cstag/) for more details.
 
 ## 🛠 Installation
 
@@ -37,9 +41,10 @@ Using [Bioconda](https://anaconda.org/bioconda/cstag):
 conda install -c bioconda cstag
 ```
 
-## 💡Usage
+## 💡 Usage
+
+### Generating CS Tags
 
-### Generate CS Tags
 ```python
 import cstag
 
@@ -54,28 +59,28 @@ cstag.call(cigar, md, seq, long=True)
 # =AC*ag=TACGT-ag=ACGT+ac~nn3nn=G
 ```
 
-### Shorten or Lengthen CS Tags
+### Shortening or Lengthening CS Tags
 
 ```python
 import cstag
 
 # Convert a CS tag from long to short
-cs = "=ACGT*ag=CGT"
+cs_tag = "=ACGT*ag=CGT"
 
-cstag.shorten(cs)
+cstag.shorten(cs_tag)
 # :4*ag:3
 
 
 # Convert a CS tag from short to long
-cs = ":4*ag:3"
+cs_tag = ":4*ag:3"
 cigar = "8M"
 seq = "ACGTACGT"
 
-cstag.lengthen(cs, cigar, seq)
+cstag.lengthen(cs_tag, cigar, seq)
 # =ACGT*ag=CGT
 ```
 
-### Generate a Consensus
+### Creating a Consensus
 
 ```python
 import cstag
@@ -87,26 +92,26 @@ cstag.consensus(cs_tags, positions)
 # =AC*gt*T
 ```
 
-### Mask Low-Quality Bases
+### Masking Low-Quality Bases
 
 ```python
 import cstag
 
-cs = "=ACGT*ac+gg-cc=T"
+cs_tag = "=ACGT*ac+gg-cc=T"
 cigar = "5M2I2D1M"
 qual = "AA!!!!AA"
 phred_threshold = 10
-cstag.mask(cs, cigar, qual, phred_threshold)
+cstag.mask(cs_tag, cigar, qual, phred_threshold)
 # =ACNN*an+ng-cc=T
 ```
 
-### Split a CS Tag
+### Splitting a CS Tag
 
 ```python
 import cstag
 
-cs = "=ACGT*ac+gg-cc=T"
-cstag.split(cs)
+cs_tag = "=ACGT*ac+gg-cc=T"
+cstag.split(cs_tag)
 # ['=ACGT', '*ac', '+gg', '-cc', '=T']
 ```
 
@@ -115,19 +120,28 @@ cstag.split(cs)
 ```python
 import cstag
 
-cs = "=ACGT*ac+gg-cc=T"
-cstag.revcomp(cs)
+cs_tag = "=ACGT*ac+gg-cc=T"
+cstag.revcomp(cs_tag)
 # =A-gg+cc*tg=ACGT
 ```
 
-### Generate VCF Report
+### Reconstructing the Reference Subsequence
+
+```python
+import cstag
+cs_tag = "=AC*gt=T-gg=C+tt=A"
+cstag.to_sequence(cs_tag)
+# ACTTCTTA
+```
+
+### Generating a VCF Report
 
 ```python
 import cstag
 cs_tag = "=AC*gt=T-gg=C+tt=A"
 chrom = "chr1"
 pos = 1
-print(cstag.to_vcf(cstag, chrom, pos))
+print(cstag.to_vcf(cs_tag, chrom, pos))
 """
 ##fileformat=VCFv4.2
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
@@ -137,24 +151,46 @@ chr1	5	.	C	CTT	.	.	.
 """
 ```
 
-### Generate HTML Report
+The multiple CS tags enable reporting of the variant allele frequency (VAF).
+
+```python
+import cstag
+cs_tags = ["=ACGT", "=AC*gt=T", "=C*gt=T", "=ACGT", "=AC*gt=T"]
+chroms = ["chr1", "chr1", "chr1", "chr2", "chr2"]
+positions = [2, 2, 3, 10, 100]
+print(cstag.to_vcf(cs_tags, chroms, positions))
+"""
+##fileformat=VCFv4.2
+##INFO=<ID=DP,Number=1,Type=Integer,Description="Total Depth">
+##INFO=<ID=RD,Number=1,Type=Integer,Description="Depth of Ref allele">
+##INFO=<ID=AD,Number=1,Type=Integer,Description="Depth of Alt allele">
+##INFO=<ID=VAF,Number=1,Type=Float,Description="Variant allele frequency (AD/DP)">
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO
+chr1	4	.	G	T	.	.	DP=3;RD=1;AD=2;VAF=0.667
+chr2	102	.	G	T	.	.	DP=1;RD=0;AD=1;VAF=1.0
+"""
+```
+
+### Generating an HTML Report
 
 ```python
 import cstag
 from pathlib import Path
 
-cs_tag = "=AC+GGG=T-ACGT*at~gt10cg=GNNN"
+cs_tag = "=AC+ggg=T-acgt*at~gt10ag=GNNN"
 description = "Example"
 
 cs_tag_html = cstag.to_html(cs_tag, description)
 Path("report.html").write_text(cs_tag_html)
 # Output "report.html"
 ```
-The resulting `report.html` looks like this :point_down:
 
-<img width="414" alt="example_report" src="https://user-images.githubusercontent.com/15861316/158910398-67f480d2-8742-412a-b528-40e545c46513.png">
+You can visualize mutations indicated by the CS tag using the generated `report.html` file as shown below:
+
+<img width="511" alt="image" src="https://user-images.githubusercontent.com/15861316/265405607-a3cc1b76-f6a2-441d-b282-6f2dc06fc03d.png">
+
 
-## 📣Feedback
+## 📣 Feedback and Support
 
-For questions, bug reports, or any other inquiries, feel free to reach out!  
-Please use [GitHub Issues](https://github.com/akikuno/cstag/issues) for reporting.
+For questions, bug reports, or other forms of feedback, we'd love to hear from you!  
+Please use [GitHub Issues](https://github.com/akikuno/cstag/issues) for all reporting purposes.  
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "poetry.core.masonry.api"
 
 [tool.poetry]
 name = "cstag"
-version = "0.6.1"
+version = "0.6.2"
 description = "Python module to manipulate the minimap2's CS tag"
 authors = ["Akihiro Kuno <[email protected]>"]
 homepage = "https://github.com/akikuno/cstag"

diff --git a/src/cstag/__init__.py b/src/cstag/__init__.py
@@ -11,3 +11,4 @@
 from cstag.revcomp import revcomp
 from cstag.to_html import to_html
 from cstag.to_vcf import to_vcf
+from cstag.to_sequence import to_sequence
diff --git a/src/cstag/consensus.py b/src/cstag/consensus.py
@@ -4,24 +4,24 @@
 from itertools import chain
 from collections import deque, Counter
 
-from cstag.utils.validator import validate_long_format
+from cstag.utils.validator import validate_cs_tag, validate_long_format
 
 
-def split_cs_tags(cs_tags: list[str]) -> list[deque[str]]:
+def split_cs_tags(cs_tags: list[str]) -> list[list[str]]:
     """
-    Split and process each cs tag in cs_tags.
+    Split and process each CS tag in cs_tags.
 
     Args:
-        cs_tags (list[str]): list of cs tags in the long format.
+        cs_tags (list[str]): list of CS tags in the long format.
 
     Returns:
-        list[deque[str]]: list of processed cs tags as deque objects.
+        list[list[str]]: list of processed CS tags.
     """
     cs_tags_splitted = []
     for cs_tag in cs_tags:
         # Remove the prefix "cs:Z:" if present
         cs_tag = cs_tag.replace("cs:Z:", "")
-        # Split the cs tag using special symbols (-, *, ~, =)
+        # Split the CS tag using special symbols (-, *, ~, =)
         split_tags = re.split(r"([-*~=])", cs_tag)[1:]
         # Combine the symbol with the corresponding sequence
         combined_tags = [symbol + seq for symbol, seq in zip(split_tags[0::2], split_tags[1::2])]
@@ -33,36 +33,47 @@ def split_cs_tags(cs_tags: list[str]) -> list[deque[str]]:
         non_empty_tags = [[elem for elem in tag if elem] for tag in further_split_tags]
         # Flatten the list of lists into a single list
         flat_tags = list(chain.from_iterable(non_empty_tags))
-        cs_tags_splitted.append(deque(flat_tags))
+        cs_tags_splitted.append(flat_tags)
     return cs_tags_splitted
 
 
-def normalize_read_lengths(cs_list: list[deque[str]], starts: list[int]) -> list[deque[str]]:
+def normalize_positions(positions: list[int]) -> list[int]:
     """
-    Normalize the lengths of each read in cs_list based on their starts positions.
+    Normalize the positions in the given list by shifting them so that the minimum position becomes zero.
+    """
+    pos_min = min(positions)
+    return [pos - pos_min for pos in positions]
+
+
+def normalize_read_lengths(cs_tags: list[str], positions: list[int]) -> list[list[str]]:
+    """
+    Normalize the lengths of each read in cs_tags based on their starts positions.
 
     Args:
-        cs_list (list[deque[str]]): list of deques representing the reads.
-        starts (list[int]): Starting positions of each read.
+        cs_tags (list[str]): list of CS tags.
+        positions (list[int]): Starting positions of each read.
 
     Returns:
-        list[deque[str]]: list of deques representing the reads, now normalized to the same length.
+        list[list[str]]: list of lists representing the reads, now normalized to the same length.
     """
-    cs_maxlen = max(len(cs) + start for cs, start in zip(cs_list, starts))
+    cs_tags_split = split_cs_tags(cs_tags)
+    cs_tags_deque = [deque(cs) for cs in cs_tags_split]
+    positions_normalized = normalize_positions(positions)
+    cs_maxlen = max(len(cs) + pos for cs, pos in zip(cs_tags_deque, positions_normalized))
 
-    for i, start in enumerate(starts):
-        if start > 0:
-            cs_list[i].extendleft(["N"] * start)
-        if len(cs_list[i]) < cs_maxlen:
-            cs_list[i].extend(["N"] * (cs_maxlen - len(cs_list[i])))
+    for i, pos in enumerate(positions_normalized):
+        if pos > 0:
+            cs_tags_deque[i].extendleft(["N"] * pos)
+        if len(cs_tags_deque[i]) < cs_maxlen:
+            cs_tags_deque[i].extend(["N"] * (cs_maxlen - len(cs_tags_deque[i])))
+    cs_tags = [list(cs) for cs in cs_tags_deque]
+    return cs_tags
 
-    return cs_list
 
-
-def get_consensus(cs_list: list[deque[str]]) -> str:
+def get_consensus(cs_tags: list[list[str]]) -> str:
     cs_consensus = []
-    for cs in zip(*cs_list):
-        # Get the most common cs tag(s)
+    for cs in zip(*cs_tags):
+        # Get the most common CS tag(s)
         most_common_tags = Counter(cs).most_common()
 
         # If there's a unique most common tag, return it
@@ -86,13 +97,13 @@ def get_consensus(cs_list: list[deque[str]]) -> str:
 
 
 def consensus(cs_tags: list[str], positions: list[int], prefix: bool = False) -> str:
-    """generate consensus of cs tags
+    """generate consensus of CS tags
     Args:
-        cs_tags (list): cs tags in the **long** format
+        cs_tags (list): CS tags in the **long** format
         positions (list): 1-based leftmost mapping position (4th column in SAM file)
-        prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the cs tag. Defaults to False
+        prefix (bool, optional): Whether to add the prefix 'cs:Z:' to the CS tag. Defaults to False
     Return:
-        str: a consensus of cs tag in the **long** format
+        str: a consensus of CS tag in the **long** format
     Example:
         >>> import cstag
         >>> cs_tags = ["=ACGT", "=AC*gt=T", "=C*gt=T", "=C*gt=T", "=ACT+ccc=T"]
@@ -104,13 +115,10 @@ def consensus(cs_tags: list[str], positions: list[int], prefix: bool = False) ->
         raise ValueError("Element numbers of each argument must be the same")
 
     for cs_tag in cs_tags:
+        validate_cs_tag(cs_tag)
         validate_long_format(cs_tag)
 
-    cs_tag_split = split_cs_tags(cs_tags)
-
-    positions_zero_indexed = [pos - 1 for pos in positions]
-
-    cs_tags_normalized_length = normalize_read_lengths(cs_tag_split, positions_zero_indexed)
+    cs_tags_normalized_length = normalize_read_lengths(cs_tags, positions)
 
     cs_consensus = get_consensus(cs_tags_normalized_length)
 

diff --git a/src/cstag/lengthen.py b/src/cstag/lengthen.py
@@ -1,7 +1,7 @@
 from __future__ import annotations
 
 import re
-from cstag.utils.validator import validate_short_format
+from cstag.utils.validator import validate_cs_tag, validate_short_format
 
 
 def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
@@ -23,6 +23,7 @@ def lengthen(cs_tag: str, cigar: str, seq: str, prefix: bool = False) -> str:
         >>> cstag.lengthen(cs, cigar, seq)
         =ACGT*ag=CGT
     """
+    validate_cs_tag(cs_tag)
     validate_short_format(cs_tag)
 
     cs_tag_split = re.split(r"([-+*~:])", cs_tag.replace("cs:Z:", ""))[1:]

diff --git a/src/cstag/mask.py b/src/cstag/mask.py
@@ -2,7 +2,7 @@
 
 import re
 
-from cstag.utils.validator import validate_long_format, validate_threshold
+from cstag.utils.validator import validate_cs_tag, validate_long_format, validate_threshold
 
 
 def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool = False) -> str:
@@ -23,7 +23,7 @@ def mask(cs_tag: str, cigar: str, qual: str, threshold: int = 10, prefix: bool =
         >>> cstag.mask(cs_tag, qual)
         =ACNN*an+ng-cc=T
     """
-
+    validate_cs_tag(cs_tag)
     validate_long_format(cs_tag)
     validate_threshold(threshold)