1

CaiBiotech · Jun 30, 2023 · 5d00439 · 5d00439
1 parent 8c438e0
commit 5d00439
Show file tree

Hide file tree

Showing 10 changed files with 918,919 additions and 0 deletions.
diff --git a/example/ET_D.filted.indel.vcf b/example/ET_D.filted.indel.vcf
diff --git a/example/ET_GD.filted.indel.vcf b/example/ET_GD.filted.indel.vcf
diff --git a/example/ET_V1.filted.indel.vcf b/example/ET_V1.filted.indel.vcf
diff --git a/example/ET_V2.filted.indel.vcf b/example/ET_V2.filted.indel.vcf
diff --git a/example/genome.fa b/example/genome.fa
diff --git a/example/genome.fa.fai b/example/genome.fa.fai
diff --git a/example/strain1_markers.csv b/example/strain1_markers.csv
diff --git a/example/strain2_markers.csv b/example/strain2_markers.csv
diff --git a/indel_marker_detector.py b/indel_marker_detector.py
@@ -0,0 +1,110 @@
+import pysam
+import sys
+import re
+import csv
+import argparse
+from itertools import groupby
+
+def read_indel_file(file_path, min_indel_length=10):
+    indels = set()
+    vcf_in = pysam.VariantFile(file_path, "r")
+    for rec in vcf_in.fetch():
+        for alt in rec.alts:
+            indel_length = abs(len(alt) - len(rec.ref))
+            if indel_length >= min_indel_length:
+                indel_seq = alt if len(alt) > len(rec.ref) else rec.ref
+                indels.add((rec.chrom, rec.pos, indel_seq))
+    return indels
+
+def get_unique_indel_list(strain1_indels, strain2_indels, genome_file, marker_length_min, marker_length_max):
+    unique_indels = strain1_indels.symmetric_difference(strain2_indels)
+    genome = pysam.FastaFile(genome_file)
+
+    marker_list = []
+    for indel_chrom, indel_pos, indel_seq in unique_indels:
+        # Skip if indel sequence is a palindrome or has a repeated subsequence occupying more than 50% of the sequence
+        if is_palindrome(indel_seq) or is_simple_repeat(indel_seq):
+            continue
+
+        found_marker = False
+        for marker_length in range(marker_length_min, marker_length_max + 1):
+            start = indel_pos - marker_length // 2
+            end = indel_pos + marker_length // 2
+
+            # Ensure start is not negative
+            start = max(0, start)
+
+            seq = genome.fetch(region=indel_chrom, start=start, end=end)
+
+            if 'N' in seq:  # Skip sequences with 'N'
+                continue
+
+            if not is_palindrome(seq) and not is_simple_repeat(seq):
+                strain_with_indel = "strain1" if (indel_chrom, indel_pos, indel_seq) in strain1_indels else "strain2"
+                marker_list.append((indel_chrom, start, end, seq, strain_with_indel, indel_seq))
+                found_marker = True
+                break
+        if not found_marker:
+            print(f"No suitable marker found for indel at {indel_chrom}:{indel_pos}")
+    return marker_list
+
+def is_palindrome(seq):
+    seq_len = len(seq)
+    for i in range(seq_len // 2):
+        if seq[i] != seq[seq_len - 1 - i]:
+            return False
+    return True
+
+def is_simple_repeat(seq):
+    seq_len = len(seq)
+    min_subseq_len = 3  # Minimum length of a subsequence to be considered
+    max_same_char_stretch = max(len(list(g)) for _, g in groupby(seq))  # Length of the longest stretch of the same character
+    if max_same_char_stretch > seq_len * 0.5:  # If the longest stretch of the same character occupies more than 50% of the sequence
+        return True
+
+    for repeat_len in range(min_subseq_len, seq_len // 2 + 1):
+        # Check for repeated subsequences
+        for i in range(seq_len - repeat_len + 1):
+            subseq = seq[i:i+repeat_len]
+            if seq.count(subseq) * len(subseq) > seq_len * 0.5:  # If the repeated subsequence occupies more than 50% of the sequence
+                return True
+    return False
+
+def main(indel_files1, indel_files2, genome_file):
+    strain1_indels = set.intersection(*(read_indel_file(file) for file in indel_files1))
+    strain2_indels = set.intersection(*(read_indel_file(file) for file in indel_files2))
+    marker_list = get_unique_indel_list(strain1_indels, strain2_indels, genome_file, 300, 400)
+
+    strain1_markers = []
+    strain2_markers = []
+    for marker in marker_list:
+        chrom, start, end, seq, strain_with_indel, indel_seq = marker  # Corrected here
+        indel_pos = (start + end) // 2
+        marker_info = (chrom, start, end, indel_pos, seq, indel_seq)  # Corrected here
+        if strain_with_indel == "strain1":
+            strain1_markers.append(marker_info)
+        else:
+            strain2_markers.append(marker_info)
+
+    # Save strain1 markers to a CSV file
+    with open('strain1_markers.csv', 'w', newline='') as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(['Chromosome', 'Marker Start', 'Marker End', 'Indel Position', 'Marker Sequence', 'Indel Sequence'])
+        for marker in strain1_markers:
+            csvwriter.writerow(marker)
+
+    # Save strain2 markers to a CSV file
+    with open('strain2_markers.csv', 'w', newline='') as csvfile:
+        csvwriter = csv.writer(csvfile)
+        csvwriter.writerow(['Chromosome', 'Marker Start', 'Marker End', 'Indel Position', 'Marker Sequence', 'Indel Sequence'])
+        for marker in strain2_markers:
+            csvwriter.writerow(marker)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description='This script detects unique indel markers from multiple VCF files for two different strains.')
+    parser.add_argument("--strain1", nargs='+', required=True, help="Specify one or more VCF files for strain 1.")
+    parser.add_argument("--strain2", nargs='+', required=True, help="Specify one or more VCF files for strain 2.")
+    parser.add_argument("--genome", required=True, help="Specify the genome file.")
+    args = parser.parse_args()
+
+    main(args.strain1, args.strain2, args.genome)
diff --git a/readme b/readme
@@ -0,0 +1,32 @@
+# Indel Marker Detector
+This Python script detects unique indel markers from multiple VCF files for two different strains. It uses the `pysam` library to read VCF files and the genome file, and the `argparse` library to handle command-line arguments.
+
+## Requirements
+- Python 3
+- pysam library
+
+## Usage
+Run the script with the following command:
+
+python indel_marker_detector.py --strain1 indel_file1.vcf.gz indel_file2.vcf.gz --strain2 indel_file3.vcf.gz indel_file4.vcf.gz --genome genome.fa
+
+## Command-Line Arguments
+- `--strain1`: Specify one or more VCF files for strain 1.
+- `--strain2`: Specify one or more VCF files for strain 2.
+- `--genome`: Specify the genome file.
+
+## Output
+The script outputs two CSV files, `strain1_markers.csv` and `strain2_markers.csv`, which contain the detected indel markers for strain 1 and strain 2, respectively. Each row in the CSV files represents a marker and includes the following columns: Chromosome, Marker Start, Marker End, Indel Position, Marker Sequence, Indel Sequence.
+
+## Here is a brief description of the script and its command-line arguments:
+usage: indel_marker_detector.py --strain1 VCF_FILES [VCF_FILES ...] --strain2 VCF_FILES [VCF_FILES ...] --genome GENOME_FILE
+
+This script detects unique indel markers from multiple VCF files for two different strains.
+
+optional arguments:
+  --strain1 VCF_FILES [VCF_FILES ...]
+                        Specify one or more VCF files for strain 1.
+  --strain2 VCF_FILES [VCF_FILES ...]
+                        Specify one or more VCF files for strain 2.
+  --genome GENOME_FILE
+                        Specify the genome file.