new (and improved tools) for single-cell discovery mode

HorvathLab · Mar 5, 2024 · b07b295 · b07b295
1 parent 8c05ee4
commit b07b295
Show file tree

Hide file tree

Showing 12 changed files with 1,137 additions and 79 deletions.
diff --git a/ReadCounts/src/varLoci.py b/ReadCounts/src/varLoci.py
@@ -10,20 +10,75 @@
 except NameError:
     pass
 
-from variantloci import VariantLociByPileup, VariantLociByFetch
+from collections import defaultdict
+from variantloci import VariantLociByFetch
+
+opt = defaultdict(lambda: False)
+while len(sys.argv) > 1 and sys.argv[1] in ("-v","-h"):
+   opt[sys.argv.pop(1)[1]] = True
+
+if len(sys.argv) <= 1 or opt['h']:
+    print("""
+Usage:
+  varLoci [ options ] <bamfile> [ parameters ]
+
+Parameters (with defaults):
+  maxedits=5             # Max. edits for acceptable read alignment
+  minbasequal=25         # Min. base quality score
+  mindist=5              # Min. number of bases between variants
+  minmappingqual=60      # Min. mapping quality for reads
+  minvarcnt=5            # Min. number of reads with variant
+  region=                # Restriction analysis to chrom. region
+  outfile=               # Place output in file, otherwise standard out
+
+Options:
+  -v 	Verbose
+  -h	Help
+""".strip())
+    sys.exit(0)
+
+verbose = opt['v']
 
 bamfilename = sys.argv[1]
-region = None
-minreads = 1
-if len(sys.argv) > 2:
-    minreads = int(sys.argv[2])
-if len(sys.argv) > 3:
-    region = sys.argv[3]
-    if region.strip() in ("", "-", "None"):
-        region = None
-
-# scan = VariantLociByPileup(bamfilename,region=region,minreads=minreads)
-scan = VariantLociByFetch(bamfilename,region=region,minreads=minreads)
+kwargs = dict(region = None, minvarcnt = 5, maxedits = 5, minbasequal = 25, minmappingqual = 60, mindist = 5, outfile="")
+for kv in sys.argv[2:]:
+    badsplit = False
+    try:
+        k,v = map(str.strip,kv.split('='))
+    except ValueError:
+        badsplit = True
+    if badsplit or not k or k not in kwargs:
+        raise RuntimeError("Bad parameter setting: "+kv)
+    try:
+        v = eval(v)
+    except:
+        pass
+    kwargs[k] = v
+
+if verbose:
+    print("Parameters:",file=sys.stderr)
+    for k,v in sorted(kwargs.items()):
+        if k in ('cellbarcodes','umibarcodes'):
+            print("  %s=%s"%(k,v._name),file=sys.stderr)
+        else:
+            if v == None:
+                print("  %s="%(k,),file=sys.stderr)
+            else:
+                print("  %s=%s"%(k,v),file=sys.stderr)
+    print(file=sys.stderr)
+
+outfile = kwargs.get('outfile')
+if 'outfile' in kwargs:
+    del kwargs['outfile']
+
+scan = VariantLociByFetch(bamfilename,**kwargs)
+if not outfile:
+    outfile = sys.stdout
+else:
+    outfile = open(outfile,'wt')
+print("CHROM","POS","REF","ALT",sep='\t',file=outfile)
 for chrom,pos,refnuc,altnuc,freq in scan.loci():
-    # print(chrom,pos,refnuc,altnuc,freq.get(refnuc,0),freq.get(altnuc,0),sep='\t')
     print(chrom,pos,refnuc,altnuc,sep='\t')
+
+if outfile != sys.stdout:
+    outfile.close()
diff --git a/SCReadCounts/data/dlexons.sh b/SCReadCounts/data/dlexons.sh
@@ -0,0 +1,24 @@
+#!/bin/sh
+if [ "$1" == "-h" ]; then
+  echo "Usage: $0 [ <EMBL-Release> ]" 1>&2
+  exit 1
+fi
+if [ "$1" == "" ]; then
+  REL1="current_gtf"
+else
+  REL1="release-$1/gtf"
+fi
+FN=`wget -q -O - "https://ftp.ensembl.org/pub/$REL1/homo_sapiens/" | sed -n 's/^.*href="\(.*.[0-9][0-9]*.gtf.gz\)".*$/\1/p'`
+REL2=`echo "$FN" | tr '.'  ' ' | awk '{print $(NF-2)}'` 
+ASS=`echo "$FN" | tr '.'  ' ' | awk '{print $(NF-3)}'` 
+URL="https://ftp.ensembl.org/pub/$REL1/homo_sapiens/Homo_sapiens.$ASS.$REL2.gtf.gz"
+echo "Downloading Homo_sapiens.$ASS.$REL2.gtf.gz" 1>&2
+wget -q -O - "$URL" | gunzip -c | \
+  grep -v '^#' | \
+  awk '$3 == "exon" {print $1,$4,$5}' | \
+  awk '$1 ~ /^([12]?[0-9]|X|Y|MT)$/' | \
+  sed -e 's/^X /100 /' -e 's/^Y /101 /' -e 's/^MT /102 /' | \
+  sort -k1n,1 -k2n,2 -k3n,3 | \
+  sed -e 's/^100 /X /' -e 's/^101 /Y /' -e 's/^102 /MT /' | \
+  tr ' ' '\t' | \
+  uniq 
diff --git a/SCReadCounts/data/singlecell.sh b/SCReadCounts/data/singlecell.sh
@@ -7,6 +7,10 @@ rm -f singlecell2-all-output.{tsv,*.matrix.tsv}
 
 set -x
 
+PREFIX="../bin/"
+# PREFIX="conda run -n HorvathLab --live-stream "
+SCRC="${PREFIX}scReadCounts"
+
 #
 # Equivalent to running these three commands...
 #
@@ -18,22 +22,22 @@ set -x
 #
 # readCountsMatrix -c "singlecell-output.tsv" -M VAF -m 5 -o "singlecell-output.vaf-m5.matrix.tsv"
 #
-../bin/scReadCounts -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 3 -o singlecell-output.tsv
+$SCRC -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 3 -o singlecell-output.tsv
 
 #
 # Regenerate the VAF matrix (different output name: singlecell-output.vaf-m5.matrix.tsv)
 #
-../bin/scReadCounts -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 5 -o singlecell-output.tsv
+$SCRC -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 5 -o singlecell-output.tsv
 
 #
 # STARsolo example
 #
-../bin/scReadCounts -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-                    -C STARsolo -U STARsolo -b singlecell2_117_barcodes.tsv -o singlecell2-output.tsv
+$SCRC -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
+      -C STARsolo -U STARsolo -b singlecell2_117_barcodes.tsv -o singlecell2-output.tsv
 
 #
 # Override accept list for barcodes, accept all barcodes, directional output
 #
-../bin/scReadCounts -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-                    -C STARsolo -U STARsolo -b None -D -o singlecell2-all-output.tsv
+$SCRC -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
+      -C STARsolo -U STARsolo -b None -D -o singlecell2-all-output.tsv
 
diff --git a/SCReadCounts/src/release.py b/SCReadCounts/src/release.py
@@ -1,7 +1,7 @@
 #!/bin/env python
 RELEASE = "SCReadCounts"
-VERSION = '1.3.2'
-PROGRAMS = 'readCountsMatrix.py scReadCounts.py readCounts.py varLoci.py scVarLoci.py'
+VERSION = '1.4.0'
+PROGRAMS = 'readCountsMatrix.py scReadCounts.py readCounts.py varLoci.py scVarLoci.py rcio.py scCountLoci.py exonicFilter.py'
 INCLUDES = 'common ReadCounts'
 if __name__ == '__main__':
     import sys

diff --git a/SCReadCounts/src/scCountLoci.py b/SCReadCounts/src/scCountLoci.py
@@ -26,21 +26,22 @@
 if len(sys.argv) <= 1 or opt['h']:
     print("""
 Usage:
-  scCountLoci [ options ] <bamfile> <snvlocifile> [ parameters ]
+  scCountLoci [ options ] <bamfile> [ <snvlocifile> | - ] [ parameters ]
 
 Parameters (with defaults):
   acceptlist=            # File of cell-barcodes to consider
   cellbarcodes=STARsolo  # Name of cell-barcode extraction strategy
-  maxedits=3             # Max. edits for acceptable read alignment
+  maxedits=5             # Max. edits for acceptable read alignment
   minbasequal=25         # Min. base quality score
   mincells=3             # Min. number of cells for cell and variant constraints
   minmappingqual=60      # Min. mapping quality for reads
-  minreadspercell=3      # Min. number of reads per cell
+  minreadspercell=0      # Min. number of reads per cell
   minumipercell=3        # Min. number of UMIs per cell
-  minvarreadspercell=3   # Min. number of variant reads per cell
+  minvarreadspercell=0   # Min. number of variant reads per cell
   minvarumipercell=3     # Min. number of variant UMIs per cell
   outfile=               # Place output in compact binary file
   outputumis=True        # Output UMI counts, not reads
+  outputcells=False      # Output cell counts, not UMI or read counts per cell
   umibarcodes=STARsolo   # Name of UMI extraction strategy
 
 Options:
@@ -53,17 +54,27 @@
 
 bf = sys.argv[1]
 sf = sys.argv[2]
-kwargs = {'minbasequal': 25, 'minmappingqual': 60, 'gapsize': 1000, 'maxedits': 3, 
+kwargs = {'minbasequal': 25, 'minmappingqual': 60, 'gapsize': 1000, 'maxedits': 5, 
           'cellbarcodes': 'STARsolo', 'mincells': 3, 'minvarumipercell': 3, 'minumipercell': 3,
-          'outfile': "", 'acceptlist': "", 'minvarreadspercell': 3, 'minreadspercell': 3, 'outputumis': True}
+          'acceptlist': "", 'minvarreadspercell': 0, 'minreadspercell': 0, 
+          'outputumis': True, 'outputcells': False, 'outfile': "", 'umibarcodes': None, }
 for kv in sys.argv[3:]:
-    k,v = kv.split('=')
+    badsplit = False
+    try:
+        k,v = map(str.strip,kv.split('='))
+    except ValueError:
+        badsplit = True
+    if badsplit or not k or k not in kwargs:
+        raise RuntimeError("Bad parameter setting: "+kv)
     try:
        v = eval(v)
     except:
        pass
     kwargs[k] = v
-snvs = [ l.split()[:4] for l in open(sf) ]
+if sf == "-":
+    snvs = [ l.split()[:4] for l in sys.stdin if not l.startswith('CHR') ]
+else:
+    snvs = [ l.split()[:4] for l in open(sf) if not l.startswith('CHR') ]
 
 groupFactory = ReadGroupFactory()
 groupOptions = [t[1] for t in groupFactory.list(type="CellBarcode")] + [""]
@@ -76,7 +87,7 @@
 for s,n,d in sorted(groupFactory.list(type="UMI")):
     umiMap[n] = s
 
-if not 'umibarcodes' in kwargs and not kwargs.get('umibarcodes',None):
+if not kwargs.get('umibarcodes',None):
     kwargs['umibarcodes'] = kwargs['cellbarcodes']
 if kwargs.get('acceptlist') != None:
     if kwargs.get('acceptlist') in (None,"","None","-"):
@@ -88,6 +99,7 @@
 kwargs['cellbarcodes'] = groupFactory.get(groupMap[kwargs['cellbarcodes']],readgroupparam)
 kwargs['umibarcodes'] = groupFactory.get(umiMap[kwargs['umibarcodes']])
 outputumis = kwargs.get('outputumis',True)
+outputcells = kwargs.get('outputcells',False)
 
 if verbose:
     print("Parameters:",file=sys.stderr)
@@ -103,17 +115,30 @@
     del kwargs['outfile']
 if 'outputumis' in kwargs:
     del kwargs['outputumis']
+if 'outputcells' in kwargs:
+    del kwargs['outputcells']
 
 start = time.time()
 locifreq = SCSelectedLociByFetch(bf,snvs,**kwargs)
-if not outfile:
-    writer = scrctxtwriter()
+if not outputcells:
+    if not outfile:
+        writer = scrctxtwriter()
+    else:
+        assert(outfile.endswith('.brc'))
+        writer = scrcbinwriter(outfile)
 else:
-    assert(outfile.endswith('.brc'))
-    writer = scrcbinwriter(outfile)
+    if not outfile:
+        writer = sys.stdout
+    else:
+        writer = open(outfile,'wt')
+    writer.write("\t".join(["CHROM","POS","REF","ALT","SNVCELLS","CELLS"])+"\n")
 for chr,pos,refalt,freq in locifreq.loci():
     ref = refalt[0]
     for alt in refalt[1:]:
+        cellcnt = 0
+        varcellcnt = 0
+        refcellcnt = 0
+        bothcellcnt = 0
         for cb in freq:
             allumi = set()
             vals = dict()
@@ -128,14 +153,36 @@
                 continue
             if totalreads < kwargs.get('minreadspercell'):
                 continue
+            cellcnt += 1
+            varbad = False
             if vals[alt] < kwargs.get('minvarumipercell'):
+                varbad = True
+            elif vals1[alt] < kwargs.get('minvarreadspercell'):
+                varbad = True
+            if not varbad:
+                varcellcnt += 1
+            refbad = False
+            if vals[ref] < kwargs.get('minvarumipercell'):
+                refbad = True
+            elif vals1[ref] < kwargs.get('minvarreadspercell'):
+                refbad = True
+            if not refbad:
+                refcellcnt += 1
+            if not refbad and not varbad:
+                bothcellcnt += 1
+            if varbad:
                 continue
-            if vals1[alt] < kwargs.get('minvarreadspercell'):
-                continue
-            if outputumis:
-                writer.writecounts(chr,pos,ref,alt,None,cb,vals[ref],vals[alt])
-            else:
-                writer.writecounts(chr,pos,ref,alt,None,cb,vals1[ref],vals1[alt])
-writer.close()
+            if not outputcells:
+                if outputumis:
+                    writer.writecounts(chr,pos,ref,alt,None,cb,vals[ref],vals[alt])
+                else:
+                    writer.writecounts(chr,pos,ref,alt,None,cb,vals1[ref],vals1[alt])
+        if outputcells:
+            writer.write("\t".join(map(str,[chr,pos,ref,alt,varcellcnt,cellcnt]))+"\n")
+if not outputcells:
+    writer.close()
+else:
+    if writer != sys.stdout:
+        writer.close()
 elapsed = time.time() - start
 # print("Elapsed: %d:%02d.%02d"%(int(elapsed//60),int(elapsed%60),int(round(100*(elapsed-int(elapsed))))))