Skip to content

Commit

Permalink
new (and improved tools) for single-cell discovery mode
Browse files Browse the repository at this point in the history
  • Loading branch information
edwardsnj committed Mar 5, 2024
1 parent 8c05ee4 commit b07b295
Show file tree
Hide file tree
Showing 12 changed files with 1,137 additions and 79 deletions.
81 changes: 68 additions & 13 deletions ReadCounts/src/varLoci.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,20 +10,75 @@
except NameError:
pass

from variantloci import VariantLociByPileup, VariantLociByFetch
from collections import defaultdict
from variantloci import VariantLociByFetch

opt = defaultdict(lambda: False)
while len(sys.argv) > 1 and sys.argv[1] in ("-v","-h"):
opt[sys.argv.pop(1)[1]] = True

if len(sys.argv) <= 1 or opt['h']:
print("""
Usage:
varLoci [ options ] <bamfile> [ parameters ]
Parameters (with defaults):
maxedits=5 # Max. edits for acceptable read alignment
minbasequal=25 # Min. base quality score
mindist=5 # Min. number of bases between variants
minmappingqual=60 # Min. mapping quality for reads
minvarcnt=5 # Min. number of reads with variant
region= # Restriction analysis to chrom. region
outfile= # Place output in file, otherwise standard out
Options:
-v Verbose
-h Help
""".strip())
sys.exit(0)

verbose = opt['v']

bamfilename = sys.argv[1]
region = None
minreads = 1
if len(sys.argv) > 2:
minreads = int(sys.argv[2])
if len(sys.argv) > 3:
region = sys.argv[3]
if region.strip() in ("", "-", "None"):
region = None

# scan = VariantLociByPileup(bamfilename,region=region,minreads=minreads)
scan = VariantLociByFetch(bamfilename,region=region,minreads=minreads)
kwargs = dict(region = None, minvarcnt = 5, maxedits = 5, minbasequal = 25, minmappingqual = 60, mindist = 5, outfile="")
for kv in sys.argv[2:]:
badsplit = False
try:
k,v = map(str.strip,kv.split('='))
except ValueError:
badsplit = True
if badsplit or not k or k not in kwargs:
raise RuntimeError("Bad parameter setting: "+kv)
try:
v = eval(v)
except:
pass
kwargs[k] = v

if verbose:
print("Parameters:",file=sys.stderr)
for k,v in sorted(kwargs.items()):
if k in ('cellbarcodes','umibarcodes'):
print(" %s=%s"%(k,v._name),file=sys.stderr)
else:
if v == None:
print(" %s="%(k,),file=sys.stderr)
else:
print(" %s=%s"%(k,v),file=sys.stderr)
print(file=sys.stderr)

outfile = kwargs.get('outfile')
if 'outfile' in kwargs:
del kwargs['outfile']

scan = VariantLociByFetch(bamfilename,**kwargs)
if not outfile:
outfile = sys.stdout
else:
outfile = open(outfile,'wt')
print("CHROM","POS","REF","ALT",sep='\t',file=outfile)
for chrom,pos,refnuc,altnuc,freq in scan.loci():
# print(chrom,pos,refnuc,altnuc,freq.get(refnuc,0),freq.get(altnuc,0),sep='\t')
print(chrom,pos,refnuc,altnuc,sep='\t')

if outfile != sys.stdout:
outfile.close()
24 changes: 24 additions & 0 deletions SCReadCounts/data/dlexons.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#!/bin/sh
if [ "$1" == "-h" ]; then
echo "Usage: $0 [ <EMBL-Release> ]" 1>&2
exit 1
fi
if [ "$1" == "" ]; then
REL1="current_gtf"
else
REL1="release-$1/gtf"
fi
FN=`wget -q -O - "https://ftp.ensembl.org/pub/$REL1/homo_sapiens/" | sed -n 's/^.*href="\(.*.[0-9][0-9]*.gtf.gz\)".*$/\1/p'`
REL2=`echo "$FN" | tr '.' ' ' | awk '{print $(NF-2)}'`
ASS=`echo "$FN" | tr '.' ' ' | awk '{print $(NF-3)}'`
URL="https://ftp.ensembl.org/pub/$REL1/homo_sapiens/Homo_sapiens.$ASS.$REL2.gtf.gz"
echo "Downloading Homo_sapiens.$ASS.$REL2.gtf.gz" 1>&2
wget -q -O - "$URL" | gunzip -c | \
grep -v '^#' | \
awk '$3 == "exon" {print $1,$4,$5}' | \
awk '$1 ~ /^([12]?[0-9]|X|Y|MT)$/' | \
sed -e 's/^X /100 /' -e 's/^Y /101 /' -e 's/^MT /102 /' | \
sort -k1n,1 -k2n,2 -k3n,3 | \
sed -e 's/^100 /X /' -e 's/^101 /Y /' -e 's/^102 /MT /' | \
tr ' ' '\t' | \
uniq
16 changes: 10 additions & 6 deletions SCReadCounts/data/singlecell.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,10 @@ rm -f singlecell2-all-output.{tsv,*.matrix.tsv}

set -x

PREFIX="../bin/"
# PREFIX="conda run -n HorvathLab --live-stream "
SCRC="${PREFIX}scReadCounts"

#
# Equivalent to running these three commands...
#
Expand All @@ -18,22 +22,22 @@ set -x
#
# readCountsMatrix -c "singlecell-output.tsv" -M VAF -m 5 -o "singlecell-output.vaf-m5.matrix.tsv"
#
../bin/scReadCounts -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 3 -o singlecell-output.tsv
$SCRC -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 3 -o singlecell-output.tsv

#
# Regenerate the VAF matrix (different output name: singlecell-output.vaf-m5.matrix.tsv)
#
../bin/scReadCounts -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 5 -o singlecell-output.tsv
$SCRC -r singlecell_chr17.bam -s singlecell_222_5_chr17.txt -m 5 -o singlecell-output.tsv

#
# STARsolo example
#
../bin/scReadCounts -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-C STARsolo -U STARsolo -b singlecell2_117_barcodes.tsv -o singlecell2-output.tsv
$SCRC -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-C STARsolo -U STARsolo -b singlecell2_117_barcodes.tsv -o singlecell2-output.tsv

#
# Override accept list for barcodes, accept all barcodes, directional output
#
../bin/scReadCounts -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-C STARsolo -U STARsolo -b None -D -o singlecell2-all-output.tsv
$SCRC -r singlecell2_117.bam -s singlecell2_117_snvs.txt -m 5 -t 10 \
-C STARsolo -U STARsolo -b None -D -o singlecell2-all-output.tsv

4 changes: 2 additions & 2 deletions SCReadCounts/src/release.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/env python
RELEASE = "SCReadCounts"
VERSION = '1.3.2'
PROGRAMS = 'readCountsMatrix.py scReadCounts.py readCounts.py varLoci.py scVarLoci.py'
VERSION = '1.4.0'
PROGRAMS = 'readCountsMatrix.py scReadCounts.py readCounts.py varLoci.py scVarLoci.py rcio.py scCountLoci.py exonicFilter.py'
INCLUDES = 'common ReadCounts'
if __name__ == '__main__':
import sys
Expand Down
87 changes: 67 additions & 20 deletions SCReadCounts/src/scCountLoci.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,22 @@
if len(sys.argv) <= 1 or opt['h']:
print("""
Usage:
scCountLoci [ options ] <bamfile> <snvlocifile> [ parameters ]
scCountLoci [ options ] <bamfile> [ <snvlocifile> | - ] [ parameters ]
Parameters (with defaults):
acceptlist= # File of cell-barcodes to consider
cellbarcodes=STARsolo # Name of cell-barcode extraction strategy
maxedits=3 # Max. edits for acceptable read alignment
maxedits=5 # Max. edits for acceptable read alignment
minbasequal=25 # Min. base quality score
mincells=3 # Min. number of cells for cell and variant constraints
minmappingqual=60 # Min. mapping quality for reads
minreadspercell=3 # Min. number of reads per cell
minreadspercell=0 # Min. number of reads per cell
minumipercell=3 # Min. number of UMIs per cell
minvarreadspercell=3 # Min. number of variant reads per cell
minvarreadspercell=0 # Min. number of variant reads per cell
minvarumipercell=3 # Min. number of variant UMIs per cell
outfile= # Place output in compact binary file
outputumis=True # Output UMI counts, not reads
outputcells=False # Output cell counts, not UMI or read counts per cell
umibarcodes=STARsolo # Name of UMI extraction strategy
Options:
Expand All @@ -53,17 +54,27 @@

bf = sys.argv[1]
sf = sys.argv[2]
kwargs = {'minbasequal': 25, 'minmappingqual': 60, 'gapsize': 1000, 'maxedits': 3,
kwargs = {'minbasequal': 25, 'minmappingqual': 60, 'gapsize': 1000, 'maxedits': 5,
'cellbarcodes': 'STARsolo', 'mincells': 3, 'minvarumipercell': 3, 'minumipercell': 3,
'outfile': "", 'acceptlist': "", 'minvarreadspercell': 3, 'minreadspercell': 3, 'outputumis': True}
'acceptlist': "", 'minvarreadspercell': 0, 'minreadspercell': 0,
'outputumis': True, 'outputcells': False, 'outfile': "", 'umibarcodes': None, }
for kv in sys.argv[3:]:
k,v = kv.split('=')
badsplit = False
try:
k,v = map(str.strip,kv.split('='))
except ValueError:
badsplit = True
if badsplit or not k or k not in kwargs:
raise RuntimeError("Bad parameter setting: "+kv)
try:
v = eval(v)
except:
pass
kwargs[k] = v
snvs = [ l.split()[:4] for l in open(sf) ]
if sf == "-":
snvs = [ l.split()[:4] for l in sys.stdin if not l.startswith('CHR') ]
else:
snvs = [ l.split()[:4] for l in open(sf) if not l.startswith('CHR') ]

groupFactory = ReadGroupFactory()
groupOptions = [t[1] for t in groupFactory.list(type="CellBarcode")] + [""]
Expand All @@ -76,7 +87,7 @@
for s,n,d in sorted(groupFactory.list(type="UMI")):
umiMap[n] = s

if not 'umibarcodes' in kwargs and not kwargs.get('umibarcodes',None):
if not kwargs.get('umibarcodes',None):
kwargs['umibarcodes'] = kwargs['cellbarcodes']
if kwargs.get('acceptlist') != None:
if kwargs.get('acceptlist') in (None,"","None","-"):
Expand All @@ -88,6 +99,7 @@
kwargs['cellbarcodes'] = groupFactory.get(groupMap[kwargs['cellbarcodes']],readgroupparam)
kwargs['umibarcodes'] = groupFactory.get(umiMap[kwargs['umibarcodes']])
outputumis = kwargs.get('outputumis',True)
outputcells = kwargs.get('outputcells',False)

if verbose:
print("Parameters:",file=sys.stderr)
Expand All @@ -103,17 +115,30 @@
del kwargs['outfile']
if 'outputumis' in kwargs:
del kwargs['outputumis']
if 'outputcells' in kwargs:
del kwargs['outputcells']

start = time.time()
locifreq = SCSelectedLociByFetch(bf,snvs,**kwargs)
if not outfile:
writer = scrctxtwriter()
if not outputcells:
if not outfile:
writer = scrctxtwriter()
else:
assert(outfile.endswith('.brc'))
writer = scrcbinwriter(outfile)
else:
assert(outfile.endswith('.brc'))
writer = scrcbinwriter(outfile)
if not outfile:
writer = sys.stdout
else:
writer = open(outfile,'wt')
writer.write("\t".join(["CHROM","POS","REF","ALT","SNVCELLS","CELLS"])+"\n")
for chr,pos,refalt,freq in locifreq.loci():
ref = refalt[0]
for alt in refalt[1:]:
cellcnt = 0
varcellcnt = 0
refcellcnt = 0
bothcellcnt = 0
for cb in freq:
allumi = set()
vals = dict()
Expand All @@ -128,14 +153,36 @@
continue
if totalreads < kwargs.get('minreadspercell'):
continue
cellcnt += 1
varbad = False
if vals[alt] < kwargs.get('minvarumipercell'):
varbad = True
elif vals1[alt] < kwargs.get('minvarreadspercell'):
varbad = True
if not varbad:
varcellcnt += 1
refbad = False
if vals[ref] < kwargs.get('minvarumipercell'):
refbad = True
elif vals1[ref] < kwargs.get('minvarreadspercell'):
refbad = True
if not refbad:
refcellcnt += 1
if not refbad and not varbad:
bothcellcnt += 1
if varbad:
continue
if vals1[alt] < kwargs.get('minvarreadspercell'):
continue
if outputumis:
writer.writecounts(chr,pos,ref,alt,None,cb,vals[ref],vals[alt])
else:
writer.writecounts(chr,pos,ref,alt,None,cb,vals1[ref],vals1[alt])
writer.close()
if not outputcells:
if outputumis:
writer.writecounts(chr,pos,ref,alt,None,cb,vals[ref],vals[alt])
else:
writer.writecounts(chr,pos,ref,alt,None,cb,vals1[ref],vals1[alt])
if outputcells:
writer.write("\t".join(map(str,[chr,pos,ref,alt,varcellcnt,cellcnt]))+"\n")
if not outputcells:
writer.close()
else:
if writer != sys.stdout:
writer.close()
elapsed = time.time() - start
# print("Elapsed: %d:%02d.%02d"%(int(elapsed//60),int(elapsed%60),int(round(100*(elapsed-int(elapsed))))))
Loading

0 comments on commit b07b295

Please sign in to comment.