-
Notifications
You must be signed in to change notification settings - Fork 0
/
collapse.py
115 lines (104 loc) · 5.08 KB
/
collapse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
#! /usr/bin/python3.4
import sys
import numpy as np
from numpy import count_nonzero
# The purpose of this program is to identify and correct cell barcodes that are generated by incomplete extension
# during oligonucleotide synthesis. Cell barcodes should be 12 nts long, but incomplete extension can result in cell
# barcodes that are 11 nts long, causing the apparent terminal nucleotide of the corresponding UMI to be 'T' due to
# premature synthesis of oligo(dT). The program identifies these cell barcodes by quantifying the fractional use of "T"
# at the terminal nucleotide of the UMI. It the collapses the four resulting incorrect 12 nt cell barcodes to a single
# corrected 11 nt cell barcode. The program also corrects the UMI sequence.
addresscts_INFILE = sys.argv[1] # input file containing UMI-collapsed reads
collapse_OUTFILE = sys.argv[2] # output file with corrected cell barcodes
trackbcs_OUTFILE = sys.argv[3] # output file tracking altered barcode sequences
technology = sys.argv[4] # 1 if 10x v2 library, 0 if drop-seq library
if technology == 'DropSeqv1':
umipos = 7
cellbcpos = 11
cellbclen = 12
elif technology in ['10xv2','CiteSeq5v2']:
umipos = 9
cellbcpos = 15
cellbclen = 16
elif technology in ['DropSeqv2','PearSeq']:
umipos = 8
cellbcpos = 11
cellbclen = 12
elif technology in ['10xv3','CiteSeq3v3','CiteSeqTSB']:
umipos = 11
cellbcpos = 15
cellbclen = 16
cellbcs = {} # dictionary with cell barcode sequence keys and addresscts_INFILE line values
cellbcsT = {} # dictionary with cell barcode sequence keys and number of UMI terminal nts that are 'T' as values
with open(addresscts_INFILE) as f:
for line in f:
llist = line.split()
cellbc = llist[0] # cell barcode
if cellbc.find('N') == -1 and cellbc.find('GGGGG') == -1: # discard cell barcodes containing 'N' or G5
if cellbc in cellbcs.keys():
cellbcs[cellbc].append(line)
if llist[1][umipos] == 'T': # if the terminal nt of the UMI is 'T'
cellbcsT[cellbc]+=1
else:
cellbcs[cellbc]=[line]
if llist[1][umipos] == 'T':
cellbcsT[cellbc]=1
else:
cellbcsT[cellbc]=0
print(len(cellbcs.keys()))
alteredbcs = [] # list of tuples of old and corrected barcodes altered by the program (old,new)
for cellbc in cellbcsT.keys():
if cellbc in cellbcs.keys():
N = len(cellbcs[cellbc])
if N > 19: # only consider barcodes with at least 20 apparent molecules for accurate quantification of terminal 'T' nts
fracT = float(cellbcsT[cellbc])/float(N)
if fracT > 0.9: # if over 90% of UMI terminal nts are 'T'
newcellbc = cellbc[0:cellbcpos] # corrected cell barcode is the first 11 nts of the old one
oldcellbcA = newcellbc+'A' # incorrect cell barcode will have four possible variants, one for each nt
oldcellbcG = newcellbc+'G'
oldcellbcT = newcellbc+'T'
oldcellbcC = newcellbc+'C'
newtot = []
if oldcellbcA in cellbcs: # consider 'A' terminated incorrect cell barcode
fracTbcA = float(cellbcsT[oldcellbcA])/float(len(cellbcs[oldcellbcA]))
if fracTbcA > 0.9: # collapse if over 90% of UMI terminal nts are 'T'
newtot.extend(cellbcs[oldcellbcA]) # grab all the address lines for the old barcode
del cellbcs[oldcellbcA] # delete entry for old barcode sequence
alteredbcs.append((oldcellbcA,newcellbc)) # track modified barcode sequence
if oldcellbcG in cellbcs:
fracTbcG = float(cellbcsT[oldcellbcG])/float(len(cellbcs[oldcellbcG]))
if fracTbcG > 0.9:
newtot.extend(cellbcs[oldcellbcG])
del cellbcs[oldcellbcG]
alteredbcs.append((oldcellbcG,newcellbc))
if oldcellbcT in cellbcs:
fracTbcT = float(cellbcsT[oldcellbcT])/float(len(cellbcs[oldcellbcT]))
if fracTbcT > 0.9:
newtot.extend(cellbcs[oldcellbcT])
del cellbcs[oldcellbcT]
alteredbcs.append((oldcellbcT,newcellbc))
if oldcellbcC in cellbcs:
fracTbcC = float(cellbcsT[oldcellbcC])/float(len(cellbcs[oldcellbcC]))
if fracTbcC > 0.9:
newtot.extend(cellbcs[oldcellbcC])
del cellbcs[oldcellbcC]
alteredbcs.append((oldcellbcC,newcellbc))
cellbcs[newcellbc] = newtot # collapse all old barcode address lines to new barcode
print(len(cellbcs.keys()))
collapse_OUTPUT = open(collapse_OUTFILE,'w')
for cellbc in cellbcs.keys():
if len(cellbc) == cellbclen: # if barcode is uncorrected (12 or 16 nts), output address lines
for line in cellbcs[cellbc]:
collapse_OUTPUT.write(line)
else: # if barcode is corrected (11 nts), correct barcode sequence and UMI sequence in each address line and output
for line in cellbcs[cellbc]:
llist = line.split() # cell barcode is first 11 nts of old cell barcode, last nt of old cell barcode is first nt of UMI, first 7 nts of old UMI is last 7 nts of new UMI
newline = llist[0][0:cellbcpos]+'\t'+llist[0][cellbcpos]+llist[1][0:umipos]+'\t'+llist[2]+'\t'+llist[3]+'\n'
collapse_OUTPUT.write(newline)
collapse_OUTPUT.close()
trackbcs_OUTPUT = open(trackbcs_OUTFILE,'w')
for alteredbc in alteredbcs: # output old and new sequences of altered barcodes to a file
bc_old = alteredbc[0]
bc_new = alteredbc[1]
trackbcs_OUTPUT.write('%(bc_old)s\t%(bc_new)s\n' % vars())
trackbcs_OUTPUT.close()