collapse.py

#! /usr/bin/python3.4
import sys
import numpy as np
from numpy import count_nonzero

# The purpose of this program is to identify and correct cell barcodes that are generated by incomplete extension
# during oligonucleotide synthesis. Cell barcodes should be 12 nts long, but incomplete extension can result in cell
# barcodes that are 11 nts long, causing the apparent terminal nucleotide of the corresponding UMI to be 'T' due to
# premature synthesis of oligo(dT). The program identifies these cell barcodes by quantifying the fractional use of "T"
# at the terminal nucleotide of the UMI.  It the collapses the four resulting incorrect 12 nt cell barcodes to a single
# corrected 11 nt cell barcode. The program also corrects the UMI sequence.

addresscts_INFILE = sys.argv[1]  # input file containing UMI-collapsed reads
collapse_OUTFILE = sys.argv[2]   # output file with corrected cell barcodes
trackbcs_OUTFILE = sys.argv[3]   # output file tracking altered barcode sequences
technology = sys.argv[4]	 # 1 if 10x v2 library, 0 if drop-seq library

if technology == 'DropSeqv1':
	umipos = 7
	cellbcpos = 11
	cellbclen = 12
elif technology in ['10xv2','CiteSeq5v2']:
	umipos = 9
	cellbcpos = 15	
	cellbclen = 16
elif technology in ['DropSeqv2','PearSeq']:
	umipos = 8
	cellbcpos = 11
	cellbclen = 12
elif technology in ['10xv3','CiteSeq3v3','CiteSeqTSB']:
	umipos = 11
	cellbcpos = 15
	cellbclen = 16

cellbcs = {}  # dictionary with cell barcode sequence keys and addresscts_INFILE line values
cellbcsT = {} # dictionary with cell barcode sequence keys and number of UMI terminal nts that are 'T' as values
with open(addresscts_INFILE) as f:
	for line in f:
		llist = line.split()
		cellbc = llist[0] # cell barcode
		if cellbc.find('N') == -1 and cellbc.find('GGGGG') == -1: # discard cell barcodes containing 'N' or G5
			if cellbc in cellbcs.keys():
				cellbcs[cellbc].append(line)
				if llist[1][umipos] == 'T':  # if the terminal nt of the UMI is 'T'
					cellbcsT[cellbc]+=1
			else:
				cellbcs[cellbc]=[line]
				if llist[1][umipos] == 'T':
					cellbcsT[cellbc]=1
				else:
					cellbcsT[cellbc]=0
print(len(cellbcs.keys()))
alteredbcs = [] # list of tuples of old and corrected barcodes altered by the program (old,new)
for cellbc in cellbcsT.keys():
	if cellbc in cellbcs.keys():
		N = len(cellbcs[cellbc])
		if N > 19: # only consider barcodes with at least 20 apparent molecules for accurate quantification of terminal 'T' nts
			fracT = float(cellbcsT[cellbc])/float(N)
			if fracT > 0.9:  # if over 90% of UMI terminal nts are 'T'
				newcellbc = cellbc[0:cellbcpos]  # corrected cell barcode is the first 11 nts of the old one
				oldcellbcA = newcellbc+'A'  # incorrect cell barcode will have four possible variants, one for each nt
				oldcellbcG = newcellbc+'G'
				oldcellbcT = newcellbc+'T'
				oldcellbcC = newcellbc+'C'
				newtot = []
				if oldcellbcA in cellbcs: # consider 'A' terminated incorrect cell barcode
					fracTbcA = float(cellbcsT[oldcellbcA])/float(len(cellbcs[oldcellbcA]))
					if fracTbcA > 0.9: # collapse if over 90% of UMI terminal nts are 'T'
						newtot.extend(cellbcs[oldcellbcA]) # grab all the address lines for the old barcode
						del cellbcs[oldcellbcA]  # delete entry for old barcode sequence
						alteredbcs.append((oldcellbcA,newcellbc))  # track modified barcode sequence
				if oldcellbcG in cellbcs:
					fracTbcG = float(cellbcsT[oldcellbcG])/float(len(cellbcs[oldcellbcG]))
					if fracTbcG > 0.9:
						newtot.extend(cellbcs[oldcellbcG])
						del cellbcs[oldcellbcG]
						alteredbcs.append((oldcellbcG,newcellbc))
				if oldcellbcT in cellbcs:
					fracTbcT = float(cellbcsT[oldcellbcT])/float(len(cellbcs[oldcellbcT]))
					if fracTbcT > 0.9:
						newtot.extend(cellbcs[oldcellbcT])
						del cellbcs[oldcellbcT]
						alteredbcs.append((oldcellbcT,newcellbc))
				if oldcellbcC in cellbcs:
					fracTbcC = float(cellbcsT[oldcellbcC])/float(len(cellbcs[oldcellbcC]))
					if fracTbcC > 0.9:
						newtot.extend(cellbcs[oldcellbcC])
						del cellbcs[oldcellbcC]
						alteredbcs.append((oldcellbcC,newcellbc))
				cellbcs[newcellbc] = newtot # collapse all old barcode address lines to new barcode

print(len(cellbcs.keys()))
collapse_OUTPUT = open(collapse_OUTFILE,'w')
for cellbc in cellbcs.keys():
	if len(cellbc) == cellbclen: # if barcode is uncorrected (12 or 16 nts), output address lines
		for line in cellbcs[cellbc]:
			collapse_OUTPUT.write(line)
	else:  # if barcode is corrected (11 nts), correct barcode sequence and UMI sequence in each address line and output
		for line in cellbcs[cellbc]:
			llist = line.split() # cell barcode is first 11 nts of old cell barcode, last nt of old cell barcode is first nt of UMI, first 7 nts of old UMI is last 7 nts of new UMI
			newline = llist[0][0:cellbcpos]+'\t'+llist[0][cellbcpos]+llist[1][0:umipos]+'\t'+llist[2]+'\t'+llist[3]+'\n'
			collapse_OUTPUT.write(newline)
	
collapse_OUTPUT.close()

trackbcs_OUTPUT = open(trackbcs_OUTFILE,'w')
for alteredbc in alteredbcs: # output old and new sequences of altered barcodes to a file
	bc_old = alteredbc[0]
	bc_new = alteredbc[1]
	trackbcs_OUTPUT.write('%(bc_old)s\t%(bc_new)s\n' % vars())
trackbcs_OUTPUT.close()