generated from akshala/peregrine
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathbTADlinks.py
70 lines (58 loc) · 2.4 KB
/
bTADlinks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from collections import defaultdict
import argparse
import os
parser = argparse.ArgumentParser()
parser.add_argument('target_path')
def TADlinks(gene_file, enhancer_tad_file, output_file):
'''
Trim down to a file showing just the gene>enhancer>cell>assay
'''
tadgenes = defaultdict(dict)
with open(gene_file, 'r') as tss_file:
for line in tss_file:
chr, start, end, geneID, one, two, three, tad, tissue = line.strip().split('\t')
cell = tissue.split('|')[1]
tad = tad + '\t' + cell
tadgenes[tad][geneID] = 1
tadenh = defaultdict(dict)
with open(enhancer_tad_file, 'r') as enh_file:
for line in enh_file:
chr, start, end, enhID, one, two, three, tad, tissue = line.strip().split('\t')
cell = tissue.split('|')[1]
tad = tad + '\t' + cell
tadenh[enhID][tad] = 1
n = len(tadenh)
batch_size = 1000
start_idx = 0
count = 1
while start_idx < n:
end_idx = min(start_idx + batch_size, n)
count += 1
TADlinks_write_to_file(tadenh, tadgenes, output_file, start_idx, end_idx)
start_idx = end_idx
def TADlinks_write_to_file(tadenh, tadgenes, output_file, start_idx, end_idx):
with open(output_file, 'a') as out:
for enh in tadenh.keys():
for tad in list(tadenh[enh].keys())[start_idx:end_idx]:
if tad in tadgenes:
for gene in tadgenes[tad].keys():
panthID = panther_mapping.get(gene, '')
cell = tad.split('\t')[1]
if enh and panthID and cell:
out.write(f"{enh}\t{panthID}\t{cell}\t3\n")
if __name__ == "__main__":
args = parser.parse_args()
target_path = args.target_path
os.makedirs(target_path, exist_ok=True)
panther_mapping = {}
with open('pantherGeneList.txt', 'r') as pantherGene_file:
for line in pantherGene_file:
line_split = line.strip().split('\t')
panth = line_split[0]
ensg = line_split[1]
panther_mapping[ensg] = panth
chromosomes = ['chr' + str(x) for x in range(1, 23)]
chromosomes.append('chrX')
chromosomes.append('chrY')
for chr in chromosomes:
TADlinks(os.path.join(target_path, chr+'TSSbTAD'), os.path.join(target_path, chr+'enhancersbTAD'), os.path.join(target_path, chr+'linksbTAD'))