-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathGWAS_enrichment.py
119 lines (92 loc) · 4.29 KB
/
GWAS_enrichment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# headers
import pandas as pd
import numpy as np
from pandas import DataFrame
## Problem shooting -- p value need to match bags, generate p value file
#read in the SNP bag file
bagSNP_file = open("C:\\Users\\User\\Desktop\\eQTL_project\\bagging\\finished.Bagging.NoMissing\\bagSNP.chr1.print.txt", 'r').read()
# split the strings of the bagSNP by spaces and newlines
bagSNP = []
bagidx = []
bag_number = 1
for line in range(len(bagSNP_file.splitlines())):
temp = (bagSNP_file.splitlines()[line])
words = temp.split()
idx = 0
for idx in range(len(words)):
bagSNP.append(words[idx])
bagidx.append(bag_number)
print (bag_number)
bag_number += 1
# For chr1, bqg_number should be 93418
# Create a Pandas dataframe table
# this table have each row of bagSNPs and index as their bag numbers
bag_df = pd.DataFrame({"SNPs" : bagSNP, "bag index" : bagidx})
bag_df.sort_values(by="SNPs")
bag_df
#import GWAS file
GWAS_file = open("C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\HEIGHT_hg19_Pvalue", 'r')
# read the GWAS line by line
for line in GWAS_file:
lines = GWAS_file.readlines()
GWAS_SNP = []
GWAS_Pval = []
count = 1
for line in lines:
p = line.split()
if (p[0].split(":")[0] == '1'): ## Need to change the number if we bag GWAS with different chromosome !!!
GWAS_SNP.append(p[0].split(":")[1])
GWAS_Pval.append(p[1])
count += 1
# for HEIGHT GWAS, there should be 2481248 lines, for chromosome 1, there should be 188142 lines
# already check that it doesn't contain chr 1X in the list p
# Create a Pandas dataframe table
# this table have each row of GWASSNPs and index as their p values
GWAS_df = pd.DataFrame({"SNPs" : GWAS_SNP, "P Value" : GWAS_Pval})
GWAS_df = GWAS_df.sort_values(by="SNPs")
## create a dataframe for storing the mapped results
# Create a Pandas dataframe table
# this table have each row of SNPs and index as their bag index
GWASbag_df = pd.merge(bag_df, GWAS_df, on="SNPs") # 161068 rows for chr1 and HEIGHT
#GWASbag_df = GWASbag_df[~GWASbag_df["P Value"].str.contains("NA")] #160951
#GWASbag_df = GWASbag_df[~GWASbag_df["P Value"].str.contains("NaN")]
GWASbag_df.dropna() ## Need to reset the bag index
GWASbag_df = GWASbag_df.sort_values(by = "bag index")
GWASbag_df = GWASbag_df.reset_index(drop=True) # reset the index of rows, not index of bags
# Create a dataframe table for storing (bag index - representative p value) information
GWAS_bag_amounts = len(GWASbag_df["bag index"].unique()) # 44708 bags for chr1 and HEIGHT
Mapped_GWAS_SNP_amounts = len(GWASbag_df["SNPs"])
Missed_GWAS_SNP_amounts = len(GWAS_df["SNPs"]) - Mapped_GWAS_SNP_amounts
bag_number # original eQTL bag numbers
# Solve the jump number of bag index problem
newbagidx = []
idx = 1
newbagidx.append(idx)
for j in range (1, len(GWASbag_df)): # len(GWASbag_df) = 161068 for HEIGHT chr1 ## 1~161067
if (GWASbag_df["bag index"] [j] != GWASbag_df["bag index"] [j - 1]):
idx += 1
newbagidx.append(idx)
else:
newbagidx.append(idx)
GWASbag_df = pd.DataFrame({"SNPs" : GWASbag_df["SNPs"], "P Value" : GWASbag_df["P Value"], "bag index" : newbagidx})
# for each bag index in merged GWASbag_df , find the smallest p value, assign to the new GWAS_bag_result_df dataframe
GWASbagidx = []
smallestPval = []
for idx in range(GWAS_bag_amounts):
df = (GWASbag_df[GWASbag_df["bag index"] == idx + 1])
smallestPval.append(df["P Value"].min())
GWASbagidx.append(idx + 1)
GWAS_bag_result_df = pd.DataFrame({"bag index": GWASbagidx, "P Value": smallestPval})
GWAS_bag_result_df
# space delimited output
GWAS_bag_result_df['P Value'].to_csv('C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\%s_bagPval_Chr1.txt'%"HEIGHT_hg19"
, header=None, index=None, mode='w')
outFile = open('C:\\Users\\User\\Desktop\\eQTL_project\\python_data\\%s_bagSNP_Chr1.txt'% "HEIGHT_hg19", 'w')
for bagidx in range(GWAS_bag_amounts):
test = GWASbag_df[GWASbag_df["bag index"] == (bagidx + 1)].sort_values(by = "P Value")
a = test.groupby("bag index")["SNPs"].apply(tuple)
for result in a:
result = ' '.join(result)
outFile.write(result + '\n')
GWAS_file.close()
outFile.close()