-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGetNullSample.R
115 lines (83 loc) · 4.36 KB
/
GetNullSample.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
source('GetNullSample_Utillity_Functions.R')
#
# Iteration through all datasets
#
for( dataset_index in 1:500 ){
#
# Load the original pop data (simulated under alternative)
#
load( paste0('Dataset', dataset_index, '/pop_data.RData') )
#
# Get the null population
#
pop_data <- Get_the_null_population(pop_data = pop_data, n_pop_case_ind = 155)
#
# Get the null sample
#
sample_data <- Sample_from_population(pop_data = pop_data, nCase = 50, nCon = 50)
#
# Generate the permutation index for null sample
#
nperm = 1000
ccLabel = sample_data$Genos$ccStatus
permute_indx = matrix(NA, nrow = nperm, ncol = length(ccLabel) )
for(i in 1:nperm){
permute_indx[i,] = sample( x = 1:length(ccLabel), replace = FALSE)
}
#
# Make null directory and save the null population and sample
#
system( paste0('mkdir Dataset' , dataset_index, '/', 'null') )
save( pop_data, file = paste0('Dataset', dataset_index, '/null/pop_data.RData') )
save( sample_data, file = paste0('Dataset', dataset_index, '/null/sample_data.RData') )
save( permute_indx, file = paste0('Dataset', dataset_index, '/null/permute_indx.RData') )
# Optional step
#
# We only run this stage as it saves us time later when we pick a dataset as example dataset and we want
# to present it. So this stage is totally optional.
#
# We can save two tables here:
#
# Table1. Case/Control and cSNV information in the sample
# Table2. Case/Control and cSNV information in the population
#
#
###########################################################################################################################
# Table1. Case/Control and cSNV information
Variants = pop_data$Variants
cSNV = pop_data$cSNV
D_case_haplotypes = Find_Sequences_of_Individuals(sample_data$CaseIND, PM = pop_data$Population.Mapping)
D_control_haplotypes = Find_Sequences_of_Individuals(sample_data$ControlIND, PM = pop_data$Population.Mapping)
derived.allele.frequency = rowSums( Variants ) / ncol(Variants)
derived.allele.percentage = derived.allele.frequency * 100
cSNV_HapMat = Variants[cSNV, c(D_case_haplotypes , D_control_haplotypes) ]
dim(cSNV_HapMat)
t = data.frame(
cSNV = cSNV,
daf = derived.allele.percentage[cSNV],
Carrier_Case_Haplo = rowSums(cSNV_HapMat[,1:100]),
Carrier_Control_Haplo = rowSums(cSNV_HapMat[,101:200])
)
rownames(t) = NULL
t = t[order(t$daf , decreasing = TRUE),]
write.table(t, file = paste0('Dataset', dataset_index, '/null/Table1_cSNV_null_sample_info.txt'), sep = ",", quote = FALSE, row.names = F)
rm(t)
##############################################################################################################################
##############################################################################################################################
# Table2. Case/Control and cSNV information in the population
pop_D_case_haplotypes = Find_Sequences_of_Individuals(ind_list = pop_data$DISCRETE[[1]], PM = pop_data$Population.Mapping)
pop_D_control_haplotypes = Find_Sequences_of_Individuals(ind_list = pop_data$DISCRETE[[2]], PM = pop_data$Population.Mapping)
pop_cSNV_HapMat = Variants[cSNV, c(pop_D_case_haplotypes , pop_D_control_haplotypes) ]
t = data.frame(
cSNV = cSNV,
daf = derived.allele.percentage[cSNV],
Carrier_Case_Haplo = rowSums(pop_cSNV_HapMat[,1:length(pop_D_case_haplotypes) ]),
Carrier_Control_Haplo = rowSums(pop_cSNV_HapMat[,(length(pop_D_case_haplotypes)+1):ncol(pop_cSNV_HapMat) ])
)
rownames(t) = NULL
t = t[order(t$daf , decreasing = TRUE),]
write.table(t, file = paste0('Dataset', dataset_index, '/null/Table2_cSNV_null_population_info.txt'), sep = ",", quote = FALSE, row.names = F)
rm(t)
##############################################################################################################################
print( paste0("Currently at dataset ", dataset_index , " out of 500") )
}