-
Notifications
You must be signed in to change notification settings - Fork 0
/
Script_07_annotate_X_CpGs.Rmd
148 lines (115 loc) · 4.38 KB
/
Script_07_annotate_X_CpGs.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
---
title: "annotate_450k_X_CpGs"
author: "Yunfeng"
date: "1/16/2023"
output: html_document
---
#Script information
This script was written to annotate X-chromosome CpGs of the Illumina Infinium 450K array with CGI features and XCI status.
#CGI feature annotation
CGI-centric annotation was split in 3 regions:
1) CGIs, as annotated by UCSC (hg19).
2) Shores: 2 kb regions flanking the CGIs (both up- and downstream).
3) Non-CGI: none of the above.
Schematic representation of the CGI-centric annotation:
----------------------|-------|-------|-------|----------------------
Non-CGI | Shore | CGI | Shore | Non-CGI
| 2 kb | | 2 kb |
----------------------|-------|-------|-------|----------------------
# XCI status annotation
XCI status annotation were based on reliable meta-status calls[1], which integrated three published studies and resulted in genes classified into three categories: subject to XCI, escape XCI and variably escape XCI.
[1] Balaton BP, Brown CJ: Contribution of genetic and epigenetic changes to escape from X-chromosome inactivation. Epigenetics Chromatin 2021, 14:30.
```{r}
#set library path.
.libPaths("~/researchdrive/yliu/Rlibs")
```
```{r}
#Load all necessary libraries.
library(EnsDb.Hsapiens.v75)
library(rtracklayer)
library(FDb.InfiniumMethylation.hg19)
library(AnnotationHub)
library(ph525x)
library(data.table)
```
```{r}
#Load data.
load(file ="./Output/Output_01_chrX.RData")
#Make a GRanges object containing features of all 450K CpGs.
ann450K <- features(FDb.InfiniumMethylation.hg19)
#Select only the X chromosome CpGs which were measured in the BIOS data.
mvalues_chrX<-cbind(mvalues_Xfemales,mvalues_Xmales)
dim(mvalues_chrX) # 9777 4031
ann450K_chrX <- ann450K[rownames(mvalues_chrX)]
#Remove metadata and strand information, then sort the object.
ann450K_chrX <- ann450K_chrX[,NULL]
strand(ann450K_chrX) <- "*"
ann450K_chrX <- sort(sortSeqlevels(ann450K_chrX))
#Add the chromosome names to the metadata.
ann450K_chrX$Chromosome <- factor(seqnames(ann450K_chrX))
ann450K_chrX
```
```{r}
# Open ensembl database.
edb75 <- EnsDb.Hsapiens.v75
# Only keep protein-coding genes.
GeneEns <- genes(edb75, filter = GeneBiotypeFilter("protein_coding"))
GeneEns <- keepStandardChromosomes(GeneEns, pruning.mode = "coarse")
GeneEns <- sort(sortSeqlevels(GeneEns))
values(GeneEns)[,c(2,4)] <- NULL
GeneEns
# keep protein-coding genes on X chromosome
seqlevels(GeneEns) <- gsub("X", "chrX", seqlevels(GeneEns))
GeneEns_chrX <- keepSeqlevels(GeneEns,"chrX", pruning.mode = "coarse")
```
```{r}
#Change the genome nomenclature of GeneEns_chrX from GRCh37 to hg19
genome(GeneEns_chrX)<-genome(ann450K_chrX)
ann450K_chrX
GeneEns_chrX
```
```{r}
#Annotate CGIs
#Retrieve CpG islands from the UCSC genome browser.
mySession = browserSession("UCSC")
genome(mySession) <- "hg19"
CpGislands.raw <- getTable(ucscTableQuery(mySession, track="CpG Islands",table="cpgIslandExt"))
CGI.gr <- GRanges(CpGislands.raw$chrom, IRanges(CpGislands.raw$chromStart+1, CpGislands.raw$chromEnd), name=CpGislands.raw$name)
CGI.gr <- keepStandardChromosomes(CGI.gr, pruning.mode = "coarse")
#Add CGI annotation (also including shores (within 2000 bp from CGI)).
#Shores start
shores1 <- CGI.gr
shores2 <- CGI.gr
start(shores1) <- start(CGI.gr) - 2000
end(shores1) <- start(CGI.gr)
#Shores end
end(shores2) <- end(CGI.gr) + 2000
start(shores2) <- end(CGI.gr)
shores <- c(shores1, shores2)
#Add the CGI annotations to the 450K CpGs.
#Non-CGI
ann450K_chrX$CGI_Feature <- rep("non-CGI")
#Shore
OL.sho <- as.matrix(findOverlaps(shores,ann450K_chrX))[,2]
OL.sho <- unique(OL.sho)
ann450K_chrX[OL.sho,]$CGI_Feature <- "Shore"
#CGI
OL.cgi <- as.matrix(findOverlaps(CGI.gr, ann450K_chrX))[,2]
OL.cgi <- unique(OL.cgi)
ann450K_chrX[OL.cgi,]$CGI_Feature <- "CGI"
#Turn CGI features into a factor variable.
ann450K_chrX$CGI_Feature <- factor(ann450K_chrX$CGI_Feature, levels = c("CGI", "Shore", "non-CGI"))
ann450K_chrX
```
```{r}
# XCI annotation based on meta calls
load(file="Input_07_TSS.XCI.Status.meta.calls.RData")
XCI.annotation<-Annotation.TSS.meta.calls[names(ann450K_chrX),]
ann450K_chrX$XCI.Status.meta.calls<-XCI.annotation$TSS.Status
ann450K_chrX$Distance.meta.calls<-XCI.annotation$Distance
```
```{r}
#save the files.
save(GeneEns_chrX, file = "Output/Output_07/Output_07_ensembl_genes_chrX.RData")
save(ann450K_chrX, file = "Output/Output_07/Output_07_450K_chrX_annotation.RData")
```