add lab notes

ocbe-uio · Jan 4, 2024 · ee9081d · ee9081d
1 parent b02671b
commit ee9081d
Show file tree

Hide file tree

Showing 19 changed files with 2,228 additions and 7 deletions.
diff --git a/docs/lab/lab_day2_pca.html b/docs/lab/lab_day2_pca.html
diff --git a/docs/lab/lab_day2_pca_files/figure-html/pca-nci-keeppc-1.png b/docs/lab/lab_day2_pca_files/figure-html/pca-nci-keeppc-1.png
diff --git a/docs/lab/lab_day2_pca_files/figure-html/pca-nci-plot2-1.png b/docs/lab/lab_day2_pca_files/figure-html/pca-nci-plot2-1.png
diff --git a/docs/lab/lab_day2_pca_files/figure-html/pca-nci-plotpc-1.png b/docs/lab/lab_day2_pca_files/figure-html/pca-nci-plotpc-1.png
diff --git a/docs/lab/lab_day3_pca.html b/docs/lab/lab_day3_pca.html
diff --git a/docs/lab/lab_day3_pca_files/figure-html/pca-nci-keeppc-1.png b/docs/lab/lab_day3_pca_files/figure-html/pca-nci-keeppc-1.png
diff --git a/docs/lab/lab_day3_pca_files/figure-html/pca-nci-plot2-1.png b/docs/lab/lab_day3_pca_files/figure-html/pca-nci-plot2-1.png
diff --git a/docs/lab/lab_day3_pca_files/figure-html/pca-nci-plotpc-1.png b/docs/lab/lab_day3_pca_files/figure-html/pca-nci-plotpc-1.png
diff --git a/docs/lab/lab_day4_clustering.html b/docs/lab/lab_day4_clustering.html
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-keeppc-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-keeppc-1.png
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plot2-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plot2-1.png
diff --git a/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plotpc-1.png b/docs/lab/lab_day4_clustering_files/figure-html/pca-nci-plotpc-1.png
diff --git a/docs/lab/overview.html b/docs/lab/overview.html
@@ -154,6 +154,7 @@ <h2 id="toc-title">On this page</h2>
 
   <ul>
   <li><a href="#datasets" id="toc-datasets" class="nav-link active" data-scroll-target="#datasets">Datasets</a></li>
+  <li><a href="#lab-notes-and-r-scripts" id="toc-lab-notes-and-r-scripts" class="nav-link" data-scroll-target="#lab-notes-and-r-scripts">Lab notes and R scripts</a></li>
   </ul>
 </nav>
     </div>
@@ -188,7 +189,7 @@ <h2 class="anchored" data-anchor-id="datasets">Datasets</h2>
 <p>day 3 (PCA)</p>
 <ul>
 <li>pima.txt</li>
-<li>NCI60</li>
+<li>NCI60 (done)</li>
 <li>USarrest</li>
 <li>Food</li>
 <li>CH10Ex11</li>
@@ -200,6 +201,40 @@ <h2 class="anchored" data-anchor-id="datasets">Datasets</h2>
 <li>NCI60</li>
 <li>CH10Ex11</li>
 </ul>
+</section>
+<section id="lab-notes-and-r-scripts" class="level2">
+<h2 class="anchored" data-anchor-id="lab-notes-and-r-scripts">Lab notes and R scripts</h2>
+<table class="table">
+<thead>
+<tr class="header">
+<th style="text-align: center;">Day</th>
+<th style="text-align: center;">Lab notes</th>
+<th style="text-align: center;">R script</th>
+</tr>
+</thead>
+<tbody>
+<tr class="odd">
+<td style="text-align: center;">Day 1</td>
+<td style="text-align: center;"><a href="">Day 1: Introduction to R</a></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr class="even">
+<td style="text-align: center;">Day 2</td>
+<td style="text-align: center;"><a href="">Day 2: Multiple testing</a></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr class="odd">
+<td style="text-align: center;">Day 3</td>
+<td style="text-align: center;"><a href="../lab/lab_day3_pca.html">Day 3: Principal Component Analysis</a></td>
+<td style="text-align: center;"></td>
+</tr>
+<tr class="even">
+<td style="text-align: center;">Day 4</td>
+<td style="text-align: center;"><a href="../lab/lab_day4_clustering.html">Day 4: Clustering</a></td>
+<td style="text-align: center;"></td>
+</tr>
+</tbody>
+</table>
 
 
 <!-- -->
@@ -510,7 +545,7 @@ <h2 class="anchored" data-anchor-id="datasets">Datasets</h2>
 <span id="cb1-17"><a href="#cb1-17" aria-hidden="true" tabindex="-1"></a>day 3 (PCA)</span>
 <span id="cb1-18"><a href="#cb1-18" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-19"><a href="#cb1-19" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>pima.txt</span>
-<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>NCI60</span>
+<span id="cb1-20"><a href="#cb1-20" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>NCI60 (done)</span>
 <span id="cb1-21"><a href="#cb1-21" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>USarrest</span>
 <span id="cb1-22"><a href="#cb1-22" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>Food</span>
 <span id="cb1-23"><a href="#cb1-23" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>CH10Ex11</span>
@@ -523,9 +558,16 @@ <h2 class="anchored" data-anchor-id="datasets">Datasets</h2>
 <span id="cb1-30"><a href="#cb1-30" aria-hidden="true" tabindex="-1"></a><span class="ss">- </span>CH10Ex11</span>
 <span id="cb1-31"><a href="#cb1-31" aria-hidden="true" tabindex="-1"></a></span>
 <span id="cb1-32"><a href="#cb1-32" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-33"><a href="#cb1-33" aria-hidden="true" tabindex="-1"></a><span class="fu">## Lab notes and R scripts</span></span>
 <span id="cb1-34"><a href="#cb1-34" aria-hidden="true" tabindex="-1"></a></span>
-<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
+<span id="cb1-35"><a href="#cb1-35" aria-hidden="true" tabindex="-1"></a>|     Day      |                 Lab notes                |  R script  |</span>
+<span id="cb1-36"><a href="#cb1-36" aria-hidden="true" tabindex="-1"></a>|:-------------:|:------------------------------------:|:-----------------:|</span>
+<span id="cb1-37"><a href="#cb1-37" aria-hidden="true" tabindex="-1"></a>| Day 1   | <span class="co">[</span><span class="ot">Day 1: Introduction to R</span><span class="co">]()</span>  |   |</span>
+<span id="cb1-38"><a href="#cb1-38" aria-hidden="true" tabindex="-1"></a>| Day 2  | <span class="co">[</span><span class="ot">Day 2: Multiple testing</span><span class="co">]()</span>  |   |</span>
+<span id="cb1-39"><a href="#cb1-39" aria-hidden="true" tabindex="-1"></a>| Day 3  | <span class="co">[</span><span class="ot">Day 3: Principal Component Analysis</span><span class="co">](lab_day3_pca.qmd)</span> | |</span>
+<span id="cb1-40"><a href="#cb1-40" aria-hidden="true" tabindex="-1"></a>| Day 4  | <span class="co">[</span><span class="ot">Day 4: Clustering</span><span class="co">](lab_day4_clustering.qmd)</span> | |</span>
+<span id="cb1-41"><a href="#cb1-41" aria-hidden="true" tabindex="-1"></a></span>
+<span id="cb1-42"><a href="#cb1-42" aria-hidden="true" tabindex="-1"></a></span></code><button title="Copy to Clipboard" class="code-copy-button"><i class="bi"></i></button></pre></div>
 </div></div></div></div></div>
 </div> <!-- /content -->
 

diff --git a/docs/search.json b/docs/search.json
@@ -123,7 +123,7 @@
     "href": "lab/overview.html",
     "title": "R Lab: Overview",
     "section": "",
-    "text": "day 2 (MT)\n\nbrainshake (t-test)\nNCI60 (t-test, multiple testing)\nCh10Ex11\n\nday 3 (PCA)\n\npima.txt\nNCI60\nUSarrest\nFood\nCH10Ex11\n\nday 4 (clustering)\n\niris\nbirds\nNCI60\nCH10Ex11"
+    "text": "day 2 (MT)\n\nbrainshake (t-test)\nNCI60 (t-test, multiple testing)\nCh10Ex11\n\nday 3 (PCA)\n\npima.txt\nNCI60 (done)\nUSarrest\nFood\nCH10Ex11\n\nday 4 (clustering)\n\niris\nbirds\nNCI60\nCH10Ex11"
   },
   {
     "objectID": "lab/overview.html#a-cancer-modeling-example",
@@ -165,6 +165,55 @@
     "href": "lab/overview.html#datasets",
     "title": "R Lab: Overview",
     "section": "",
-    "text": "day 2 (MT)\n\nbrainshake (t-test)\nNCI60 (t-test, multiple testing)\nCh10Ex11\n\nday 3 (PCA)\n\npima.txt\nNCI60\nUSarrest\nFood\nCH10Ex11\n\nday 4 (clustering)\n\niris\nbirds\nNCI60\nCH10Ex11"
+    "text": "day 2 (MT)\n\nbrainshake (t-test)\nNCI60 (t-test, multiple testing)\nCh10Ex11\n\nday 3 (PCA)\n\npima.txt\nNCI60 (done)\nUSarrest\nFood\nCH10Ex11\n\nday 4 (clustering)\n\niris\nbirds\nNCI60\nCH10Ex11"
+  },
+  {
+    "objectID": "lab/lab_day2_pca.html",
+    "href": "lab/lab_day2_pca.html",
+    "title": "R Lab (day 2): PCA",
+    "section": "",
+    "text": "Datasets\nR script"
+  },
+  {
+    "objectID": "lab/lab_day2_pca.html#nci60",
+    "href": "lab/lab_day2_pca.html#nci60",
+    "title": "R Lab (day 2): PCA",
+    "section": "NCI60",
+    "text": "NCI60\n\nlibrary(ISLR)\nnci.labs &lt;- NCI60$labs # Sample labels (tissue type)\nnci.data &lt;- NCI60$data # Gene expression data set\n\n\n# what if I would like to compute the mean of each gene within each tissue type?\ntissue.means &lt;- apply(nci.data, 2, function(x){tapply(x, nci.labs, mean)})\ndim(tissue.means)\n\n[1]   14 6830\n\ntable(nci.labs)\n\nnci.labs\n     BREAST         CNS       COLON K562A-repro K562B-repro    LEUKEMIA \n          7           5           7           1           1           6 \nMCF7A-repro MCF7D-repro    MELANOMA       NSCLC     OVARIAN    PROSTATE \n          1           1           8           9           6           2 \n      RENAL     UNKNOWN \n          9           1 \n\n\nCompute the PC\n\n# PCA analysis after scaling the variables to standard deviation one:\npr.out &lt;- prcomp(nci.data, scale=TRUE)\n\nPrint the summary output\n\nsummary(pr.out)\n\n\npr.var &lt;- pr.out$sdev^2\npve &lt;- pr.var/sum(pr.var)\npve &lt;- 100*pve\n\npar(mfrow=c(1,2))\nplot(pve,  type=\"o\", ylab=\"PVE\", xlab=\"Principal Component\", col=\"blue\")\nplot(cumsum(pve), type=\"o\", ylab=\"Cumulative PVE\", xlab=\"Principal Component\", col=\"brown3\")\n\n\n\n\n\nHow many PCs to keep?\n\nmysel80 &lt;- which(cumsum(pve) &gt; 80)[1] # explains 80% of the variability\nmysel70 &lt;- which(cumsum(pve) &gt; 70)[1] # explains 70% of the variability\n\npar(mfrow=c(1,2)) # plot contains two smaller plots next to each other\nplot(pve,  type=\"o\", ylab=\"PVE\", xlab=\"Principal Component\", col=\"blue\")\nabline(v = mysel80)\nabline(v = mysel70, col=3)\nplot(cumsum(pve), type=\"o\", ylab=\"Cumulative PVE\", xlab=\"Principal Component\", col=\"brown3\")\nabline(v = mysel80)\nabline(h = 80)\nabline(v = mysel70, col=3)\nabline(h = 70, col=3)\n\n\n\n\nIf we decide to only keep the principal components that explains 70% of the variance, we end up with 24 components, which we can further analyse to better understand the relationships between the variables. For simplicity we only look at the first few components.\nWe plot the first few principal component score vectors, to visualize the results. The observations (cell lines) corresponding to a given cancer type will be plotted in the same colour.\n\nCols=function(vec){\n  cols=rainbow(length(unique(vec)))\n  return(cols[as.numeric(as.factor(vec))])\n}\n\n# Plot the first vs second and first vs third principal component score vectors,\n# with colors associated to labels (using the Cols() helper function)\npar(mfrow=c(1,2))\nplot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab=\"PC 1\",ylab=\" PC 2\")\nplot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab=\"PC 1\",ylab=\" PC 3\")\nlegend('topleft', col=rainbow(length(unique(nci.labs))), legend=unique(nci.labs), bty='n', lwd=2, cex=.6)"
+  },
+  {
+    "objectID": "lab/overview.html#lab-notes-and-r-scripts",
+    "href": "lab/overview.html#lab-notes-and-r-scripts",
+    "title": "R Lab: Overview",
+    "section": "Lab notes and R scripts",
+    "text": "Lab notes and R scripts\n\n\n\nDay\nLab notes\nR script\n\n\n\n\nDay 1\nDay 1: Introduction to R\n\n\n\nDay 2\nDay 2: Multiple testing\n\n\n\nDay 3\nDay 3: Principal Component Analysis\n\n\n\nDay 4\nDay 4: Clustering"
+  },
+  {
+    "objectID": "lab/lab_day3_pca.html",
+    "href": "lab/lab_day3_pca.html",
+    "title": "R Lab (day 3): Principal Component Analysis",
+    "section": "",
+    "text": "Datasets\nR script"
+  },
+  {
+    "objectID": "lab/lab_day3_pca.html#nci60",
+    "href": "lab/lab_day3_pca.html#nci60",
+    "title": "R Lab (day 3): Principal Component Analysis",
+    "section": "NCI60",
+    "text": "NCI60\n\nlibrary(ISLR)\nnci.labs &lt;- NCI60$labs # Sample labels (tissue type)\nnci.data &lt;- NCI60$data # Gene expression data set\n\n\n# what if I would like to compute the mean of each gene within each tissue type?\ntissue.means &lt;- apply(nci.data, 2, function(x){tapply(x, nci.labs, mean)})\ndim(tissue.means)\n\n[1]   14 6830\n\ntable(nci.labs)\n\nnci.labs\n     BREAST         CNS       COLON K562A-repro K562B-repro    LEUKEMIA \n          7           5           7           1           1           6 \nMCF7A-repro MCF7D-repro    MELANOMA       NSCLC     OVARIAN    PROSTATE \n          1           1           8           9           6           2 \n      RENAL     UNKNOWN \n          9           1 \n\n\nCompute the PC\n\n# PCA analysis after scaling the variables to standard deviation one:\npr.out &lt;- prcomp(nci.data, scale=TRUE)\n\nPrint the summary output\n\nsummary(pr.out)\n\n\npr.var &lt;- pr.out$sdev^2\npve &lt;- pr.var/sum(pr.var)\npve &lt;- 100*pve\n\npar(mfrow=c(1,2))\nplot(pve,  type=\"o\", ylab=\"PVE\", xlab=\"Principal Component\", col=\"blue\")\nplot(cumsum(pve), type=\"o\", ylab=\"Cumulative PVE\", xlab=\"Principal Component\", col=\"brown3\")\n\n\n\n\n\nHow many PCs to keep?\n\nmysel80 &lt;- which(cumsum(pve) &gt; 80)[1] # explains 80% of the variability\nmysel70 &lt;- which(cumsum(pve) &gt; 70)[1] # explains 70% of the variability\n\npar(mfrow=c(1,2)) # plot contains two smaller plots next to each other\nplot(pve,  type=\"o\", ylab=\"PVE\", xlab=\"Principal Component\", col=\"blue\")\nabline(v = mysel80)\nabline(v = mysel70, col=3)\nplot(cumsum(pve), type=\"o\", ylab=\"Cumulative PVE\", xlab=\"Principal Component\", col=\"brown3\")\nabline(v = mysel80)\nabline(h = 80)\nabline(v = mysel70, col=3)\nabline(h = 70, col=3)\n\n\n\n\nIf we decide to only keep the principal components that explains 70% of the variance, we end up with 24 components, which we can further analyse to better understand the relationships between the variables. For simplicity we only look at the first few components.\nWe plot the first few principal component score vectors, to visualize the results. The observations (cell lines) corresponding to a given cancer type will be plotted in the same colour.\n\nCols=function(vec){\n  cols=rainbow(length(unique(vec)))\n  return(cols[as.numeric(as.factor(vec))])\n}\n\n# Plot the first vs second and first vs third principal component score vectors,\n# with colors associated to labels (using the Cols() helper function)\npar(mfrow=c(1,2))\nplot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab=\"PC 1\",ylab=\" PC 2\")\nplot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab=\"PC 1\",ylab=\" PC 3\")\nlegend('topleft', col=rainbow(length(unique(nci.labs))), legend=unique(nci.labs), bty='n', lwd=2, cex=.6)"
+  },
+  {
+    "objectID": "lab/lab_day4_clustering.html",
+    "href": "lab/lab_day4_clustering.html",
+    "title": "R Lab (day 4): Clustering",
+    "section": "",
+    "text": "Datasets\nR script"
+  },
+  {
+    "objectID": "lab/lab_day4_clustering.html#nci60",
+    "href": "lab/lab_day4_clustering.html#nci60",
+    "title": "R Lab (day 4): Clustering",
+    "section": "NCI60",
+    "text": "NCI60\n\nlibrary(ISLR)\nnci.labs &lt;- NCI60$labs # Sample labels (tissue type)\nnci.data &lt;- NCI60$data # Gene expression data set"
   }
 ]
diff --git a/lab/.DS_Store b/lab/.DS_Store
diff --git a/lab/code/lab_pca.R b/lab/code/lab_pca.R
@@ -0,0 +1,113 @@
+# code for PCA
+
+
+# NCI60  ----
+## Now we move to a typical large-scale biological data set
+## (this is partly based on Lab 10.3 in James et al., 2013).
+
+## We have already seen the NCI60 cancer cell line microarray data set,
+## consisting of 6830 gene expression measurements on 64 cancer cell lines.
+## It is available in the R package ISLR, which is the compendium R package
+## to the book by James et al. (2013).
+
+library(ISLR)
+nci.labs <- NCI60$labs # Sample labels (tissue type)
+nci.data <- NCI60$data # Gene expression data set
+# what if I would like to compute the mean of each gene within each tissue type?
+tissue.means <- apply(nci.data, 2, function(x){tapply(x, nci.labs, mean)})
+dim(tissue.means)
+table(nci.labs)
+
+
+## NB: important to always check if the variables are on the columns before doing a PCA
+## (they cannot be on the rows!) If they are *not* on the columns we can
+## transpose the dataset with the function t(), i.e. you would run the code:
+## nci.data <- t(nci.data)
+
+
+## PCA on the NCI60 Data
+
+## First, let us perform the PCA analysis after scaling the variables (genes)
+## to have standard deviation one.
+
+# PCA analysis after scaling the variables to standard deviation one:
+pr.out <- prcomp(nci.data, scale=TRUE)
+
+## Lastly, we calculate the proportion of variance explained (PVE), and visualise
+## it via a scree plot. In addition, we also plot the cumulative proportion of variance
+## explained cumsum (pve), which will reach 100% when all principal components are added up.
+
+# Proportion of variance explained (PVE):
+summary(pr.out)
+
+
+
+# Calculate the proportion of variance explained (PVE) by hand,
+# make a scree plot and plot the cumulative proportion of variance explained cumsum(pve):
+pr.var <- pr.out$sdev^2
+pve <- pr.var/sum(pr.var)
+pve <- 100*pve
+
+par(mfrow=c(1,2))
+plot(pve,  type="o", ylab="PVE", xlab="Principal Component", col="blue")
+plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")
+
+# How many principal components would you keep to achieve a good dimension reduction,
+# while keeping most of the variability in the data set?
+
+mysel80 <- which(cumsum(pve) > 80)[1] # explains 80% of the variability
+mysel70 <- which(cumsum(pve) > 70)[1] # explains 70% of the variability
+
+par(mfrow=c(1,2)) # plot contains two smaller plots next to each other
+plot(pve,  type="o", ylab="PVE", xlab="Principal Component", col="blue")
+abline(v = mysel80)
+abline(v = mysel70, col=3)
+plot(cumsum(pve), type="o", ylab="Cumulative PVE", xlab="Principal Component", col="brown3")
+abline(v = mysel80)
+abline(h = 80)
+abline(v = mysel70, col=3)
+abline(h = 70, col=3)
+
+## If we decide to only keep the principal components that explains 70% of the variance,
+## we end up with 24 components, which we can further analyse to better understand the
+## relationships between the variables. For simplicity we only look at the first few components.
+## We plot the first few principal component score vectors, to visualize the results.
+## The observations (cell lines) corresponding to a given cancer type
+## will be plotted in the same colour.
+
+# we here define a "helper function", which assigns a different colour to each sample
+# label (nci.labs) in the next plot
+Cols=function(vec){
+  cols=rainbow(length(unique(vec)))
+  return(cols[as.numeric(as.factor(vec))])
+}
+
+# Plot the first vs second and first vs third principal component score vectors,
+# with colors associated to labels (using the Cols() helper function)
+par(mfrow=c(1,2))
+plot(pr.out$x[,1:2], col=Cols(nci.labs), pch=19,xlab="PC 1",ylab=" PC 2")
+plot(pr.out$x[,c(1,3)], col=Cols(nci.labs), pch=19,xlab="PC 1",ylab=" PC 3")
+legend('topleft', col=rainbow(length(unique(nci.labs))), legend=unique(nci.labs), bty='n', lwd=2, cex=.6)
+
+
+
+
+
+# Ch10Ex11 ----
+
+
+## Consider again the gene expression data set "Ch10Ex11.csv"
+## (which can be also found on the book website, www.StatLearning.com)
+## that consists of 40 tissue samples with measurements on 1,000 genes.
+## The first 20 samples are from healthy patients,
+## while the second 20 are from a diseased group.
+
+## 1. Load in the data using read.csv(). You will need to select header=F.
+##    Alternatively: load in the data using "Import dataset" in the upper right window,
+##    and click "no" on the "Heading" option.
+## 2. Perform a PCA of these data and visualize the results. Note: remember to check
+##    if the variables (genes) are on the columns in the dataset before running the PCA.
+##    If they are not: use t() to transform the dataset.
+
+
+