diff --git a/01-helpers.R b/01-helpers.R new file mode 100644 index 00000000..ec062993 --- /dev/null +++ b/01-helpers.R @@ -0,0 +1,7 @@ +r_version_string <- function() { + paste0(R.version$major, ".", R.version$minor) +} + +r_version_string.patch_x <- function() { + gsub(".$", "x", r_version_string()) +} diff --git a/01-setup.md b/01-setup.md new file mode 100644 index 00000000..194a5427 --- /dev/null +++ b/01-setup.md @@ -0,0 +1,100 @@ +--- +source: Rmd +title: Introduction and setup +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Ensure that participants are using the correct version of R to reproduce exactly the contents of this lesson. +- Download the example files for this lesson. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- Am I using the correct version of R for this lesson? +- Why does my version of R matter? +- How do I obtain the files that are used in this lesson? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Version of R + +This lesson was developed and tested with R version 4.4.1 (2024-06-14). + +Take a moment to launch RStudio and verify that you are using R version 4.4.x, with `x` being any patch version, e.g. 4.4.1. + + +``` r +R.version.string +``` + +``` output +[1] "R version 4.4.1 (2024-06-14)" +``` + +This is important because Bioconductor uses the version of R running in the current session to determine the version of Bioconductor packages that can be installed in the R library associated with the current R session. +Using a different version of R while following this lesson may lead to unexpected results. + +## Download files + +Several episodes in this lesson rely on example files that participants need to download. + +Run the code below programmatically create a folder called `data` in the current working directory, and download the lesson files in that folder. + + +``` r +dir.create("data", showWarnings = FALSE) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/TrimmomaticAdapters/TruSeq3-PE-2.fa", + destfile = "data/TruSeq3-PE-2.fa" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/ActbGtf/actb.gtf", + destfile = "data/actb.gtf" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/ActbOrf/actb_orfs.fasta", + destfile = "data/actb_orfs.fasta" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/counts.csv", + destfile = "data/counts.csv" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/gene_metadata.csv", + destfile = "data/gene_metadata.csv" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/sample_metadata.csv", + destfile = "data/sample_metadata.csv" +) +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +## Note + +Ideally, participants might want to create a new [RStudio project][external-rstudio-project] and download the lesson files in a sub-directory of that project. + +Using an RStudio project sets the working directory to the root directory of that project. +As a consequence, code is executed relative to that root directory, often avoiding the need for using absolute file paths to import/export data from/to files. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[external-rstudio-project]: https://support.rstudio.com/hc/en-us/articles/200526207-Using-Projects + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Participants will only be able to install the version of Bioconductor packages described in this lesson and reproduce their exact outputs if they use the correct version of R. +- The files used in this lesson should be downloaded in a local path that is easily accessible from an R session. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/02-introduction-to-bioconductor.md b/02-introduction-to-bioconductor.md new file mode 100644 index 00000000..c53e3a78 --- /dev/null +++ b/02-introduction-to-bioconductor.md @@ -0,0 +1,357 @@ +--- +source: Rmd +title: Introduction to Bioconductor +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe the Bioconductor project globally. +- Gain a global view of the Bioconductor project in the R ecosystem. +- Identify sources of information to watch for future updates about the Bioconductor project. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What does the Bioconductor project comprise? +- How does the Bioconductor project relate to the CRAN repository? +- How can I learn to use Bioconductor packages effectively? +- How do I join and communicate with the Bioconductor community? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + + +## What is Bioconductor? + +### A brief history of Bioconductor + +The Bioconductor project was started in the Fall of 2001, as an initiative for the collaborative creation of extensible software for computational biology and bioinformatics (Gentleman, Carey, Bates, Bolstad, Dettling, Dudoit, Ellis, Gautier, Ge, Gentry, Hornik, Hothorn, Huber, Iacus, Irizarry, Leisch, Li, Maechler, Rossini, Sawitzki, Smith, Smyth, Tierney, Yang, and Zhang, 2004). +From the very start, the stated mission of the project was to develop tools for the statistical analysis and comprehension of large datasets and technological artifacts in rigorously and robustly designed experiments. +Beyond statistical analyses, the interpretation of statistical results is supported by packages providing biological context, visualization, and reproducibility. + +Over the years, software packages contributed to the Bioconductor project have reflected the evolution and emergence of several high-throughput technologies, from microarrays to single-cell genomics, through many variations of sequencing experiments (e.g., RNA-seq, ChIP-seq, DNA-seq), analyses (e.g., sequence variation, copy number variation, single nucleotide polymorphisms), and data modalities (e.g., flow cytometry, proteomics, microscopy and image analysis). + +Crucially, the project has not only released software packages implementing novel statistical tests and methodologies, but also produced a diverse range of packages types granting access to databases of molecular annotations and experimental datasets. + +The Bioconductor project culminates at an annual conference in North America in the summer, while regional conferences offer great opportunities for networking in Europe, Asia, and North America. +The project is committed to promote a diverse and inclusive community, including a [Code of Conduct][bioc-code-of-conduct] enforced by a Code of Conduct committee. + +![](fig/bioc-timeline.svg){alt='Timeline of major Bioconductor milestones alongside technological advancements.'} + +**Timeline of major Bioconductor milestones alongside technological advancements.** +Above the timeline, the figure marks the first occurence of major events. +Within the timeline, the name of packages providing core infrastructure indicate the release date. +Below the timeline, major technological advancements contextualise the evolution of the Bioconductor project over time. + +### A scientific project + +The original publication describes the aims and methods of the project at its inception Gentleman, Carey, Bates et al. (2004). + +Huber, Carey, Gentleman, Anders, Carlson, Carvalho, Bravo, Davis, Gatto, Girke, Gottardo, Hahne, Hansen, Irizarry, Lawrence, Love, MacDonald, Obenchain, Oles, Pages, Reyes, Shannon, Smyth, Tenenbaum, Waldron, and Morgan (2015) illustrates the progression of the project, including descriptions of core infrastructure and case studies, from the perspective of both users and developers. + +Amezquita, Lun, Becht, Carey, Carpp, Geistlinger, Marini, Rue-Albrecht, Risso, Soneson, Waldron, Pages, Smith, Huber, Morgan, Gottardo, and Hicks (2020) reviewed further developments of the project in the wake of single-cell genomics technologies. + +Many more publications and book chapters cite the Bioconductor project, with recent example listed on the [Bioconductor website][bioc-publications]. + +### A package repository + +#### Overview and relationship to CRAN + +Undoubtedly, software packages are the best-known aspect of the Bioconductor project. +Since its inception in 2001, the repository has grown over time to host thousands of packages. + +The Bioconductor project has extended the preexisting CRAN repository -- much larger and general-purpose in scope -- to comprise R packages primarily catering for bioinformatics and computational biology analyses. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +The [Discussion][discuss-cran] article of this lesson includes a section discussing the relationship of Bioconductor and CRAN in further details. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +#### The Bioconductor release cycle + +The Bioconductor project also extended the packaging infrastructure of the CRAN repository to better support the deployment and management of packages at the user level (Gentleman, Carey, Bates et al., 2004). +In particular, the Bioconductor projects features a 6-month release cycle (typically around April and October), which sees a snapshot of the current version of all packages in the Bioconductor repository earmarked for a specific version of R. +R itself is released on an annual basis (typically around April), meaning that for each release of R, two compatible releases of Bioconductor packages are available. + +As such, Bioconductor package developers are required to always use the version of R that will be associated with the next release of the Bioconductor project. +This means using the development version of R between October and April, and the release version of R between April and October. + +Crucially, the strict Bioconductor release cycle prevents users from installing temporally distant versions of packages that were very likely never tested together. +This practice reflects the development cycle of packages of both CRAN and Bioconductor, where contemporaneous packages are regularly tested by automated systems to ensure that the latest software updates in package dependencies do not break downstream packages, or prompts those package maintainers to update their own software as a consequence. + +Prior to each Bioconductor release, packages that do not pass the requires suites of automated tests are deprecated and subsequently removed from the repository. +This ensures that each Bioconductor release provides a suite of packages that are mutually compatible, traceable, and guaranteed to function for the associated version of R. + +![](fig/bioc-release-cycle.svg){alt='Timeline of release dates for selected Bioconductor and R versions.'} + +**Timeline of release dates for selected Bioconductor and R versions.** +The upper section of the timeline indicates versions and approximate release dates for the R project. +The lower section of the timeline indicates versions and release dates for the Bioconductor project. +Source: [Bioconductor][bioc-release-dates]. + +#### Package types + +Packages are broadly divided in four major categories: + +- software +- annotation data +- experiment data +- workflows + +[Software packages][glossary-software-package] themselves can be subdivided into packages that provide infrastructure (i.e., classes) to store and access data, +and packages that provide methodological tools to process data stored in those data structures. +This separation of structure and analysis is at the core of the Bioconductor project, +encouraging developers of new methodological software packages to thoughtfully re-use existing data containers where possible, +and reducing the cognitive burden imposed on users who can more easily experiment with alternative workflows without the need to learn and convert between different data structures. + +[Annotation data packages][glossary-annotation-package] provide self-contained databases of diverse genomic annotations (e.g., gene identifiers, biological pathways). +Different collections of annotation packages can be found in the Bioconductor project. +They are identifiable by their respective naming pattern, and the information that they contain. +For instance, the so-called `OrgDb` packages (e.g., the *[org.Hs.eg.db](https://bioconductor.org/packages/3.19/org.Hs.eg.db)* package) provide information mapping different types of gene identifiers and pathway databases; +the so-called `EnsDb` (e.g., *[EnsDb.Hsapiens.v86](https://bioconductor.org/packages/3.19/EnsDb.Hsapiens.v86)*) packages encapsulate individual versions of the Ensembl annotations in Bioconductor packages; +and the so-called `TxDb` packages (e.g., *[TxDb.Hsapiens.UCSC.hg38.knownGene](https://bioconductor.org/packages/3.19/TxDb.Hsapiens.UCSC.hg38.knownGene)*) encapsulate individual versions UCSC gene annotation tables. + +[Experiment data packages][glossary-experiment-package] provide self-contained datasets that are often used by software package developers to demonstrate the use of their package on well-known standard datasets in their [package vignettes][glossary-vignette]. + +Finally, [workflow packages][glossary-workflow-package] exclusively provide collections of vignettes that demonstrate the combined usage of several other packages as a coherent workflow, but do not provide any new source code or functionality themselves. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge: The Bioconductor website + +The Bioconductor website is accessible at [https://bioconductor.org/](https://bioconductor.org/). + +Browse the website to find information answering the following questions: + +1. How many packages does the current release of the Bioconductor project include? +2. How many packages of each type does this number include? + +::::::::::::::: solution + +### Solution + +The following solution includes numbers that were valid at the time of writing (Bioconductor release 3.13); +numbers will inevitably be different for future releases of the Bioconductor project. + +1. On the page [https://bioconductor.org/](https://bioconductor.org/), in the section "Install", we can read: + +> Discover 2042 software packages available in Bioconductor release 3.13. + +2. On the page [https://bioconductor.org/](https://bioconductor.org/), in the section "News", click on the link that reads "Bioconductor Bioc `X.Y` Released" (`X.Y` being the version of the current Bioconductor release when you go through this exercise yourself). + On the linked page, we can read: + +> We are pleased to announce Bioconductor 3.13, consisting of 2042 software packages, 406 experiment data packages, 965 annotation packages, and 29 workflows. +> +> There are 133 new software packages, 22 new data experiment packages, 7 new annotation packages, 1 new workflow, no new books, and many updates and improvements to existing packages; Bioconductor 3.13 is compatible with R 4.1.0, and is supported on Linux, 32- and 64-bit Windows, and macOS 10.14.6 Mojave or higher. This release will include an updated Bioconductor Docker containers. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +#### Package classification using biocViews + +The Bioconductor project uses [biocViews][glossary-biocviews], a set of terms from a controlled vocabulary, to classify Bioconductor packages and facilitate their discovery by thematic search on the [Bioconductor website][biocviews-site]. + +Each Bioconductor package is tagged with a small set of terms chosen from the available controlled vocabulary, to describe the type and functionality of the package. +Terms are initially selected by the package authors, and subsequently refined during package review or updates to the controlled vocabulary. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Visit the listing of all packages on the Bioconductor [biocViews][biocviews-site] web page. +Use the "Autocomplete biocViews search" box in the upper left to filter packages by category and explore the graph of software packages by expanding and contracting individual terms. + +1. What biocView terms can be used to identify packages that have been tagged for RNA sequencing analysis? ChIP-seq? Epigenetics? Variant annotation? Proteomics? Single-cell genomics? +2. In the `RNASeq` category, two very popular packages are *[DESeq2](https://bioconductor.org/packages/3.19/DESeq2)* and *[edgeR](https://bioconductor.org/packages/3.19/edgeR)*. + Which one is more popular in terms of download statistics (i.e., lower rank)? + +::::::::::::::: solution + +### Solution + +1. `RNAseq`, `ChIPSeq`, `Epigenetics`, `VariantAnnotation`, `Proteomics`, `SingleCell`. +2. For Bioconductor release `3.14`, *[DESeq2](https://bioconductor.org/packages/3.19/DESeq2)* and *[edgeR](https://bioconductor.org/packages/3.19/edgeR)* are listed at ranks 23 and 28 respectively. + In other words, the two packages are among the most frequently downloaded packages in the Bioconductor project, in this instance with a small advantage in favour of *[edgeR](https://bioconductor.org/packages/3.19/edgeR)*. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +The Bioconductor package *[biocViews](https://bioconductor.org/packages/3.19/biocViews)* is used to support and manage the infrastructure of the controlled vocabulary. +It can also be used to programmatically inspect and subset the list of terms available using their relationship as a graph. + +Furthermore, the *[BiocPkgTools](https://bioconductor.org/packages/3.19/BiocPkgTools)* package can be used to browse packages under different [biocViews][glossary-biocviews] (Su, Carey, Shepherd, Ritchie, Morgan, and Davis, 2019). + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +#### Packages interoperability + +At the core of the Bioconductor philosophy is the notion of interoperability. +Namely, the capacity of packages to operate on the same data structures. +Importantly, interoperability benefits both users and developers. + +Users can more easily write arbitrarily complex workflows that combine multiple packages. +With packages operating on the same data structure, users can maximize their attention to the practical steps of their workflow, and minimize time spent in often complex and error-prone conversions between different data structures specific to each package. +Comparative benchmarks are also easier to implement and can be evaluated more fairly +when competing software packages offering similar functionality operate on input and outputs stored in the same data structures. + +Similarly, developers of new packages can focus on the implementation of novel functionality, borrowing existing data structures that offer robust and trusted infrastructure for storage, verification, and indexing of information. + +Ultimately, the figure below illustrates how many different Bioconductor packages - as well as base R packages - can be combined to carry out a diverse range of analyses, from importing sequencing data into an R session, to the annotation, integration and visualization of data and results. + +![](fig/bioc-sequencing-ecosystem.svg){alt='Sequencing Ecosystem.'} + +**Sequencing Ecosystem** +Major data processing steps (blue) and relevant software packages (pink) are listed in the context of archetypal workflows for various types of genomics analyses. +The sequential relation of workflow steps and software package illustrates the importance of interoperability between software package in order to assemble complete end-to-end workflows. + +### Conferences, courses and workshops + +The Bioconductor community regularly organizes a number of events throughout the year and across the world. For example: + +- The annual BioC summer conference in North America +- Regional conference in winter (e.g. BioC Europe, BioC Asia) +- Summer schools (e.g., CSAMA) +- Online meetings open to all community members (e.g., Bioconductor Developers Forum) + +Course materials are regularly uploaded on the [Bioconductor website][bioc-course-materials] following each of those events. +In particular, [online books][bioc-books] are being developed and maintained by community members. + +The Bioconductor [YouTube channel][bioc-youtube] is used to publish video recordings of conference presentations including talks and workshops, as well as editions of the regular Bioconductor developers forum (link needed). + +::::::::::::::::::::::::::::::::::::::::: callout + +### Contribute! + +It could be great to illustrate a typical cycle of conferences over a year, e.g. + +- BioC conference in North America around late July +- EuroBioC conference in Europe around December +- BioCAsia conference in Asia around November + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Online communication channels + +#### Support site + +The Bioconductor [support site][bioc-support-site] provides a platform where users and developers can communicate freely (following the Bioconductor [Code of Conduct][bioc-code-of-conduct]) to discuss issues on a range of subjects, ranging from packages to conceptual questions about best practices. + +#### Slack workspace + +The Bioconductor [Slack workspace][bioc-slack] is an open space that all community members are welcome to join (for free) and use for rapid interactions. +Currently, the "Pro" pricing plan kindly supported by core funding provides: + +- Unlimited message archive +- Unlimited apps +- Group video calls with screen sharing +- Work securely with other organizations using Slack Connect + +A wide range of channels have been created to discuss a range of subjects, and community members can freely join the discussion on those channels of create new ones to discuss new subjects. + +Important announcements are posted on the `#general` channel. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +Users are encouraged to use the Bioconductor [support site][bioc-support-site] to raise issues that are relevant to the wider community. +The Slack workspace is often most useful for live discussions, and widely subscribed channels (e.g. `#general`) should be used with moderation. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +#### Developer Mailing List + +The [bioc-devel@r-project.org](mailto:bioc-devel@r-project.org) mailing list is used for communication between package developers, and announcements from the Biocondutor core team. + +### A scientific and technical community + +- [Scientific Advisory Board (SAB)][bioc-scientific-advisory-board] Meet Annually, External and Internal leader in the field who act as project advisors. No Term limits. +- [Technical Advisory Board (TAB)][bioc-technical-advisory-board]. Meet monthly to consider technical aspects of core infastructure and scientific direction of the project. 15 members, 3 year term. Annual open-to-all elections to rotate members. Current officers are Vince Carey (chair), Levi Waldron (vice Chair) Charlotte Soneson (Secretary). +- [Community Advisory Board (CAB)][bioc-community-advisory-board] Meet monthly to consider community outreach, events, education and training. 15 members, 3 year term. Annual open-to-all elections to rotate members. Current officers are Aedin Culhane (chair), Matt Ritchie (co Chair), Lori Kern (Secretary). +- [Code of Conduct committee][bioc-code-of-conduct] + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +At least 1 member of TAB/CAB sits on both to act at the liason to ensure communication of the board. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## References + +[1] R. A. Amezquita, A. T. L. Lun, E. Becht, et al. "Orchestrating +single-cell analysis with Bioconductor". In: _Nat Methods_ 17.2 (2020), +pp. 137-145. ISSN: 1548-7105 (Electronic) 1548-7091 (Linking). DOI: +10.1038/s41592-019-0654-x. +. + +[2] R. C. Gentleman, V. J. Carey, D. M. Bates, et al. "Bioconductor: +open software development for computational biology and +bioinformatics". In: _Genome Biol_ 5.10 (2004), p. R80. ISSN: 1474-760X +(Electronic) 1474-7596 (Linking). DOI: 10.1186/gb-2004-5-10-r80. +. + +[3] W. Huber, V. J. Carey, R. Gentleman, et al. "Orchestrating +high-throughput genomic analysis with Bioconductor". In: _Nat Methods_ +12.2 (2015), pp. 115-21. ISSN: 1548-7105 (Electronic) 1548-7091 +(Linking). DOI: 10.1038/nmeth.3252. +. + +[4] S. Su, V. Carey, L. Shepherd, et al. "BiocPkgTools: Toolkit for +mining the Bioconductor package ecosystem [version 1; peer review: 2 +approved, 1 approved with reservations] ". In: _F1000Research_ 8.752 +(2019). DOI: 10.12688/f1000research.19410.1. + +[bioc-code-of-conduct]: https://www.bioconductor.org/about/code-of-conduct/ +[bioc-publications]: https://www.bioconductor.org/help/publications/ +[discuss-cran]: discuss.html#bioconductor-and-CRAN +[bioc-release-dates]: https://bioconductor.org/about/release-announcements/ +[glossary-software-package]: reference.html#software-package +[glossary-annotation-package]: reference.html#annotationdata-package +[glossary-experiment-package]: reference.html#experimentdata-package +[glossary-vignette]: reference.html#vignette +[glossary-workflow-package]: reference.html#workflow-package +[glossary-biocviews]: reference.html#biocviews +[biocviews-site]: https://www.bioconductor.org/packages/release/BiocViews.html +[bioc-course-materials]: https://bioconductor.org/help/course-materials/ +[bioc-books]: https://www.bioconductor.org/books/release/ +[bioc-youtube]: https://www.youtube.com/user/bioconductor +[bioc-support-site]: https://support.bioconductor.org/ +[bioc-slack]: https://bioc-community.herokuapp.com/ +[bioc-scientific-advisory-board]: https://bioconductor.org/about/scientific-advisory-board/ +[bioc-technical-advisory-board]: https://bioconductor.org/about/technical-advisory-board/ +[bioc-community-advisory-board]: https://bioconductor.org/about/community-advisory-board/ + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- R packages are but one aspect of the Bioconductor project. +- The Bioconductor project extends and complements the CRAN repository. +- Different types of packages provide not only software, but also annotations, experimental data, and demonstrate the use of multiple packages in integrated workflows. +- Interoperability beteen Bioconductor packages facilitates the writing of integrated workflows and minimizes the cognitive burden on users. +- Educational materials from courses and conferences are archived and accessible on the Bioconductor website and YouTube channel. +- Different channels of communication enable community members to converse and help each other, both as users and package developers. +- The Bioconductor project is governed by scientific, technical, and advisory boards, as well as a Code of Conduct committee. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/03-installing-bioconductor.md b/03-installing-bioconductor.md new file mode 100644 index 00000000..626e2f5a --- /dev/null +++ b/03-installing-bioconductor.md @@ -0,0 +1,394 @@ +--- +source: Rmd +title: Installing Bioconductor packages +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Install BiocManager. +- Install Bioconductor packages. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How do I install Bioconductor packages? +- How do I check if newer versions of my installed packages are available? +- How do I update Bioconductor packages? +- How do I find out the name of packages available from the Bioconductor repositories? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## BiocManager + +The *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is the entry point into the Bioconductor package repository. +Technically, this is the only Bioconductor package distributed on the CRAN repository. + +It provides functions to safely install Bioconductor packages and check for available updates. + +Once the package is installed, the function `BiocManager::install()` can be used to install packages from the Bioconductor repository. +The function is also capable of installing packages from other repositories (e.g., CRAN), if those packages are not found in the Bioconductor repository first. + +![](fig/bioc-install.svg){alt='The package BiocManager is available from the CRAN repository and used to install packages from the Bioconductor repository.'} + +**The package BiocManager is available from the CRAN repository and used to install packages from the Bioconductor repository.** +The function `install.packages()` from the base R package `utils` can be used to install the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package distributed on the CRAN repository. +In turn, the function `BiocManager::install()` can be used to install packages available on the Bioconductor repository. +Notably, the `BiocManager::install()` function will fall back on the CRAN repository if a package cannot be found in the Bioconductor repository. + +Install the package using the code below. + + +``` r +install.packages("BiocManager") +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +A number of packages that are not part of the base R installation also provide functions to install packages from various repositories. +For instance: + +- `devtools::install()` +- `remotes::install_bioc()` +- `remotes::install_bitbucket()` +- `remotes::install_cran()` +- `remotes::install_dev()` +- `remotes::install_github()` +- `remotes::install_gitlab()` +- `remotes::install_git()` +- `remotes::install_local()` +- `remotes::install_svn()` +- `remotes::install_url()` +- `renv::install()` + +Those functions are beyond the scope of this lesson, and should be used with caution and adequate knowledge of their specific behaviors. +The general recommendation is to use `BiocManager::install()` over any other installation mechanism because it ensures proper versioning of Bioconductor packages. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Bioconductor releases and current version + +Once the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is installed, the `BiocManager::version()` function displays the version (i.e., release) of the Bioconductor project that is currently active in the R session. + + +``` r +BiocManager::version() +``` + +``` output +[1] '3.19' +``` + +Using the correct version of R and Bioconductor packages is a key aspect of reproducibility. +The *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* packages uses the version of R running in the current session to determine the version of Biocondutor packages that can be installed in the current R library. + +The Bioconductor project produces two releases each year, one around April and another one around October. +The April release of Bioconductor coincides with the annual release of R. +The October release of Bioconductor continues to use the same version of R for that annual cycle (i.e., until the next release, in April). + +![](fig/bioc-release-cycle.svg){alt='Timeline of release dates for selected Bioconductor and R versions.'} + +**Timeline of release dates for selected Bioconductor and R versions.** +The upper section of the timeline indicates versions and approximate release dates for the R project. +The lower section of the timeline indicates versions and release dates for the Bioconductor project. +Source: [Bioconductor][bioc-release-dates]. + +During each 6-month cycle of package development, Bioconductor tests packages for compatibility with the version of R that will be available for the next release cycle. +Then, each time a new Bioconductor release is produced, the version of every package in the Bioconductor repository is incremented, including the package *[BiocVersion](https://bioconductor.org/packages/3.19/BiocVersion)* which determines the version of the Bioconductor project. + + +``` r +packageVersion("BiocVersion") +``` + +``` output +[1] '3.19.1' +``` + +This is the case for every package, even those which have not been updated at all since the previous release. +That new version of each package is earmarked for the corresponding version of R; +in other words, that version of the package can only be installed and accessed in an R session that uses the correct version of R. +This version increment is essential to associate a each version of a Bioconductor package with a unique release of the Bioconductor project. + +Following the April release, this means that users must install the new version of R to access the newly released versions of Bioconductor packages. + +Instead, in October, users can continue to use the same version of R to access the newly released version of Bioconductor packages. +However, to update an R library from the April release to the October release of Bioconductor, users need to call the function `BiocManager::install()` specifying the correct version of Bioconductor as the `version` option, for instance: + + +``` r +BiocManager::install(version = "3.14") +``` + +This needs to be done only once, as the *[BiocVersion](https://bioconductor.org/packages/3.19/BiocVersion)* package will be updated to the corresponding version, indicating the version of Bioconductor in use in this R library. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +The [Discussion][discuss-release-cycle] article of this lesson includes a section discussing the release cycle of the Bioconductor project. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Check for updates + +The `BiocManager::valid()` function inspects the version of packages currently installed in the user library, and checks whether a new version is available for any of them on the Bioconductor repository. + +If everything is up-to-date, the function will simply print `TRUE`. + + +``` r +BiocManager::valid() +``` + +``` warning +Warning: 1 packages out-of-date; 0 packages too new +``` + +``` output + +* sessionInfo() + +R version 4.4.1 (2024-06-14) +Platform: x86_64-pc-linux-gnu +Running under: Ubuntu 22.04.4 LTS + +Matrix products: default +BLAS: /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 +LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0 + +locale: + [1] LC_CTYPE=C.UTF-8 LC_NUMERIC=C LC_TIME=C.UTF-8 + [4] LC_COLLATE=C.UTF-8 LC_MONETARY=C.UTF-8 LC_MESSAGES=C.UTF-8 + [7] LC_PAPER=C.UTF-8 LC_NAME=C LC_ADDRESS=C +[10] LC_TELEPHONE=C LC_MEASUREMENT=C.UTF-8 LC_IDENTIFICATION=C + +time zone: UTC +tzcode source: system (glibc) + +attached base packages: +[1] stats graphics grDevices utils datasets methods base + +other attached packages: +[1] BiocStyle_2.32.1 + +loaded via a namespace (and not attached): + [1] BiocManager_1.30.23 compiler_4.4.1 fastmap_1.2.0 + [4] cli_3.6.3 htmltools_0.5.8.1 tools_4.4.1 + [7] yaml_2.3.10 rmarkdown_2.27 knitr_1.48 +[10] digest_0.6.36 xfun_0.46 rlang_1.1.4 +[13] renv_1.0.7 evaluate_0.24.0 + +Bioconductor version '3.19' + + * 1 packages out-of-date + * 0 packages too new + +create a valid installation with + + BiocManager::install("withr", update = TRUE, ask = FALSE, force = TRUE) + +more details: BiocManager::valid()$too_new, BiocManager::valid()$out_of_date +``` + +Conveniently, if any package can be updated, the function generates and displays the command needed to update those packages. +Users simply need to copy-paste and run that command in their R console. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Example of out-of-date package library + +In the example below, the `BiocManager::valid()` function did not return `TRUE`. +Instead, it includes information about the active user session, and displays the exact call to `BiocManager::install()` that the user should run to replace all the outdated packages detected in the user library with the latest version available in CRAN or Bioconductor. + +``` +> BiocManager::valid() + +* sessionInfo() + +R version 4.1.0 (2021-05-18) +Platform: x86_64-apple-darwin17.0 (64-bit) +Running under: macOS Big Sur 11.6 + +Matrix products: default +LAPACK: /Library/Frameworks/R.framework/Versions/4.1/Resources/lib/libRlapack.dylib + +locale: +[1] en_GB.UTF-8/en_GB.UTF-8/en_GB.UTF-8/C/en_GB.UTF-8/en_GB.UTF-8 + +attached base packages: +[1] stats graphics grDevices datasets utils methods base + +loaded via a namespace (and not attached): +[1] BiocManager_1.30.16 compiler_4.1.0 tools_4.1.0 renv_0.14.0 + +Bioconductor version '3.13' + + * 18 packages out-of-date + * 0 packages too new + +create a valid installation with + + BiocManager::install(c( + "cpp11", "data.table", "digest", "hms", "knitr", "lifecycle", "matrixStats", "mime", "pillar", "RCurl", + "readr", "remotes", "S4Vectors", "shiny", "shinyWidgets", "tidyr", "tinytex", "XML" + ), update = TRUE, ask = FALSE) + +more details: BiocManager::valid()$too_new, BiocManager::valid()$out_of_date + +Warning message: +18 packages out-of-date; 0 packages too new +``` + +Specifically, in this example, the message tells the user to run the following command to bring their installation up to date: + +``` + BiocManager::install(c( + "cpp11", "data.table", "digest", "hms", "knitr", "lifecycle", "matrixStats", "mime", "pillar", "RCurl", + "readr", "remotes", "S4Vectors", "shiny", "shinyWidgets", "tidyr", "tinytex", "XML" + ), update = TRUE, ask = FALSE) +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Exploring the package repository + +The Bioconductor [biocViews][glossary-biocviews], demonstrated in the earlier episode [Introduction to Bioconductor][crossref-intro-biocviews], are a great way to discover new packages by thematically browsing the hierarchical classification of Bioconductor packages. + +In addition, the `BiocManager::available()` function returns the complete list of package names that are can be installed from the Bioconductor and CRAN repositories. +For instance the total number of numbers that could be installed using *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* + + +``` r +length(BiocManager::available()) +``` + +``` output +[1] 24742 +``` + +Specifically, the union of current Bioconductor repositories and other repositories on the search path can be displayed as follows. + + +``` r +BiocManager::repositories() +``` + +``` output + BioCsoft + "https://bioconductor.org/packages/3.19/bioc" + BioCann +"https://bioconductor.org/packages/3.19/data/annotation" + BioCexp +"https://bioconductor.org/packages/3.19/data/experiment" + BioCworkflows + "https://bioconductor.org/packages/3.19/workflows" + BioCbooks + "https://bioconductor.org/packages/3.19/books" + carpentries + "https://carpentries.r-universe.dev" + carpentries_archive + "https://carpentries.github.io/drat" + CRAN + "https://cran.rstudio.com" +``` + +Each repository URL can be accessed in a web browser, displaying the full list of packages available from that repository. +For instance, navigate to [https://bioconductor.org/packages/3.14/bioc](https://bioconductor.org/packages/3.14/bioc). + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +The function `BiocManager::repositories()` can be combined with the base function `available.packages()` to query packages available specifically from any package repository, e.g. the Bioconductor [software package][glossary-software-package] repository. + +``` +> db = available.packages(repos = BiocManager::repositories()["BioCsoft"]) +> dim(db) +[1] 1948 17 +> head(rownames(db)) +[1] "a4" "a4Base" "a4Classif" "a4Core" "a4Preproc" +[6] "a4Reporting" +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Conveniently, `BiocManager::available()` includes a `pattern=` argument, particularly useful to navigate annotation resources (the original use case motivating it). +For instance, a range of [Annotation data packages][glossary-annotation-package] available for the mouse model organism can be listed as follows. + + +``` r +BiocManager::available(pattern = "*Mmusculus") +``` + +``` output + [1] "BSgenome.Mmusculus.UCSC.mm10" "BSgenome.Mmusculus.UCSC.mm10.masked" + [3] "BSgenome.Mmusculus.UCSC.mm39" "BSgenome.Mmusculus.UCSC.mm8" + [5] "BSgenome.Mmusculus.UCSC.mm8.masked" "BSgenome.Mmusculus.UCSC.mm9" + [7] "BSgenome.Mmusculus.UCSC.mm9.masked" "EnsDb.Mmusculus.v75" + [9] "EnsDb.Mmusculus.v79" "PWMEnrich.Mmusculus.background" +[11] "TxDb.Mmusculus.UCSC.mm10.ensGene" "TxDb.Mmusculus.UCSC.mm10.knownGene" +[13] "TxDb.Mmusculus.UCSC.mm39.knownGene" "TxDb.Mmusculus.UCSC.mm39.refGene" +[15] "TxDb.Mmusculus.UCSC.mm9.knownGene" +``` + +## Installing packages + +The `BiocManager::install()` function is used to install or update packages. + +The function takes a character vector of package names, and attempts to install them from the Bioconductor repository. + + +``` r +BiocManager::install(c("S4Vectors", "BiocGenerics")) +``` + +However, if any package cannot be found in the Bioconductor repository, the function also searches for those packages in repositories listed in the global option `repos`. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Contribute ! + +Add an example of non-Bioconductor package that can be installed using BioManager. +Preferably, a package that will be used later in this lesson. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Uninstalling packages + +Bioconductor packages can be removed from the R library like any other R package, using the base R function `remove.packages()`. +In essence, this function simply removes installed packages and updates index information as necessary. +As a result, it will not be possible to attach the package to a session or browse the documentation of that package anymore. + + +``` r +remove.packages("S4Vectors") +``` + +[bioc-release-dates]: https://bioconductor.org/about/release-announcements/ +[discuss-release-cycle]: discuss.html#the-bioconductor-release-cycle +[glossary-biocviews]: reference.html#biocviews +[crossref-intro-biocviews]: https://carpentries-incubator.github.io/bioc-project/02-introduction-to-bioconductor/index.html#package-classification-using-biocviews +[glossary-software-package]: reference.html#software-package +[glossary-annotation-package]: reference.html#annotationdata-package + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- The BiocManager package is available from the CRAN repository. +- `BiocManager::install()` is used to install and update Bioconductor packages (but also from CRAN and GitHub). +- `BiocManager::valid()` is used to check for available package updates. +- `BiocManager::version()` reports the version of Bioconductor currently installed. +- `BiocManager::install()` can also be used to update an entire R library to a specific version of Bioconductor. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/04-getting-help.md b/04-getting-help.md new file mode 100644 index 00000000..8a3c2a72 --- /dev/null +++ b/04-getting-help.md @@ -0,0 +1,252 @@ +--- +source: Rmd +title: Getting help +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Identify online resources for help. +- Access package documentation. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- Where can I find help online? +- Where can I ask questions to package developers and other users? +- Where can I find documentation for a specific package? +- Where can I learn best practices to combine multiple package into a coherent workflow? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Getting help with Bioconductor packages + +Help about Bioconductor packages and best practices is available in several places. +Often, the best source of help depends on the situation at hand: + +- Are you trying to identify the best package for a particular task? +- Are you trying to use a package for the very first time? +- Are you unsure about best practices to use and combine multiple packages and functions in a sensible workflow for a particular type of analysis? +- Is a function throwing an error when you apply it to your data? +- Do you have questions about the theory or methodology implemented in a particular package or function? + +In the next sections, we describe different sources of help available to Bioconductor users, and situations where each of them are most useful. + +## The Bioconductor website + +The main [Bioconductor website][bioc-website] provides a host of resources, all freely available without even the need to install R or any Bioconductor package. + +In particular, the [biocViews][biocviews-site] page is a great way to thematically explore the collection of packages and identify Bioconductor packages providing a certain functionality. + +Furthermore, the website also collects materials from [courses and conferences][bioc-courses-conferences-materials], including presentations, video recordings, and teaching materials. + +By nature, individual presentations and training materials are often tied to a specific version of Bioconductor packages. +As such, they provide a snapshot of best practices at a particular point in time, and may become outdated over time after successive Bioconductor releases. +Thus, it is important check the version of packages demonstrated in teaching materials matches that in the user's R library. +Alternatively, when referring to materials that employ package versions different from those in the user's library, it is important to carefully interpret any discrepancy between the expected and actual results. + +## Package landing pages + +Each package accepted in the Bioconductor project is granted a landing page on the main Bioconductor website, e.g. *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)*. + +Package landing pages contains useful information that can be consulted without the need to install the package itself. +This is particularly useful while browsing the Bioconductor repository is search of packages suitable for a specific task. + +On the landing page, prospective users can find a short description of the package functionality, and links to [package vignettes][glossary-vignette]. +Package vignettes available on the Bioconductor website are written by developers to demonstrate how the functions available in the package are meant to be used and combined into a complete workflow. +Often, vignettes use standard data sets preprocessed and freely available from public repositories, including [ExperimentData][glossary-experiment-package] packages or the Bioconductor *[ExperimentHub](https://bioconductor.org/packages/3.19/ExperimentHub)*. + +In the "Details" section of the landing page, many packages provide a field labelled "BugReports". +That field provides a URL that users can visit to report bugs to the package developer(s). + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +It can be difficult to distinguish actual software bugs from unwitting mistakes made by users not fully familiar with the package yet. +Later in this episode, we provide advice for reporting bugs and including sufficient information to receive the fastest and most helpful responses. + +In doubt, the [Biocondutor support site][bioc-support-site] can also be a great place to discuss individual experiences and share knowledge about packages and best practices. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Additionally, the landing page provides many other pieces of information, from daily build reports indicating whether the package passed all tests on a range of operating systems, to software dependencies indicating the number of other Bioconductor packages that must be installed before the package itself can be installed and used on the user's system. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +Each package has a landing page for each release of Bioconductor since the package was added to the repository, e.g.: + +- [https://www.bioconductor.org/packages/3.14/bioc/html/BiocVersion.html](https://www.bioconductor.org/packages/3.14/bioc/html/BiocVersion.html) +- [https://www.bioconductor.org/packages/3.13/bioc/html/BiocVersion.html](https://www.bioconductor.org/packages/3.13/bioc/html/BiocVersion.html) +- [https://www.bioconductor.org/packages/3.12/bioc/html/BiocVersion.html](https://www.bioconductor.org/packages/3.12/bioc/html/BiocVersion.html) + +In the URL of a package landing page, we can replace the version number by the word "release" or "devel" to access the landing page of the latest stable release or development version, respectively. + +- [https://www.bioconductor.org/packages/release/bioc/html/BiocPkgTools.html](https://www.bioconductor.org/packages/release/bioc/html/BiocPkgTools.html) +- [https://www.bioconductor.org/packages/devel/bioc/html/BiocPkgTools.html](https://www.bioconductor.org/packages/devel/bioc/html/BiocPkgTools.html) + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Package vignettes + +Each Bioconductor package is required to include at least one vignette. +Many packages have more than one vignette, often separating core functionality from specific use cases. + +As we noted earlier in this episode, vignettes are available from [package landing pages][self-package-landing-page] on the Bioconductor website. +However, the landing page only links to the documentation of the most recent version of the package for each version of Bioconductor. +This may be a different version from the one that is installed in the user's R library and used in the R session. + +When Bioconductor packages are installed in the user's R library, the vignettes associated with that particular version of the package are also installed on the user's computer. +Those locally installed vignettes are the gold standard reference for the version of the package that is currently installed in the R library and used in the R session. +They can be accessed using the function `browseVignettes()`, for instance: + + +``` r +browseVignettes("BiocManager") +``` + +Specifically, the function `browseVignettes()` opens a local web page in the user's default web browser, listing all the vignettes available for the requested package. +Each vignette is available in three formats: + +- precompiled, in PDF or HTML format +- source, in Sweave or R markdown format +- as an R script + +The precompiled format is often the most comfortable format to read, as the PDF and HTML formats allow the contents of the documents to be preview in one integrated view. +This includes plain text explanations, as well as code and their outputs, both figures and console messages. + +## Package help pages + +Bioconductor requires every user-facing package function to be documented in one of the package help pages (often referred to as "man pages", after the name of the package sub-directory where they are stored). + +Help pages can be accessed using the `help()` function or the question mark symbol `?`. + + +``` r +help(topic = "install", package = "BiocManager") +?BiocManager::install +``` + +Help pages for Bioconductor packages generally follow the same rules as CRAN packages when it comes to formatting and essential contents. +However, Bioconductor also requires that most of the man pages documenting exported objects must have runnable examples. +Runnable examples are particularly helpful to demonstrate the usage of individual functions on small data sets immediately available to users - either artificially simulated on the fly or programmatically imported from public data repositories. + +In particular, runnable examples demonstrate the usage of functions in ideal cases, showcasing how the inputs inputs of the function should be formatted, and what information will be available to users in the outputs of the function. +Running those examples and comparing the example inputs and outputs with the user's own data often provide significant insights into the transformations that are needed before applying the function to the user's own data, and how the outputs of the function can be interpreted and interacted with. + +## The Bioconductor support site + +The [Bioconductor support site][bioc-support-site] provides a platform for the community of users and developers to ask questions, and help each other through the doubts and challenges related to Bioconductor packages and analytical workflows. + +The support site can be freely browsed without an account to search and read the many questions that were already asked and answered. +However, posting a new question does require an account on the platform. +Signing up to the platform is straightforward using an email address or Open Authorization from a number of trusted providers. + +A system of upvoting allows the most popular answers to feature more prominently at the top of each page. +Furthermore, the original poster retains the right to mark one answer as the one that resolved their issue. + +Separately, a system of points granted to each user for providing answers either popular or accepted by the original poster highlights the most active and trusted contributors on the platform. + +![](fig/bioc-support-site.png){alt='The Bioconductor support site.'} + +**The Bioconductor support site.** +The Bioconductor support site tracks questions and answers posted by registered users. +The platform can be freely browsed and searched by non-registered users. + +## Workflow packages + +Bioconductor workflow packages are special in the way that they are only expected to contain vignettes, without any additional code or functionality of their own. +Instead, the vignettes of workflow packages exclusively import functionality from other packages, and demonstrate how to combine functions from those packages into an integrated workflow that users are likely to face in their day-to-day work. + +Like regular vignettes, data is typically fetched from publicly available sources, including Bioconductor [ExperimentData][glossary-experiment-package] packages or the Bioconductor *[ExperimentHub](https://bioconductor.org/packages/3.19/ExperimentHub)*. +Those freely available standard data sets allow users to interactively reproduce outputs while they read and follow along the vignette. + +Workflow packages can be browsed in a dedicated section of the [biocViews page][biocviews-workflow]. + +## Slack workspace + +The [Bioconductor Slack workspace][bioc-slack] was created in 2016. + +The workspace can be freely joined using this [Heroku app][bioc-slack-heroku] to generate invitations for individual email addresses. + +The Bioconductor Slack workspace is a lively online platform for official announcements by the [Bioconductor Core Team][bioc-core-team] (e.g., Bioconductor release, conferences \& events), as well as informal discussions between groups of users subscribed to thematic channels, and direct messages between community members. + +The workspace features a large number of channels dedicated to particular topics and areas of interest in the community. +Those channels range from active fields of research (e.g., single-cell genomics), to time-limited events (e.g., conferences), but also community outreach (e.g., diversity and representation). + +Private channels also exist for governance (e.g., event organisation, advisory boards). + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +The Slack workspace allows for rapid day-to-day communication and discussion with fellow community members through channels and direct messages. + +However, popular channels can reach up to hundreds of users in many different time zones and should be used with parsimony and mindfulness. +Conversely, direct messages and private channels are limited to the users invited in the discussion, and any outcome relevant to the community then needs to be re-posted in a public channel. + +As a consequence, the [Bioconductor support site][bioc-support-site] remains the preferred way to publicly ask questions of interest to the community, in a way that both the question, discussion, and answers are easily searchable and indexed by major search engines. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## How to efficiently ask for help + +Most community members provide help voluntarily, on their own spare time and without any form of compensation. +As such, it is important to ask questions as clearly as possible, providing relevant information, to help those volunteers identify the source of the issue as rapidly as possible, thus saving time for both the helper and the original poster. + +Depending on the question, some key information includes: + +- Operating system +- Version of R +- Version of Bioconductor +- Version of individual packages installed in the R library +- Version of individual packages attached to the current session +- Third-party software, libraries, and compilers installed on the user's system +- Source of packages installed in the R library (e.g., Bioconductor, CRAN, GitHub) +- Code executed in the R session leading up to the issue +- Global options active in the session (accessible using `options()`) + +When the issue relates to code being run and producing unexpected outputs, it is paramount to include sufficient information for others to reproduce the issue on their own computer. +Indeed, many issues require a live R session to properly investigate the source of the issue, test fixes, or provide workaround and advice. + +Crucially, when providing code as part of your post, it is important that this code be executable by readers, including data that are processed by the code. +Often, the code itself may look correct, while the issue relates to the interaction between the code and a particular data set. +If sharing sensitive data is not an option, then the issue should be reformulated and presented using a data set publicly available on the internet, or including code to generate simulated data randomly generated in a reproducible way (e.g., `set.seed()`). + +One option is to use the package *[reprex](https://CRAN.R-project.org/package=reprex)* to collate the code and outputs that describe the issue into formatted text that is easy to post of many online forums, including the [Bioconductor support site][bioc-support-site]. + +Finally, the [Bioconductor support site][bioc-support-site] is the preferred platform to post questions related to Bioconductor packages. +This is because questions are visible to the entire community, including many experienced Bioconductor users who regularly answer those questions, and other users who can find answers to questions that were already posted and resolved by the time they run into the issue themselves. + +[bioc-website]: https://www.bioconductor.org/ +[biocviews-site]: https://www.bioconductor.org/packages/release/BiocViews.html +[bioc-courses-conferences-materials]: https://bioconductor.org/help/course-materials/ +[glossary-vignette]: reference.html#vignette +[glossary-experiment-package]: reference.html#experimentdata-package +[bioc-support-site]: https://support.bioconductor.org/ +[self-package-landing-page]: ..html#package-landing-pages +[biocviews-workflow]: https://www.bioconductor.org/packages/release/BiocViews.html#___Workflow +[bioc-slack]: https://community-bioc.slack.com/ +[bioc-slack-heroku]: https://bioc-community.herokuapp.com/ +[bioc-core-team]: https://www.bioconductor.org/about/core-team/ + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- The `browseVignettes()` function is recommended to access the vignette(s) installed with each package. +- Vignettes can also be accessed on the Bioconductor website, but beware of differences between package versions! +- The Bioconductor main website contains general information, package documentation, and course materials. +- The Bioconductor support site is the recommended place to contact developers and ask questions. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/05-s4.md b/05-s4.md new file mode 100644 index 00000000..dc28f6b0 --- /dev/null +++ b/05-s4.md @@ -0,0 +1,556 @@ +--- +source: Rmd +title: S4 classes in Bioconductor +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain what S4 classes, generics, and methods are. +- Identify S4 classes at the core of the Bioconductor package infrastructure. +- Create various S4 objects and apply relevant S4 methods. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is the S4 class system? +- How does Bioconductor use S4 classes? +- How is the Bioconductor `DataFrame` different from the base `data.frame`? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Install packages + +Before we can proceed into the following sections, we install some Bioconductor packages that we will need. +First, we check that the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is installed before trying to use it; otherwise we install it. +Then we use the `BiocManager::install()` function to install the necessary packages. + + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("S4Vectors") +``` + +:::::::::::::::::::::::::::::::::::::::::: prereq + +### For Instructors + +The first part of this episode may look somewhat heavy on the theory. +Do not be tempted to go into excessive details about the inner workings of the S4 class system (e.g., no need to mention the function `new()`, or to demonstrate a concrete example of code creating a class). +Instead, the caption of the first figure demonstrates how to progressively talk through the figure, introducing technical terms in simple sentences, building up to the method dispatch concept that is core to the S4 class system, and the source of much confusion in novice users. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## S4 classes and methods + +### The methods package + +The S4 class system is implemented in the base package [methods][r-package-methods]. +As such, the concept is not specific to the Bioconductor project and can be found in various independent packages as well. +The subject is thoroughly documented in the online book [Advanced R][book-advanced-r-s4], by Hadley Wickham. +Most Bioconductor users will never need to get overly familiar with the intricacies of the S4 class system. +Rather, the key to an efficient use of packages in the Bioconductor project relies on a sufficient understanding of key motivations for using the S4 class system, as well as best practices for user-facing functionality, including classes, generics, and methods. +In the following sections of this episode, we focus on the essential functionality and user experience of S4 classes and methods in the context of the Bioconductor project. + +On one side, [S4 classes][glossary-s4-class] provide data structures capable of storing arbitrarily complex information in computational objects that can be assigned to variable names in an R session. +On the other side, [S4 generics and methods][glossary-s4-method] define functions that may be applied to process those objects. + +Over the years, the Bioconductor project has used the S4 class system to develop a number of classes and methods capable of storing and processing data for most biological assays, including raw and processed assay data, experimental metadata for individual features and samples, as well as other assay-specific information as relevant. +Gaining familiarity with the standard S4 classes commonly used throughout Bioconductor packages is a key step in building up confidence in users wishing to follow best practices while developing analytical workflows. + +![](fig/bioc-s4.svg){alt='S4 classes, generics, and methods.'} + +**S4 classes, generics, and methods.** +On the left, two example classes named `S4Class1` and `S4Class2` demonstrate the concept of inheritance. +The class `S4Class1` contains two slots named `SlotName1` and `SlotName2` for storing data. +Those two slots are restricted to store objects of type `SlotType1` and `SlotType2`, respectively. +The class also defines validity rules that check the integrity of data each time an object is updated. +The class `S4Class2` inherits all the slots and validity rules from the class `S4Class1`, in addition to defining a new slot named `SlotName3` and new validity rules. +Example code illustrates how objects of each class are typically created using constructor functions named identically to the corresponding class. +On the right, one generic function and two methods demonstrate the concept of polymorphism and the process of S4 method dispatch. +The generic function `S4Generic1()` defines the name of the function, as well as its arguments. +However, it does not provide any implementation of that function. +Instead, two methods are defined, each providing a distinct implementation of the generic function for a particular class of input. +Namely, the first method defines an implementation of `S4Generic1()` if an object of class `S4Class1` is given as argument `x`, while the second method method provides a different implementation of `S4Generic1()` if an object of class `S4Class2` is given as argument `x`. +When the generic function `S4Generic1()` is called, a process called [method dispatch][glossary-s4-dispatch] takes place, whereby the appropriate implementation of the `S4Generic1()` method is called according to the class of the object passed to the argument `x`. + +### Slots and validity + +In contrast to the S3 class system available directly in base R (not described in this lesson), the S4 class system provides a much stricter definition of classes and methods for object-oriented programming (OOP) in R. +Like many programming languages that implement the OOP model, S4 classes are used to represent real-world entities as computational objects that store information inside one or more internal components called *slots*. +The class definition declares the type of data that may be stored in each slot; an error will be thrown if one would attempt to store unsuitable data. +Moreover, the class definition can also include code that checks the validity of data stored in an object, beyond their type. +For instance, while a slot of type `numeric` could be used to store a person's age, but a validity method could check that the value stored is, in fact, positive. + +### Inheritance + +One of the core pillars of the OOP model is the possibility to develop new classes that inherit and extend the functionality of existing classes. +The S4 class system implements this paradigm. + +The definition of a new S4 classes can declare the name of other classes to inherit from. +The new classes will contain all the slots of the parent class, in addition to any new slot added in the definition of the new class itself. + +The new class definition can also define new validity checks, which are added to any validity check implement in each of the parent classes. + +### Generics and methods + +While classes define the data structures that store information, generics and methods define the functions that can be applied to objects instantiated from those classes. + +S4 generic functions are used to declare the name of functions that are expected to behave differently depending on the class of objects that are given as some of their essential arguments. +Instead, S4 methods are used define the distinct implementations of a generic function for each particular combination of inputs. + +When a generic function is called and given an S4 object, a process called [method dispatch][glossary-s4-dispatch] takes place, whereby the class of the object is used to determine the appropriate method to execute. + +## The S4Vectors package + +The *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package defines the `Vector` and `List` virtual classes and a set of generic functions that extend the semantic of ordinary vectors and lists in R. +Using the S4 class system, package developers can easily implement vector-like or list-like objects as concrete subclasses of `Vector` or `List`. + +Virtual classes -- such as `Vector` and `List` -- cannot be instantiated into objects themselves. +Instead, those virtual classes provide core functionality inherited by all the concrete classes that are derived from them. + +Instead, a few low-level concrete subclasses of general interest (e.g. `DataFrame`, `Rle`, and `Hits`) are implemented in the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package itself, and many more are implemented in other packages throughout the Bioconductor project (e.g., *[IRanges](https://bioconductor.org/packages/3.19/IRanges)*). + +Attach the package to the current R session as follows. + + +``` r +library(S4Vectors) +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +The package startup messages printed in the console are worth noting that the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package masks a number of functions from the [base][r-package-base] package when the package is attached to the session. +This means that the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package includes an implementation of those functions, and that -- being the latest package attached to the R session -- its own implementation of those functions will be found first on the R search path and used instead of their original implementation in the [base][r-package-base] package. + +In many cases, masked functions can be used as before without any issue. +Occasionally, it may be necessary to disambiguate calls to masked function using the package name as well as the function name, e.g. `base::anyDuplicated()`. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## The DataFrame class + +### An extension to the concept of rectangular data + +The `DataFrame` class implemented in the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package extends the concept of rectangular data familiar to users of the `data.frame` class in base R, or `tibble` in the tidyverse. +Specifically, the `DataFrame` supports the storage of any type of object (with `length` and `[` methods) as columns. + +On the whole, the `DataFrame` class provides a formal definition of an S4 class that behaves very similarly to `data.frame`, in terms of construction, subsetting, splitting, combining, etc. + +The `DataFrame()` constructor function should be used to create new objects, comparably to the `data.frame()` equivalent in base R. +The help page for the function, accessible as `?DataFrame`, can be consulted for more information. + + +``` r +DF1 <- DataFrame( + Integers = c(1L, 2L, 3L), + Letters = c("A", "B", "C"), + Floats = c(1.2, 2.3, 3.4) +) +DF1 +``` + +``` output +DataFrame with 3 rows and 3 columns + Integers Letters Floats + +1 1 A 1.2 +2 2 B 2.3 +3 3 C 3.4 +``` + +In fact, `DataFrame` objects can be easily converted to equivalent `data.frame` objects. + + +``` r +df1 <- as.data.frame(DF1) +df1 +``` + +``` output + Integers Letters Floats +1 1 A 1.2 +2 2 B 2.3 +3 3 C 3.4 +``` + +Vice versa, we can also convert `data.frame` objects to `DataFrame` using the `as()` function. + + +``` r +as(df1, "DataFrame") +``` + +``` output +DataFrame with 3 rows and 3 columns + Integers Letters Floats + +1 1 A 1.2 +2 2 B 2.3 +3 3 C 3.4 +``` + +### Differences with the base data.frame + +The most notable exceptions have to do with handling of row names. +First, row names are optional. This means calling `rownames(x)` will return `NULL` if there are no row names. + + +``` r +rownames(DF1) +``` + +``` output +NULL +``` + +This is different from `data.frame`, where `rownames(x)` returns the equivalent of `as.character(seq_len(nrow(x)))`. + + +``` r +rownames(df1) +``` + +``` output +[1] "1" "2" "3" +``` + +However, returning `NULL` informs, for example, combination functions that no row names are desired (they are often a luxury when dealing with large data). + +Furthermore, row names of `DataFrame` objects are not required to be unique, in contrast to the `data.frame` in base R. +Row names are a frequent source of controversy in R, as they can be used to uniquely identify and index observations in rectangular format, without storing that information explicitly in a dedicated column. +When set, row names can be used to subset rectangular data using the `[` operator. +Meanwhile, non-unique row names defeat that purpose and can lead to unexpected results, as only the first occurrence of each selected row name is extracted. +Instead, the tidyverse `tibble` removed the ability to set row names altogether, forcing users to store every bit of information explicitly in dedicated columns, while providing functions to dedicated to efficiently filtering rows in rectangular data, without the need for the `[` operator. + + +``` r +DF2 <- DataFrame( + Integers = c(1L, 2L, 3L), + Letters = c("A", "B", "C"), + Floats = c(1.2, 2.3, 3.4), + row.names = c("name1", "name1", "name2") +) +DF2 +``` + +``` output +DataFrame with 3 rows and 3 columns + Integers Letters Floats + +name1 1 A 1.2 +name1 2 B 2.3 +name2 3 C 3.4 +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Using the example above, what does `DF2["name1", ]` return? Why? + +::::::::::::::: solution + +### Solution + +``` +> DF2["name1", ] +DataFrame with 1 row and 3 columns + Integers Letters Floats + +name1 1 A 1.2 +``` + +Only the first occurrence of a row matching the row name `name1` is returned. + +In this case, row names do not have a particular meaning, making it difficult to justify the need for them. +Instead, users could extract all the rows that matching the row name `name1` more explicitly as follows: `DF2[rownames(DF2) == "name1", ]`. + +Users should be mindful of the motivation for using row names in any given situation; what they represent, and how they should be used during the analysis. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +Finally, row names in `DataFrame` do not support partial matching during subsetting, in contrast to `data.frame`. +The stricter behaviour of `DataFrame` prevents often unexpected results faced by unsuspecting users. + + +``` r +DF3 <- DataFrame( + Integers = c(1L, 2L, 3L), + Letters = c("A", "B", "C"), + Floats = c(1.2, 2.3, 3.4), + row.names = c("alpha", "beta", "gamma") +) +df3 <- as.data.frame(DF3) +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Using the examples above, what are the outputs of `DF3["a", ]` and `df3["a", ]`? +Why are they different? + +::::::::::::::: solution + +### Solution + +``` +> DF3["a", ] +DataFrame with 1 row and 3 columns + Integers Letters Floats + + NA NA NA +> df3["a", ] + Integers Letters Floats +alpha 1 A 1.2 +``` + +The `DataFrame` object did not perform partial row name matching, and thus did not match any row and return a `DataFrame` full of `NA` values. +Instead, the `data.frame` object performed partial row name matching, matched the requested `"a"` to the `"alpha"` row name, and returned the corresponding row as a new `data.frame` object. + + + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Indexing + +Just like a regular `data.frame`, columns can be accessed using `$`, `[`, and `[[`. +Each operator has a different purpose, and the most appropriate one will often depend on what you are trying to achieve. + +For example, the dollar operator `$` can be used to extract a single column by name. +That will often be a vector, but it may depend on the nature of the data in that column. +This operator can be quite convenient in an interactive R session, as it will offer autocompletion among available column names. + + +``` r +DF3$Integers +``` + +``` output +[1] 1 2 3 +``` + +Similarly, the double bracket operator `[[` can also be used to extract a single column. +It is more flexible than `$` as it can handle both character names and integer indices. + + +``` r +DF3[["Letters"]] +``` + +``` output +[1] "A" "B" "C" +``` + +``` r +DF3[[2]] +``` + +``` output +[1] "A" "B" "C" +``` + +The operator `[` is most convenient when it comes to selecting simultaneously on rows and columns, or controlling whether a single-column selection should be returned as a `DataFrame` or a `vector`. + + +``` r +DF3[2:3, "Letters", drop=FALSE] +``` + +``` output +DataFrame with 2 rows and 1 column + Letters + +beta B +gamma C +``` + +### Metadata columns + +One of most notable novel functionality in `DataFrame` relative to the base `data.frame` is the capacity to hold metadata on the columns in another `DataFrame`. + +![](fig/dataframe-mcols.svg){alt='Metadata columns.'} + +**Metadata columns.** +Metadata columns are illustrated in the context of a `DataFrame` object. +On the left, a `DataFrame` object called `DF` is created with columns named `A` and `B`. +On the right, the metadata columns for `DF` are accessed using `mcols(DF)`. +In this example, two metadata columns are created with names `meta1` and `meta2`. +Metadata columns are stored as a `DataFrame` that contains one row for each column in the parent `DataFrame`. + +The metadata columns are accessed using the function `mcols()`, +If no metadata column is defined, `mcols()` simply returns `NULL.` + + +``` r +DF4 <- DataFrame( + Integers = c(1L, 2L, 3L), + Letters = c("A", "B", "C"), + Floats = c(1.2, 2.3, 3.4), + row.names = c("alpha", "beta", "gamma") +) +mcols(DF4) +``` + +``` output +NULL +``` + +The function `mcols()` can also be used to add, edit, or remove metadata columns. +For instance, we can initialise metadata columns as a `DataFrame` of two columns: + +- one column indicating the type of value stored in the corresponding column +- one column indicating the number of distinct values observed in the corresponding column + + +``` r +mcols(DF4) <- DataFrame( + Type = sapply(DF4, typeof), + Distinct = sapply(DF4, function(x) { length(unique(x)) } ) +) +mcols(DF4) +``` + +``` output +DataFrame with 3 rows and 2 columns + Type Distinct + +Integers integer 3 +Letters character 3 +Floats double 3 +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +The row names of the metadata columns are automatically set to match the column names of the parent `DataFrame`, clearly indicating the pairing between columns and metadata. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## Run-length encoding (RLE) + +### An extension to the concept of vector + +Similarly to the `DataFrame` class implemented in the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)*, +the `Rle` class provides an S4 extension to the `rle()` function from the base package. +Specifically, the `Rle` class supports the storage of atomic vectors in a run-length encoding format. + +![](fig/rle.svg){alt='Run-length encoding.'} + +**Run-length encoding.** +The concept of run-length encoding is demonstrated here using the example of a sequence of nucleic acids. +Before encoding, each nucleotide at each position in the sequence is explicitly stored in memory. +During the encoding, consecutive runs of identical nucleotides are collapsed into two bits of information: the identity of the nucleotide and the length of the run. + +Run-length encoding can dramatically reduce the memory footprint of vectors that contain frequent runs of identical information. +For instance, a compelling application of run-length encoding is the representation of genomic coverage in sequencing experiments, where large genomic regions devoid of any mapped reads result in long runs of `0` values. +Storing each individual value would be highly inefficient from the standpoint of memory usage. +Instead, the run-length encoding process collapses such runs of redundant information from arbitrarily long runs of identical information to two values: the repeated value itself, and the number of times that it is repeated. + + +``` r +v1 <- c(0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 2, 1, 0, 0, 0, 0, 0) +rle1 <- Rle(v1) +rle1 +``` + +``` output +numeric-Rle of length 17 with 7 runs + Lengths: 7 1 1 1 1 1 5 + Values : 0 1 2 3 2 1 0 +``` + +### Indexing + +Just like a regular `vector`, `Rle` objects can be indexed using `[`. + + +``` r +rle1[2:4] +``` + +``` output +numeric-Rle of length 3 with 1 run + Lengths: 3 + Values : 0 +``` + +### Usage + +As vector-like objects, `Rle` objects can also be stored as columns of `DataFrame` objects, alongside other vector-like objects. + + +``` r +v2 <- c(rep(1, 5), rep(2, 5)) +rle2 <- Rle(v2) +DF5 <- DataFrame( + vector = v2, + rle = rle2, + equal = v2 == rle2 +) +DF5 +``` + +``` output +DataFrame with 10 rows and 3 columns + vector rle equal + +1 1 1 TRUE +2 1 1 TRUE +3 1 1 TRUE +4 1 1 TRUE +5 1 1 TRUE +6 2 2 TRUE +7 2 2 TRUE +8 2 2 TRUE +9 2 2 TRUE +10 2 2 TRUE +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +A number of standard operations with `Rle` objects are documented in the help page of the `Rle` class, accessible as `?Rle`, and in the vignettes of the *[S4Vectors](https://bioconductor.org/packages/3.19/S4Vectors)* package, accessible using `browseVignettes("S4Vectors")`. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[r-package-methods]: https://stat.ethz.ch/R-manual/R-devel/library/methods/html/00Index.html +[book-advanced-r-s4]: https://adv-r.had.co.nz/S4.html +[glossary-s4-class]: reference.html#s4-class +[glossary-s4-method]: reference.html#s4-method +[glossary-s4-dispatch]: reference.html#s4-method-dispatch +[r-package-base]: https://stat.ethz.ch/R-manual/R-devel/library/base/html/00Index.html + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- S4 classes store information in slots, and check the validity of the information every an object is updated. +- To ensure the continued integrity of S4 objects, users should not access slots directly, but using dedicated functions. +- S4 generics invoke different implementations of the method depending on the class of the object that they are given. +- The S4 class `DataFrame` extends the functionality of base `data.frame`, for instance with the capacity to hold information about each column in metadata columns. +- The S4 class `Rle` extends the functionality of the base `vector`, for instance with the capacity to encode repetitive vectors in a memory-efficient format. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/06-biological-sequences.md b/06-biological-sequences.md new file mode 100644 index 00000000..905a2df0 --- /dev/null +++ b/06-biological-sequences.md @@ -0,0 +1,1210 @@ +--- +source: Rmd +title: Working with biological sequences +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how biological sequences are represented in the Bioconductor project. +- Identify Bioconductor packages and methods available to process biological sequences. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is the recommended way to represent biological sequences in Bioconductor? +- What Bioconductor packages provides methods to efficiently process biological sequences? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + + +## Install packages + +Before we can proceed into the following sections, we install some Bioconductor packages that we will need. +First, we check that the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is installed before trying to use it; otherwise we install it. +Then we use the `BiocManager::install()` function to install the necessary packages. + + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("Biostrings") +``` + +## The Biostrings package and classes + +### Why do we need classes for biological sequences? + +Biological sequences are arguably some of the simplest biological entities to +represent computationally. +Examples include nucleic acid sequences (e.g., DNA, RNA) and protein sequences +composed of amino acids. + +That is because alphabets have been designed and agreed upon to represent +individual monomers using character symbols. + +For instance, using the alphabet for amino acids, the reference protein sequence +for the [Actin, alpha skeletal muscle protein sequence](https://www.uniprot.org/uniprot/P68133#sequences) is represented as +follows. + + +``` output +[1] "MCDEDETTALVCDNGSGLVKAGFAGDDAPRAVFPSIVGRPRHQGVMVGMGQKDSYVGDEAQSKRGILTLKYPIEHGIITNWDDMEKIWHHTFYNELRVAPEEHPTLLTEAPLNPKANREKMTQIMFETFNVPAMYVAIQAVLSLYASGRTTGIVLDSGDGVTHNVPIYEGYALPHAIMRLDLAGRDLTDYLMKILTERGYSFVTTAEREIVRDIKEKLCYVALDFENEMATAASSSSLEKSYELPDGQVITIGNERFRCPETLFQPSFIGMESAGIHETTYNSIMKCDIDIRKDLYANNVMSGGTTMYPGIADRMQKEITALAPSTMKIKIIAPPERKYSVWIGGSILASLSTFQQMWITKQEYDEAGPSIVHRKCF" +``` + +However, a major limitation of regular character vectors is that they do not +check the validity of the sequences that they contain. +Practically, it is possible to store meaningless sequences of symbols in +character strings, including symbols that are not part of the official alphabet +for the relevant type of polymer. +In those cases, the burden of checking the validity of sequences falls on the +programs that process them, or causing those programs to run into errors when +they unexpectedly encounter invalid symbols in a sequence. + +Instead, [S4 classes][glossary-s4-class] -- demonstrated in the earlier episode [The S4 class system][crossref-s4] -- provide a way to label objects as distinct "DNA", "RNA", or "protein" varieties of biological sequences. +This label is an extremely powerful way to inform programs on the set of character symbols they can expect in the sequence, but also the range of computational operations that can be applied to those sequences. +For instance, a function designed to translate nucleic acid sequences into the corresponding amino acid sequence should only be allowed to run on sequences that represent nucleic acids. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Can you tell whether this character string is a valid DNA sequence? + +``` +AATTGGCCRGGCCAATT +``` + +::::::::::::::: solution + +### Solution + +Yes, this is a valid DNA sequence using ambiguity codes defined in the [IUPAC][external-iupac] notation. +In this case, `A`, `T`, `C`, and `G` represents the four standard types of +nucleotides, while the `R` symbol acts as a regular expression representing +either of the two purine nucleotide bases, `A` and `G`. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## The Biostrings package + +### Overview + +In the Bioconductor project, the *[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package +implements S4 classes to represent biological sequences as S4 objects, e.g. +`DNAString` for sequences of nucleotides in deoxyribonucleic acid polymers, and +`AAString` for sequences of amino acids in protein polymers. +Those S4 classes provide memory-efficient containers for character strings, +automatic validity-checking functionality for each class of biological +molecules, and methods implementing various string matching algorithms and other +utilities for fast manipulation and processing of large biological sequences or +sets of sequences. + +A short presentation of the basic classes defined in the +*[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package is available in one of the package +vignettes, accessible as `vignette("Biostrings2Classes")`, while more detailed +information is provided in the other package vignettes, accessible as +`browseVignettes("Biostrings")`. + +### First steps + +To get started, we load the package. + + +``` r +library(Biostrings) +``` + +With the package loaded and attached to the session, we have access to the +package functions. +Those include functions that let us create new objects of the classes defined +in the package. +For instance, we can create an object that represents a DNA sequence, using the +`DNAString()` constructor function. +Without assigning the output to an object, we let the resulting object be +printed in the console. + + +``` r +DNAString("ATCG") +``` + +``` output +4-letter DNAString object +seq: ATCG +``` + +Notably, DNA sequences may only contain the symbols `A`, `T`, `C`, and `G`, to +represent the four DNA nucleotide bases, the symbol `N` as a placeholder for an +unknown or unspecified base, and a restricted set of additional symbols with +special meaning defined in the +[IUPAC Extended Genetic Alphabet][iupac-alphabet]. +Notice that the constructor function does not let us create objects that contain +invalid characters, e.g. `Z`. + + +``` r +DNAString("ATCGZ") +``` + +``` error +Error in .Call2("new_XString_from_CHARACTER", class(x0), string, start, : key 90 (char 'Z') not in lookup table +``` + +Specifically, the [IUPAC Extended Genetic Alphabet][iupac-alphabet] defines +ambiguity codes that represent sets of nucleotides, in a way similar to regular +expressions. +The `IUPAC_CODE_MAP` named character vector contains the mapping from the IUPAC +nucleotide ambiguity codes to their meaning. + + +``` r +IUPAC_CODE_MAP +``` + +``` output + A C G T M R W S Y K V + "A" "C" "G" "T" "AC" "AG" "AT" "CG" "CT" "GT" "ACG" + H D B N + "ACT" "AGT" "CGT" "ACGT" +``` + +Any of those nucleotide codes are allowed in the sequence of a `DNAString` +object. +For instance, the symbol `M` represents either of the two nucleotides `A` or `C` +at a given position in a nucleic acid sequence. + + +``` r +DNAString("ATCGM") +``` + +``` output +5-letter DNAString object +seq: ATCGM +``` + +In particular, pattern matching methods implemented in the +*[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package recognize the meaning of ambiguity +codes for each class of biological sequence, allowing them to efficiently match +motifs queried by users without the need to design elaborate regular +expressions. +For instance, the method `matchPattern()` takes a `pattern=` and a `subject=` +argument, and returns a `Views` object that reports and displays any match of +the `pattern` expression at any position in the `subject` sequence. + +Note that the default option `fixed = TRUE` instructs the method to match the +query exactly -- i.e., ignore ambiguity codes -- which in this case does not report +any exact match. + + +``` r +dna1 <- DNAString("ATCGCTTTGA") +matchPattern("GM", dna1, fixed = TRUE) +``` + +``` output +Views on a 10-letter DNAString subject +subject: ATCGCTTTGA +views: NONE +``` + +Instead, to indicate that the pattern includes some ambiguity code, the argument +`fixed` must be set to `FALSE`. + + +``` r +matchPattern("GM", dna1, fixed = FALSE) +``` + +``` output +Views on a 10-letter DNAString subject +subject: ATCGCTTTGA +views: + start end width + [1] 4 5 2 [GC] + [2] 9 10 2 [GA] +``` + +In this particular example, two views describe matches of the pattern in the +subject sequence. +Specifically, the pattern `GM` first matched the sequence `GC` spanning +positions 4 to 5 in the subject sequence, and then also matched the sequence +`GA` from positions 9 to 10. + +Similarly to the method `matchPattern()`, the method `countPattern()` can be +applied to simply count the number of matches of the `pattern` in the `subject` +sequence. +And again, the option `fixed` controls whether to respect ambiguity codes, or +match them exactly. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +How many hits does the following code return? Why? + +``` +dna2 <- DNAString("TGATTGCTTGGTTGMTT") +countPattern("GM", dna2, fixed = FALSE) +``` + +::::::::::::::: solution + +### Solution + +The method `countPattern()` reports 3 hits, because the option +`fixed = FALSE` allows the pattern `GM` to match `GA`, `GC`, and `GM` +sequences, due to the use of the ambiguity code `M` in the `pattern`. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Importing biological strings from files + +In practice, users rarely type the strings representing biological sequences themselves. +Most of the time, biological strings are imported from files, either downloaded from public repositories or generated locally using bioinformatics programs. + +For instance, we can load the set of adapter sequences for the [TruSeq™ DNA PCR-Free whole-genome sequencing library preparation][external-truseq] kit from a file that we downloaded during the lesson setup. +Since adapter sequences are nucleic acid sequences, we must use the function `readDNAStringSet()`. + + +``` r +truseq_adapters <- readDNAStringSet(filepath = "data/TruSeq3-PE-2.fa") +truseq_adapters +``` + +``` output +DNAStringSet object of length 6: + width seq names +[1] 34 TACACTCTTTCCCTACACGACGCTCTTCCGATCT PrefixPE/1 +[2] 34 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT PrefixPE/2 +[3] 34 TACACTCTTTCCCTACACGACGCTCTTCCGATCT PE1 +[4] 34 AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA PE1_rc +[5] 34 GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT PE2 +[6] 34 AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC PE2_rc +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +The help page of the function `readDNAStringSet()` -- accessible using +`help(readDNAStringSet)` -- documents related functions designed to import +other types of biological sequences, e.g `readRNAStringSet()`, +`readAAStringSet()`. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Operations on biological strings + +#### Computing the frequency of symbols + +The *[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package provides several functions to +process and manipulate classes of biological strings. +For example, we have come across `matchPattern()` and `countPattern()` earlier +in this episode. + +Another example of a method that can be applied to biological strings is +`letterFrequency()`, to compute the frequency of letters in a biological +sequence. + + +``` r +letterFrequency(truseq_adapters, letters = DNA_ALPHABET) +``` + +``` output + A C G T M R W S Y K V H D B N - + . +[1,] 6 14 3 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +[2,] 5 8 10 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +[3,] 6 14 3 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +[4,] 11 3 14 6 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +[5,] 5 8 10 11 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +[6,] 11 10 8 5 0 0 0 0 0 0 0 0 0 0 0 0 0 0 +``` + +The output is a matrix with one row for each sequence in the `DNAStringSet` +object, and one column for each symbol in the alphabet of deoxyribonucleic +acids, provided by the *[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package in a +built-in object called `DNA_ALPHABET`. + +### Amino acid sequences + +Similarly to the `DNAString` and `DNAStringSet` classes, the classes `AAString` and `AAStringSet` allow efficient storage and manipulation of a long amino acid sequence, or a set thereof. + +Similarly to built-in objects for the DNA alphabet, the built-in objects `AA_ALPHABET`, `AA_STANDARD` and `AA_PROTEINOGENIC` describe different subsets of the alphabet of valid symbols for amino acid sequences. + +For instance, the `AA_ALPHABET` object describes the set of symbols in the full amino acid alphabet. + + +``` r +AA_ALPHABET +``` + +``` output + [1] "A" "R" "N" "D" "C" "Q" "E" "G" "H" "I" "L" "K" "M" "F" "P" "S" "T" "W" "Y" +[20] "V" "U" "O" "B" "J" "Z" "X" "*" "-" "+" "." +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Use base R code to identify the two symbols present in the `AA_PROTEINOGENIC` +alphabet object that are absent from the `AA_STANDARD` alphabet object. +What do those two symbols represent? + +::::::::::::::: solution + +### Solution + +``` +> setdiff(AA_PROTEINOGENIC, AA_STANDARD) +[1] "U" "O" +``` + +The symbols `U` and `O` represent selenocysteine and pyrrolysine, respectively. +Those two amino acids are in some species coded for by codons that are usually interpreted as stop codons. +As such, they are not included in the alphabet of "standard" amino acids, and an alphabet of "proteinogenic" amino acids was defined to acknowledge the special biology of those amino acids. +Either of those alphabets may be used to determine the validity of an amino acid sequence, depending on its biological nature. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Translating nucleotide sequences + +One of the key motivations for the use of [S4 classes][glossary-s4-class] and the object-oriented programming (OOP) model relies on the infrastructure of S4 generics and methods. +As described in the earlier episode [The S4 class system][crossref-s4], generics provide a mechanism for defining and applying distinct implementations of the same generic function name, according to the nature of the input object(s) provided to the function call. + +For instance, the *[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package provides multiple implementations of a generic called `translate()`, for translating DNA or RNA sequences into amino acid sequences. +The set of input objects supported by the generic `translate()` can be listed using the function `showMethods()`, from the CRAN package *[methods](https://CRAN.R-project.org/package=methods)*. + + +``` r +showMethods("translate") +``` + +``` output +Function: translate (package Biostrings) +x="DNAString" +x="DNAStringSet" +x="MaskedDNAString" +x="MaskedRNAString" +x="RNAString" +x="RNAStringSet" +``` + +In the output above, we see that that the generic function `translate()` includes methods capable of handling objects representing DNA and RNA sequences in the `DNAString` and `RNAString` classes, respectively; +but also lists of DNA and RNA sequences in objects of class `DNAStringSet` and `RNAStringSet`, as well as other classes capable of storing DNA and RNA sequences. + +To demonstrate the use of the `translate()` method, we first load a set of open +reading frames (ORFs) identified by the +[NIH Open Reading Frame Finder][orf-finder] +for the *Homo sapiens* actin beta (ACTB) mRNA (RefSeq: NM\_001101), +using the standard genetic code, a minimal ORF length of 75 nucleotides, +and starting with the "ATG" start codon only. + + +``` r +actb_orf_nih <- readDNAStringSet("data/actb_orfs.fasta") +actb_orf_nih +``` + +``` output +DNAStringSet object of length 13: + width seq names + [1] 222 ATGCCCACCATCACGCCCTGGTG...CGGGGCGGACGCGGTCTCGGCG gi|1519311456|ref... + [2] 1128 ATGGATGATGATATCGCCGCGCT...CGTCCACCGCAAATGCTTCTAG gi|1519311456|ref... + [3] 126 ATGATGATATCGCCGCGCTCGTC...CGCCCCAGGCACCAGGGCGTGA gi|1519311456|ref... + [4] 90 ATGTCGTCCCAGTTGGTGACGAT...CTGGGCCTCGTCGCCCACATAG gi|1519311456|ref... + [5] 225 ATGGGCACAGTGTGGGTGACCCC...AGCCACACGCAGCTCATTGTAG gi|1519311456|ref... + ... ... ... + [9] 342 ATGAGATTGGCATGGCTTTATTT...ATGTAATGCAAAATTTTTTTAA gi|1519311456|ref... +[10] 168 ATGGCTTTATTTGTTTTTTTTGT...TTGCACATTGTTGTTTTTTTAA gi|1519311456|ref... +[11] 111 ATGACTATTAAAAAAACAACAAT...CCTTCACCGTTCCAGTTTTTAA gi|1519311456|ref... +[12] 105 ATGCAAAATTTTTTTAATCTTCG...CCTTTTTTGTCCCCCAACTTGA gi|1519311456|ref... +[13] 135 ATGATGAGCCTTCGTGCCCCCCC...TGACTTGAGACCAGTTGAATAA gi|1519311456|ref... +``` + +Having imported the nucleotide sequences as a `DNAStringSet` object, we can +apply the `translate()` method to that object to produce the amino acid +sequence that results from the translation process for each nucleotide sequence. + + +``` r +actb_aa <- translate(actb_orf_nih) +actb_aa +``` + +``` output +AAStringSet object of length 13: + width seq names + [1] 74 MPTITPWCLGRPTMEGKTARGAS...VWTGGGSAKARLCARGADAVSA gi|1519311456|ref... + [2] 376 MDDDIAALVVDNGSGMCKAGFAG...MWISKQEYDESGPSIVHRKCF* gi|1519311456|ref... + [3] 42 MMISPRSSSTTAPACARPASRATMPPGPSSPPSWGAPGTRA* gi|1519311456|ref... + [4] 30 MSSQLVTMPCSMGYFRVRMPLLLWASSPT* gi|1519311456|ref... + [5] 75 MGTVWVTPSPESITMPVVRPEAY...GFRGASVSSTGCSSGATRSSL* gi|1519311456|ref... + ... ... ... + [9] 114 MRLAWLYLFFLFCFGFFFFFGLT...QVHTGEVIALLSCKLCNAKFF* gi|1519311456|ref... +[10] 56 MALFVFFVLFWFFFFFWLDSGFK...ERASPKVHNVAEDFDCTLLFF* gi|1519311456|ref... +[11] 37 MTIKKTTMCNQSPRPHCELWGMLAPTDCCHLHRSSF* gi|1519311456|ref... +[12] 35 MQNFFNLRLNTFLFCFILNDEPSCPPFPLFCPPT* gi|1519311456|ref... +[13] 45 MMSLRAPPSPFFVPQLEMYEGFWSPWEWVEAARAYLYTDLRPVE* gi|1519311456|ref... +``` + +In the example above, all amino acid sequences visible start with the typical +methionin amino acid encoded by the "ATG" start codon. +We also see that all but one of the amino acid sequences visible end with the +`*` symbol, which indicates that the translation process ended on a stop codon. +In contrast, the first open reading frame above reached the end of the +nucleotide sequence without encoutering a stop codon. + +Conveniently, the number of amino acids in each sequence is stated under the +header `width`. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Extract the length of each amino acid sequence above as an integer vector. +What is the length of the longest amino acid sequence translated from any of +those open reading frames? + +Compare your result with the sequence information on the UniPro page for +ACTB ([https://www.uniprot.org/uniprot/P60709#sequences](https://www.uniprot.org/uniprot/P60709#sequences)). + +::::::::::::::: solution + +### Solution + +``` +width(actb_aa) +# or +max(width(actb_aa)) +``` + +The longest translated sequence contains 376 amino acids. + +The Uniprot page reports a sequence of 375 amino acids. +However, the UniProt amino acid sequence does not comprise any symbol to +represent the stop codon. +That difference aside, the UniPro amino acid sequence is identical to the +sequence that was produced by the `translate()` method. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +## The BSgenome package + +### Overview + +In the Bioconductor project, the *[BSgenome](https://bioconductor.org/packages/3.19/BSgenome)* package +provides software infrastructure for efficient representation of full genome +and their single-nucleotide polymorphisms. + +The *[BSgenome](https://bioconductor.org/packages/3.19/BSgenome)* package itself does not contain any +genome sequence itself, but provides functionality to access genome sequences +available in other Bioconductor packages, as we demonstrate in the next section. + +### First steps + +To get started, we load the package. + + +``` r +library(BSgenome) +``` + +With the package loaded and attached to the session, we have access to the +package functions. + +In particular, the function `BSgenome::available.genomes()` can be used to +display the names of Bioconductor packages that contain genome sequences. + + +``` r +available.genomes() +``` + +``` output +'getOption("repos")' replaces Bioconductor standard repositories, see +'help("repositories", package = "BiocManager")' for details. +Replacement repositories: + BioCsoft: https://bioconductor.org/packages/3.19/bioc + BioCann: https://bioconductor.org/packages/3.19/data/annotation + BioCexp: https://bioconductor.org/packages/3.19/data/experiment + BioCworkflows: https://bioconductor.org/packages/3.19/workflows + BioCbooks: https://bioconductor.org/packages/3.19/books + CRAN: https://cran.rstudio.com +``` + +``` output + [1] "BSgenome.Alyrata.JGI.v1" + [2] "BSgenome.Amellifera.BeeBase.assembly4" + [3] "BSgenome.Amellifera.NCBI.AmelHAv3.1" + [4] "BSgenome.Amellifera.UCSC.apiMel2" + [5] "BSgenome.Amellifera.UCSC.apiMel2.masked" + [6] "BSgenome.Aofficinalis.NCBI.V1" + [7] "BSgenome.Athaliana.TAIR.04232008" + [8] "BSgenome.Athaliana.TAIR.TAIR9" + [9] "BSgenome.Btaurus.UCSC.bosTau3" + [10] "BSgenome.Btaurus.UCSC.bosTau3.masked" + [11] "BSgenome.Btaurus.UCSC.bosTau4" + [12] "BSgenome.Btaurus.UCSC.bosTau4.masked" + [13] "BSgenome.Btaurus.UCSC.bosTau6" + [14] "BSgenome.Btaurus.UCSC.bosTau6.masked" + [15] "BSgenome.Btaurus.UCSC.bosTau8" + [16] "BSgenome.Btaurus.UCSC.bosTau9" + [17] "BSgenome.Btaurus.UCSC.bosTau9.masked" + [18] "BSgenome.Carietinum.NCBI.v1" + [19] "BSgenome.Celegans.UCSC.ce10" + [20] "BSgenome.Celegans.UCSC.ce11" + [21] "BSgenome.Celegans.UCSC.ce2" + [22] "BSgenome.Celegans.UCSC.ce6" + [23] "BSgenome.Cfamiliaris.UCSC.canFam2" + [24] "BSgenome.Cfamiliaris.UCSC.canFam2.masked" + [25] "BSgenome.Cfamiliaris.UCSC.canFam3" + [26] "BSgenome.Cfamiliaris.UCSC.canFam3.masked" + [27] "BSgenome.Cjacchus.UCSC.calJac3" + [28] "BSgenome.Cjacchus.UCSC.calJac4" + [29] "BSgenome.CneoformansVarGrubiiKN99.NCBI.ASM221672v1" + [30] "BSgenome.Creinhardtii.JGI.v5.6" + [31] "BSgenome.Dmelanogaster.UCSC.dm2" + [32] "BSgenome.Dmelanogaster.UCSC.dm2.masked" + [33] "BSgenome.Dmelanogaster.UCSC.dm3" + [34] "BSgenome.Dmelanogaster.UCSC.dm3.masked" + [35] "BSgenome.Dmelanogaster.UCSC.dm6" + [36] "BSgenome.Drerio.UCSC.danRer10" + [37] "BSgenome.Drerio.UCSC.danRer11" + [38] "BSgenome.Drerio.UCSC.danRer5" + [39] "BSgenome.Drerio.UCSC.danRer5.masked" + [40] "BSgenome.Drerio.UCSC.danRer6" + [41] "BSgenome.Drerio.UCSC.danRer6.masked" + [42] "BSgenome.Drerio.UCSC.danRer7" + [43] "BSgenome.Drerio.UCSC.danRer7.masked" + [44] "BSgenome.Dvirilis.Ensembl.dvircaf1" + [45] "BSgenome.Ecoli.NCBI.20080805" + [46] "BSgenome.Gaculeatus.UCSC.gasAcu1" + [47] "BSgenome.Gaculeatus.UCSC.gasAcu1.masked" + [48] "BSgenome.Ggallus.UCSC.galGal3" + [49] "BSgenome.Ggallus.UCSC.galGal3.masked" + [50] "BSgenome.Ggallus.UCSC.galGal4" + [51] "BSgenome.Ggallus.UCSC.galGal4.masked" + [52] "BSgenome.Ggallus.UCSC.galGal5" + [53] "BSgenome.Ggallus.UCSC.galGal6" + [54] "BSgenome.Gmax.NCBI.Gmv40" + [55] "BSgenome.Hsapiens.1000genomes.hs37d5" + [56] "BSgenome.Hsapiens.NCBI.GRCh38" + [57] "BSgenome.Hsapiens.NCBI.T2T.CHM13v2.0" + [58] "BSgenome.Hsapiens.UCSC.hg17" + [59] "BSgenome.Hsapiens.UCSC.hg17.masked" + [60] "BSgenome.Hsapiens.UCSC.hg18" + [61] "BSgenome.Hsapiens.UCSC.hg18.masked" + [62] "BSgenome.Hsapiens.UCSC.hg19" + [63] "BSgenome.Hsapiens.UCSC.hg19.masked" + [64] "BSgenome.Hsapiens.UCSC.hg38" + [65] "BSgenome.Hsapiens.UCSC.hg38.dbSNP151.major" + [66] "BSgenome.Hsapiens.UCSC.hg38.dbSNP151.minor" + [67] "BSgenome.Hsapiens.UCSC.hg38.masked" + [68] "BSgenome.Hsapiens.UCSC.hs1" + [69] "BSgenome.Mdomestica.UCSC.monDom5" + [70] "BSgenome.Mfascicularis.NCBI.5.0" + [71] "BSgenome.Mfascicularis.NCBI.6.0" + [72] "BSgenome.Mfuro.UCSC.musFur1" + [73] "BSgenome.Mmulatta.UCSC.rheMac10" + [74] "BSgenome.Mmulatta.UCSC.rheMac2" + [75] "BSgenome.Mmulatta.UCSC.rheMac2.masked" + [76] "BSgenome.Mmulatta.UCSC.rheMac3" + [77] "BSgenome.Mmulatta.UCSC.rheMac3.masked" + [78] "BSgenome.Mmulatta.UCSC.rheMac8" + [79] "BSgenome.Mmusculus.UCSC.mm10" + [80] "BSgenome.Mmusculus.UCSC.mm10.masked" + [81] "BSgenome.Mmusculus.UCSC.mm39" + [82] "BSgenome.Mmusculus.UCSC.mm8" + [83] "BSgenome.Mmusculus.UCSC.mm8.masked" + [84] "BSgenome.Mmusculus.UCSC.mm9" + [85] "BSgenome.Mmusculus.UCSC.mm9.masked" + [86] "BSgenome.Osativa.MSU.MSU7" + [87] "BSgenome.Ppaniscus.UCSC.panPan1" + [88] "BSgenome.Ppaniscus.UCSC.panPan2" + [89] "BSgenome.Ptroglodytes.UCSC.panTro2" + [90] "BSgenome.Ptroglodytes.UCSC.panTro2.masked" + [91] "BSgenome.Ptroglodytes.UCSC.panTro3" + [92] "BSgenome.Ptroglodytes.UCSC.panTro3.masked" + [93] "BSgenome.Ptroglodytes.UCSC.panTro5" + [94] "BSgenome.Ptroglodytes.UCSC.panTro6" + [95] "BSgenome.Rnorvegicus.UCSC.rn4" + [96] "BSgenome.Rnorvegicus.UCSC.rn4.masked" + [97] "BSgenome.Rnorvegicus.UCSC.rn5" + [98] "BSgenome.Rnorvegicus.UCSC.rn5.masked" + [99] "BSgenome.Rnorvegicus.UCSC.rn6" +[100] "BSgenome.Rnorvegicus.UCSC.rn7" +[101] "BSgenome.Scerevisiae.UCSC.sacCer1" +[102] "BSgenome.Scerevisiae.UCSC.sacCer2" +[103] "BSgenome.Scerevisiae.UCSC.sacCer3" +[104] "BSgenome.Sscrofa.UCSC.susScr11" +[105] "BSgenome.Sscrofa.UCSC.susScr3" +[106] "BSgenome.Sscrofa.UCSC.susScr3.masked" +[107] "BSgenome.Tgondii.ToxoDB.7.0" +[108] "BSgenome.Tguttata.UCSC.taeGut1" +[109] "BSgenome.Tguttata.UCSC.taeGut1.masked" +[110] "BSgenome.Tguttata.UCSC.taeGut2" +[111] "BSgenome.Vvinifera.URGI.IGGP12Xv0" +[112] "BSgenome.Vvinifera.URGI.IGGP12Xv2" +[113] "BSgenome.Vvinifera.URGI.IGGP8X" +``` + +### Installing BSgenome packages + +To use one of the available genomes, the corresponding package must be installed +first. +For instance, the example below demonstrates how the data package +*[BSgenome.Hsapiens.UCSC.hg38.masked](https://bioconductor.org/packages/3.19/BSgenome.Hsapiens.UCSC.hg38.masked)* can be installed +using the function `BiocManager::install()` that we have seen before. + + +``` r +BiocManager::install("BSgenome.Hsapiens.UCSC.hg38.masked") +``` + +### Using BSgenome packages + +Once installed, BSgenome packages can be loaded like any other R package, +using the `library()` function. + + +``` r +library(BSgenome.Hsapiens.UCSC.hg38.masked) +``` + +Each BSgenome package contains an object that is named identically to the +package and contains the genome sequence. + +Having loaded the package +*[BSgenome.Hsapiens.UCSC.hg38.masked](https://bioconductor.org/packages/3.19/BSgenome.Hsapiens.UCSC.hg38.masked)* above, we can +display the BSgenome object as follows. + + +``` r +BSgenome.Hsapiens.UCSC.hg38.masked +``` + +``` output +| BSgenome object for Human +| - organism: Homo sapiens +| - provider: UCSC +| - genome: hg38 +| - release date: 2023/01/31 +| - 711 sequence(s): +| chr1 chr2 chr3 +| chr4 chr5 chr6 +| chr7 chr8 chr9 +| chr10 chr11 chr12 +| chr13 chr14 chr15 +| ... ... ... +| chr19_KV575256v1_alt chr19_KV575257v1_alt chr19_KV575258v1_alt +| chr19_KV575259v1_alt chr19_KV575260v1_alt chr19_MU273387v1_alt +| chr22_KN196485v1_alt chr22_KN196486v1_alt chr22_KQ458387v1_alt +| chr22_KQ458388v1_alt chr22_KQ759761v1_alt chrX_KV766199v1_alt +| chrX_MU273395v1_alt chrX_MU273396v1_alt chrX_MU273397v1_alt +| +| Tips: call 'seqnames()' on the object to get all the sequence names, call +| 'seqinfo()' to get the full sequence info, use the '$' or '[[' operator to +| access a given sequence, see '?BSgenome' for more information. +``` + +Given the length and the complexity of the object name, it is common practice +to assign a copy of BSgenome objects to a new object simply called `genome`. + + +``` r +genome <- BSgenome.Hsapiens.UCSC.hg38.masked +``` + +### Using BSgenome objects + +When printing BSgenome objects in the console (see above), some helpful tips +are displayed under the object itself, hinting at functions commonly used to +access information in the object. + +For instance, the function `seqnames()` can be used get the list of sequence +names (i.e., chromosomes and contigs) present in the object. + + +``` r +seqnames(genome) +``` + +``` output + [1] "chr1" "chr2" + [3] "chr3" "chr4" + [5] "chr5" "chr6" + [7] "chr7" "chr8" + [9] "chr9" "chr10" + [11] "chr11" "chr12" + [13] "chr13" "chr14" + [15] "chr15" "chr16" + [17] "chr17" "chr18" + [19] "chr19" "chr20" + [21] "chr21" "chr22" + [23] "chrX" "chrY" + [25] "chrM" "chr1_GL383518v1_alt" + [27] "chr1_GL383519v1_alt" "chr1_GL383520v2_alt" + [29] "chr1_KI270759v1_alt" "chr1_KI270760v1_alt" + [31] "chr1_KI270761v1_alt" "chr1_KI270762v1_alt" + [33] "chr1_KI270763v1_alt" "chr1_KI270764v1_alt" + [35] "chr1_KI270765v1_alt" "chr1_KI270766v1_alt" + [37] "chr1_KI270892v1_alt" "chr2_GL383521v1_alt" + [39] "chr2_GL383522v1_alt" "chr2_GL582966v2_alt" + [41] "chr2_KI270767v1_alt" "chr2_KI270768v1_alt" + [43] "chr2_KI270769v1_alt" "chr2_KI270770v1_alt" + [45] "chr2_KI270771v1_alt" "chr2_KI270772v1_alt" + [47] "chr2_KI270773v1_alt" "chr2_KI270774v1_alt" + [49] "chr2_KI270775v1_alt" "chr2_KI270776v1_alt" + [51] "chr2_KI270893v1_alt" "chr2_KI270894v1_alt" + [53] "chr3_GL383526v1_alt" "chr3_JH636055v2_alt" + [55] "chr3_KI270777v1_alt" "chr3_KI270778v1_alt" + [57] "chr3_KI270779v1_alt" "chr3_KI270780v1_alt" + [59] "chr3_KI270781v1_alt" "chr3_KI270782v1_alt" + [61] "chr3_KI270783v1_alt" "chr3_KI270784v1_alt" + [63] "chr3_KI270895v1_alt" "chr3_KI270924v1_alt" + [65] "chr3_KI270934v1_alt" "chr3_KI270935v1_alt" + [67] "chr3_KI270936v1_alt" "chr3_KI270937v1_alt" + [69] "chr4_GL000257v2_alt" "chr4_GL383527v1_alt" + [71] "chr4_GL383528v1_alt" "chr4_KI270785v1_alt" + [73] "chr4_KI270786v1_alt" "chr4_KI270787v1_alt" + [75] "chr4_KI270788v1_alt" "chr4_KI270789v1_alt" + [77] "chr4_KI270790v1_alt" "chr4_KI270896v1_alt" + [79] "chr4_KI270925v1_alt" "chr5_GL339449v2_alt" + [81] "chr5_GL383530v1_alt" "chr5_GL383531v1_alt" + [83] "chr5_GL383532v1_alt" "chr5_GL949742v1_alt" + [85] "chr5_KI270791v1_alt" "chr5_KI270792v1_alt" + [87] "chr5_KI270793v1_alt" "chr5_KI270794v1_alt" + [89] "chr5_KI270795v1_alt" "chr5_KI270796v1_alt" + [91] "chr5_KI270897v1_alt" "chr5_KI270898v1_alt" + [93] "chr6_GL000250v2_alt" "chr6_GL000251v2_alt" + [95] "chr6_GL000252v2_alt" "chr6_GL000253v2_alt" + [97] "chr6_GL000254v2_alt" "chr6_GL000255v2_alt" + [99] "chr6_GL000256v2_alt" "chr6_GL383533v1_alt" +[101] "chr6_KB021644v2_alt" "chr6_KI270758v1_alt" +[103] "chr6_KI270797v1_alt" "chr6_KI270798v1_alt" +[105] "chr6_KI270799v1_alt" "chr6_KI270800v1_alt" +[107] "chr6_KI270801v1_alt" "chr6_KI270802v1_alt" +[109] "chr7_GL383534v2_alt" "chr7_KI270803v1_alt" +[111] "chr7_KI270804v1_alt" "chr7_KI270805v1_alt" +[113] "chr7_KI270806v1_alt" "chr7_KI270807v1_alt" +[115] "chr7_KI270808v1_alt" "chr7_KI270809v1_alt" +[117] "chr7_KI270899v1_alt" "chr8_KI270810v1_alt" +[119] "chr8_KI270811v1_alt" "chr8_KI270812v1_alt" +[121] "chr8_KI270813v1_alt" "chr8_KI270814v1_alt" +[123] "chr8_KI270815v1_alt" "chr8_KI270816v1_alt" +[125] "chr8_KI270817v1_alt" "chr8_KI270818v1_alt" +[127] "chr8_KI270819v1_alt" "chr8_KI270820v1_alt" +[129] "chr8_KI270821v1_alt" "chr8_KI270822v1_alt" +[131] "chr8_KI270900v1_alt" "chr8_KI270901v1_alt" +[133] "chr8_KI270926v1_alt" "chr9_GL383539v1_alt" +[135] "chr9_GL383540v1_alt" "chr9_GL383541v1_alt" +[137] "chr9_GL383542v1_alt" "chr9_KI270823v1_alt" +[139] "chr10_GL383545v1_alt" "chr10_GL383546v1_alt" +[141] "chr10_KI270824v1_alt" "chr10_KI270825v1_alt" +[143] "chr11_GL383547v1_alt" "chr11_JH159136v1_alt" +[145] "chr11_JH159137v1_alt" "chr11_KI270826v1_alt" +[147] "chr11_KI270827v1_alt" "chr11_KI270829v1_alt" +[149] "chr11_KI270830v1_alt" "chr11_KI270831v1_alt" +[151] "chr11_KI270832v1_alt" "chr11_KI270902v1_alt" +[153] "chr11_KI270903v1_alt" "chr11_KI270927v1_alt" +[155] "chr12_GL383549v1_alt" "chr12_GL383550v2_alt" +[157] "chr12_GL383551v1_alt" "chr12_GL383552v1_alt" +[159] "chr12_GL383553v2_alt" "chr12_GL877875v1_alt" +[161] "chr12_GL877876v1_alt" "chr12_KI270833v1_alt" +[163] "chr12_KI270834v1_alt" "chr12_KI270835v1_alt" +[165] "chr12_KI270836v1_alt" "chr12_KI270837v1_alt" +[167] "chr12_KI270904v1_alt" "chr13_KI270838v1_alt" +[169] "chr13_KI270839v1_alt" "chr13_KI270840v1_alt" +[171] "chr13_KI270841v1_alt" "chr13_KI270842v1_alt" +[173] "chr13_KI270843v1_alt" "chr14_KI270844v1_alt" +[175] "chr14_KI270845v1_alt" "chr14_KI270846v1_alt" +[177] "chr14_KI270847v1_alt" "chr15_GL383554v1_alt" +[179] "chr15_GL383555v2_alt" "chr15_KI270848v1_alt" +[181] "chr15_KI270849v1_alt" "chr15_KI270850v1_alt" +[183] "chr15_KI270851v1_alt" "chr15_KI270852v1_alt" +[185] "chr15_KI270905v1_alt" "chr15_KI270906v1_alt" +[187] "chr16_GL383556v1_alt" "chr16_GL383557v1_alt" +[189] "chr16_KI270853v1_alt" "chr16_KI270854v1_alt" +[191] "chr16_KI270855v1_alt" "chr16_KI270856v1_alt" +[193] "chr17_GL000258v2_alt" "chr17_GL383563v3_alt" +[195] "chr17_GL383564v2_alt" "chr17_GL383565v1_alt" +[197] "chr17_GL383566v1_alt" "chr17_JH159146v1_alt" +[199] "chr17_JH159147v1_alt" "chr17_JH159148v1_alt" +[201] "chr17_KI270857v1_alt" "chr17_KI270858v1_alt" +[203] "chr17_KI270859v1_alt" "chr17_KI270860v1_alt" +[205] "chr17_KI270861v1_alt" "chr17_KI270862v1_alt" +[207] "chr17_KI270907v1_alt" "chr17_KI270908v1_alt" +[209] "chr17_KI270909v1_alt" "chr17_KI270910v1_alt" +[211] "chr18_GL383567v1_alt" "chr18_GL383568v1_alt" +[213] "chr18_GL383569v1_alt" "chr18_GL383570v1_alt" +[215] "chr18_GL383571v1_alt" "chr18_GL383572v1_alt" +[217] "chr18_KI270863v1_alt" "chr18_KI270864v1_alt" +[219] "chr18_KI270911v1_alt" "chr18_KI270912v1_alt" +[221] "chr19_GL000209v2_alt" "chr19_GL383573v1_alt" +[223] "chr19_GL383574v1_alt" "chr19_GL383575v2_alt" +[225] "chr19_GL383576v1_alt" "chr19_GL949746v1_alt" +[227] "chr19_GL949747v2_alt" "chr19_GL949748v2_alt" +[229] "chr19_GL949749v2_alt" "chr19_GL949750v2_alt" +[231] "chr19_GL949751v2_alt" "chr19_GL949752v1_alt" +[233] "chr19_GL949753v2_alt" "chr19_KI270865v1_alt" +[235] "chr19_KI270866v1_alt" "chr19_KI270867v1_alt" +[237] "chr19_KI270868v1_alt" "chr19_KI270882v1_alt" +[239] "chr19_KI270883v1_alt" "chr19_KI270884v1_alt" +[241] "chr19_KI270885v1_alt" "chr19_KI270886v1_alt" +[243] "chr19_KI270887v1_alt" "chr19_KI270888v1_alt" +[245] "chr19_KI270889v1_alt" "chr19_KI270890v1_alt" +[247] "chr19_KI270891v1_alt" "chr19_KI270914v1_alt" +[249] "chr19_KI270915v1_alt" "chr19_KI270916v1_alt" +[251] "chr19_KI270917v1_alt" "chr19_KI270918v1_alt" +[253] "chr19_KI270919v1_alt" "chr19_KI270920v1_alt" +[255] "chr19_KI270921v1_alt" "chr19_KI270922v1_alt" +[257] "chr19_KI270923v1_alt" "chr19_KI270929v1_alt" +[259] "chr19_KI270930v1_alt" "chr19_KI270931v1_alt" +[261] "chr19_KI270932v1_alt" "chr19_KI270933v1_alt" +[263] "chr19_KI270938v1_alt" "chr20_GL383577v2_alt" +[265] "chr20_KI270869v1_alt" "chr20_KI270870v1_alt" +[267] "chr20_KI270871v1_alt" "chr21_GL383578v2_alt" +[269] "chr21_GL383579v2_alt" "chr21_GL383580v2_alt" +[271] "chr21_GL383581v2_alt" "chr21_KI270872v1_alt" +[273] "chr21_KI270873v1_alt" "chr21_KI270874v1_alt" +[275] "chr22_GL383582v2_alt" "chr22_GL383583v2_alt" +[277] "chr22_KB663609v1_alt" "chr22_KI270875v1_alt" +[279] "chr22_KI270876v1_alt" "chr22_KI270877v1_alt" +[281] "chr22_KI270878v1_alt" "chr22_KI270879v1_alt" +[283] "chr22_KI270928v1_alt" "chrX_KI270880v1_alt" +[285] "chrX_KI270881v1_alt" "chrX_KI270913v1_alt" +[287] "chr1_KI270706v1_random" "chr1_KI270707v1_random" +[289] "chr1_KI270708v1_random" "chr1_KI270709v1_random" +[291] "chr1_KI270710v1_random" "chr1_KI270711v1_random" +[293] "chr1_KI270712v1_random" "chr1_KI270713v1_random" +[295] "chr1_KI270714v1_random" "chr2_KI270715v1_random" +[297] "chr2_KI270716v1_random" "chr3_GL000221v1_random" +[299] "chr4_GL000008v2_random" "chr5_GL000208v1_random" +[301] "chr9_KI270717v1_random" "chr9_KI270718v1_random" +[303] "chr9_KI270719v1_random" "chr9_KI270720v1_random" +[305] "chr11_KI270721v1_random" "chr14_GL000009v2_random" +[307] "chr14_GL000194v1_random" "chr14_GL000225v1_random" +[309] "chr14_KI270722v1_random" "chr14_KI270723v1_random" +[311] "chr14_KI270724v1_random" "chr14_KI270725v1_random" +[313] "chr14_KI270726v1_random" "chr15_KI270727v1_random" +[315] "chr16_KI270728v1_random" "chr17_GL000205v2_random" +[317] "chr17_KI270729v1_random" "chr17_KI270730v1_random" +[319] "chr22_KI270731v1_random" "chr22_KI270732v1_random" +[321] "chr22_KI270733v1_random" "chr22_KI270734v1_random" +[323] "chr22_KI270735v1_random" "chr22_KI270736v1_random" +[325] "chr22_KI270737v1_random" "chr22_KI270738v1_random" +[327] "chr22_KI270739v1_random" "chrY_KI270740v1_random" +[329] "chrUn_GL000195v1" "chrUn_GL000213v1" +[331] "chrUn_GL000214v1" "chrUn_GL000216v2" +[333] "chrUn_GL000218v1" "chrUn_GL000219v1" +[335] "chrUn_GL000220v1" "chrUn_GL000224v1" +[337] "chrUn_GL000226v1" "chrUn_KI270302v1" +[339] "chrUn_KI270303v1" "chrUn_KI270304v1" +[341] "chrUn_KI270305v1" "chrUn_KI270310v1" +[343] "chrUn_KI270311v1" "chrUn_KI270312v1" +[345] "chrUn_KI270315v1" "chrUn_KI270316v1" +[347] "chrUn_KI270317v1" "chrUn_KI270320v1" +[349] "chrUn_KI270322v1" "chrUn_KI270329v1" +[351] "chrUn_KI270330v1" "chrUn_KI270333v1" +[353] "chrUn_KI270334v1" "chrUn_KI270335v1" +[355] "chrUn_KI270336v1" "chrUn_KI270337v1" +[357] "chrUn_KI270338v1" "chrUn_KI270340v1" +[359] "chrUn_KI270362v1" "chrUn_KI270363v1" +[361] "chrUn_KI270364v1" "chrUn_KI270366v1" +[363] "chrUn_KI270371v1" "chrUn_KI270372v1" +[365] "chrUn_KI270373v1" "chrUn_KI270374v1" +[367] "chrUn_KI270375v1" "chrUn_KI270376v1" +[369] "chrUn_KI270378v1" "chrUn_KI270379v1" +[371] "chrUn_KI270381v1" "chrUn_KI270382v1" +[373] "chrUn_KI270383v1" "chrUn_KI270384v1" +[375] "chrUn_KI270385v1" "chrUn_KI270386v1" +[377] "chrUn_KI270387v1" "chrUn_KI270388v1" +[379] "chrUn_KI270389v1" "chrUn_KI270390v1" +[381] "chrUn_KI270391v1" "chrUn_KI270392v1" +[383] "chrUn_KI270393v1" "chrUn_KI270394v1" +[385] "chrUn_KI270395v1" "chrUn_KI270396v1" +[387] "chrUn_KI270411v1" "chrUn_KI270412v1" +[389] "chrUn_KI270414v1" "chrUn_KI270417v1" +[391] "chrUn_KI270418v1" "chrUn_KI270419v1" +[393] "chrUn_KI270420v1" "chrUn_KI270422v1" +[395] "chrUn_KI270423v1" "chrUn_KI270424v1" +[397] "chrUn_KI270425v1" "chrUn_KI270429v1" +[399] "chrUn_KI270435v1" "chrUn_KI270438v1" +[401] "chrUn_KI270442v1" "chrUn_KI270448v1" +[403] "chrUn_KI270465v1" "chrUn_KI270466v1" +[405] "chrUn_KI270467v1" "chrUn_KI270468v1" +[407] "chrUn_KI270507v1" "chrUn_KI270508v1" +[409] "chrUn_KI270509v1" "chrUn_KI270510v1" +[411] "chrUn_KI270511v1" "chrUn_KI270512v1" +[413] "chrUn_KI270515v1" "chrUn_KI270516v1" +[415] "chrUn_KI270517v1" "chrUn_KI270518v1" +[417] "chrUn_KI270519v1" "chrUn_KI270521v1" +[419] "chrUn_KI270522v1" "chrUn_KI270528v1" +[421] "chrUn_KI270529v1" "chrUn_KI270530v1" +[423] "chrUn_KI270538v1" "chrUn_KI270539v1" +[425] "chrUn_KI270544v1" "chrUn_KI270548v1" +[427] "chrUn_KI270579v1" "chrUn_KI270580v1" +[429] "chrUn_KI270581v1" "chrUn_KI270582v1" +[431] "chrUn_KI270583v1" "chrUn_KI270584v1" +[433] "chrUn_KI270587v1" "chrUn_KI270588v1" +[435] "chrUn_KI270589v1" "chrUn_KI270590v1" +[437] "chrUn_KI270591v1" "chrUn_KI270593v1" +[439] "chrUn_KI270741v1" "chrUn_KI270742v1" +[441] "chrUn_KI270743v1" "chrUn_KI270744v1" +[443] "chrUn_KI270745v1" "chrUn_KI270746v1" +[445] "chrUn_KI270747v1" "chrUn_KI270748v1" +[447] "chrUn_KI270749v1" "chrUn_KI270750v1" +[449] "chrUn_KI270751v1" "chrUn_KI270752v1" +[451] "chrUn_KI270753v1" "chrUn_KI270754v1" +[453] "chrUn_KI270755v1" "chrUn_KI270756v1" +[455] "chrUn_KI270757v1" "chr1_KN196472v1_fix" +[457] "chr1_KN196473v1_fix" "chr1_KN196474v1_fix" +[459] "chr1_KN538360v1_fix" "chr1_KN538361v1_fix" +[461] "chr1_KQ031383v1_fix" "chr1_KZ208906v1_fix" +[463] "chr1_KZ559100v1_fix" "chr1_MU273333v1_fix" +[465] "chr1_MU273334v1_fix" "chr1_MU273335v1_fix" +[467] "chr1_MU273336v1_fix" "chr2_KN538362v1_fix" +[469] "chr2_KN538363v1_fix" "chr2_KQ031384v1_fix" +[471] "chr2_ML143341v1_fix" "chr2_ML143342v1_fix" +[473] "chr2_MU273341v1_fix" "chr2_MU273342v1_fix" +[475] "chr2_MU273343v1_fix" "chr2_MU273344v1_fix" +[477] "chr2_MU273345v1_fix" "chr3_KN196475v1_fix" +[479] "chr3_KN196476v1_fix" "chr3_KN538364v1_fix" +[481] "chr3_KQ031385v1_fix" "chr3_KQ031386v1_fix" +[483] "chr3_KV766192v1_fix" "chr3_KZ559104v1_fix" +[485] "chr3_MU273346v1_fix" "chr3_MU273347v1_fix" +[487] "chr3_MU273348v1_fix" "chr4_KQ983257v1_fix" +[489] "chr4_ML143344v1_fix" "chr4_ML143345v1_fix" +[491] "chr4_ML143346v1_fix" "chr4_ML143347v1_fix" +[493] "chr4_ML143348v1_fix" "chr4_ML143349v1_fix" +[495] "chr4_MU273350v1_fix" "chr4_MU273351v1_fix" +[497] "chr5_KV575244v1_fix" "chr5_ML143350v1_fix" +[499] "chr5_MU273352v1_fix" "chr5_MU273353v1_fix" +[501] "chr5_MU273354v1_fix" "chr5_MU273355v1_fix" +[503] "chr6_KN196478v1_fix" "chr6_KQ031387v1_fix" +[505] "chr6_KQ090016v1_fix" "chr6_KV766194v1_fix" +[507] "chr6_KZ208911v1_fix" "chr6_ML143351v1_fix" +[509] "chr7_KQ031388v1_fix" "chr7_KV880764v1_fix" +[511] "chr7_KV880765v1_fix" "chr7_KZ208912v1_fix" +[513] "chr7_ML143352v1_fix" "chr8_KV880766v1_fix" +[515] "chr8_KV880767v1_fix" "chr8_KZ208914v1_fix" +[517] "chr8_KZ208915v1_fix" "chr8_MU273359v1_fix" +[519] "chr8_MU273360v1_fix" "chr8_MU273361v1_fix" +[521] "chr8_MU273362v1_fix" "chr8_MU273363v1_fix" +[523] "chr9_KN196479v1_fix" "chr9_ML143353v1_fix" +[525] "chr9_MU273364v1_fix" "chr9_MU273365v1_fix" +[527] "chr9_MU273366v1_fix" "chr10_KN196480v1_fix" +[529] "chr10_KN538365v1_fix" "chr10_KN538366v1_fix" +[531] "chr10_KN538367v1_fix" "chr10_KQ090021v1_fix" +[533] "chr10_ML143354v1_fix" "chr10_ML143355v1_fix" +[535] "chr10_MU273367v1_fix" "chr11_KN196481v1_fix" +[537] "chr11_KQ090022v1_fix" "chr11_KQ759759v1_fix" +[539] "chr11_KQ759759v2_fix" "chr11_KV766195v1_fix" +[541] "chr11_KZ559108v1_fix" "chr11_KZ559109v1_fix" +[543] "chr11_ML143356v1_fix" "chr11_ML143357v1_fix" +[545] "chr11_ML143358v1_fix" "chr11_ML143359v1_fix" +[547] "chr11_ML143360v1_fix" "chr11_MU273369v1_fix" +[549] "chr11_MU273370v1_fix" "chr11_MU273371v1_fix" +[551] "chr12_KN196482v1_fix" "chr12_KN538369v1_fix" +[553] "chr12_KN538370v1_fix" "chr12_KQ759760v1_fix" +[555] "chr12_KZ208916v1_fix" "chr12_KZ208917v1_fix" +[557] "chr12_ML143361v1_fix" "chr12_ML143362v1_fix" +[559] "chr12_MU273372v1_fix" "chr13_KN196483v1_fix" +[561] "chr13_KN538371v1_fix" "chr13_KN538372v1_fix" +[563] "chr13_KN538373v1_fix" "chr13_ML143363v1_fix" +[565] "chr13_ML143364v1_fix" "chr13_ML143365v1_fix" +[567] "chr13_ML143366v1_fix" "chr14_KZ208920v1_fix" +[569] "chr14_ML143367v1_fix" "chr14_MU273373v1_fix" +[571] "chr15_KN538374v1_fix" "chr15_ML143369v1_fix" +[573] "chr15_ML143370v1_fix" "chr15_ML143371v1_fix" +[575] "chr15_ML143372v1_fix" "chr15_MU273374v1_fix" +[577] "chr16_KV880768v1_fix" "chr16_KZ559113v1_fix" +[579] "chr16_ML143373v1_fix" "chr16_MU273376v1_fix" +[581] "chr16_MU273377v1_fix" "chr17_KV575245v1_fix" +[583] "chr17_KV766196v1_fix" "chr17_ML143374v1_fix" +[585] "chr17_ML143375v1_fix" "chr17_MU273379v1_fix" +[587] "chr17_MU273380v1_fix" "chr17_MU273381v1_fix" +[589] "chr17_MU273382v1_fix" "chr17_MU273383v1_fix" +[591] "chr18_KQ090028v1_fix" "chr18_KZ208922v1_fix" +[593] "chr18_KZ559115v1_fix" "chr19_KN196484v1_fix" +[595] "chr19_KQ458386v1_fix" "chr19_ML143376v1_fix" +[597] "chr19_MU273384v1_fix" "chr19_MU273385v1_fix" +[599] "chr19_MU273386v1_fix" "chr20_MU273388v1_fix" +[601] "chr20_MU273389v1_fix" "chr21_ML143377v1_fix" +[603] "chr21_MU273390v1_fix" "chr21_MU273391v1_fix" +[605] "chr21_MU273392v1_fix" "chr22_KQ759762v1_fix" +[607] "chr22_KQ759762v2_fix" "chr22_ML143378v1_fix" +[609] "chr22_ML143379v1_fix" "chr22_ML143380v1_fix" +[611] "chrX_ML143381v1_fix" "chrX_ML143382v1_fix" +[613] "chrX_ML143383v1_fix" "chrX_ML143384v1_fix" +[615] "chrX_ML143385v1_fix" "chrX_MU273393v1_fix" +[617] "chrX_MU273394v1_fix" "chrY_KN196487v1_fix" +[619] "chrY_KZ208923v1_fix" "chrY_KZ208924v1_fix" +[621] "chrY_MU273398v1_fix" "chr1_KQ458382v1_alt" +[623] "chr1_KQ458383v1_alt" "chr1_KQ458384v1_alt" +[625] "chr1_KQ983255v1_alt" "chr1_KV880763v1_alt" +[627] "chr1_KZ208904v1_alt" "chr1_KZ208905v1_alt" +[629] "chr1_MU273330v1_alt" "chr1_MU273331v1_alt" +[631] "chr1_MU273332v1_alt" "chr2_KQ983256v1_alt" +[633] "chr2_KZ208907v1_alt" "chr2_KZ208908v1_alt" +[635] "chr2_MU273337v1_alt" "chr2_MU273338v1_alt" +[637] "chr2_MU273339v1_alt" "chr2_MU273340v1_alt" +[639] "chr3_KZ208909v1_alt" "chr3_KZ559101v1_alt" +[641] "chr3_KZ559102v1_alt" "chr3_KZ559103v1_alt" +[643] "chr3_KZ559105v1_alt" "chr3_ML143343v1_alt" +[645] "chr4_KQ090013v1_alt" "chr4_KQ090014v1_alt" +[647] "chr4_KQ090015v1_alt" "chr4_KQ983258v1_alt" +[649] "chr4_KV766193v1_alt" "chr4_MU273349v1_alt" +[651] "chr5_KN196477v1_alt" "chr5_KV575243v1_alt" +[653] "chr5_KZ208910v1_alt" "chr5_MU273356v1_alt" +[655] "chr6_KQ090017v1_alt" "chr6_MU273357v1_alt" +[657] "chr7_KZ208913v1_alt" "chr7_KZ559106v1_alt" +[659] "chr7_MU273358v1_alt" "chr8_KZ559107v1_alt" +[661] "chr9_KQ090018v1_alt" "chr9_KQ090019v1_alt" +[663] "chr10_KQ090020v1_alt" "chr11_KN538368v1_alt" +[665] "chr11_KZ559110v1_alt" "chr11_KZ559111v1_alt" +[667] "chr11_MU273368v1_alt" "chr12_KQ090023v1_alt" +[669] "chr12_KZ208918v1_alt" "chr12_KZ559112v1_alt" +[671] "chr13_KQ090024v1_alt" "chr13_KQ090025v1_alt" +[673] "chr14_KZ208919v1_alt" "chr14_ML143368v1_alt" +[675] "chr15_KQ031389v1_alt" "chr15_MU273375v1_alt" +[677] "chr16_KQ031390v1_alt" "chr16_KQ090026v1_alt" +[679] "chr16_KQ090027v1_alt" "chr16_KZ208921v1_alt" +[681] "chr17_KV766197v1_alt" "chr17_KV766198v1_alt" +[683] "chr17_KZ559114v1_alt" "chr17_MU273378v1_alt" +[685] "chr18_KQ458385v1_alt" "chr18_KZ559116v1_alt" +[687] "chr19_KV575246v1_alt" "chr19_KV575247v1_alt" +[689] "chr19_KV575248v1_alt" "chr19_KV575249v1_alt" +[691] "chr19_KV575250v1_alt" "chr19_KV575251v1_alt" +[693] "chr19_KV575252v1_alt" "chr19_KV575253v1_alt" +[695] "chr19_KV575254v1_alt" "chr19_KV575255v1_alt" +[697] "chr19_KV575256v1_alt" "chr19_KV575257v1_alt" +[699] "chr19_KV575258v1_alt" "chr19_KV575259v1_alt" +[701] "chr19_KV575260v1_alt" "chr19_MU273387v1_alt" +[703] "chr22_KN196485v1_alt" "chr22_KN196486v1_alt" +[705] "chr22_KQ458387v1_alt" "chr22_KQ458388v1_alt" +[707] "chr22_KQ759761v1_alt" "chrX_KV766199v1_alt" +[709] "chrX_MU273395v1_alt" "chrX_MU273396v1_alt" +[711] "chrX_MU273397v1_alt" +``` + +Similarly, the function `seqinfo()` can be used to get the full sequence +information stored in the object. + + +``` r +seqinfo(genome) +``` + +``` output +Seqinfo object with 711 sequences (1 circular) from hg38 genome: + seqnames seqlengths isCircular genome + chr1 248956422 FALSE hg38 + chr2 242193529 FALSE hg38 + chr3 198295559 FALSE hg38 + chr4 190214555 FALSE hg38 + chr5 181538259 FALSE hg38 + ... ... ... ... + chr22_KQ759761v1_alt 145162 FALSE hg38 + chrX_KV766199v1_alt 188004 FALSE hg38 + chrX_MU273395v1_alt 619716 FALSE hg38 + chrX_MU273396v1_alt 294119 FALSE hg38 + chrX_MU273397v1_alt 330493 FALSE hg38 +``` + +Finally, the nature of BSgenome objects being akin to a list of sequences, +the operators `$` and `[[]]` can both be used to extract individual sequences +from the BSgenome object. + + +``` r +genome$chr1 +``` + +``` output +248956422-letter MaskedDNAString object (# for masking) +seq: ####################################...#################################### +masks: + maskedwidth maskedratio active names desc +1 18470101 7.419010e-02 TRUE AGAPS assembly gaps +2 5309 2.132502e-05 TRUE AMB intra-contig ambiguities +3 119060341 4.782377e-01 FALSE RM RepeatMasker +4 1647959 6.619468e-03 FALSE TRF Tandem Repeats Finder [period<=12] +all masks together: + maskedwidth maskedratio + 137685771 0.5530517 +all active masks together: + maskedwidth maskedratio + 18475410 0.07421142 +``` + +For instance, we can extract the sequence of the Y chromosome and assign it +to a new object `chrY`. + + +``` r +chrY <- genome[["chrY"]] +``` + +### Using genome sequences + +From this point, genome sequences can be treated very much like biological +strings (e.g. `DNAString`) described earlier, in the +*[Biostrings](https://bioconductor.org/packages/3.19/Biostrings)* package. + +For instance, the function `countPattern()` can be used to count the number of +occurences of a given pattern in a given genome sequence. + + +``` r +countPattern(pattern = "CANNTG", subject = chrY, fixed = FALSE) +``` + +``` output +[1] 141609 +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Note + +In the example above, the argument `fixed = FALSE` is used to indicate that the +pattern contain [IUPAC ambiguity codes][external-iupac]. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[glossary-s4-class]: reference.html#s4-class +[crossref-s4]: 05-s4.html +[external-iupac]: https://en.wikipedia.org/wiki/Nucleic_acid_notation#IUPAC_notation +[iupac-alphabet]: https://www.bioinformatics.org/sms/iupac.html +[external-truseq]: https://emea.illumina.com/products/by-type/sequencing-kits/library-prep-kits.html +[orf-finder]: https://www.ncbi.nlm.nih.gov/orffinder/ + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- The `Biostrings` package defines classes to represent sequences of nucleotides and amino acids. +- The `Biostrings` package also defines methods to efficiently process biological sequences. +- The `BSgenome` package provides genome sequences for a range of model organisms immediately available as Bioconductor objects. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/07-genomic-ranges.md b/07-genomic-ranges.md new file mode 100644 index 00000000..564a67d4 --- /dev/null +++ b/07-genomic-ranges.md @@ -0,0 +1,1065 @@ +--- +source: Rmd +title: Working with genomics ranges +teaching: XX +exercises: XX +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how genomic coordinates and intervals are represented in the Bioconductor project. +- Identify Bioconductor packages and methods available to process ranges of genomic coordinates. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What is the recommended way to represent coordinates on a genomic scale in Bioconductor? +- What Bioconductor packages provides methods to efficiently process genomic ranges? +- How can I import/export sets of genomic coordinates from/to various genomic file formats? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + + + + +## Install packages + +Before we can proceed into the following sections, we install some Bioconductor packages that we will need. +First, we check that the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is installed before trying to use it; otherwise we install it. +Then we use the `BiocManager::install()` function to install the necessary packages. + + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install("GenomicRanges") +``` + +## The GenomicRanges package and classes + +### Why do we need classes for genomic ranges? + +In the era of genomics, many observations are reported as ranges of +coordinates - i.e., intervals - on a genomic scale. +Depending on the nature of the assay, those genomic ranges may represent genes, +transcripts, exons, single nucleotide polymorphisms (SNPs), transcription factor +binding sites, or peaks from next-generation sequencing assays such as ChIP-seq +or ATAC-seq. + +Genomic ranges tie those observations of assayed values (e.g., gene +expression) to a physical location in the genome or an organism. +For instance, those genomic ranges can be used to query physical proximity or +overlap between assayed features and databases of known regulatory regions. + +Often, the final genomic ranges used for reporting measurements are the result +of combinations and operations on sets of genomic ranges in databases of +known genomic features. +For instance, in RNA-sequencing, next-generation sequencing reads are often +counted within individual exons, and those counts are subsequently aggregated +across all the exons of each gene. +Separately, promoters are frequently defined as region of arbitrary width, +partly upstream and/or downstream of known transcription start sites (TSS). + +Importantly, genomic ranges do not necessarily need to span multiple +coordinates. +The notion of range is meant in the mathematical way, and single-nucleotide +genomic ranges (e.g., SNPs) can be described as opening and closing at the same +coordinate (or at the next coordinate, in the case of a right-open interval). + +For many organisms, the genetic material is split into a number of separate +nucleic acid molecules (e.g., chromosomes, plasmids). +As such, genomic ranges are described by the name of the sequence and +the numeric interval of coordinates on that sequence. + +![](fig/bioc-genomicranges.svg){alt='Example uses of the GenomicRanges algebra.'} + +**Example uses of the GenomicRanges algebra.** +Adapted from Huber, Carey, Gentleman, Anders, Carlson, Carvalho, Bravo, Davis, Gatto, Girke, Gottardo, Hahne, Hansen, Irizarry, Lawrence, Love, MacDonald, Obenchain, Oles, Pages, Reyes, Shannon, Smyth, Tenenbaum, Waldron, and Morgan (2015). +The figure illustrates the example of a gene model that comprises two +transcripts, and the definition of various genomic ranges relative to that +gene model. +For instance - in this specific illustration - unspliced transcripts summarise +the entire range of coordinates from the start of the first exon to the end of +the last exon; while the gene region is defined as the set of coordinates +included in at least one exon of one transcript. + +### A brief introduction to intervals + +Intervals are described in mathematical terms using a start and an end position +on an axis of continuous coordinates. +The interval then comprises all the real numbers between those two coordinates, +and the width of each interval can be computed from the difference between the +coordinates of the start and end positions. + +Generally speaking, the start and end position can be any rational number, +including floating-point numbers. +However, in genomics, integer coordinates are typically used to represent the +location of monomers (e.g., nucleotide, amino acid) in the sequence of polymers +(e.g., nucleic acid, protein). + +You may come across packages, databases, and programming languages that use +different rules to define intervals. +In R, indexing is 1-based (meaning that the first position in a sequence is +1\), which contrasts with Python that is 0-based (the index of the first +position in a sequence is 0). +Similarly, references files in the UCSC Genome Browser are 0-based, while +those of the Ensembl Genome Browser are 1-based. + +The definition of intervals in a shared coordinate system allows calculations +such as the distance between two intervals - generally calculated as the +distance between the two closest edges of those intervals -, and the +identification of overlapping intervals. + +![](fig/intervals.svg){alt='Example of intervals.'} + +**Example of intervals.** +Three intervals named A, B, and C, are represented. +Interval A starts at position 5 and ends at position 9, for a width of 4 units; +interval B starts at position 1 and ends at position 3, for a width of 2 units; +interval C starts at position 3 and ends at position 6, for a width of 3 units. +Intervals A and C overlap, from coordinates 5 to 6; +while intervals B and C meet at coordinate 3, but do not strictly overlap each +other. + +### A brief introduction to genomic ranges + +Genomic ranges essentially extend the notion of mathematical intervals on +sets of biological sequences (e.g., chromosomes). +In other words, genomic ranges combine the name of the biological sequence on +which they are located with the integer range of coordinates that the +genomic ranges span in that sequence. +This is key to distinguish genomic features that span an overlapping range of +coordinates on different biological sequences. + +Furthermore, the double-stranded nature of DNA sequences also adds the notion of +strandedness to genomic ranges. +If known, the strand information of genomic features is a key piece of +information that should be tracked, so that it may be used for downstream +analyses. +For instance, genomic ranges spanning a common range of coordinates on opposite +strands of the same DNA sequence may not be considered to overlap (e.g., for the +purpose of strand-specific next-generation sequencing assays). + +Genomic ranges are *closed* intervals - the start and end positions are included +in the interval; in the example of nucleic acids, the start position indicates +the first nucleotide in the interval, and the end position indicates the last +nucleotide in the interval. + +![](fig/genomic-intervals.svg){alt='Example of genomic intervals.'} + +**Example of genomic ranges.** +Genomic ranges are defined by the name of the biological sequence in which they +are located (here, "chr1"), and the positions of start and end in that sequence. +Here, numeric positions are not explicitly shown, but implied by the sequence of +nucleic acids and the arrow indicating coordinates increasing from the left to +the right. +In this example, genomic ranges can be used to describe individual exons, +with metadata grouping those exons into transcripts and genes. +Furthermore, the strandedness of exons, transcripts, and genes is an important +piece of information to precisely describe the location of each genomic range +in the double-stranded DNA polymer. + +## The GenomicRanges package + +### Overview + +The *[GenomicRanges](https://bioconductor.org/packages/3.19/GenomicRanges)* package implements +[S4 classes][glossary-s4-class] to represent genomic ranges as S4 objects. + +Specifically, the `GRanges` class is designed to store a set of intervals +including the name of the sequence where features are located as well as the +range of integer coordinates spanned by the feature in that sequence. + +More generally, the `IRanges` class is designed to store a set of intervals +over a range of integer coordinates, without the notion of sequence names. +As such, a `GRanges` object is merely the combination of an `IRanges` object and +a vector of sequence names. + +Those S4 classes provide automatic validity-checking functionality, +and a range of methods implementing common operations on integer intervals +and genomic ranges, +from the calculation of distance between pairs of intervals to the +identification of overlapping genomic ranges. + +A short presentation of the basic classes defined in the +*[GenomicRanges](https://bioconductor.org/packages/3.19/GenomicRanges)* package is available in one of the +package vignettes, accessible as `vignette("GenomicRangesIntroduction")`, +while more detailed information is provided in the other package vignettes, +accessible as `browseVignettes("GenomicRanges")`. + +### First steps + +To get started, we load the package. + + +``` r +library(GenomicRanges) +``` + +### The IRanges class + +While the genomic space of many organisms is subdivided into multiple sequences +(e.g., chromosomes), many operations on genomic ranges take place within +individual sequences, where only integer positions matter. +The `IRanges` class provides a container for such "simple" ranges that are +defined by two out of three pieces of information: + +- the start position of the range +- the width of the range +- the end position of the range + +The `IRanges()` constructor function accepts those three pieces of information +in the arguments `start=`, `width=`, and `end=`. +For instance, we create two integer ranges from their start position and width: + +- one range starts at position 10 and has width 10 +- one range starts at position 15 and has width 5 + + +``` r +demo_iranges <- IRanges(start = c(10, 15), width = c(10, 5)) +demo_iranges +``` + +``` output +IRanges object with 2 ranges and 0 metadata columns: + start end width + + [1] 10 19 10 + [2] 15 19 5 +``` + +We note how the object displays not only the *start* and *width* information +that we requested for each range, but also the *end* position that is naturally +computed from the other two pieces of information. + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Create the same two ranges as above, using the arguments `start=` and `end=` +of the `IRanges()` constructor function. + +::::::::::::::: solution + +### Solution + + +``` r +IRanges(start = c(10, 15), end = c(19, 19)) +``` + +``` output +IRanges object with 2 ranges and 0 metadata columns: + start end width + + [1] 10 19 10 + [2] 15 19 5 +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +The start and end positions as well as the width of every interval can be +extracted as numeric vector using the functions `start()`, `end()` and +`width()`, respectively. + + +``` r +start(demo_iranges) +``` + +``` output +[1] 10 15 +``` + +``` r +end(demo_iranges) +``` + +``` output +[1] 19 19 +``` + +``` r +width(demo_iranges) +``` + +``` output +[1] 10 5 +``` + +Objects of the `IRanges` family extend the `Vector` class, and are handled as +unidimensional vectors in terms of indexing. +As such, individual ranges can be extracted by integer index like any +regular vector. + + +``` r +demo_iranges[1] +``` + +``` output +IRanges object with 1 range and 0 metadata columns: + start end width + + [1] 10 19 10 +``` + +### Metadata on IRanges + +The `IRanges` class can accommodate metadata information on each range, +including names - passed to the `names=` argument - and miscellaneous metadata +passed as named vectors. + +For instance, we create two ranges named "A" and "B". +Furthermore, we define metadata fields to store an example of character +values and numeric values, respectively. +Both the names and the values of the metadata fields are completely arbitrary +in this example. + + +``` r +demo_with_metadata <- IRanges( + start = c(10, 15), + end = c(19, 19), + names = c("A", "B"), + character_metadata = c("control", "target"), + numeric_metadata = c(100, 200) +) +demo_with_metadata +``` + +``` output +IRanges object with 2 ranges and 2 metadata columns: + start end width | character_metadata numeric_metadata + | + A 10 19 10 | control 100 + B 15 19 5 | target 200 +``` + +The metadata columns can be extracted as a `DataFrame` using the function `mcols()` (short for "metadata columns"). + + +``` r +mcols(demo_with_metadata) +``` + +``` output +DataFrame with 2 rows and 2 columns + character_metadata numeric_metadata + +A control 100 +B target 200 +``` + +The character vector of names can be extracted using the function `names()`. + + +``` r +names(demo_with_metadata) +``` + +``` output +[1] "A" "B" +``` + +Similarly to named vector of base data types, individual ranges can be extracted +by name. + + +``` r +demo_with_metadata["A"] +``` + +``` output +IRanges object with 1 range and 2 metadata columns: + start end width | character_metadata numeric_metadata + | + A 10 19 10 | control 100 +``` + +### Operations on IRanges + +`IRanges` provide the basis for most operations on ranges of numerical coordinates. + +For instance, given two sets of ranges - a query set and a subject set - the +`findOVerlaps()` function can be used to find out which pairs of ranges in the +two sets overlap with each other. + + +``` r +query_iranges <- IRanges( + start = c(8, 16), + end = c(14, 18) +) +overlaps_iranges <- findOverlaps(query = query_iranges, subject = demo_iranges) +overlaps_iranges +``` + +``` output +Hits object with 3 hits and 0 metadata columns: + queryHits subjectHits + + [1] 1 1 + [2] 2 1 + [3] 2 2 + ------- + queryLength: 2 / subjectLength: 2 +``` + +The results are returned in the form of a `Hits` object, which we have not +introduced yet. +A `Hits` object is visualised as a table that comprises two integer +columns named `queryHits` and `subjectHits`. +Each row in the table reports an overlap between one range in the query set +and one range in the subject set, with the integer value in each column +indicating the index of the range in each set involved in the overlap. + +In this example, we confirm that the first range in the query set overlaps the +first range in the subject set; while the second range in the query set overlaps +both ranges in the subject set. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +For downstream use, the two components can be extracted from `Hits` objects +using their names, respectively: + + +``` r +queryHits(overlaps_iranges) +``` + +``` output +[1] 1 2 2 +``` + +``` r +subjectHits(overlaps_iranges) +``` + +``` output +[1] 1 1 2 +``` + +While displayed as a table, `Hits` objects are actually handled like vectors. +Individual hits between one query range and one subject range can be extracted +their index: + + +``` r +overlaps_iranges[1] +``` + +``` output +Hits object with 1 hit and 0 metadata columns: + queryHits subjectHits + + [1] 1 1 + ------- + queryLength: 2 / subjectLength: 2 +``` + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### The GRanges class + +Having defined integer ranges, the only additional information necessary to +define genomic ranges is the name of the genomic sequence on which each range is +located. + +For instance, we define two genomic ranges, as follows: + +- one genomic range on chromosome 1 (abbreviated "chr1"), from position 10 to 25 +- one genomic range on chromosome 2 (abbreviated "chr2"), from position 20 to 35 + +To do so, we use the `GRanges()` constructor function. +We provide the sequence names as a character vector to the argument `seqnames=`, +and we provide both the start and end position to the argument `ranges=` +as an `IRanges` object. + + +``` r +demo_granges <- GRanges( + seqnames = c("chr1", "chr2"), + ranges = IRanges( + start = c(10, 20), + end = c(25, 35)) +) +demo_granges +``` + +``` output +GRanges object with 2 ranges and 0 metadata columns: + seqnames ranges strand + + [1] chr1 10-25 * + [2] chr2 20-35 * + ------- + seqinfo: 2 sequences from an unspecified genome; no seqlengths +``` + +In the console, the object displays the sequence names in the `seqnames` +component, and the ranges in the form `start-end` in the `ranges` component. +Furthermore, the example above also demonstrate that `GRanges` objects possess a +component called `strand`; the symbol `*` indicates unstranded genomic ranges, +as we have not provided that information. + +The strand information can be supplied to the `strand=` argument, for instance: + + +``` r +demo_granges2 <- GRanges( + seqnames = c("chr1", "chr2"), + ranges = IRanges( + start = c(10, 20), + end = c(25, 35)), + strand = c("+", "-") +) +demo_granges2 +``` + +``` output +GRanges object with 2 ranges and 0 metadata columns: + seqnames ranges strand + + [1] chr1 10-25 + + [2] chr2 20-35 - + ------- + seqinfo: 2 sequences from an unspecified genome; no seqlengths +``` + +Finally, the example above also demonstrate that `GRanges` objects include a +component called `seqinfo`, which can be used to store information +about each sequence that may be represented in the `seqnames` component. +In the latest example above, we have not provide any information about any +sequence. +As such, the `seqinfo` component was automatically populated with the names +of the sequences that we used to create the object, while the remaining +pieces of information were left unspecified, as `NA`. + + +``` r +seqinfo(demo_granges2) +``` + +``` output +Seqinfo object with 2 sequences from an unspecified genome; no seqlengths: + seqnames seqlengths isCircular genome + chr1 NA NA + chr2 NA NA +``` + +The example above reveals that information about sequences include not only +their respective name and length, but also whether they represent a circular +polymer (e.g., plasmid), and the name of the genome that they are part of. + +This information can be provided directly to the constructor when the object +is created, or edited on an existing object using the `seqinfo()` accessor and +the `Seqinfo()` constructor: + + +``` r +seqinfo(demo_granges2) <- Seqinfo( + seqnames = c("chr1", "chr2"), + seqlengths = c(1234, 5678), + isCircular = c(FALSE, TRUE), + genome = c("homo_sapiens", "homo_sapiens") +) +demo_granges2 +``` + +``` output +GRanges object with 2 ranges and 0 metadata columns: + seqnames ranges strand + + [1] chr1 10-25 + + [2] chr2 20-35 - + ------- + seqinfo: 2 sequences (1 circular) from homo_sapiens genome +``` + +The start and end positions of the individual ranges as well as the width of +every interval can be extracted as numeric vector using the functions `start()`, +`end()` and `width()`, respectively. + + +``` r +start(demo_granges2) +``` + +``` output +[1] 10 20 +``` + +``` r +end(demo_granges2) +``` + +``` output +[1] 25 35 +``` + +``` r +width(demo_granges2) +``` + +``` output +[1] 16 16 +``` + +The sequence names and strand information can be extracted using the functions +`seqnames()` and `strand()`, respectively. + + +``` r +seqnames(demo_granges2) +``` + +``` output +factor-Rle of length 2 with 2 runs + Lengths: 1 1 + Values : chr1 chr2 +Levels(2): chr1 chr2 +``` + +``` r +strand(demo_granges2) +``` + +``` output +factor-Rle of length 2 with 2 runs + Lengths: 1 1 + Values : + - +Levels(3): + - * +``` + +### Metadata on GRanges + +Similarly to `IRanges`, metadata can be passed directly to the `GRanges` +constructor function. +For instance: + + +``` r +demo_granges3 <- GRanges( + seqnames = c("chr1", "chr2"), + ranges = IRanges( + start = c(10, 20), + end = c(25, 35)), + metadata1 = c("control", "target"), + metadata2 = c(1, 2) +) +demo_granges3 +``` + +``` output +GRanges object with 2 ranges and 2 metadata columns: + seqnames ranges strand | metadata1 metadata2 + | + [1] chr1 10-25 * | control 1 + [2] chr2 20-35 * | target 2 + ------- + seqinfo: 2 sequences from an unspecified genome; no seqlengths +``` + +### Importing genomic ranges from files + +Frequently, large collections of genomic ranges are imported from files rather +than described in manually written code. +In particular, genome-wide annotations of known gene features are distributed +as files on websites such as the [Ensembl FTP][ensembl-ftp] and the +[UCSC Genome Data][ucsc-genome-data] sites. + +Various file formats are commonly used to store genomic ranges in bioinformatics +workflows. +For instance, the BED (Browser Extensible Data) format is commonly found in +Chromatin Immunoprecipitation Sequencing (ChIP-Seq), while GTF +(Gene Transfer Format, GTF2.2) is the *de facto* standard file format to +describe genomic features such as exons, transcripts, and genes. + +In the following example, we import the gene model for Actin Beta (ACTB) from +a small GTF file as a set of genomic ranges. +The example file represents a subset of a GTF file for the *Homo sapiens* +species, downloaded from the [Ensembl FTP][ensembl-ftp] site. +The original file contains more than 3 millions lines and 22 metadata fields, +from which a subset was extracted into a smaller file for this lesson. + +In particular, we use the `import()` generic +defined in the *[BiocIO](https://bioconductor.org/packages/3.19/BiocIO)* package - with methods +implemented in the *[rtracklayer](https://bioconductor.org/packages/3.19/rtracklayer)* package - as a +versatile function that is capable of recognising common file extensions and +associating them with the appropriate method for parsing each particular file +format. + + +``` r +library(rtracklayer) +``` + +``` warning +Warning: replacing previous import 'S4Arrays::makeNindexFromArrayViewport' by +'DelayedArray::makeNindexFromArrayViewport' when loading 'SummarizedExperiment' +``` + +``` r +actb_gtf_data <- rtracklayer::import("data/actb.gtf") +actb_gtf_data +``` + +``` output +GRanges object with 267 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5526409-5563902 - | rtracklayer gene NA + [2] 7 5526409-5530601 - | rtracklayer transcript NA + [3] 7 5530542-5530601 - | rtracklayer exon NA + [4] 7 5529535-5529684 - | rtracklayer exon NA + [5] 7 5529535-5529657 - | rtracklayer CDS NA + ... ... ... ... . ... ... ... + [263] 7 5540676-5540771 - | rtracklayer five_prime_utr NA + [264] 7 5529658-5529663 - | rtracklayer five_prime_utr NA + [265] 7 5561852-5562716 - | rtracklayer transcript NA + [266] 7 5562390-5562716 - | rtracklayer exon NA + [267] 7 5561852-5561949 - | rtracklayer exon NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB + [2] ENSG00000075624 ACTB ENST00000674681 + [3] ENSG00000075624 ACTB ENST00000674681 + [4] ENSG00000075624 ACTB ENST00000674681 + [5] ENSG00000075624 ACTB ENST00000674681 + ... ... ... ... ... + [263] ENSG00000075624 ACTB ENST00000414620 + [264] ENSG00000075624 ACTB ENST00000414620 + [265] ENSG00000075624 ACTB ENST00000646584 + [266] ENSG00000075624 ACTB ENST00000646584 + [267] ENSG00000075624 ACTB ENST00000646584 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths +``` + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +Individual methods for parsing specific file formats can be invoked directly. +For instance, in this case, the GTF file format being identical to the GFF +version 2 file format, we could have directly invoked the function +`rtracklayer::import.gff2()` with the exact same effect. + +Refer to the documentation of the *[rtracklayer](https://bioconductor.org/packages/3.19/rtracklayer)* +package for the full list of methods available. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +In the example above, the contents of the GTF file were imported into a +`GRanges` object. For each entry in the file, the sequence name, start and end +position, and strand information were used to populate the dedicated components +of the object, while all other pieces of information are stored as separate +columns of metadata. + +From here on, this `GRanges` object can be manipulated just like any of the +other `GRanges` objects that we have created earlier in this episode. + +### Operations on GRanges and the GRangesList class + +As we have demonstrated so far, `GRanges` objects can be manually defined +or imported from files. +Those often represent genomic regions of interest, and databases of known +genomic features, respectively. +Either way, a number of operations are commonly applied to `GRanges` objects +throughout bioinformatics workflows. + +#### Subset + +For instance, the `subset()` method is extremely convenient to extract a set +of genomic ranges matching a condition on any component, including sequence +name, start and end position, strand, or any metadata field. +In the example below, we extract all the records of type `transcript` that start +at position `5527147`. + + +``` r +subset(actb_gtf_data, type == "transcript" & start == 5527147) +``` + +``` output +GRanges object with 5 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5527147-5529949 - | rtracklayer transcript NA + [2] 7 5527147-5530581 - | rtracklayer transcript NA + [3] 7 5527147-5530604 - | rtracklayer transcript NA + [4] 7 5527147-5530604 - | rtracklayer transcript NA + [5] 7 5527147-5530604 - | rtracklayer transcript NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB ENST00000642480 + [2] ENSG00000075624 ACTB ENST00000676397 + [3] ENSG00000075624 ACTB ENST00000676319 + [4] ENSG00000075624 ACTB ENST00000676189 + [5] ENSG00000075624 ACTB ENST00000473257 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths +``` + +#### Split + +Separately, the `split()` method is useful to divide a set of genomic ranges +initially stored in a single `GRanges` object into groups that are stored +in a named list of `GRanges` objects. +Conveniently, the `GRangesList` class provides a container for efficiently +displaying and processing lists of `GRanges` objects. + +In the example below, we first extract the subset of entries that represent +exons, before separating those exons by transcript identifier, yielding +the result as a `GRangesList` object. + + +``` r +actb_exons <- subset(actb_gtf_data, type == "exon") +actb_exons_by_transcript <- split(actb_exons, actb_exons$transcript_id) +actb_exons_by_transcript +``` + +``` output +GRangesList object of length 23: +$ENST00000414620 +GRanges object with 4 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5562574-5562790 - | rtracklayer exon NA + [2] 7 5540676-5540771 - | rtracklayer exon NA + [3] 7 5529535-5529663 - | rtracklayer exon NA + [4] 7 5529282-5529400 - | rtracklayer exon NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB ENST00000414620 + [2] ENSG00000075624 ACTB ENST00000414620 + [3] ENSG00000075624 ACTB ENST00000414620 + [4] ENSG00000075624 ACTB ENST00000414620 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths + +$ENST00000417101 +GRanges object with 3 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5529806-5529982 - | rtracklayer exon NA + [2] 7 5529535-5529663 - | rtracklayer exon NA + [3] 7 5529235-5529400 - | rtracklayer exon NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB ENST00000417101 + [2] ENSG00000075624 ACTB ENST00000417101 + [3] ENSG00000075624 ACTB ENST00000417101 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths + +$ENST00000425660 +GRanges object with 7 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5530524-5530601 - | rtracklayer exon NA + [2] 7 5529535-5529663 - | rtracklayer exon NA + [3] 7 5529161-5529400 - | rtracklayer exon NA + [4] 7 5529019-5529059 - | rtracklayer exon NA + [5] 7 5528281-5528719 - | rtracklayer exon NA + [6] 7 5528004-5528185 - | rtracklayer exon NA + [7] 7 5527156-5527891 - | rtracklayer exon NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB ENST00000425660 + [2] ENSG00000075624 ACTB ENST00000425660 + [3] ENSG00000075624 ACTB ENST00000425660 + [4] ENSG00000075624 ACTB ENST00000425660 + [5] ENSG00000075624 ACTB ENST00000425660 + [6] ENSG00000075624 ACTB ENST00000425660 + [7] ENSG00000075624 ACTB ENST00000425660 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths + +... +<20 more elements> +``` + +When printing the object above in the console, the first line confirms the +class of the object as `GRrangesList`, while each named `GRanges` in that list +is introduced by the dollar sign and the name of that item, just like regular +named lists in base R. + +#### Length + +By nature, many of the methods applicable to `list` objects can be directly +applied to `GRangesList` objects. +For instance, the `lengths()` function can be used on `GRangesList` to display +the length of each `GRanges` object in the list as an integer vector. + +In the latest example above, we can compute the number of exons in each transcript +as the length of each `GRanges` object within the `GRangesList`: + + +``` r +lengths(actb_exons_by_transcript) +``` + +``` output +ENST00000414620 ENST00000417101 ENST00000425660 ENST00000432588 ENST00000443528 + 4 3 7 5 3 +ENST00000462494 ENST00000464611 ENST00000473257 ENST00000477812 ENST00000480301 + 5 3 5 5 2 +ENST00000484841 ENST00000493945 ENST00000642480 ENST00000645025 ENST00000645576 + 5 6 5 4 5 +ENST00000646584 ENST00000646664 ENST00000647275 ENST00000674681 ENST00000675515 + 2 6 3 6 6 +ENST00000676189 ENST00000676319 ENST00000676397 + 6 3 6 +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Importantly, the function `lengths()` (with a final `s`) demonstrated above +is different from the function `length()` (without `s`). +The former is meant to be used on list objects, returning a vector giving the +length of each element in the list; while the latter returns a single numeric +scalar giving the length of the list itself (i.e., the number of elements +in the list). + +What does `length(actb_exons_by_transcript)` return, and what does this +number represent biologically? + +::::::::::::::: solution + +### Solution + + +``` r +length(actb_exons_by_transcript) +``` + +``` output +[1] 23 +``` + +This code returns the single integer value `23`, which is the number +of `GRanges` in the `GRangesList` object and the number of transcripts for +the gene ACTB. + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +### Subset by overlap + +Possibly one of the most common operations when working with genomic ranges +is to subset arbitrarily large collections of genomic ranges to those located +in a specific region of the genome; for instance, when visualising information +as tracks in a genome browser. + +To demonstrate, we manually define a new `GRanges` representing a region of +interest that we will use to extract all of the genomic ranges imported earlier +from the GTF file which overlap that region of interest. + + +``` r +region_of_interest <- GRanges( + seqnames = "7", + ranges = IRanges(start = 5525830, end = 5531239) +) +actb_in_region <- subsetByOverlaps(x = actb_gtf_data, ranges = region_of_interest) +actb_in_region +``` + +``` output +GRanges object with 256 ranges and 7 metadata columns: + seqnames ranges strand | source type score + | + [1] 7 5526409-5563902 - | rtracklayer gene NA + [2] 7 5526409-5530601 - | rtracklayer transcript NA + [3] 7 5530542-5530601 - | rtracklayer exon NA + [4] 7 5529535-5529684 - | rtracklayer exon NA + [5] 7 5529535-5529657 - | rtracklayer CDS NA + ... ... ... ... . ... ... ... + [252] 7 5529535-5529657 - | rtracklayer CDS NA + [253] 7 5529655-5529657 - | rtracklayer start_codon NA + [254] 7 5529282-5529400 - | rtracklayer exon NA + [255] 7 5529282-5529400 - | rtracklayer CDS NA + [256] 7 5529658-5529663 - | rtracklayer five_prime_utr NA + phase gene_id gene_name transcript_id + + [1] ENSG00000075624 ACTB + [2] ENSG00000075624 ACTB ENST00000674681 + [3] ENSG00000075624 ACTB ENST00000674681 + [4] ENSG00000075624 ACTB ENST00000674681 + [5] ENSG00000075624 ACTB ENST00000674681 + ... ... ... ... ... + [252] ENSG00000075624 ACTB ENST00000414620 + [253] ENSG00000075624 ACTB ENST00000414620 + [254] ENSG00000075624 ACTB ENST00000414620 + [255] ENSG00000075624 ACTB ENST00000414620 + [256] ENSG00000075624 ACTB ENST00000414620 + ------- + seqinfo: 1 sequence from an unspecified genome; no seqlengths +``` + +Like the `subset()` method, the `subsetByOverlaps()` method returns a new +`GRanges` object. +We can visually compare the information printed in the object +(256 ranges in the new subsetted object, relative to 267 ranges in the original +object), or we can programmatically compare the length of the two objects +to check whether the new `GRanges` object is any smaller than the original +`GRanges` object: + + +``` r +length(actb_in_region) - length(actb_gtf_data) +``` + +``` output +[1] -11 +``` + +In the example above, we learn that the new `GRanges` object has 11 records less +than the original `GRanges` object. + +::::::::::::::::::::::::::::::::::::::::: callout + +### Going further + +Many more methods exist to operate on `GRanges` and `GRangesList` objects +than what could be demonstrated here. + +You can find the full list of functions defined in the `GenomicRanges` package +on the index page of the package documentation, accessible using +`help(package="GenomicRanges")`. +You can also find more examples and use cases in the package vignettes, accessible using +`browseVignettes("GenomicRanges")`. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[glossary-s4-class]: reference.html#s4-class +[ensembl-ftp]: https://www.ensembl.org/info/data/ftp/ +[ucsc-genome-data]: https://hgdownload.soe.ucsc.edu/downloads.html + + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- The `GenomicRanges` package defines classes to represent ranges of coordinates on a genomic scale. +- The `GenomicRanges` package also defines methods to efficiently process genomic ranges. +- The `rtracklayer` package provides functions to import and export genomic ranges from and to common genomic file formats. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + diff --git a/08-annotations.md b/08-annotations.md new file mode 100644 index 00000000..c6e29d2e --- /dev/null +++ b/08-annotations.md @@ -0,0 +1,818 @@ +--- +source: Rmd +title: Working with annotations +teaching: XX +exercises: XX +--- + +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Explain how gene annotations are managed in the Bioconductor project. +- Identify Bioconductor packages and methods available to fetch and use gene annotations. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- What Bioconductor packages provides methods to efficiently fetch and use gene annotations? +- How can I use gene annotation packages to convert between different gene identifiers? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + + + + +## Install packages + +Before we can proceed into the following sections, we install some Bioconductor +packages that we will need. +First, we check that the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is +installed before trying to use it; otherwise we install it. +Then we use the `BiocManager::install()` function to install the necessary +packages. + + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install(c("biomaRt", "org.Hs.eg.db")) +``` + +## Overview + +Packages dedicated to query gene annotations exist in the 'Software' and +'Annotation' categories of the Bioconductor [biocViews][biocviews], according to +their nature. + +In the 'Software' section, we find packages that do not actually contain gene +annotations, but rather dynamically _query_ them from online resources +(e.g.,[Ensembl BioMart][biomart-ensembl]). +One such Bioconductor package is *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)*. + +Instead, in the 'Annotation' section, we find packages that do contain +annotations. +Examples include *[org.Hs.eg.db](https://bioconductor.org/packages/3.19/org.Hs.eg.db)*, +*[EnsDb.Hsapiens.v86](https://bioconductor.org/packages/3.19/EnsDb.Hsapiens.v86)*, +and *[TxDb.Hsapiens.UCSC.hg38.knownGene](https://bioconductor.org/packages/3.19/TxDb.Hsapiens.UCSC.hg38.knownGene)*. + +In this episode, we will demonstrate the two approaches: + +* Querying annotations from the Ensembl Biomart API using the *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* package. +* Querying annotations from the *[org.Hs.eg.db](https://bioconductor.org/packages/3.19/org.Hs.eg.db)* annotation package. + +## Online resources or Bioconductor annotation packages? + +### Accessing the latest information + +Bioconductor's 6-month release cycle implies that packages available from the +latest stable release branch will not be updated for six months +(only bugfixes are allowed, not functional updates). +As a result, annotation packages may contain information that is out-of-date +by up to six months. + +Instead, independent online resources often have different policies driving +the release of updated information. +Some databases are frequently updated, while others may not have been updated +in years. + +Accessing up-to-date information must also be balanced with reproducibility. +Having downloaded the 'latest' information at one point is time is no good if +one hasn't recorded at least *when* that information was downloaded. + +### Storage requirements + +By nature, Bioconductor annotation packages are larger than software packages. +Just like any other R package, annotation packages must be installed on the +user's computer before they can be used. +This can rapidly use add up to using an amount of disk space that is not +negligible. + +Conversely, online resources are generally accessed programmatically, and +generally only require users to record code to replicate analyses reproducibly. + +### Internet connectivity + +When using online resources, it is often a good idea to write annotations +ownloaded from online resources to a local file, +and refer to that local file during analyses. + +If online resources were to become unavailable for any reason +(e.g., downtime, loss of internet connection), analyses that use local files +can carry on while those that rely on those online resources cannot. + +In contrast, Bioconductor annotation packages only require internet connectivity +at the time of installation. +Once installed, they do not require internet connectivity, as they rely on +information stored locally. + +### Reproducibility + +Bioconductor annotation packages are naturally versioned, meaning that users +can confidently report the version of the package used in their analysis. +Just like software packages, users control if and when annotation packages +should be updated on their computer. + +Online resources have different policies to facilitate reproducible analyses. +Some online resources keep archived versions of their annotations, allowing +users to consistently access the same information over time. +When this is not the case, it may be necessary to download a copy of the +annotation at one point in time, and preciously keep that copy throughout +the lifetime of the project to ensure the use of a consistent set of +annotations. + +### Consistency + +As we will see in the practical examples of this episode, +Bioconductor annotation packages generally re-use a consistent set of data +structures. +This allows users familiar with one annotation package to rapidly get started +with others. + +Independent online resources often organise their data in different ways, +which requires users to write custom code to access, retrieve, and process +their respective data. + +## Querying annotations from Ensembl BioMart + +### The Ensembl BioMart + +[Ensembl BioMart][biomart-ensembl] is a robust data mining tool designed to +facilitate access to the vast array of biological data available through the +Ensembl project. + +The [BioMart web interface][biomart-ensembl] enables researchers to efficiently +query and retrieve data on genes, proteins, and other genomic features +across multiple species. +It allows users to filter, sort, and export data based on various attributes +such as gene IDs, chromosomal locations, and functional annotations. + +### The Bioconductor `biomaRt` package + +*[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* is a Bioconductor software package that +enables retrieval of large amounts of data from Ensembl BioMart tables +directly from an R session where those annotations can be used. + +Let us first load the package: + + +``` r +library(biomaRt) +``` + +### Listing available marts + +Ensembl BioMart organises its diverse biological information into four databases +also known as 'marts' or 'biomarts'. +Each mart focuses on a different type of data. + +Users must select the mart corresponds to the type of data they are interested +in before they can query any information from it. + +The function `listMarts()` can be used to display the names of those marts. +This is convenient as users do not need to memorise the name of the marts, +and the function will also return an updated list of names if any mart is +renamed, added, or removed. + + +``` r +listMarts() +``` + +``` output + biomart version +1 ENSEMBL_MART_ENSEMBL Ensembl Genes 112 +2 ENSEMBL_MART_MOUSE Mouse strains 112 +3 ENSEMBL_MART_SNP Ensembl Variation 112 +4 ENSEMBL_MART_FUNCGEN Ensembl Regulation 112 +``` + +In this demonstration, we will use the biomart called `ENSEMBL_MART_ENSEMBL`, +which contains the Ensembl gene set. + +Notably, the `version` columns also indicates the version of the biomart. +The Ensembl BioMart is updated regularly (multiple times per year). +By default, *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* functions access the latest +version of each biomart. +This is not ideal for reproducibility. + +Thankfully, Ensembl BioMart archives past versions of its mars in a way that +is accessible both programmatically, and on its website. + +The function `listEnsemblArchives()` can be used to display all the versions of +Ensembl Biomart accessible. + + +``` r +listEnsemblArchives() +``` + +``` output + name date url version +1 Ensembl GRCh37 Feb 2014 https://grch37.ensembl.org GRCh37 +2 Ensembl 112 May 2024 https://may2024.archive.ensembl.org 112 +3 Ensembl 111 Jan 2024 https://jan2024.archive.ensembl.org 111 +4 Ensembl 110 Jul 2023 https://jul2023.archive.ensembl.org 110 +5 Ensembl 109 Feb 2023 https://feb2023.archive.ensembl.org 109 +6 Ensembl 108 Oct 2022 https://oct2022.archive.ensembl.org 108 +7 Ensembl 107 Jul 2022 https://jul2022.archive.ensembl.org 107 +8 Ensembl 106 Apr 2022 https://apr2022.archive.ensembl.org 106 +9 Ensembl 105 Dec 2021 https://dec2021.archive.ensembl.org 105 +10 Ensembl 104 May 2021 https://may2021.archive.ensembl.org 104 +11 Ensembl 103 Feb 2021 https://feb2021.archive.ensembl.org 103 +12 Ensembl 102 Nov 2020 https://nov2020.archive.ensembl.org 102 +13 Ensembl 101 Aug 2020 https://aug2020.archive.ensembl.org 101 +14 Ensembl 100 Apr 2020 https://apr2020.archive.ensembl.org 100 +15 Ensembl 99 Jan 2020 https://jan2020.archive.ensembl.org 99 +16 Ensembl 98 Sep 2019 https://sep2019.archive.ensembl.org 98 +17 Ensembl 97 Jul 2019 https://jul2019.archive.ensembl.org 97 +18 Ensembl 80 May 2015 https://may2015.archive.ensembl.org 80 +19 Ensembl 77 Oct 2014 https://oct2014.archive.ensembl.org 77 +20 Ensembl 75 Feb 2014 https://feb2014.archive.ensembl.org 75 +21 Ensembl 54 May 2009 https://may2009.archive.ensembl.org 54 + current_release +1 +2 * +3 +4 +5 +6 +7 +8 +9 +10 +11 +12 +13 +14 +15 +16 +17 +18 +19 +20 +21 +``` + +In the output above, the key piece of information is the `url` column, which +provides the URL that *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* functions will need to +access data from the corresponding snapshot of the Ensembl BioMart. + +At the time of writing, the current release is Ensembl 112, so let us use +the corresponding url `https://may2024.archive.ensembl.org` to ensure +reproducible results no matter when this lesson is delivered. + +### Connecting to a biomart + +The two pieces of information collected above -- the name of a biomart +and the URL of a snapshot -- is all that is needed to connect to a BioMart +database reproducibly. + +The function `useMart()` can then be used to create a connection. +The connection is traditionally stored in an object called `mart`, +to be reused in subsequent steps for querying information from the online mart. + + +``` r +mart <- useMart(biomart = "ENSEMBL_MART_ENSEMBL", host = "https://may2024.archive.ensembl.org") +``` + +### Listing available data sets + +Each biomart contains a number of data sets. + +The function `listDatasets()` can be used to display information about those +data sets. +This is convenient as users do not need to memorise the name of the data sets, +and the information returned by the function includes a short description of +each data set, as well as its version. + +In the example below, we restrict the output table to the first few rows, +as the full table comprises 214 rows. + + +``` r +head(listDatasets(mart)) +``` + +``` output + dataset description +1 abrachyrhynchus_gene_ensembl Pink-footed goose genes (ASM259213v1) +2 acalliptera_gene_ensembl Eastern happy genes (fAstCal1.3) +3 acarolinensis_gene_ensembl Green anole genes (AnoCar2.0v2) +4 acchrysaetos_gene_ensembl Golden eagle genes (bAquChr1.2) +5 acitrinellus_gene_ensembl Midas cichlid genes (Midas_v5) +6 amelanoleuca_gene_ensembl Giant panda genes (ASM200744v2) + version +1 ASM259213v1 +2 fAstCal1.3 +3 AnoCar2.0v2 +4 bAquChr1.2 +5 Midas_v5 +6 ASM200744v2 +``` + +In the output above, the key piece of information is the `dataset` column, which +provides the identifier that *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* functions will +need to access data from the corresponding biomart table. + +In this demonstration, we will use the Ensembl gene set for Homo sapiens, +which is not visible in the output above. + +Given the number of data sets available, +let us programmatically filter the table of information using pattern matching +rather than searching the table manually: + + +``` r +subset(listDatasets(mart), grepl("sapiens", dataset)) +``` + +``` output + dataset description version +80 hsapiens_gene_ensembl Human genes (GRCh38.p14) GRCh38.p14 +``` + +From the output above, we identify the desired data set identifier as +`hsapiens_gene_ensembl`. + +### Connecting to a data set + +Having chosen the data set that we want to use, we need to call the function +`useMart()` again, this time specifying the selected data set. + +Typically, one would copy paste the previous call to `useMart()` and edit as +needed. +It is also common practice to replace the `mart` object with the new connection. + + +``` r +mart <- useMart( + biomart = "ENSEMBL_MART_ENSEMBL", + dataset = "hsapiens_gene_ensembl", + host = "https://may2024.archive.ensembl.org") +``` + +### Listing information available in a data set + +BioMart tables contain many pieces of information also known as 'attributes'. +So many, in fact, that they have been grouped into categories also known as +'pages'. + +The function `listAttributes()` can be used to display information about +those attributes. +This is convenient as users do not need to memorise the name of the attributes, +and the information returned by the function includes a short description of +each attribute, as well as its page categorisation. + +In the example below, we restrict the output table to the first few rows, +as the full table comprises 3157 rows. + + +``` r +head(listAttributes(mart)) +``` + +``` output + name description page +1 ensembl_gene_id Gene stable ID feature_page +2 ensembl_gene_id_version Gene stable ID version feature_page +3 ensembl_transcript_id Transcript stable ID feature_page +4 ensembl_transcript_id_version Transcript stable ID version feature_page +5 ensembl_peptide_id Protein stable ID feature_page +6 ensembl_peptide_id_version Protein stable ID version feature_page +``` + +In the output above, the key piece of information is the `name` column, which +provides the identifier that *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* functions will +need to query that information from the corresponding biomart data set. + +The choice of attributes to query now depends on what it is we wish to achieve. + +For instance, let us imagine that we have a set of gene identifiers, +for which we wish to query: + +* The gene symbol +* The name of the chromosome where the gene is located +* The start and end position of the gene on that chromosome +* The strand on which the gene is encoded + +Users would often manually explore the full table of attributes to identify +the ones they wish to include in their query. +It is also possible to programmatically filter the table of attribute, +based on experience and intuition, to narrow down the search: + + +``` r +subset(listAttributes(mart), grepl("position", name) & grepl("feature", page)) +``` + +``` output + name description page +10 start_position Gene start (bp) feature_page +11 end_position Gene end (bp) feature_page +``` + +### Querying information from a BioMart table + +We have now all the information that we need to perform the actual query: + +* A connection to a BioMart data set +* The list of attributes available in that data set + +The function `getBM()` is the main *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* query +function. +Given a set of filters and corresponding values, it retrieves the attributes +requested by the user from the BioMart data set it is connected to. + +In the example below, we manually create a vector of arbitrary gene identifiers +for our query. +In practice, the query will often originate from an earlier analysis +(e.g., differential gene expression). + +The example below also queries attributes that we have not introduced yet. +In the previous section, we described how one may search the table of attributes +returned by `listAttributes()` to identify attributes to include in their query. + + +``` r +query_gene_ids <- c( + "ENSG00000133101", + "ENSG00000145386", + "ENSG00000134057", + "ENSG00000157456", + "ENSG00000147082" +) +getBM( + attributes = c( + "ensembl_gene_id", + "hgnc_symbol", + "chromosome_name", + "start_position", + "end_position", + "strand" + ), + filters = "ensembl_gene_id", + values = query_gene_ids, + mart = mart +) +``` + +``` output + ensembl_gene_id hgnc_symbol chromosome_name start_position end_position +1 ENSG00000133101 CCNA1 13 36431520 36442870 +2 ENSG00000134057 CCNB1 5 69167135 69178245 +3 ENSG00000145386 CCNA2 4 121816444 121823883 +4 ENSG00000147082 CCNB3 X 50202713 50351914 +5 ENSG00000157456 CCNB2 15 59105126 59125045 + strand +1 1 +2 1 +3 -1 +4 1 +5 1 +``` + +Note that we also included the filtering attribute `ensembl_gene_id` to the +attributes retrieved from the data set. +This is key to reliably match the newly retrieved attributes to those used +in the query. + +## Querying annotations from annotation packages + +### Families of annotation packages + +To balance the need for comprehensive information +while maintaining reasonable package sizes, +Bioconductor annotation packages are organised by release, data type, and +species. + +The major families of Bioconductor annotation packages are: + +* `OrgDb` packages provide mapping between various types of gene identifiers + and pathway information. +* `EnsDb` packages provide individual releases of Ensembl annotations. +* `TxDb` packages provide individual releases of UCSC annotations. + +All those families of annotations derive from the `AnnotationDb` base class +defined in the *[AnnotationDbi](https://bioconductor.org/packages/3.19/AnnotationDbi)* package. +As a result, any of those annotation packages can be accessed using the same set +of R functions, as demonstrated in the following sections. + +### Using an OrgDb package + +In this example, we will use the *[org.Hs.eg.db](https://bioconductor.org/packages/3.19/org.Hs.eg.db)* +package to demonstrate the use of gene annotations for the human species. + +Let us first load the package: + + +``` r +library(org.Hs.eg.db) +``` + +Each `OrgDb` package contains an object named identically to the package itself. +That object contains the annotations that the package is meant to disseminate. + +Aside from querying information, the whole object can be called to print +information about the annotations it contains, including the date at which +the snapshots of annotations that it contains were made. + + +``` r +org.Hs.eg.db +``` + +``` output +OrgDb object: +| DBSCHEMAVERSION: 2.1 +| Db type: OrgDb +| Supporting package: AnnotationDbi +| DBSCHEMA: HUMAN_DB +| ORGANISM: Homo sapiens +| SPECIES: Human +| EGSOURCEDATE: 2024-Mar12 +| EGSOURCENAME: Entrez Gene +| EGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA +| CENTRALID: EG +| TAXID: 9606 +| GOSOURCENAME: Gene Ontology +| GOSOURCEURL: http://current.geneontology.org/ontology/go-basic.obo +| GOSOURCEDATE: 2024-01-17 +| GOEGSOURCEDATE: 2024-Mar12 +| GOEGSOURCENAME: Entrez Gene +| GOEGSOURCEURL: ftp://ftp.ncbi.nlm.nih.gov/gene/DATA +| KEGGSOURCENAME: KEGG GENOME +| KEGGSOURCEURL: ftp://ftp.genome.jp/pub/kegg/genomes +| KEGGSOURCEDATE: 2011-Mar15 +| GPSOURCENAME: UCSC Genome Bioinformatics (Homo sapiens) +| GPSOURCEURL: +| GPSOURCEDATE: 2024-Feb29 +| ENSOURCEDATE: 2023-Nov22 +| ENSOURCENAME: Ensembl +| ENSOURCEURL: ftp://ftp.ensembl.org/pub/current_fasta +| UPSOURCENAME: Uniprot +| UPSOURCEURL: http://www.UniProt.org/ +| UPSOURCEDATE: Thu Apr 18 21:39:39 2024 +``` + +``` output + +Please see: help('select') for usage information +``` + +That same object is the one that needs to be supplied to +*[AnnotationDbi](https://bioconductor.org/packages/3.19/AnnotationDbi)* functions for running queries and +retrieving annotations. + +### Listing information available in an annotation package + +The function `columns()` can be used to display the annotations available +in the object. + +Here, the word 'column' refers to columns of tables used to store information in +database, the very same concept as 'attributes' in BioMart. +In other words, columns represent all the types of annotations that may be +retrieved from the object. + +This is convenient as users do not need to memorise the names of the columns of +annotations available in the package. + + +``` r +columns(org.Hs.eg.db) +``` + +``` output + [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" + [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" +[11] "GENETYPE" "GO" "GOALL" "IPI" "MAP" +[16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" +[21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" +[26] "UNIPROT" +``` + +### Listing keys and key types + +In database terminology, *keys* are the values by which information may be +queried from a database table. + +Information being organised in columns, *key types* are the names of the columns +in which the key values are stored. + +Given the variable number of columns in database tables, some tables may allow +information to be queried by more than one key. +As a result, it is crucial to specify both the keys and the type of key as part +of the query. + +The function `keytypes()` can be used to display the names of the columns that +may be used to query information from the object. + + +``` r +keytypes(org.Hs.eg.db) +``` + +``` output + [1] "ACCNUM" "ALIAS" "ENSEMBL" "ENSEMBLPROT" "ENSEMBLTRANS" + [6] "ENTREZID" "ENZYME" "EVIDENCE" "EVIDENCEALL" "GENENAME" +[11] "GENETYPE" "GO" "GOALL" "IPI" "MAP" +[16] "OMIM" "ONTOLOGY" "ONTOLOGYALL" "PATH" "PFAM" +[21] "PMID" "PROSITE" "REFSEQ" "SYMBOL" "UCSCKG" +[26] "UNIPROT" +``` + +The function `keys()` can be used to display all the possible values for a given +key type. + +It is generally better practice to specify the type of key being queried +(to avoid ambiguity), although database tables typically have a 'primary key' +used if users do not specify a type themselves. + +In the example below, we restrict the list of gene symbol keys to the first few +values, +as the full set comprises 193279 +values. + + +``` r +head(keys(org.Hs.eg.db, keytype = "SYMBOL")) +``` + +``` output +[1] "A1BG" "A2M" "A2MP1" "NAT1" "NAT2" "NATP" +``` + +### Querying information from an annotation package + +The function `select()` is the main *[AnnotationDbi](https://bioconductor.org/packages/3.19/AnnotationDbi)* +query function. +Given an `AnnotationDb` object, key values, and columns +(and optionally the type of key supplied if not the primary key), +it retrieves the columns requested by the user from the annotation object. + +In the example below, we re-use the vector of arbitrary gene identifiers +used in the BioMart example a few sections above. + +As you can see from the output of the `columns()` function, the annotation +object does not contain some of the attributes that we queried in the Biomart +example. +In this case, let us query: + +* the gene symbol +* the gene name +* the gene type + + +``` r +select( + x = org.Hs.eg.db, + keys = query_gene_ids, + columns = c( + "SYMBOL", + "GENENAME", + "GENETYPE" + ), + keytype = "ENSEMBL" +) +``` + +``` output +'select()' returned 1:1 mapping between keys and columns +``` + +``` output + ENSEMBL SYMBOL GENENAME GENETYPE +1 ENSG00000133101 CCNA1 cyclin A1 protein-coding +2 ENSG00000145386 CCNA2 cyclin A2 protein-coding +3 ENSG00000134057 CCNB1 cyclin B1 protein-coding +4 ENSG00000157456 CCNB2 cyclin B2 protein-coding +5 ENSG00000147082 CCNB3 cyclin B3 protein-coding +``` + +One small but notable difference with *[biomaRt](https://bioconductor.org/packages/3.19/biomaRt)* is that +the output of `select()` automatically contains the column that correspond to +the key type used in the query. +In other words, there is no need to specify the key type(s) again in the +column(s) to retrieve. + +### Vectorized 1:1 mapping + +It is sometimes possible for annotations to display 1-to-many relationships. +For instance, individual genes typically have a unique Ensembl gene identifier, +while they may be known under multiple gene name aliases. + +The `select()` function demonstrated in the previous section automatically +returns *all* values in the columns requested, for the key specified. +This is possible thanks to the tabular format in which annotations are returned; +rows are added, repeating values as necessary to display them on the same row +as every other values they are associated with. + +In some cases, that behaviour is not desirable. +Instead, users may wish to retrieve a single value for each key that they input. +One common scenario arises during differential gene expression (DGE), +where gene identifiers are used to uniquely identify genes throughout the +analysis, while gene symbols are added to the final table of DGE statistics, +to provide more readable human-friendly gene identifiers. +However, it is not desirable to duplicate rows of DGE statistics, and thus +only a single gene symbol is required to annotate each gene. + +The function `mapIds()` can be used for this purpose. +A major difference between the functions `mapIds()` and `select()` +are their arguments `column` (singular) and `columns` (plural), respectively. +The function `mapIds()` accepts a single column name and returns a named +character vector where names are the input query values, and values are the +corresponding values in the requested column. + +To deal with 1-to-many relationships, the function `mapIds()` has an argument +`multiVals` which can be used to specify how the function should handle multiple +values. +The default is to take the first value and ignore any other value. + +In the example below, we query the gene symbol for a set of Ensembl gene +identifiers. + + +``` r +mapIds( + x = org.Hs.eg.db, + keys = query_gene_ids, + column = "SYMBOL", + keytype = "ENSEMBL" +) +``` + +``` output +'select()' returned 1:1 mapping between keys and columns +``` + +``` output +ENSG00000133101 ENSG00000145386 ENSG00000134057 ENSG00000157456 ENSG00000147082 + "CCNA1" "CCNA2" "CCNB1" "CCNB2" "CCNB3" +``` + +::::::::::::::::::::::::::::::::::::::: challenge + +### Challenge + +Load the packages *[EnsDb.Hsapiens.v86](https://bioconductor.org/packages/3.19/EnsDb.Hsapiens.v86)* and +*[TxDb.Hsapiens.UCSC.hg38.knownGene](https://bioconductor.org/packages/3.19/TxDb.Hsapiens.UCSC.hg38.knownGene)*. +Then, display the columns of annotations available in those packages. + +::::::::::::::: solution + +### Solution + + +``` r +library(EnsDb.Hsapiens.v86) +columns(EnsDb.Hsapiens.v86) +``` + +``` output + [1] "ENTREZID" "EXONID" "EXONIDX" + [4] "EXONSEQEND" "EXONSEQSTART" "GENEBIOTYPE" + [7] "GENEID" "GENENAME" "GENESEQEND" +[10] "GENESEQSTART" "INTERPROACCESSION" "ISCIRCULAR" +[13] "PROTDOMEND" "PROTDOMSTART" "PROTEINDOMAINID" +[16] "PROTEINDOMAINSOURCE" "PROTEINID" "PROTEINSEQUENCE" +[19] "SEQCOORDSYSTEM" "SEQLENGTH" "SEQNAME" +[22] "SEQSTRAND" "SYMBOL" "TXBIOTYPE" +[25] "TXCDSSEQEND" "TXCDSSEQSTART" "TXID" +[28] "TXNAME" "TXSEQEND" "TXSEQSTART" +[31] "UNIPROTDB" "UNIPROTID" "UNIPROTMAPPINGTYPE" +``` + + +``` r +library(TxDb.Hsapiens.UCSC.hg38.knownGene) +columns(TxDb.Hsapiens.UCSC.hg38.knownGene) +``` + +``` output + [1] "CDSCHROM" "CDSEND" "CDSID" "CDSNAME" "CDSPHASE" + [6] "CDSSTART" "CDSSTRAND" "EXONCHROM" "EXONEND" "EXONID" +[11] "EXONNAME" "EXONRANK" "EXONSTART" "EXONSTRAND" "GENEID" +[16] "TXCHROM" "TXEND" "TXID" "TXNAME" "TXSTART" +[21] "TXSTRAND" "TXTYPE" +``` + +::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- Bioconductor provides a wide range annotation packages. +- Some Bioconductor software packages can be used to programmatically access online resources. +- Users should carefully choose their source of annotations based on their needs and expectations. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[biocviews]: https://www.bioconductor.org/packages/release/BiocViews.html +[biomart-ensembl]: https://www.ensembl.org/biomart/martview diff --git a/09-summarizedexperiment.md b/09-summarizedexperiment.md new file mode 100644 index 00000000..61b1a17f --- /dev/null +++ b/09-summarizedexperiment.md @@ -0,0 +1,521 @@ +--- +source: Rmd +title: The SummarizedExperiment class +teaching: XX +exercises: XX +--- + +--- + + + +::::::::::::::::::::::::::::::::::::::: objectives + +- Describe how both experimental data and metadata can be stored in a single object. +- Explain why this is crucial to keep data and metadata synchronised throughout analyses. + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +:::::::::::::::::::::::::::::::::::::::: questions + +- How is information organized in SummarizedExperiment objects? +- How can that information be added, edited, and accessed? + +:::::::::::::::::::::::::::::::::::::::::::::::::: + + + + + +## Install packages + +Before we can proceed into the following sections, we install some Bioconductor +packages that we will need. +First, we check that the *[BiocManager](https://bioconductor.org/packages/3.19/BiocManager)* package is +installed before trying to use it; otherwise we install it. +Then we use the `BiocManager::install()` function to install the necessary +packages. + + +``` r +if (!requireNamespace("BiocManager", quietly = TRUE)) + install.packages("BiocManager") + +BiocManager::install(c("SummarizedExperiment")) +``` + +## Motivation + +Experiments are multifaceted data sets typically composed of at least two key +pieces of information necessary for any analysis: + +- Assay data, typically a matrix representing measurements of a set of features + in a set of samples + (e.g., RNA-sequencing). +- Sample metadata, typically a `data.frame` of metadata representing information + about samples + (e.g., treatment group). + +All those pieces of information must be kept synchronised -- same samples, +same order -- for downstream analyses to accurately process the information +and produce reliable results. + +It is also very common for analytical workflows to analyse subsets of samples or +identify outliers that need to be removed to allow for more accurate downstream +analyses. +In such cases, all aspects of the experiments must be subsetted to the same set +of samples -- in the same order -- to preserve consistency in the data set +and correct results. + +The `SummarizedExperiment` -- implemented in the +*[SummarizedExperiment](https://bioconductor.org/packages/3.19/SummarizedExperiment)* package -- provides a container +that accommodates those essential aspects of individual experiments into a +single object coordinates data and metadata during subsetting and reordering +operations. +Its flexibility accommodating many biological data types +and comprehensive set of features make it a popular data +structure re-used throughout the Bioconductor and a key part of the Bioconductor +ecosystem. +For instance, familiarity with the `SummarizedExperiment` is a prerequisite +for working with the *[DESeq2](https://bioconductor.org/packages/3.19/DESeq2)* package for differential +expression analysis, and the *[SingleCellExperiment](https://bioconductor.org/packages/3.19/SingleCellExperiment)* +extension class for single-cell analyses. + +## Class structure + +`SummarizedExperiment` is a matrix-like container where rows represent features of +interest (e.g. genes, transcripts, exons, etc.) and columns represent samples. + +The objects can contain one or more assays, each represented by a matrix-like +object, as long as they be of the same dimensions. + +Information about the features is stored in a `DataFrame` object, +nested within the `SummarizedExperiment` object, +and accessible using the function `rowData()`. +Each row of the `DataFrame` provides information on the feature in the +corresponding row of the SummarizedExperiment object. +That information may include annotations independent of the experiment +(e.g., gene identifier) as well as quality control metrics computed from +assay data during workflows. + +Similarly, information about the samples is stored in another `DataFrame` object, +also nested within the `SummarizedExperiment` object, +and accessible using the function `colData()`. + +The following graphic displays the class geometry and highlights the +vertical (column) and horizontal (row) relationships. +It was obtained from the vignette of the +*[SummarizedExperiment](https://bioconductor.org/packages/3.19/SummarizedExperiment)* package. + +![](fig/summarizedexperiment.svg){alt='Schematic representation of the SummarizedExperiment class.'} + +## Creating a SummarizedExperiment object + +Let us first load the package: + + +``` r +library(SummarizedExperiment) +``` + +Then, let us import assay data from a file that we downloaded during the lesson +setup. + +The file is a simple text file in which +the first column contains made-up feature identifiers +and all other columns contain simulated data for made-up samples. +As such, we can use the base R function `read.csv` to parse the file +into a `data.frame` object. + +In the example below, we indicate that the row names can be found in the first +column, so that the function immediately sets the row names accordingly +in the output object. +Hadn't we specified it, the function would have parsed it as a regular column +and left the row names to the default integer indexing. + + +``` r +count_data <- read.csv("data/counts.csv", row.names = 1) +count_data +``` + +``` output + sample_1 sample_2 sample_3 sample_4 +gene_1 109 84 91 105 +gene_2 111 97 98 108 +gene_3 89 121 105 99 +gene_4 105 109 122 101 +gene_5 82 97 112 83 +gene_6 89 96 90 116 +gene_7 121 95 88 106 +gene_8 101 101 86 103 +gene_9 91 119 89 87 +gene_10 81 111 81 118 +gene_11 93 118 93 99 +gene_12 103 111 116 103 +gene_13 89 126 103 100 +gene_14 101 107 111 79 +gene_15 96 91 103 108 +gene_16 110 102 128 103 +gene_17 95 106 118 100 +gene_18 99 115 114 102 +gene_19 114 105 94 118 +gene_20 110 88 99 102 +gene_21 116 95 94 105 +gene_22 114 96 107 91 +gene_23 97 120 93 90 +gene_24 91 84 118 97 +gene_25 99 106 97 110 +``` + +One assay data matrix is enough to create a `SummarizedExperiment` object, +although without sample metadata, only unsupervised analyses -- that do not +require information about the samples -- are possible. + +In the example below, we create a `SummarizedExperiment` object in which we +store the matrix of count data under the name 'counts'. +Note that the argument 'assays=' (plural) can accept more than one assay +-- as discussed above -- which is why we encapsulate our only assay matrix +in a named `list` that also gives us the opportunity to assign a name to the +assay. +Naming assays becomes crucial during workflows that contain multiple assays, +in order to identify and retrieve individual assays unambiguously. + + +``` r +se <- SummarizedExperiment( + assays = list(counts = count_data) +) +se +``` + +``` output +class: SummarizedExperiment +dim: 25 4 +metadata(0): +assays(1): counts +rownames(25): gene_1 gene_2 ... gene_24 gene_25 +rowData names(0): +colnames(4): sample_1 sample_2 sample_3 sample_4 +colData names(0): +``` + +In the output above, the summary view of the object reminds us that the assay +-- and thus the overall `SummarizedExperiment` object -- contains information +for 25 features in 4 samples, +it contains a single assay named 'counts', +the features seem to be named from 'gene_1' to 'gene_25' +(only the first and last ones are shown), +and the samples are named from `sample_1` to `sample_4`. +The object does not contain any row metadata nor column metadata. + +To create a more comprehensive `SummarizedExperiment` object, +let us import gene metadata and sample metadata for another two files +that we downloaded during the lesson setup. + +The files are formatted similarly to the count data, +so we use again the base R function `read.csv()` to parse them into `data.frame` +objects. + + +``` r +sample_metadata <- read.csv("data/sample_metadata.csv", row.names = 1) +sample_metadata +``` + +``` output + condition batch +sample_1 A 1 +sample_2 A 2 +sample_3 B 1 +sample_4 B 2 +``` + + +``` r +gene_metadata <- read.csv("data/gene_metadata.csv", row.names = 1) +gene_metadata +``` + +``` output + chromosome +gene_1 4 +gene_2 4 +gene_3 5 +gene_4 4 +gene_5 5 +gene_6 1 +gene_7 2 +gene_8 1 +gene_9 3 +gene_10 1 +gene_11 1 +gene_12 5 +gene_13 5 +gene_14 1 +gene_15 3 +gene_16 4 +gene_17 2 +gene_18 5 +gene_19 1 +gene_20 3 +gene_21 5 +gene_22 5 +gene_23 1 +gene_24 4 +gene_25 5 +``` + +We can re-create the `SummarizedExperiment` object, this time including +the gene and sample metadata: + + +``` r +se <- SummarizedExperiment( + assays = list(counts = count_data), + colData = sample_metadata, + rowData = gene_metadata +) +se +``` + +``` output +class: SummarizedExperiment +dim: 25 4 +metadata(0): +assays(1): counts +rownames(25): gene_1 gene_2 ... gene_24 gene_25 +rowData names(1): chromosome +colnames(4): sample_1 sample_2 sample_3 sample_4 +colData names(2): condition batch +``` + +Comparing the output above with the previous 'assay-only' version of the +`SummarizedExperiment` object, we can see that the `rowData` and `colData` +components now contain 1 and 4 metadata, respectively. + +## Accessing information + +A number of functions give access to the various components of +`SummarizedExperiment` objects. + +The `assays()` function returns the list of assays stored in the object. +The output is always a `List`, event if the object contains a single assay. + + +``` r +assays(se) +``` + +``` output +List of length 1 +names(1): counts +``` + +The `assayNames()` function returns a character vector of the assay names. +This is most useful when the object contains larger numbers of assays, +as the `assays()` function (see above) may not display all of them. +Knowing the names of the various assays is key to accessing any individual +assay. + + +``` r +assayNames(se) +``` + +``` output +[1] "counts" +``` + +The `assay()` function can be used to retrieve a single assay from the object. +For this, the function should be given the name or the integer position of the +desired assay. +If unspecified, the function automatically returns the first assay in the +object. + + +``` r +head(assay(se, "counts")) +``` + +``` output + sample_1 sample_2 sample_3 sample_4 +gene_1 109 84 91 105 +gene_2 111 97 98 108 +gene_3 89 121 105 99 +gene_4 105 109 122 101 +gene_5 82 97 112 83 +gene_6 89 96 90 116 +``` + +The `colData()` and `rowData()` functions can be used to retrieve +sample metadata and row metadata, respectively. + + +``` r +colData(se) +``` + +``` output +DataFrame with 4 rows and 2 columns + condition batch + +sample_1 A 1 +sample_2 A 2 +sample_3 B 1 +sample_4 B 2 +``` + + +``` r +rowData(se) +``` + +``` output +DataFrame with 25 rows and 1 column + chromosome + +gene_1 4 +gene_2 4 +gene_3 5 +gene_4 4 +gene_5 5 +... ... +gene_21 5 +gene_22 5 +gene_23 1 +gene_24 4 +gene_25 5 +``` + +Separately, the `$` operator can be used to access a single column of sample +metadata. +A useful feature of this operator is the autocompletion that is triggered +automatically in RStudio or using the tabulation key in terminal applications. + + +``` r +se$batch +``` + +``` output +[1] 1 2 1 2 +``` + +Notably, there is no operator for accessing a single column of feature metadata. +For this, users need to first access the full `DataFrame` returned by +`rowData()` before accessing a column using the standard `$` or `[[` operators, +e.g. + + +``` r +rowData(se)[["chromosome"]] +``` + +``` output + [1] 4 4 5 4 5 1 2 1 3 1 1 5 5 1 3 4 2 5 1 3 5 5 1 4 5 +``` + +### Adding and editing information + +Information can be added to `SummarizedExperiment` after their creation. +In fact, this is the basis for workflows that compute normalised assay values -- +adding those to the list of assays --, and +quality control metrics for either features or samples -- adding those to the +`rowData` and `colData` components, as appropriate -- progressively growing +the amount of information stored within the overall object. + +Most of the functions for accessing information, described in the previous +section, have a counterpart function for adding new values or editing existing +ones. +Note that editing is merely the result of adding values under a name already in +use, which has the effect of replacing existing values. + +In the example below, we add an assay named 'logcounts' which is the result +of applying a log-transformation to the 'counts' assay after adding a +pseucocount of one: + + +``` r +assay(se, "logcounts") <- log1p(assay(se, "counts")) +se +``` + +``` output +class: SummarizedExperiment +dim: 25 4 +metadata(0): +assays(2): counts logcounts +rownames(25): gene_1 gene_2 ... gene_24 gene_25 +rowData names(1): chromosome +colnames(4): sample_1 sample_2 sample_3 sample_4 +colData names(2): condition batch +``` + +In the output above, we see that the object now contains two assays: +the 'counts' assay included in the object when it was first created, +and the 'logcounts' assay added just now. + +Similarly, the `colData()` and `rowData()` functions -- as well as the `$` +operator -- can be used to add and edit values in the corresponding components. + +In the example below, we compute the sum of counts for each sample, +and store the result in the sample metadata table under the new name +'sum_counts'. + + +``` r +colData(se)[["sum_counts"]] <- colSums(assay(se, "counts")) +colData(se) +``` + +``` output +DataFrame with 4 rows and 3 columns + condition batch sum_counts + +sample_1 A 1 2506 +sample_2 A 2 2600 +sample_3 B 1 2550 +sample_4 B 2 2533 +``` + +In this next example, we compute the average count for each feature, +and store the result in the feature metadata table under the new name +'mean_counts'. + + +``` r +rowData(se)[["mean_counts"]] <- rowSums(assay(se, "counts")) +rowData(se) +``` + +``` output +DataFrame with 25 rows and 2 columns + chromosome mean_counts + +gene_1 4 389 +gene_2 4 414 +gene_3 5 414 +gene_4 4 437 +gene_5 5 374 +... ... ... +gene_21 5 410 +gene_22 5 408 +gene_23 1 400 +gene_24 4 390 +gene_25 5 412 +``` + +:::::::::::::::::::::::::::::::::::::::: keypoints + +- The `SummarizedExperiment` class provides a single container for storing both + assay data and metadata. +- Assay data and metadata are kept synchronised through subsetting and + reordering operations. +- A comprehensive set of functions are available to access, add, and edit + information stored in the various components of the `SummarizedExperiment` + objects. + +:::::::::::::::::::::::::::::::::::::::::::::::::: +[biocviews]: https://www.bioconductor.org/packages/release/BiocViews.html +[biomart-ensembl]: https://www.ensembl.org/biomart/martview diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 00000000..f19b8049 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,13 @@ +--- +title: "Contributor Code of Conduct" +--- + +As contributors and maintainers of this project, +we pledge to follow the [The Carpentries Code of Conduct][coc]. + +Instances of abusive, harassing, or otherwise unacceptable behavior +may be reported by following our [reporting guidelines][coc-reporting]. + + +[coc-reporting]: https://docs.carpentries.org/topic_folders/policies/incident-reporting.html +[coc]: https://docs.carpentries.org/topic_folders/policies/code-of-conduct.html diff --git a/LICENSE.md b/LICENSE.md new file mode 100644 index 00000000..b6f5f2ce --- /dev/null +++ b/LICENSE.md @@ -0,0 +1,82 @@ +--- +title: "Licenses" +--- + +## Instructional Material + +All Software Carpentry, Data Carpentry, and Library Carpentry instructional material is +made available under the [Creative Commons Attribution +license][cc-by-human]. The following is a human-readable summary of +(and not a substitute for) the [full legal text of the CC BY 4.0 +license][cc-by-legal]. + +You are free: + +* to **Share**---copy and redistribute the material in any medium or format +* to **Adapt**---remix, transform, and build upon the material + +for any purpose, even commercially. + +The licensor cannot revoke these freedoms as long as you follow the +license terms. + +Under the following terms: + +* **Attribution**---You must give appropriate credit (mentioning that + your work is derived from work that is Copyright © Software + Carpentry and, where practical, linking to + http://software-carpentry.org/), provide a [link to the + license][cc-by-human], and indicate if changes were made. You may do + so in any reasonable manner, but not in any way that suggests the + licensor endorses you or your use. + +**No additional restrictions**---You may not apply legal terms or +technological measures that legally restrict others from doing +anything the license permits. With the understanding that: + +Notices: + +* You do not have to comply with the license for elements of the + material in the public domain or where your use is permitted by an + applicable exception or limitation. +* No warranties are given. The license may not give you all of the + permissions necessary for your intended use. For example, other + rights such as publicity, privacy, or moral rights may limit how you + use the material. + +## Software + +Except where otherwise noted, the example programs and other software +provided by Software Carpentry and Data Carpentry are made available under the +[OSI][osi]-approved +[MIT license][mit-license]. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +## Trademark + +"Software Carpentry" and "Data Carpentry" and their respective logos +are registered trademarks of [Community Initiatives][ci]. + +[cc-by-human]: https://creativecommons.org/licenses/by/4.0/ +[cc-by-legal]: https://creativecommons.org/licenses/by/4.0/legalcode +[mit-license]: https://opensource.org/licenses/mit-license.html +[ci]: http://communityin.org/ +[osi]: https://opensource.org diff --git a/config.yaml b/config.yaml new file mode 100644 index 00000000..9135522a --- /dev/null +++ b/config.yaml @@ -0,0 +1,87 @@ +#------------------------------------------------------------ +# Values for this lesson. +#------------------------------------------------------------ + +# Which carpentry is this (swc, dc, lc, or cp)? +# swc: Software Carpentry +# dc: Data Carpentry +# lc: Library Carpentry +# cp: Carpentries (to use for instructor training for instance) +# incubator: The Carpentries Incubator +carpentry: 'incubator' + +# Overall title for pages. +title: 'The Bioconductor project' + +# Date the lesson was created (YYYY-MM-DD, this is empty by default) +created: '2020-09-14' + +# Comma-separated list of keywords for the lesson +keywords: 'software, data, lesson, The Carpentries' + +# Life cycle stage of the lesson +# possible values: pre-alpha, alpha, beta, stable +life_cycle: 'pre-alpha' + +# License of the lesson +license: 'CC-BY 4.0' + +# Link to the source repository for this lesson +source: 'https://github.com/carpentries-incubator/bioc-project' + +# Default branch of your lesson +branch: 'main' + +# Who to contact if there are any issues +contact: 'team@carpentries.org' + +# Navigation ------------------------------------------------ +# +# Use the following menu items to specify the order of +# individual pages in each dropdown section. Leave blank to +# include all pages in the folder. +# +# Example ------------- +# +# episodes: +# - introduction.md +# - first-steps.md +# +# learners: +# - setup.md +# +# instructors: +# - instructor-notes.md +# +# profiles: +# - one-learner.md +# - another-learner.md + +# Order of episodes in your lesson +episodes: +- 01-setup.Rmd +- 02-introduction-to-bioconductor.Rmd +- 03-installing-bioconductor.Rmd +- 04-getting-help.Rmd +- 05-s4.Rmd +- 06-biological-sequences.Rmd +- 07-genomic-ranges.Rmd +- 08-annotations.Rmd +- 09-summarizedexperiment.Rmd + +# Information for Learners +learners: + +# Information for Instructors +instructors: + +# Learner Profiles +profiles: + +# Customisation --------------------------------------------- +# +# This space below is where custom yaml items (e.g. pinning +# sandpaper and varnish versions) should live + + +url: 'https://carpentries-incubator.github.io/bioc-project' diff --git a/data/TruSeq3-PE-2.fa b/data/TruSeq3-PE-2.fa new file mode 100644 index 00000000..2e385464 --- /dev/null +++ b/data/TruSeq3-PE-2.fa @@ -0,0 +1,12 @@ +>PrefixPE/1 +TACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PrefixPE/2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>PE1 +TACACTCTTTCCCTACACGACGCTCTTCCGATCT +>PE1_rc +AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTA +>PE2 +GTGACTGGAGTTCAGACGTGTGCTCTTCCGATCT +>PE2_rc +AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC diff --git a/data/actb.gtf b/data/actb.gtf new file mode 100644 index 00000000..eb20b76f --- /dev/null +++ b/data/actb.gtf @@ -0,0 +1,270 @@ +##gff-version 2 +##source-version rtracklayer 1.54.0 +##date 2022-04-26 +7 rtracklayer gene 5526409 5563902 . - . gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5526409 5530601 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530542 5530601 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529684 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5526409 5527891 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530542 5530601 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529684 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5526409 5527747 . - . transcript_id "ENST00000674681"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527147 5529949 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529949 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527147 5527891 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529949 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527147 5527747 . - . transcript_id "ENST00000642480"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527147 5530581 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530581 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527969 5528185 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527986 5528185 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527983 5527985 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527147 5527867 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530581 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527969 5527982 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527147 5527867 . - . transcript_id "ENST00000676397"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527147 5530604 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530604 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529571 5529663 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529571 5529657 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527147 5527783 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527783 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530604 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527147 5527747 . - . transcript_id "ENST00000676319"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527147 5530604 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530604 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529150 5529400 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529150 5529400 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528707 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528629 5528707 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5528626 5528628 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527147 5527891 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530604 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5528281 5528625 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5528004 5528185 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527147 5527891 . - . transcript_id "ENST00000676189"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527147 5530604 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530604 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529394 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529392 5529394 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527147 5527891 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530604 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529395 5529400 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527147 5527747 . - . transcript_id "ENST00000473257"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527148 5530601 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530601 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527148 5527891 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530601 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527148 5527747 . - . transcript_id "ENST00000646664"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527151 5528098 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528098 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528098 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527733 5527891 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527151 5527611 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527733 5527747 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527151 5527611 . - . transcript_id "ENST00000464611"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527152 5530590 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530590 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529663 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5529059 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527152 5527891 . - . transcript_id "ENST00000477812"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527152 5563902 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5563714 5563902 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527152 5527891 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5563714 5563902 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527152 5527747 . - . transcript_id "ENST00000675515"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527156 5530601 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530601 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529019 5529059 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529048 5529059 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5529045 5529047 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527156 5527891 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530601 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5529019 5529044 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5528281 5528719 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5528004 5528185 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527156 5527891 . - . transcript_id "ENST00000425660"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527156 5530601 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530601 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5529400 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527156 5527891 . - . transcript_id "ENST00000462494"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527536 5530602 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530602 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529733 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528004 5528185 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528004 5528185 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527536 5527891 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527751 5527891 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527748 5527750 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530602 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529733 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527536 5527747 . - . transcript_id "ENST00000493945"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5527798 5530627 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530627 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528719 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5527798 5528185 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5527986 5528185 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer stop_codon 5527983 5527985 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530627 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer three_prime_utr 5527798 5527982 . - . transcript_id "ENST00000432588"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5528011 5530597 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530597 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529161 5529400 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528671 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528671 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528011 5528185 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528011 5528185 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530597 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000645576"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5528150 5530604 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530604 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528281 5528719 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528281 5528716 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5528714 5528716 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528150 5528185 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5528150 5528185 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5530524 5530604 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5528717 5528719 . - . transcript_id "ENST00000647275"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5528717 5530601 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530601 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529733 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529019 5529059 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528717 5528719 . - . transcript_id "ENST00000484841"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5528934 5530600 . - . transcript_id "ENST00000645025"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530600 . - . transcript_id "ENST00000645025"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000645025"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529161 5529400 . - . transcript_id "ENST00000645025"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5528934 5529059 . - . transcript_id "ENST00000645025"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5529067 5530583 . - . transcript_id "ENST00000480301"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5530524 5530583 . - . transcript_id "ENST00000480301"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529067 5529663 . - . transcript_id "ENST00000480301"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5529216 5562828 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5562574 5562828 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529216 5529400 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529216 5529400 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5562574 5562828 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000443528"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5529235 5529982 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529806 5529982 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529235 5529400 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529235 5529400 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529806 5529982 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000417101"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5529282 5562790 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5562574 5562790 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5540676 5540771 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529535 5529663 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529535 5529657 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer start_codon 5529655 5529657 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5529282 5529400 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer CDS 5529282 5529400 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5562574 5562790 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5540676 5540771 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer five_prime_utr 5529658 5529663 . - . transcript_id "ENST00000414620"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer transcript 5561852 5562716 . - . transcript_id "ENST00000646584"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5562390 5562716 . - . transcript_id "ENST00000646584"; gene_id "ENSG00000075624"; gene_name "ACTB" +7 rtracklayer exon 5561852 5561949 . - . transcript_id "ENST00000646584"; gene_id "ENSG00000075624"; gene_name "ACTB" diff --git a/data/actb_orfs.fasta b/data/actb_orfs.fasta new file mode 100644 index 00000000..f96380f6 --- /dev/null +++ b/data/actb_orfs.fasta @@ -0,0 +1,66 @@ +>gi|1519311456|ref|NM_001101.5|:c224-3 ORF12 CDS +ATGCCCACCATCACGCCCTGGTGCCTGGGGCGCCCCACGATGGAGGGGAAGACGGCCCGGGGGGCATCGT +CGCCCGCGAAGCCGGCCTTGCACATGCCGGAGCCGTTGTCGACGACGAGCGCGGCGATATCATCATCCAT +GGTGAGCTGGCGGCGGGTGTGGACGGGCGGCGGATCGGCAAAGGCGAGGCTCTGTGCTCGCGGGGCGGAC +GCGGTCTCGGCG +>gi|1519311456|ref|NM_001101.5|:85-1212 ORF1 CDS +ATGGATGATGATATCGCCGCGCTCGTCGTCGACAACGGCTCCGGCATGTGCAAGGCCGGCTTCGCGGGCG +ACGATGCCCCCCGGGCCGTCTTCCCCTCCATCGTGGGGCGCCCCAGGCACCAGGGCGTGATGGTGGGCAT +GGGTCAGAAGGATTCCTATGTGGGCGACGAGGCCCAGAGCAAGAGAGGCATCCTCACCCTGAAGTACCCC +ATCGAGCACGGCATCGTCACCAACTGGGACGACATGGAGAAAATCTGGCACCACACCTTCTACAATGAGC +TGCGTGTGGCTCCCGAGGAGCACCCCGTGCTGCTGACCGAGGCCCCCCTGAACCCCAAGGCCAACCGCGA +GAAGATGACCCAGATCATGTTTGAGACCTTCAACACCCCAGCCATGTACGTTGCTATCCAGGCTGTGCTA +TCCCTGTACGCCTCTGGCCGTACCACTGGCATCGTGATGGACTCCGGTGACGGGGTCACCCACACTGTGC +CCATCTACGAGGGGTATGCCCTCCCCCATGCCATCCTGCGTCTGGACCTGGCTGGCCGGGACCTGACTGA +CTACCTCATGAAGATCCTCACCGAGCGCGGCTACAGCTTCACCACCACGGCCGAGCGGGAAATCGTGCGT +GACATTAAGGAGAAGCTGTGCTACGTCGCCCTGGACTTCGAGCAAGAGATGGCCACGGCTGCTTCCAGCT +CCTCCCTGGAGAAGAGCTACGAGCTGCCTGACGGCCAGGTCATCACCATTGGCAATGAGCGGTTCCGCTG +CCCTGAGGCACTCTTCCAGCCTTCCTTCCTGGGCATGGAGTCCTGTGGCATCCACGAAACTACCTTCAAC +TCCATCATGAAGTGTGACGTGGACATCCGCAAAGACCTGTACGCCAACACAGTGCTGTCTGGCGGCACCA +CCATGTACCCTGGCATTGCCGACAGGATGCAGAAGGAGATCACTGCCCTGGCACCCAGCACAATGAAGAT +CAAGATCATTGCTCCTCCTGAGCGCAAGTACTCCGTGTGGATCGGCGGCTCCATCCTGGCCTCGCTGTCC +ACCTTCCAGCAGATGTGGATCAGCAAGCAGGAGTATGACGAGTCCGGCCCCTCCATCGTCCACCGCAAAT +GCTTCTAG +>gi|1519311456|ref|NM_001101.5|:89-214 ORF4 CDS +ATGATGATATCGCCGCGCTCGTCGTCGACAACGGCTCCGGCATGTGCAAGGCCGGCTTCGCGGGCGACGA +TGCCCCCCGGGCCGTCTTCCCCTCCATCGTGGGGCGCCCCAGGCACCAGGGCGTGA +>gi|1519311456|ref|NM_001101.5|:c329-240 ORF11 CDS +ATGTCGTCCCAGTTGGTGACGATGCCGTGCTCGATGGGGTACTTCAGGGTGAGGATGCCTCTCTTGCTCT +GGGCCTCGTCGCCCACATAG +>gi|1519311456|ref|NM_001101.5|:c578-354 ORF10 CDS +ATGGGCACAGTGTGGGTGACCCCGTCACCGGAGTCCATCACGATGCCAGTGGTACGGCCAGAGGCGTACA +GGGATAGCACAGCCTGGATAGCAACGTACATGGCTGGGGTGTTGAAGGTCTCAAACATGATCTGGGTCAT +CTTCTCGCGGTTGGCCTTGGGGTTCAGGGGGGCCTCGGTCAGCAGCACGGGGTGCTCCTCGGGAGCCACA +CGCAGCTCATTGTAG +>gi|1519311456|ref|NM_001101.5|:c603-502 ORF8 CDS +ATGGGGGAGGGCATACCCCTCGTAGATGGGCACAGTGTGGGTGACCCCGTCACCGGAGTCCATCACGATG +CCAGTGGTACGGCCAGAGGCGTACAGGGATAG +>gi|1519311456|ref|NM_001101.5|:c1190-801 ORF9 CDS +ATGGAGGGGCCGGACTCGTCATACTCCTGCTTGCTGATCCACATCTGCTGGAAGGTGGACAGCGAGGCCA +GGATGGAGCCGCCGATCCACACGGAGTACTTGCGCTCAGGAGGAGCAATGATCTTGATCTTCATTGTGCT +GGGTGCCAGGGCAGTGATCTCCTTCTGCATCCTGTCGGCAATGCCAGGGTACATGGTGGTGCCGCCAGAC +AGCACTGTGTTGGCGTACAGGTCTTTGCGGATGTCCACGTCACACTTCATGATGGAGTTGAAGGTAGTTT +CGTGGATGCCACAGGACTCCATGCCCAGGAAGGAAGGCTGGAAGAGTGCCTCAGGGCAGCGGAACCGCTC +ATTGCCAATGGTGATGACCTGGCCGTCAGGCAGCTCGTAG +>gi|1519311456|ref|NM_001101.5|:839-934 ORF5 CDS +ATGAGCGGTTCCGCTGCCCTGAGGCACTCTTCCAGCCTTCCTTCCTGGGCATGGAGTCCTGTGGCATCCA +CGAAACTACCTTCAACTCCATCATGA +>gi|1519311456|ref|NM_001101.5|:1277-1618 ORF6 CDS +ATGAGATTGGCATGGCTTTATTTGTTTTTTTTGTTTTGTTTTGGTTTTTTTTTTTTTTTTGGCTTGACTC +AGGATTTAAAAACTGGAACGGTGAAGGTGACAGCAGTCGGTTGGAGCGAGCATCCCCCAAAGTTCACAAT +GTGGCCGAGGACTTTGATTGCACATTGTTGTTTTTTTAATAGTCATTCCAAATATGAGATGCGTTGTTAC +AGGAAGTCCCTTGCCATCCTAAAAGCCACCCCACTTCTCTCTAAGGAGAATGGCCCAGTCCTCTCCCAAG +TCCACACAGGGGAGGTGATAGCATTGCTTTCGTGTAAATTATGTAATGCAAAATTTTTTTAA +>gi|1519311456|ref|NM_001101.5|:1288-1455 ORF2 CDS +ATGGCTTTATTTGTTTTTTTTGTTTTGTTTTGGTTTTTTTTTTTTTTTTGGCTTGACTCAGGATTTAAAA +ACTGGAACGGTGAAGGTGACAGCAGTCGGTTGGAGCGAGCATCCCCCAAAGTTCACAATGTGGCCGAGGA +CTTTGATTGCACATTGTTGTTTTTTTAA +>gi|1519311456|ref|NM_001101.5|:c1462-1352 ORF13 CDS +ATGACTATTAAAAAAACAACAATGTGCAATCAAAGTCCTCGGCCACATTGTGAACTTTGGGGGATGCTCG +CTCCAACCGACTGCTGTCACCTTCACCGTTCCAGTTTTTAA +>gi|1519311456|ref|NM_001101.5|:1602-1706 ORF7 CDS +ATGCAAAATTTTTTTAATCTTCGCCTTAATACTTTTTTATTTTGTTTTATTTTGAATGATGAGCCTTCGT +GCCCCCCCTTCCCCCTTTTTTGTCCCCCAACTTGA +>gi|1519311456|ref|NM_001101.5|:1657-1791 ORF3 CDS +ATGATGAGCCTTCGTGCCCCCCCTTCCCCCTTTTTTGTCCCCCAACTTGAGATGTATGAAGGCTTTTGGT +CTCCCTGGGAGTGGGTGGAGGCAGCCAGGGCTTACCTGTACACTGACTTGAGACCAGTTGAATAA diff --git a/data/counts.csv b/data/counts.csv new file mode 100644 index 00000000..2ba0e396 --- /dev/null +++ b/data/counts.csv @@ -0,0 +1,26 @@ +feature,sample_1,sample_2,sample_3,sample_4 +gene_1,109,84,91,105 +gene_2,111,97,98,108 +gene_3,89,121,105,99 +gene_4,105,109,122,101 +gene_5,82,97,112,83 +gene_6,89,96,90,116 +gene_7,121,95,88,106 +gene_8,101,101,86,103 +gene_9,91,119,89,87 +gene_10,81,111,81,118 +gene_11,93,118,93,99 +gene_12,103,111,116,103 +gene_13,89,126,103,100 +gene_14,101,107,111,79 +gene_15,96,91,103,108 +gene_16,110,102,128,103 +gene_17,95,106,118,100 +gene_18,99,115,114,102 +gene_19,114,105,94,118 +gene_20,110,88,99,102 +gene_21,116,95,94,105 +gene_22,114,96,107,91 +gene_23,97,120,93,90 +gene_24,91,84,118,97 +gene_25,99,106,97,110 diff --git a/data/gene_metadata.csv b/data/gene_metadata.csv new file mode 100644 index 00000000..fac7206f --- /dev/null +++ b/data/gene_metadata.csv @@ -0,0 +1,26 @@ +id,chromosome +gene_1,4 +gene_2,4 +gene_3,5 +gene_4,4 +gene_5,5 +gene_6,1 +gene_7,2 +gene_8,1 +gene_9,3 +gene_10,1 +gene_11,1 +gene_12,5 +gene_13,5 +gene_14,1 +gene_15,3 +gene_16,4 +gene_17,2 +gene_18,5 +gene_19,1 +gene_20,3 +gene_21,5 +gene_22,5 +gene_23,1 +gene_24,4 +gene_25,5 diff --git a/data/sample_metadata.csv b/data/sample_metadata.csv new file mode 100644 index 00000000..3fc1faea --- /dev/null +++ b/data/sample_metadata.csv @@ -0,0 +1,5 @@ +id,condition,batch +sample_1,A,1 +sample_2,A,2 +sample_3,B,1 +sample_4,B,2 diff --git a/discuss.md b/discuss.md new file mode 100644 index 00000000..73d642bc --- /dev/null +++ b/discuss.md @@ -0,0 +1,108 @@ +--- +title: Discussion +--- + +## Bioconductor and CRAN + +[Bioconductor][bioc-website] and [CRAN][cran-website] are two well-established repositories of R packages that are often compared in terms of practices with regard to package submission, review, and release cycles. + +R packages can be obtained and installed from a number of repositories online, CRAN and Bioconductor being the two most prominent and well-established repositories. +Repositories often have different guidelines and procedures to manage and check the quality of packages that are distributed on their platforms. +This can be a source of considerable confusion for users, when searching for new packages and assessing their popularity and reliability before investing time in using them. + +CRAN tends to host more general-purpose R packages, hosting more than 17,854 packages (July 2021; [source][cran-packages]). +It is also the oldest and main repository of R packages, established in 1997, and initially hosting 12 packages ([reference][cran-first-release]). + +Rather, Bioconductor focuses on R packages related to bioinformatics tasks and workflows. +Biocondutor distinguishes packages from different types, including 2,042 software packages, 406 experiment data packages, 965 annotation packages, and 29 workflows (July 2021; [source][bioc-packages]). + +Both repositories implement an initial review process for new package submissions. +However, the two review processes operate in noticeably different manners, including in the way packages are evaluated and deprecated after acceptance, as they consistently fail regular automated checks without action from package maintainers. +In particular, the release cycle of the Bioconductor project creates natural opportunities to regularly identify packages that consistently fail automated checks, triggering their deprecation during the following release cycle, and removal in the subsequent one. + +Packages from Bioconductor and CRAN can be installed and used side-by-side. +They are all R packages and can often interoperate on standard R structures (e.g., `vector`, `matrix`, `data.frame`). +However, one common issue and challenge stems from the different implementations of S3 and S4 class and generic systems. +For instance, the `r BiocStyle::Biocpkg("AnnotationDbi")` and `r BiocStyle::CRANpkg("dplyr")` packages implement two versions of the `select()` method, using the S4 and S3 system, respectively. +When both packages are attached to the same R session, the `select()` method defined by the first package attached can only be accessed using the full syntax `package::function()` (e.g., `AnnotationDbi::select()`), as the method implemented by the latest package attached masks any other definition of the method in the active R session. +To date, there has been no solution to this issue, relying on users learning from experience to circumvent the issue using the full syntax to access individual methods explicitly. + +The Bioconductor project labels and classifies packages using [biocViews][biocviews-site]. +This controlled vocabulary provides a convenient solution to effectively browse and filter thematically-related packages. +The CRAN repository does not provide any comparable functionality, instead relying on search engines and word-of-mouth to identify packages suitable for specific tasks. + +## Bioconductor package versions + +### Format + +Bioconductor packages use the standard format `MAJOR.MINOR.PATCH` to version packages (e.g., `1.13.2`). +The version number is stored in the `DESCRIPTION` file that is part of every R package. +Each of `MAJOR`, `MINOR`, and `PATCH` is an integer that is incremented to mark a new release of the corresponding package, following different rules for each field. + +### Major version + +Candidate packages in development should set the `MAJOR` field to `0` while the package is submitted for review. +When the package is accepted, `MAJOR` is automatically incremented by `1`, to mark its entry into the Bioconductor repository. + +Following acceptance, `MAJOR` often remains the same for the lifetime of the package. +It should only be incremented by package developers to mark breaking changes that require the full attention of users who may need to update their workflow accordingly. + +However, developers should never update the `MAJOR` field itself. +Instead, they should set `MINOR` to `99`. +During the preparation of the next Bioconductor release, this will automatically trigger an increment of `MAJOR` by `1` and reset `MINOR` to `0`. +For instance, a package at version `0.99.15` during the Bioconductor review process will appear at version `1.0.0` when accepted and added to the next release of the Bioconductor package repository. + +### Minor version + +The `MINOR` field is automatically updated every 6 months as part of the Bioconductor release process to mark the version of each package that will feature in the upcoming release. +Simultaneously, when the `MINOR` field is incremented for a new release cycle, the `PATCH` field is reset to `0`. +For instance, if a package was at version `1.3.5` for Bioconductor release `3.13`, it would be incremented to version `1.4.0` at the start of Bioconductor release `3.14`. + +As described above, developers can set `MINOR` directly to `99` - skipping all values in-between - if they wish to trigger an increment of `MAJOR` for their package in the next release of Bioconductor. + +### Patch version + +The `PATCH` field is the field that package developers use most frequently to release updates within a release cycle. +Importantly, updates to packages are not deployed to users until the package version is incremented. +This is crucial to ensure that users cannot install different versions of a package that contain different source code. +For instance, a package at version `1.3.5` would be incremented to version `1.3.6` to deploy a new version of the package available to users on the Bioconductor repository. + +## The Bioconductor release cycle + +### Release branches + +Bioconductor uses the [Git][git-website] version control system to manage its package repository. +For each new Bioconductor release (i.e., version), a new branch is created in the [Bioconductor Git repository][git-bioconductor]; those are referred to as *release* branches. +Release branches allow users to install stable versions of packages that were tested together for a given version of Bioconductor, itself earmarked for a specific version of R. + +Work on the *release* branches is restricted. +Older *release* branches are entirely frozen, meaning that no further update is allowed on those branches. +When users request a package for a given version of Bioconductor, they receive the latest version of the package on the correspoding release branch. + +Only the latest release branch allows updates from package maintainers, but those are restricted to critical bug fixes. +This means that for each 6-month release cycle, users can expect packages on the latest branch to be reasonably stable. + +### Devel branches + +Meanwhile, the main branch of the Git repository (historically called `master`) is referred to as the *devel* branch. + +The *devel* branch allow developers to continue updating the packages as frequently as they wish, without affecting users or disrupting workflows. +Typically, packages on the *devel* branch are mainly used by other developers and the Bioconductor build system, to run tests using the latest code of every package in the Bioconductor repository, and to prepare the next stable release of the project. +However, users can also access packages on the *devel* branch using `BiocManager::install(version = ...)` with `version` set to one minor version greater than the latest Bioconductor *release* version (e.g. if the latest release is `3.13`, then devel is `3.14`). + +### Transition between devel and release - the release process + +After a new release branch is created, the version of every single package on the *devel* branch is incremented, to prepare the version of the package that will feature in the next Bioconductor stable release. +This includes the `r BiocStyle::Biocpkg("BiocVersion")` package, which marks the value of the next version of Bioconductor. + + + +[bioc-website]: https://bioconductor.org +[cran-website]: https://cran.r-project.org +[cran-packages]: https://cran.r-project.org/web/packages/index.html +[cran-first-release]: https://stat.ethz.ch/pipermail/r-announce/1997/000001.html +[bioc-packages]: https://bioconductor.org/news/bioc_3_13_release/ +[biocviews-site]: https://www.bioconductor.org/packages/release/BiocViews.html + + + diff --git a/download_data.R b/download_data.R new file mode 100644 index 00000000..64d07483 --- /dev/null +++ b/download_data.R @@ -0,0 +1,26 @@ +dir.create("data", showWarnings = FALSE) + +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/TrimmomaticAdapters/TruSeq3-PE-2.fa", + destfile = "data/TruSeq3-PE-2.fa" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/ActbGtf/actb.gtf", + destfile = "data/actb.gtf" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/master/data/ActbOrf/actb_orfs.fasta", + destfile = "data/actb_orfs.fasta" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/counts.csv", + destfile = "data/counts.csv" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/gene_metadata.csv", + destfile = "data/gene_metadata.csv" +) +download.file( + url = "https://raw.githubusercontent.com/Bioconductor/bioconductor-teaching/devel/data/SummarizedExperiment/sample_metadata.csv", + destfile = "data/sample_metadata.csv" +) diff --git a/fig/bioc-genomicranges.svg b/fig/bioc-genomicranges.svg new file mode 100644 index 00000000..a752f673 --- /dev/null +++ b/fig/bioc-genomicranges.svg @@ -0,0 +1,196 @@ + + + + + + Gene model + + + + + + + + + + + + + + + + + + + + + + + + + + + Unsplicedtranscripts + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Gene region + + + + + + + + + + + + + Disjoint bins + + + + + + + + + + + + + + Promoter + + + + + + + + + + + + + + + + + + + + + + + + + + + Introns + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/fig/bioc-install.svg b/fig/bioc-install.svg new file mode 100644 index 00000000..c679e864 --- /dev/null +++ b/fig/bioc-install.svg @@ -0,0 +1,104 @@ + + + + + +utils::install.packages() +CRAN + + + + + + + + + + + + + + + + + + + + + + + + +BiocManager +... +... + + +BiocManager::install() +CRAN +BiocManager +... +... + +Bioconductor +Biobase +... +... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/fig/bioc-release-cycle.svg b/fig/bioc-release-cycle.svg new file mode 100644 index 00000000..5acc4a52 --- /dev/null +++ b/fig/bioc-release-cycle.svg @@ -0,0 +1,64 @@ + + + + + + + + + + + +R + + + +April 28, +2020 +October 28, +2020 +Bioconductor + + +4.0 +3.11 +3.12 + +4.1 + + +3.13 +... +May 20, +2021 +... +2021 +April +2020 +May +2021 +... +2022 + +... + + + + + + diff --git a/fig/bioc-s4.svg b/fig/bioc-s4.svg new file mode 100644 index 00000000..1259db1e --- /dev/null +++ b/fig/bioc-s4.svg @@ -0,0 +1,94 @@ + + + + + + + + + + +S4Class1 +SlotName1 +SlotName2 +SlotType1 +SlotType2 + + +S4Class2 +SlotName1 +SlotName2 +SlotType1 +SlotType2 + + +contains: S4Class1 +SlotName3 +SlotType3 + + + + + + + + +inheritance + +Validity rules for S4Class1 + +Validity rules for S4Class1 +Validity rules for S4Class2 + +S4Generic1() +function(x, y, ...) + + +signature:- x + + + + + + + + +method dispatch + +Method 1 +function(x, y, ...) { # action 1} + + +Method 2 + + + + + + + + + + +> S4Generic1(x = object1) + +signature:- x: S4Class1 +function(x, y, ...) { # action 2} + + +signature:- x: S4Class2 +> object1 <- S4Class1(...) +> object2 <- S4Class2(...) + +> S4Generic1(x = object2) + diff --git a/fig/bioc-sequencing-ecosystem.svg b/fig/bioc-sequencing-ecosystem.svg new file mode 100644 index 00000000..041e4e06 --- /dev/null +++ b/fig/bioc-sequencing-ecosystem.svg @@ -0,0 +1,210 @@ + + + + + + +Sequencing +Alignment +Reduction + + + FASTQ + + + + BAM + + + + ShortReads, Biostrings, qrqc, ... + + + + Rsamtools, GenomicAlignments, ... + + + + Differential expression(genes, transcripts) + + + + edgeR, DESeq2,DEXSeq, SGSeq, ... + + + + Peaks (.bed, .wig) + + + + rtracklayer + + + + Annotation;Differential binding + + + + ChIPpeakAnno, DiffBind,ChIPseeker, csaw, ... + + + + Variants (.vcf) + + + + VariantAnnotation,VariantTools, h5vc,... + + + + Effect predictions; GWAS + + + + ChIPpeakAnno, DiffBind,ChIPseeker, csaw, ... + +... +... + + + IRanges,GenomicRanges,GenomicAlignments,... + + + + AnnotationDbi,GenomicFeatures,org.*, TxDb*,biomaRt, PSICQUIC,KEGGREST, ... + + + + Gviz, ggbio, epivisr,rtracklayer, SRAdb, ... + + + + + + + + + + + + + + + + + + +Analysis +Integration & Visualization + + + + + + + + + + + + + + + + + + + Counts (.csv) + + + + + + + + + + + +... +... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Base R + + GenomicAlignments,Rsubread, ... + + diff --git a/fig/bioc-support-site.png b/fig/bioc-support-site.png new file mode 100644 index 00000000..17697ec4 Binary files /dev/null and b/fig/bioc-support-site.png differ diff --git a/fig/bioc-timeline.svg b/fig/bioc-timeline.svg new file mode 100644 index 00000000..64e012f3 --- /dev/null +++ b/fig/bioc-timeline.svg @@ -0,0 +1,104 @@ + + + + + + + + + 2000 + + + + 2020 + + + + 2010 + + + + 2030 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +First commit2001 +Microarrays +RNA-sequencing +DNA-sequencing +Single-cell RNA-sequencing +Single-cell multimodal omics +GenomicRanges +Biobase +BiocGenerics +rtracklayer +VariantAnnotation +Rsamtools +Biostrings +ShortRead +Spectra +MSnbase +SummarizedExperiment +SingleCellExperiment +RaggedExperiment +GSEABase +MultiAssayExperiment +BiocSet +First conference? +First workshop? +First package? +Website? +Support website? +Slack workspace? +Scientific Advisory Board? +Technical Advisory Board? +Community Advisory Board2020 + + + + +First sticker? + diff --git a/fig/dataframe-mcols.svg b/fig/dataframe-mcols.svg new file mode 100644 index 00000000..24943af9 --- /dev/null +++ b/fig/dataframe-mcols.svg @@ -0,0 +1,43 @@ + + + + + + + + + + + + + + + +A + +B + +A + +B + +meta1 + + + +meta2 +> DF = DataFrame( A = ..., B =..., row.names = c("A", "B")) +> mcols(DF) = DataFrame( meta1 = ..., meta2 =...) + + + diff --git a/fig/genomic-intervals.svg b/fig/genomic-intervals.svg new file mode 100644 index 00000000..68f1a9aa --- /dev/null +++ b/fig/genomic-intervals.svg @@ -0,0 +1,94 @@ + + + + + + + + + + + + +...ATCGCTCGCTCGCTCGCTGCGGCTCTCTATAAATAATCTCTATATAGATATATAAAAGTAG... + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +chr1 +exon1 +exon2 +exon3 +exon4 +exon6 +exon5 +e9 +exon8 +exon7 +transcript1 +transcript2 +transcript3 +transcript4 +gene1 +gene2 + + + diff --git a/fig/intervals.svg b/fig/intervals.svg new file mode 100644 index 00000000..94a51d59 --- /dev/null +++ b/fig/intervals.svg @@ -0,0 +1,54 @@ + + + + + + + + + + + + + + + + + + + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 10 + 8 + 9 + 10 + + + A + + + + B + + + + C + + ... + width = 2 + width = 3 + width = 4 + + diff --git a/fig/rle.svg b/fig/rle.svg new file mode 100644 index 00000000..31015742 --- /dev/null +++ b/fig/rle.svg @@ -0,0 +1,136 @@ + + + + +Run-length encoding + + + + + + + + + + + A + + T + + G + + C + + N + + N + + N + + N + + N + + N + + N + + N + + A + + A + + G + + C + + N + + N + + N + + N + + N + + N + + N + + N + + T + + G + + C + + N + + N + + + + A + + T + + G + + C + + N + + A + + G + + C + + N + + 1 + + 1 + + 1 + + 1 + + 8 + + 2 + + 1 + + 1 + + 10 + + T + + G + + C + + 1 + + 1 + + 1 + + diff --git a/fig/summarizedexperiment.svg b/fig/summarizedexperiment.svg new file mode 100644 index 00000000..c3e62e2b --- /dev/null +++ b/fig/summarizedexperiment.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/files/bibliography.bib b/files/bibliography.bib new file mode 100644 index 00000000..f0272cd8 --- /dev/null +++ b/files/bibliography.bib @@ -0,0 +1,55 @@ +@article{Amezquita2020, + author = {Amezquita, R. A. and Lun, A. T. L. and Becht, E. and Carey, V. J. and Carpp, L. N. and Geistlinger, L. and Marini, F. and Rue-Albrecht, K. and Risso, D. and Soneson, C. and Waldron, L. and Pages, H. and Smith, M. L. and Huber, W. and Morgan, M. and Gottardo, R. and Hicks, S. C.}, + title = {Orchestrating single-cell analysis with Bioconductor}, + journal = {Nat Methods}, + volume = {17}, + number = {2}, + pages = {137-145}, + ISSN = {1548-7105 (Electronic) +1548-7091 (Linking)}, + DOI = {10.1038/s41592-019-0654-x}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/31792435}, + year = {2020}, + type = {Journal Article} +} + +@article{Gentleman2004, + author = {Gentleman, R. C. and Carey, V. J. and Bates, D. M. and Bolstad, B. and Dettling, M. and Dudoit, S. and Ellis, B. and Gautier, L. and Ge, Y. and Gentry, J. and Hornik, K. and Hothorn, T. and Huber, W. and Iacus, S. and Irizarry, R. and Leisch, F. and Li, C. and Maechler, M. and Rossini, A. J. and Sawitzki, G. and Smith, C. and Smyth, G. and Tierney, L. and Yang, J. Y. and Zhang, J.}, + title = {Bioconductor: open software development for computational biology and bioinformatics}, + journal = {Genome Biol}, + volume = {5}, + number = {10}, + pages = {R80}, + ISSN = {1474-760X (Electronic) +1474-7596 (Linking)}, + DOI = {10.1186/gb-2004-5-10-r80}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/15461798}, + year = {2004}, + type = {Journal Article} +} + +@article{Huber2015, + author = {Huber, W. and Carey, V. J. and Gentleman, R. and Anders, S. and Carlson, M. and Carvalho, B. S. and Bravo, H. C. and Davis, S. and Gatto, L. and Girke, T. and Gottardo, R. and Hahne, F. and Hansen, K. D. and Irizarry, R. A. and Lawrence, M. and Love, M. I. and MacDonald, J. and Obenchain, V. and Oles, A. K. and Pages, H. and Reyes, A. and Shannon, P. and Smyth, G. K. and Tenenbaum, D. and Waldron, L. and Morgan, M.}, + title = {Orchestrating high-throughput genomic analysis with Bioconductor}, + journal = {Nat Methods}, + volume = {12}, + number = {2}, + pages = {115-21}, + ISSN = {1548-7105 (Electronic) +1548-7091 (Linking)}, + DOI = {10.1038/nmeth.3252}, + url = {https://www.ncbi.nlm.nih.gov/pubmed/25633503}, + year = {2015}, + type = {Journal Article} +} + +@Article{BiocPkgTools2019, +AUTHOR = { Su, S and Carey, VJ and Shepherd, L and Ritchie, M and Morgan, MT and Davis, S}, +TITLE = {BiocPkgTools: Toolkit for mining the Bioconductor package ecosystem [version 1; peer review: 2 approved, 1 approved with reservations] +}, +JOURNAL = {F1000Research}, +VOLUME = {8}, +YEAR = {2019}, +NUMBER = {752}, +DOI = {10.12688/f1000research.19410.1} +} \ No newline at end of file diff --git a/index.md b/index.md new file mode 100644 index 00000000..b9b21e46 --- /dev/null +++ b/index.md @@ -0,0 +1,42 @@ +--- +permalink: index.html +site: sandpaper::sandpaper_site +--- + +This lesson provides an introduction to the Bioconductor project. + +A good understanding of the Bioconductor project is the foundation to efficiently use Bioconductor packages for the analysis and visualization of -omics data using R and RStudio. + +We download and install R packages from Bioconductor and other repositories to write workflows and perform analyses. +In order to do so, we first identify packages that are available and relevant to our analysis, and we learn from their documentation the best practices to use them as their authors intended it. +For reproducibility, it is also important to identify and track versions of packages used to perform each analysis. + +Sometimes, we encounter bugs in packages that we use. +While it is possible to report bugs to the authors and wait for issue to be fixed, +packages hosted on public repositories offer the chance to inspect the code and contribute or propose fixes ourselves. +In addition to being a great opportunity to develop coding skills, community contributors are very often recognized and credited for their contributions! + +In this lesson, you will learn: + +- To describe the Bioconductor project beyond software packages. +- To navigate the Bioconductor website to find packages for a particular task. +- To install and update Bioconductor packages. +- To open a package vignette and practice running through the examples that they contain. +- To identify standard classes and methods re-used across Bioconductor packages. +- To modify code and contribute to existing Bioconductor packages. +- Best practices to get help from packages developers and peers. + + + +{% comment %} This is a comment in Liquid {% endcomment %} + +:::::::::::::::::::::::::::::::::::::::::: prereq + +## Prerequisites + +- Learning objectives of the [Introduction to data analysis with R and Bioconductor][lesson-intro-r-bioconductor] workshop. + + +:::::::::::::::::::::::::::::::::::::::::::::::::: + +[lesson-intro-r-bioconductor]: https://carpentries-incubator.github.io/bioc-intro/index.html diff --git a/instructor-notes.md b/instructor-notes.md new file mode 100644 index 00000000..7e32cb3c --- /dev/null +++ b/instructor-notes.md @@ -0,0 +1,19 @@ +--- +title: Instructor Notes +--- + +## Workshop Structure + +[Instructors, please add notes on your experience with the workshop structure here.] + +## Technical tips and tricks + +[Instructors, please add notes on your experience with technical tips and tricks here.] + +## Common problems + +[Instructors, please add notes on your experience with common problems here.] + + + + diff --git a/learner-profiles.md b/learner-profiles.md new file mode 100644 index 00000000..434e335a --- /dev/null +++ b/learner-profiles.md @@ -0,0 +1,5 @@ +--- +title: FIXME +--- + +This is a placeholder file. Please add content here. diff --git a/md5sum.txt b/md5sum.txt new file mode 100644 index 00000000..ab18d077 --- /dev/null +++ b/md5sum.txt @@ -0,0 +1,20 @@ +"file" "checksum" "built" "date" +"CODE_OF_CONDUCT.md" "c93c83c630db2fe2462240bf72552548" "site/built/CODE_OF_CONDUCT.md" "2024-07-31" +"LICENSE.md" "afaf427b4223952624dcb6d8ded53ec0" "site/built/LICENSE.md" "2024-07-31" +"config.yaml" "865bc02946e1c148f8b2374313178780" "site/built/config.yaml" "2024-08-01" +"index.md" "6cf23d25cc71edcff0cdd1f6d60039da" "site/built/index.md" "2024-07-31" +"episodes/01-setup.Rmd" "19f06a730c60c7a4a87c26ec49f3c092" "site/built/01-setup.md" "2024-08-01" +"episodes/02-introduction-to-bioconductor.Rmd" "95db83520fedb68fb63fc660fb583ff5" "site/built/02-introduction-to-bioconductor.md" "2024-07-31" +"episodes/03-installing-bioconductor.Rmd" "54bbda53c9e93e35c098c510ebdbd12a" "site/built/03-installing-bioconductor.md" "2024-07-31" +"episodes/04-getting-help.Rmd" "ef95b83e2cf1db86786c9b8ff971684b" "site/built/04-getting-help.md" "2024-07-31" +"episodes/05-s4.Rmd" "ba95aa3057423bedb306e6191bbcb696" "site/built/05-s4.md" "2024-08-01" +"episodes/06-biological-sequences.Rmd" "8d8b215474e33ad9dc43e38fd49afc02" "site/built/06-biological-sequences.md" "2024-08-01" +"episodes/07-genomic-ranges.Rmd" "a88d289c2db16b377d3af09a9c896086" "site/built/07-genomic-ranges.md" "2024-07-31" +"episodes/08-annotations.Rmd" "896912973e4501db019d3f5ad6777a93" "site/built/08-annotations.md" "2024-08-01" +"episodes/09-summarizedexperiment.Rmd" "44e35429366ba2e222900d576bd2a392" "site/built/09-summarizedexperiment.md" "2024-08-01" +"instructors/instructor-notes.md" "a848c3b904c16f0840e7ec385de58d7d" "site/built/instructor-notes.md" "2024-07-31" +"learners/discuss.md" "90557a59d285aa7ce2a8adfc5799b1a8" "site/built/discuss.md" "2024-07-31" +"learners/reference.md" "c0ac274d8350e94fbaef3e28b458c7f4" "site/built/reference.md" "2024-07-31" +"learners/setup.md" "9c765f91a75f7c623db5cc3f3a2f8006" "site/built/setup.md" "2024-07-31" +"profiles/learner-profiles.md" "60b93493cf1da06dfd63255d73854461" "site/built/learner-profiles.md" "2024-07-31" +"renv/profiles/lesson-requirements/renv.lock" "0a47f041a605a26ae453cbd30ad32aba" "site/built/renv.lock" "2024-07-31" diff --git a/reference.md b/reference.md new file mode 100644 index 00000000..75d74a3d --- /dev/null +++ b/reference.md @@ -0,0 +1,65 @@ +--- +{} +--- + +## Glossary + +{:auto\_ids} + +AnnotationData package +: Type of Bioconductor package that provides databases of molecular annotations (e.g., genes, proteins, pathways). + +biocViews +: Directed acyclic graphs of terms from a controlled vocabulary, used to categorize R packages in the Bioconductor repository. +The `biocViews` can be browsed on the [Bioconductor website][biocviews-site]. + +ExperimentData package +: Type of Bioconductor package that provides experimental datasets, immediately available as standard Bioconductor objects. +This type of package is often used in [package vignettes](#vignette), to conveniently import data used to demonstrate the functionality of other packages as well as larger workflows. +Experiment data packages can be explored on the [biocViews page][bioc-experimentdata]. + +S4 class +: R has three object-oriented programming (OOP) systems: S3, S4 and R6 (or Reference Classes). +S4 is system that defines formal classes, using an implementation that is stricter than the S3 class system. +Classes define the conceptual structure of S4 objects, while S4 objects represent practical instances of their class. See [S4 object](#s4-object). + +S4 class slot +: Slots can be seen as parts, elements, properties, or attributes of S4 objects. +Each slot is defined by its name and the data type that it may contain. + +S4 generic +: Template function for [S4 methods](#s4-method) that defines the arguments considered for [S4 method dispatch](s4-method-dispatch). + +S4 method +: Instance of an [S4 generic](#s4-generic) for a particular combination of classes across the arguments considered for [S4 method dispatch](s4-method-dispatch). + +S4 method dispatch +: Mechanism allowing R to identify and call the implementation of an [S4 generic](#s4-generic) R function according to the class of object(s) given as argument(s). + +S4 object +: S4 objects are instances of S4 classes, in the same way that an actual car is an instance of the definition of a car that one would find in a dictionary. + +Software package +: Type of Bioconductor package that provides implementations of methodologies for processing experimental data. + +Vignette +: Document(s) in PDF or HTML format, distributed and installed alongside package code, +providing long-form documentation that demonstrates the use of the package functionality in the context of an example workflow. +Vignettes typically use standard datasets obtained from an [ExperimentData package](#experimentdata-package) or the [*ExperimentHub*](https://bioconductor.org/packages/ExperimentHub/) package. + +Workflow package +: Type of Bioconductor package that exclusively provides vignettes used to demonstrate the use of multiple Bioconductor packages in the context of a large workflow. + +## Web resources + +*[Bioconductor website][bioconductor-website]* +: The official Bioconductor website. + + + +[biocviews-site]: https://www.bioconductor.org/packages/release/BiocViews.html +[bioc-experimentdata]: https://www.bioconductor.org/packages/release/BiocViews.html#___ExperimentData +[bioconductor-website]: https://bioconductor.org/ + + + diff --git a/renv.lock b/renv.lock new file mode 100644 index 00000000..be99cc58 --- /dev/null +++ b/renv.lock @@ -0,0 +1,1786 @@ +{ + "R": { + "Version": "4.4.1", + "Repositories": [ + { + "Name": "BioCsoft", + "URL": "https://bioconductor.org/packages/3.19/bioc" + }, + { + "Name": "BioCann", + "URL": "https://bioconductor.org/packages/3.19/data/annotation" + }, + { + "Name": "BioCexp", + "URL": "https://bioconductor.org/packages/3.19/data/experiment" + }, + { + "Name": "BioCworkflows", + "URL": "https://bioconductor.org/packages/3.19/workflows" + }, + { + "Name": "BioCbooks", + "URL": "https://bioconductor.org/packages/3.19/books" + }, + { + "Name": "carpentries", + "URL": "https://carpentries.r-universe.dev" + }, + { + "Name": "carpentries_archive", + "URL": "https://carpentries.github.io/drat" + }, + { + "Name": "CRAN", + "URL": "https://cran.rstudio.com" + } + ] + }, + "Bioconductor": { + "Version": "3.19" + }, + "Packages": { + "AnnotationDbi": { + "Package": "AnnotationDbi", + "Version": "1.66.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "Biobase", + "BiocGenerics", + "DBI", + "IRanges", + "KEGGREST", + "R", + "RSQLite", + "S4Vectors", + "methods", + "stats", + "stats4" + ], + "Hash": "b7df9c597fb5533fc8248d73b8c703ac" + }, + "AnnotationFilter": { + "Package": "AnnotationFilter", + "Version": "1.28.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "GenomicRanges", + "R", + "lazyeval", + "methods", + "utils" + ], + "Hash": "24e809470aef6d81b25003d775b2fb56" + }, + "BH": { + "Package": "BH", + "Version": "1.84.0-0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "a8235afbcd6316e6e91433ea47661013" + }, + "BSgenome": { + "Package": "BSgenome", + "Version": "1.72.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "BiocIO", + "Biostrings", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "R", + "Rsamtools", + "S4Vectors", + "XVector", + "matrixStats", + "methods", + "rtracklayer", + "stats", + "utils" + ], + "Hash": "9e00bf24b78d10f32cb8e1dceb5f87ff" + }, + "BSgenome.Hsapiens.UCSC.hg38": { + "Package": "BSgenome.Hsapiens.UCSC.hg38", + "Version": "1.4.5", + "Source": "Bioconductor", + "Requirements": [ + "BSgenome", + "GenomeInfoDb", + "R" + ], + "Hash": "0ba28cc20a4f8629fbb30d0bf1a133ac" + }, + "BSgenome.Hsapiens.UCSC.hg38.masked": { + "Package": "BSgenome.Hsapiens.UCSC.hg38.masked", + "Version": "1.4.5", + "Source": "Bioconductor", + "Requirements": [ + "BSgenome", + "BSgenome.Hsapiens.UCSC.hg38", + "GenomeInfoDb", + "R" + ], + "Hash": "f165a75d7b8ad3c36692070e50d43a09" + }, + "Biobase": { + "Package": "Biobase", + "Version": "2.64.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "R", + "methods", + "utils" + ], + "Hash": "9bc4cabd3bfda461409172213d932813" + }, + "BiocFileCache": { + "Package": "BiocFileCache", + "Version": "2.12.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "DBI", + "R", + "RSQLite", + "curl", + "dbplyr", + "dplyr", + "filelock", + "httr", + "methods", + "stats", + "utils" + ], + "Hash": "9c3414bcfae204d56080dd0f0a220136" + }, + "BiocGenerics": { + "Package": "BiocGenerics", + "Version": "0.50.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "R", + "graphics", + "methods", + "stats", + "utils" + ], + "Hash": "ef32d07aafdd12f24c5827374ae3590d" + }, + "BiocIO": { + "Package": "BiocIO", + "Version": "1.14.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "R", + "S4Vectors", + "methods", + "tools" + ], + "Hash": "f97a7ef01d364cf20d1946d43a3d526f" + }, + "BiocManager": { + "Package": "BiocManager", + "Version": "1.30.23", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "47e968dfe563c1b22c2e20a067ec21d5" + }, + "BiocParallel": { + "Package": "BiocParallel", + "Version": "1.38.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BH", + "R", + "codetools", + "cpp11", + "futile.logger", + "methods", + "parallel", + "snow", + "stats", + "utils" + ], + "Hash": "7b6e79f86e3d1c23f62c5e2052e848d4" + }, + "BiocStyle": { + "Package": "BiocStyle", + "Version": "2.32.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocManager", + "bookdown", + "knitr", + "rmarkdown", + "stats", + "utils", + "yaml" + ], + "Hash": "beadb5ac6d6b64dc6153cb300dd063ef" + }, + "BiocVersion": { + "Package": "BiocVersion", + "Version": "3.19.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "R" + ], + "Hash": "b892e27fc9659a4c8f8787d34c37b8b2" + }, + "Biostrings": { + "Package": "Biostrings", + "Version": "2.72.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDb", + "IRanges", + "R", + "S4Vectors", + "XVector", + "crayon", + "grDevices", + "methods", + "stats", + "utils" + ], + "Hash": "886ff0ed958d6f839ed2e0d01f6853b3" + }, + "DBI": { + "Package": "DBI", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "065ae649b05f1ff66bb0c793107508f5" + }, + "DelayedArray": { + "Package": "DelayedArray", + "Version": "0.30.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "SparseArray", + "methods", + "stats", + "stats4" + ], + "Hash": "395472c65cd9d606a1a345687102f299" + }, + "EnsDb.Hsapiens.v86": { + "Package": "EnsDb.Hsapiens.v86", + "Version": "2.99.0", + "Source": "Bioconductor", + "Requirements": [ + "ensembldb" + ], + "Hash": "626af36a8d6de3c44779ff2a073952e6" + }, + "GenomeInfoDb": { + "Package": "GenomeInfoDb", + "Version": "1.40.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDbData", + "IRanges", + "R", + "S4Vectors", + "UCSC.utils", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "171e9becd9bb948b9e64eb3759208c94" + }, + "GenomeInfoDbData": { + "Package": "GenomeInfoDbData", + "Version": "1.2.12", + "Source": "Bioconductor", + "Requirements": [ + "R" + ], + "Hash": "c3c792a7b7f2677be56e8632c5b7543d" + }, + "GenomicAlignments": { + "Package": "GenomicAlignments", + "Version": "1.40.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "BiocParallel", + "Biostrings", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "R", + "Rsamtools", + "S4Vectors", + "SummarizedExperiment", + "methods", + "stats", + "utils" + ], + "Hash": "e539709764587c581b31e446dc84d7b8" + }, + "GenomicFeatures": { + "Package": "GenomicFeatures", + "Version": "1.56.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "AnnotationDbi", + "BiocGenerics", + "Biostrings", + "DBI", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "R", + "S4Vectors", + "XVector", + "methods", + "rtracklayer", + "stats", + "utils" + ], + "Hash": "0d19619d13b06b9dea85993ce7f09c52" + }, + "GenomicRanges": { + "Package": "GenomicRanges", + "Version": "1.56.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "GenomeInfoDb", + "IRanges", + "R", + "S4Vectors", + "XVector", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "a3c822ef3c124828e25e7a9611beeb50" + }, + "IRanges": { + "Package": "IRanges", + "Version": "2.38.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "R", + "S4Vectors", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "066f3c5d6b022ed62c91ce49e4d8f619" + }, + "KEGGREST": { + "Package": "KEGGREST", + "Version": "1.44.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "Biostrings", + "R", + "httr", + "methods", + "png" + ], + "Hash": "017f19c09477c0473073518db9076ac1" + }, + "Matrix": { + "Package": "Matrix", + "Version": "1.7-0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "lattice", + "methods", + "stats", + "utils" + ], + "Hash": "1920b2f11133b12350024297d8a4ff4a" + }, + "MatrixGenerics": { + "Package": "MatrixGenerics", + "Version": "1.16.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "matrixStats", + "methods" + ], + "Hash": "152dbbcde6a9a7c7f3beef79b68cd76a" + }, + "ProtGenerics": { + "Package": "ProtGenerics", + "Version": "1.36.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "methods" + ], + "Hash": "a3737c10efc865abfa9d204ca8735b74" + }, + "R6": { + "Package": "R6", + "Version": "2.5.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "470851b6d5d0ac559e9d01bb352b4021" + }, + "RCurl": { + "Package": "RCurl", + "Version": "1.98-1.16", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bitops", + "methods" + ], + "Hash": "ddbdf53d15b47be4407ede6914f56fbb" + }, + "RSQLite": { + "Package": "RSQLite", + "Version": "2.3.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "bit64", + "blob", + "cpp11", + "memoise", + "methods", + "pkgconfig", + "plogr", + "rlang" + ], + "Hash": "46b45a4dd7bb0e0f4e3fc22245817240" + }, + "Rcpp": { + "Package": "Rcpp", + "Version": "1.0.13", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods", + "utils" + ], + "Hash": "f27411eb6d9c3dada5edd444b8416675" + }, + "RefManageR": { + "Package": "RefManageR", + "Version": "1.4.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bibtex", + "httr", + "jsonlite", + "lubridate", + "methods", + "plyr", + "stringr", + "tools", + "utils", + "xml2" + ], + "Hash": "d22e94d9088f55cc112e722888afcc70" + }, + "Rhtslib": { + "Package": "Rhtslib", + "Version": "3.0.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "tools", + "zlibbioc" + ], + "Hash": "5d6514cd44a0106581e3310f3972a82e" + }, + "Rsamtools": { + "Package": "Rsamtools", + "Version": "2.20.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "BiocParallel", + "Biostrings", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "R", + "Rhtslib", + "S4Vectors", + "XVector", + "bitops", + "methods", + "stats", + "utils", + "zlibbioc" + ], + "Hash": "9762f24dcbdbd1626173c516bb64792c" + }, + "S4Arrays": { + "Package": "S4Arrays", + "Version": "1.4.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "R", + "S4Vectors", + "abind", + "crayon", + "methods", + "stats" + ], + "Hash": "deeed4802c5132e88f24a432a1caf5e0" + }, + "S4Vectors": { + "Package": "S4Vectors", + "Version": "0.42.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "R", + "methods", + "stats", + "stats4", + "utils" + ], + "Hash": "86398fc7c5f6be4ba29fe23ed08c2da6" + }, + "SparseArray": { + "Package": "SparseArray", + "Version": "1.4.8", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "XVector", + "matrixStats", + "methods", + "stats", + "utils" + ], + "Hash": "97f70ff11c14edd379ee2429228cbb60" + }, + "SummarizedExperiment": { + "Package": "SummarizedExperiment", + "Version": "1.34.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "Biobase", + "BiocGenerics", + "DelayedArray", + "GenomeInfoDb", + "GenomicRanges", + "IRanges", + "Matrix", + "MatrixGenerics", + "R", + "S4Arrays", + "S4Vectors", + "methods", + "stats", + "tools", + "utils" + ], + "Hash": "2f6c8cc972ed6aee07c96e3dff729d15" + }, + "TxDb.Hsapiens.UCSC.hg38.knownGene": { + "Package": "TxDb.Hsapiens.UCSC.hg38.knownGene", + "Version": "3.18.0", + "Source": "Bioconductor", + "Requirements": [ + "AnnotationDbi", + "GenomicFeatures" + ], + "Hash": "1e9c96dc273bbcb3b33974ad233ee0f3" + }, + "UCSC.utils": { + "Package": "UCSC.utils", + "Version": "1.0.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "S4Vectors", + "httr", + "jsonlite", + "methods", + "stats" + ], + "Hash": "83d45b690bffd09d1980c224ef329f5b" + }, + "XML": { + "Package": "XML", + "Version": "3.99-0.17", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods", + "utils" + ], + "Hash": "bc2a8a1139d8d4bd9c46086708945124" + }, + "XVector": { + "Package": "XVector", + "Version": "0.44.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "IRanges", + "R", + "S4Vectors", + "methods", + "tools", + "utils", + "zlibbioc" + ], + "Hash": "4245b9938ac74c0dbddbebbec6036ab4" + }, + "abind": { + "Package": "abind", + "Version": "1.4-5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods", + "utils" + ], + "Hash": "4f57884290cc75ab22f4af9e9d4ca862" + }, + "askpass": { + "Package": "askpass", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "sys" + ], + "Hash": "cad6cf7f1d5f6e906700b9d3e718c796" + }, + "backports": { + "Package": "backports", + "Version": "1.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "e1e1b9d75c37401117b636b7ae50827a" + }, + "base64enc": { + "Package": "base64enc", + "Version": "0.1-3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "543776ae6848fde2f48ff3816d0628bc" + }, + "bibtex": { + "Package": "bibtex", + "Version": "0.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "backports", + "utils" + ], + "Hash": "a704d52e87822191b42c715c568f96dd" + }, + "biomaRt": { + "Package": "biomaRt", + "Version": "2.60.1", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "AnnotationDbi", + "BiocFileCache", + "digest", + "httr2", + "methods", + "progress", + "rappdirs", + "stringr", + "utils", + "xml2" + ], + "Hash": "e53d495b9e6ecd5394acad1d53c3fa22" + }, + "bit": { + "Package": "bit", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "d242abec29412ce988848d0294b208fd" + }, + "bit64": { + "Package": "bit64", + "Version": "4.0.5", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bit", + "methods", + "stats", + "utils" + ], + "Hash": "9fe98599ca456d6552421db0d6772d8f" + }, + "bitops": { + "Package": "bitops", + "Version": "1.0-8", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "da69e6b6f8feebec0827205aad3fdbd8" + }, + "blob": { + "Package": "blob", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "methods", + "rlang", + "vctrs" + ], + "Hash": "40415719b5a479b87949f3aa0aee737c" + }, + "bookdown": { + "Package": "bookdown", + "Version": "0.40", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "htmltools", + "jquerylib", + "knitr", + "rmarkdown", + "tinytex", + "xfun", + "yaml" + ], + "Hash": "896a79478a50c78fb035a37148638f4e" + }, + "bslib": { + "Package": "bslib", + "Version": "0.8.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "base64enc", + "cachem", + "fastmap", + "grDevices", + "htmltools", + "jquerylib", + "jsonlite", + "lifecycle", + "memoise", + "mime", + "rlang", + "sass" + ], + "Hash": "b299c6741ca9746fb227debcb0f9fb6c" + }, + "cachem": { + "Package": "cachem", + "Version": "1.1.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "fastmap", + "rlang" + ], + "Hash": "cd9a672193789068eb5a2aad65a0dedf" + }, + "cli": { + "Package": "cli", + "Version": "3.6.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "b21916dd77a27642b447374a5d30ecf3" + }, + "codetools": { + "Package": "codetools", + "Version": "0.2-20", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "61e097f35917d342622f21cdc79c256e" + }, + "cpp11": { + "Package": "cpp11", + "Version": "0.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "5a295d7d963cc5035284dcdbaf334f4e" + }, + "crayon": { + "Package": "crayon", + "Version": "1.5.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grDevices", + "methods", + "utils" + ], + "Hash": "859d96e65ef198fd43e82b9628d593ef" + }, + "curl": { + "Package": "curl", + "Version": "5.2.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "411ca2c03b1ce5f548345d2fc2685f7a" + }, + "dbplyr": { + "Package": "dbplyr", + "Version": "2.5.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "DBI", + "R", + "R6", + "blob", + "cli", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "purrr", + "rlang", + "tibble", + "tidyr", + "tidyselect", + "utils", + "vctrs", + "withr" + ], + "Hash": "39b2e002522bfd258039ee4e889e0fd1" + }, + "digest": { + "Package": "digest", + "Version": "0.6.36", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "fd6824ad91ede64151e93af67df6376b" + }, + "dplyr": { + "Package": "dplyr", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "R6", + "cli", + "generics", + "glue", + "lifecycle", + "magrittr", + "methods", + "pillar", + "rlang", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "fedd9d00c2944ff00a0e2696ccf048ec" + }, + "ensembldb": { + "Package": "ensembldb", + "Version": "2.28.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "AnnotationDbi", + "AnnotationFilter", + "Biobase", + "BiocGenerics", + "Biostrings", + "DBI", + "GenomeInfoDb", + "GenomicFeatures", + "GenomicRanges", + "IRanges", + "ProtGenerics", + "R", + "RSQLite", + "Rsamtools", + "S4Vectors", + "curl", + "methods", + "rtracklayer" + ], + "Hash": "f9a5e52468ec832a839c012e15c41c15" + }, + "evaluate": { + "Package": "evaluate", + "Version": "0.24.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "methods" + ], + "Hash": "a1066cbc05caee9a4bf6d90f194ff4da" + }, + "fansi": { + "Package": "fansi", + "Version": "1.0.6", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "utils" + ], + "Hash": "962174cf2aeb5b9eea581522286a911f" + }, + "fastmap": { + "Package": "fastmap", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "RSPM", + "Hash": "aa5e1cd11c2d15497494c5292d7ffcc8" + }, + "filelock": { + "Package": "filelock", + "Version": "1.0.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "192053c276525c8495ccfd523aa8f2d1" + }, + "fontawesome": { + "Package": "fontawesome", + "Version": "0.5.2", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "htmltools", + "rlang" + ], + "Hash": "c2efdd5f0bcd1ea861c2d4e2a883a67d" + }, + "formatR": { + "Package": "formatR", + "Version": "1.14", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "63cb26d12517c7863f5abb006c5e0f25" + }, + "fs": { + "Package": "fs", + "Version": "1.6.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15aeb8c27f5ea5161f9f6a641fafd93a" + }, + "futile.logger": { + "Package": "futile.logger", + "Version": "1.4.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "futile.options", + "lambda.r", + "utils" + ], + "Hash": "99f0ace8c05ec7d3683d27083c4f1e7e" + }, + "futile.options": { + "Package": "futile.options", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "0d9bf02413ddc2bbe8da9ce369dcdd2b" + }, + "generics": { + "Package": "generics", + "Version": "0.1.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "15e9634c0fcd294799e9b2e929ed1b86" + }, + "glue": { + "Package": "glue", + "Version": "1.7.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "methods" + ], + "Hash": "e0b3a53876554bd45879e596cdb10a52" + }, + "highr": { + "Package": "highr", + "Version": "0.11", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "xfun" + ], + "Hash": "d65ba49117ca223614f71b60d85b8ab7" + }, + "hms": { + "Package": "hms", + "Version": "1.1.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "lifecycle", + "methods", + "pkgconfig", + "rlang", + "vctrs" + ], + "Hash": "b59377caa7ed00fa41808342002138f9" + }, + "htmltools": { + "Package": "htmltools", + "Version": "0.5.8.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "base64enc", + "digest", + "fastmap", + "grDevices", + "rlang", + "utils" + ], + "Hash": "81d371a9cc60640e74e4ab6ac46dcedc" + }, + "httr": { + "Package": "httr", + "Version": "1.4.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "curl", + "jsonlite", + "mime", + "openssl" + ], + "Hash": "ac107251d9d9fd72f0ca8049988f1d7f" + }, + "httr2": { + "Package": "httr2", + "Version": "1.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "cli", + "curl", + "glue", + "lifecycle", + "magrittr", + "openssl", + "rappdirs", + "rlang", + "vctrs", + "withr" + ], + "Hash": "320c8fe23fcb25a6690ef7bdb6a3a705" + }, + "jquerylib": { + "Package": "jquerylib", + "Version": "0.1.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "htmltools" + ], + "Hash": "5aab57a3bd297eee1c1d862735972182" + }, + "jsonlite": { + "Package": "jsonlite", + "Version": "1.8.8", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "methods" + ], + "Hash": "e1b9c55281c5adc4dd113652d9e26768" + }, + "knitr": { + "Package": "knitr", + "Version": "1.48", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "evaluate", + "highr", + "methods", + "tools", + "xfun", + "yaml" + ], + "Hash": "acf380f300c721da9fde7df115a5f86f" + }, + "lambda.r": { + "Package": "lambda.r", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "formatR" + ], + "Hash": "b1e925c4b9ffeb901bacf812cbe9a6ad" + }, + "lattice": { + "Package": "lattice", + "Version": "0.22-6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "grDevices", + "graphics", + "grid", + "stats", + "utils" + ], + "Hash": "cc5ac1ba4c238c7ca9fa6a87ca11a7e2" + }, + "lazyeval": { + "Package": "lazyeval", + "Version": "0.2.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "d908914ae53b04d4c0c0fd72ecc35370" + }, + "lifecycle": { + "Package": "lifecycle", + "Version": "1.0.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "rlang" + ], + "Hash": "b8552d117e1b808b09a832f589b79035" + }, + "lubridate": { + "Package": "lubridate", + "Version": "1.9.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "generics", + "methods", + "timechange" + ], + "Hash": "680ad542fbcf801442c83a6ac5a2126c" + }, + "magrittr": { + "Package": "magrittr", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "7ce2733a9826b3aeb1775d56fd305472" + }, + "matrixStats": { + "Package": "matrixStats", + "Version": "1.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "4b3ea27a19d669c0405b38134d89a9d1" + }, + "memoise": { + "Package": "memoise", + "Version": "2.0.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "cachem", + "rlang" + ], + "Hash": "e2817ccf4a065c5d9d7f2cfbe7c1d78c" + }, + "mime": { + "Package": "mime", + "Version": "0.12", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "tools" + ], + "Hash": "18e9c28c1d3ca1560ce30658b22ce104" + }, + "openssl": { + "Package": "openssl", + "Version": "2.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "askpass" + ], + "Hash": "2bcca3848e4734eb3b16103bc9aa4b8e" + }, + "org.Hs.eg.db": { + "Package": "org.Hs.eg.db", + "Version": "3.19.1", + "Source": "Bioconductor", + "Requirements": [ + "AnnotationDbi", + "R", + "methods" + ], + "Hash": "1ac8a004ad2e4f6489dadf3a2ffeb638" + }, + "pillar": { + "Package": "pillar", + "Version": "1.9.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "cli", + "fansi", + "glue", + "lifecycle", + "rlang", + "utf8", + "utils", + "vctrs" + ], + "Hash": "15da5a8412f317beeee6175fbc76f4bb" + }, + "pkgconfig": { + "Package": "pkgconfig", + "Version": "2.0.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "utils" + ], + "Hash": "01f28d4278f15c76cddbea05899c5d6f" + }, + "plogr": { + "Package": "plogr", + "Version": "0.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "09eb987710984fc2905c7129c7d85e65" + }, + "plyr": { + "Package": "plyr", + "Version": "1.8.9", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "Rcpp" + ], + "Hash": "6b8177fd19982f0020743fadbfdbd933" + }, + "png": { + "Package": "png", + "Version": "0.1-8", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "bd54ba8a0a5faded999a7aab6e46b374" + }, + "prettyunits": { + "Package": "prettyunits", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "6b01fc98b1e86c4f705ce9dcfd2f57c7" + }, + "progress": { + "Package": "progress", + "Version": "1.2.3", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "R6", + "crayon", + "hms", + "prettyunits" + ], + "Hash": "f4625e061cb2865f111b47ff163a5ca6" + }, + "purrr": { + "Package": "purrr", + "Version": "1.0.2", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "lifecycle", + "magrittr", + "rlang", + "vctrs" + ], + "Hash": "1cba04a4e9414bdefc9dcaa99649a8dc" + }, + "rappdirs": { + "Package": "rappdirs", + "Version": "0.3.3", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "5e3c5dc0b071b21fa128676560dbe94d" + }, + "renv": { + "Package": "renv", + "Version": "1.0.7", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "utils" + ], + "Hash": "397b7b2a265bc5a7a06852524dabae20" + }, + "restfulr": { + "Package": "restfulr", + "Version": "0.0.15", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "RCurl", + "S4Vectors", + "XML", + "methods", + "rjson", + "yaml" + ], + "Hash": "44651c1e68eda9d462610aca9f15a815" + }, + "rjson": { + "Package": "rjson", + "Version": "0.2.21", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R" + ], + "Hash": "f9da75e6444e95a1baf8ca24909d63b9" + }, + "rlang": { + "Package": "rlang", + "Version": "1.1.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "3eec01f8b1dee337674b2e34ab1f9bc1" + }, + "rmarkdown": { + "Package": "rmarkdown", + "Version": "2.27", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "bslib", + "evaluate", + "fontawesome", + "htmltools", + "jquerylib", + "jsonlite", + "knitr", + "methods", + "tinytex", + "tools", + "utils", + "xfun", + "yaml" + ], + "Hash": "27f9502e1cdbfa195f94e03b0f517484" + }, + "rtracklayer": { + "Package": "rtracklayer", + "Version": "1.64.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Requirements": [ + "BiocGenerics", + "BiocIO", + "Biostrings", + "GenomeInfoDb", + "GenomicAlignments", + "GenomicRanges", + "IRanges", + "R", + "Rsamtools", + "S4Vectors", + "XML", + "XVector", + "curl", + "httr", + "methods", + "restfulr", + "tools", + "zlibbioc" + ], + "Hash": "3d6f004fce582bd7d68e2e18d44abbc1" + }, + "sass": { + "Package": "sass", + "Version": "0.4.9", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R6", + "fs", + "htmltools", + "rappdirs", + "rlang" + ], + "Hash": "d53dbfddf695303ea4ad66f86e99b95d" + }, + "snow": { + "Package": "snow", + "Version": "0.4-4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "utils" + ], + "Hash": "40b74690debd20c57d93d8c246b305d4" + }, + "stringi": { + "Package": "stringi", + "Version": "1.8.4", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "stats", + "tools", + "utils" + ], + "Hash": "39e1144fd75428983dc3f63aa53dfa91" + }, + "stringr": { + "Package": "stringr", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "magrittr", + "rlang", + "stringi", + "vctrs" + ], + "Hash": "960e2ae9e09656611e0b8214ad543207" + }, + "sys": { + "Package": "sys", + "Version": "3.4.2", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "3a1be13d68d47a8cd0bfd74739ca1555" + }, + "tibble": { + "Package": "tibble", + "Version": "3.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "fansi", + "lifecycle", + "magrittr", + "methods", + "pillar", + "pkgconfig", + "rlang", + "utils", + "vctrs" + ], + "Hash": "a84e2cc86d07289b3b6f5069df7a004c" + }, + "tidyr": { + "Package": "tidyr", + "Version": "1.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "cpp11", + "dplyr", + "glue", + "lifecycle", + "magrittr", + "purrr", + "rlang", + "stringr", + "tibble", + "tidyselect", + "utils", + "vctrs" + ], + "Hash": "915fb7ce036c22a6a33b5a8adb712eb1" + }, + "tidyselect": { + "Package": "tidyselect", + "Version": "1.2.1", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang", + "vctrs", + "withr" + ], + "Hash": "829f27b9c4919c16b593794a6344d6c0" + }, + "timechange": { + "Package": "timechange", + "Version": "0.3.0", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cpp11" + ], + "Hash": "c5f3c201b931cd6474d17d8700ccb1c8" + }, + "tinytex": { + "Package": "tinytex", + "Version": "0.52", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "xfun" + ], + "Hash": "cfbad971a71f0e27cec22e544a08bc3b" + }, + "utf8": { + "Package": "utf8", + "Version": "1.2.4", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R" + ], + "Hash": "62b65c52671e6665f803ff02954446e9" + }, + "vctrs": { + "Package": "vctrs", + "Version": "0.6.5", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "cli", + "glue", + "lifecycle", + "rlang" + ], + "Hash": "c03fa420630029418f7e6da3667aac4a" + }, + "withr": { + "Package": "withr", + "Version": "3.0.0", + "Source": "Repository", + "Repository": "RSPM", + "Requirements": [ + "R", + "grDevices", + "graphics" + ], + "Hash": "d31b6c62c10dcf11ec530ca6b0dd5d35" + }, + "xfun": { + "Package": "xfun", + "Version": "0.46", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "grDevices", + "stats", + "tools" + ], + "Hash": "00ce32f398db0415dde61abfef11300c" + }, + "xml2": { + "Package": "xml2", + "Version": "1.3.6", + "Source": "Repository", + "Repository": "CRAN", + "Requirements": [ + "R", + "cli", + "methods", + "rlang" + ], + "Hash": "1d0336142f4cd25d8d23cd3ba7a8fb61" + }, + "yaml": { + "Package": "yaml", + "Version": "2.3.10", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "51dab85c6c98e50a18d7551e9d49f76c" + }, + "zlibbioc": { + "Package": "zlibbioc", + "Version": "1.50.0", + "Source": "Bioconductor", + "Repository": "Bioconductor 3.19", + "Hash": "3db02e3c460e1c852365df117a2b441b" + } + } +} diff --git a/setup.md b/setup.md new file mode 100644 index 00000000..49b9192b --- /dev/null +++ b/setup.md @@ -0,0 +1,20 @@ +--- +title: Setup +--- + +Ensure that you have the most recent versions of R and RStudio installed on your computer. +For detailed instructions on how to do this, you can refer to the section "If you already have R and RStudio installed" +in the [Introduction to R](https://carpentries-incubator.github.io/bioc-intro/#r-and-rstudio) +episode of the [Introduction to data analysis with R and Bioconductor](https://carpentries-incubator.github.io/bioc-intro) lesson. + +Additionally, you will also need to install the following packages that will be used throughout the lesson. + +```r +install.packages(c("BiocManager", "remotes")) +BiocManager::install(c( + "S4Vectors", "Biostrings", "BSgenome", + "BSgenome.Hsapiens.UCSC.hg38.masked", + "GenomicRanges", "rtracklayer", "biomaRt")) +``` + +*If you are attending a workshop, please complete all of the above before the workshop. Should you need help, an instructor will be available 30 minutes before the workshop commences to assist.*