forked from jennybc/happy-git-with-r
-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Starting porting content fro "Excuse Me"
- Loading branch information
Showing
6 changed files
with
244 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,10 +1,194 @@ | ||
@Book{xie2015, | ||
title = {Dynamic Documents with {R} and knitr}, | ||
@Book{knitr-book, | ||
title = {Dynamic Documents with {R} and knitr}, | ||
author = {Yihui Xie}, | ||
publisher = {Chapman and Hall/CRC}, | ||
address = {Boca Raton, Florida}, | ||
year = {2015}, | ||
edition = {2nd}, | ||
note = {ISBN 978-1498716963}, | ||
url = {http://yihui.name/knitr/}, | ||
} | ||
|
||
@Article{Ram2013, | ||
author="Ram, Karthik", | ||
title="Git can facilitate greater reproducibility and increased transparency in science", | ||
journal="Source Code for Biology and Medicine", | ||
year="2013", | ||
volume="8", | ||
number="1", | ||
pages="7", | ||
abstract="Reproducibility is the hallmark of good science. Maintaining a high degree of transparency in scientific reporting is essential not just for gaining trust and credibility within the scientific community but also for facilitating the development of new ideas. Sharing data and computer code associated with publications is becoming increasingly common, motivated partly in response to data deposition requirements from journals and mandates from funders. Despite this increase in transparency, it is still difficult to reproduce or build upon the findings of most scientific publications without access to a more complete workflow.", | ||
issn="1751-0473", | ||
doi="10.1186/1751-0473-8-7", | ||
url="http://dx.doi.org/10.1186/1751-0473-8-7" | ||
} | ||
|
||
@article{good-enough, | ||
author = {Greg Wilson and | ||
Jennifer Bryan and | ||
Karen Cranston and | ||
Justin Kitzes and | ||
Lex Nederbragt and | ||
Tracy K. Teal}, | ||
title = {Good Enough Practices in Scientific Computing}, | ||
journal = {CoRR}, | ||
volume = {abs/1609.00037}, | ||
year = {2016}, | ||
url = {http://arxiv.org/abs/1609.00037}, | ||
timestamp = {Mon, 03 Oct 2016 17:51:10 +0200}, | ||
biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/WilsonBCKNT16}, | ||
bibsource = {dblp computer science bibliography, http://dblp.org} | ||
} | ||
|
||
@misc{git-for-humans, | ||
Author = "Alice Bartlett", | ||
Title = "Git for Humans", | ||
Institution = "Financial Times, London", | ||
Howpublished = "Talk at UX Brighton", | ||
Year = "2016", | ||
Url = "https://speakerdeck.com/alicebartlett/git-for-humans", | ||
Abstract = "This talk will explore a tool that most developers couldn't live without. We'll look at the way it helps developers tell the story of their project, and how non-technical people can get in on the action too." | ||
} | ||
|
||
@Manual{rmd-pkg, | ||
title = {rmarkdown: Dynamic Documents for R}, | ||
author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Aron Atkins and Rob Hyndman and Ruben Arslan}, | ||
year = {2017}, | ||
note = {R package version 1.5.9000}, | ||
url = {http://rmarkdown.rstudio.com}, | ||
} | ||
|
||
@Manual{knitr-pkg, | ||
title = {knitr: A General-Purpose Package for Dynamic Report | ||
Generation in R}, | ||
author = {Yihui Xie}, | ||
year = {2017}, | ||
note = {R package version 1.16}, | ||
url = {http://yihui.name/knitr/}, | ||
} | ||
|
||
|
||
@article{ten-simple-rules-git, | ||
author = {Yasset Perez-Riverol and | ||
Laurent Gatto and | ||
Rui Wang and | ||
Timo Sachsenberg and | ||
Julian Uszkoreit and | ||
Felipe da Veiga Leprevost and | ||
Christian Fufezan and | ||
Tobias Ternent and | ||
Stephen J. Eglen and | ||
Daniel S. Katz and | ||
Tom J. Pollard and | ||
Alexander Konovalov and | ||
Robert M. Flight and | ||
Kai Blin and | ||
Juan Antonio Vizcaíno}, | ||
journal = {PLOS Computational Biology}, | ||
publisher = {Public Library of Science}, | ||
title = {Ten Simple Rules for Taking Advantage of Git and GitHub}, | ||
year = {2016}, | ||
month = {07}, | ||
volume = {12}, | ||
url = {https://doi.org/10.1371/journal.pcbi.1004947}, | ||
pages = {1-11}, | ||
abstract = {}, | ||
number = {7}, | ||
doi = {10.1371/journal.pcbi.1004947} | ||
} | ||
|
||
@Manual{bookdown-pkg, | ||
title = {bookdown: Authoring Books and Technical Documents with R Markdown}, | ||
author = {Yihui Xie}, | ||
year = {2016}, | ||
note = {R package version 0.3}, | ||
url = {https://github.com/rstudio/bookdown}, | ||
} | ||
|
||
@Book{bookdown-book, | ||
title = {bookdown: Authoring Books and Technical Documents with {R} Markdown}, | ||
author = {Yihui Xie}, | ||
publisher = {Chapman and Hall/CRC}, | ||
address = {Boca Raton, Florida}, | ||
year = {2017}, | ||
note = {ISBN 978-1138700109}, | ||
url = {https://github.com/rstudio/bookdown}, | ||
} | ||
|
||
@manual{git, | ||
title = {Git}, | ||
url = {https://git-scm.com} | ||
} | ||
|
||
@manual{github, | ||
title = {GitHub}, | ||
url = {https://github.com} | ||
} | ||
|
||
@manual{rstudio, | ||
title = {RStudio Integrated Desktop Environment}, | ||
url = {https://www.rstudio.com/products/rstudio} | ||
} | ||
|
||
@manual{r, | ||
title = {R: A Language and Environment for Statistical Computing}, | ||
author = {{R Core Team}}, | ||
organization = {R Foundation for Statistical Computing}, | ||
address = {Vienna, Austria}, | ||
year = {2017}, | ||
url = {https://www.R-project.org} | ||
} | ||
|
||
@misc{donoho, | ||
author = {David Donoho}, | ||
title = {50 years of Data Science}, | ||
institution = {Stanford University}, | ||
howpublished = {Version 1.00}, | ||
month = {September}, | ||
year = {2015}, | ||
edition = {2nd}, | ||
note = {ISBN 978-1498716963}, | ||
url = {http://yihui.name/knitr/}, | ||
url = {http://courses.csail.mit.edu/18.337/2015/docs/50YearsDataScience.pdf} | ||
} | ||
|
||
@article{cetinkaya-rundel-dss-2017, | ||
title = {Infrastructure and tools for teaching computing throughout the statistical curriculum}, | ||
author = {Cetinkaya-Rundel, Mine and Rundel, Colin W}, | ||
year = 2017, | ||
month = aug, | ||
keywords = {R markdown, git / github, reproducibility, data science, workflow, R language, Continuous integration, RStudio, teaching, cirriculum}, | ||
abstract = { | ||
Modern statistics is fundamentally a computational discipline, but too often this fact is not reflected in our statistics curricula. With the rise of big data and data science it has become increasingly clear that students both want, expect, and need explicit training in this area of the discipline. Additionally, recent curricular guidelines clearly state that working with data requires extensive computing skills and that statistics students should be fluent in accessing, manipulating, analyzing, and modeling with professional statistical analysis software. Much has been written in the statistics education literature about pedagogical tools and approaches to provide a practical computational foundation for students. This article discusses the computational infrastructure and toolkit choices to allow for these pedagogical innovations while minimizing frustration and improving adoption for both our students and instructors. | ||
}, | ||
volume = 5, | ||
pages = {e3181v1}, | ||
journal = {PeerJ Preprints}, | ||
issn = {2167-9843}, | ||
url = {https://doi.org/10.7287/peerj.preprints.3181v1}, | ||
doi = {10.7287/peerj.preprints.3181v1} | ||
} | ||
|
||
@article {fisher, | ||
author = {FISHER, R. A.}, | ||
title = {THE USE OF MULTIPLE MEASUREMENTS IN TAXONOMIC PROBLEMS}, | ||
journal = {Annals of Eugenics}, | ||
volume = {7}, | ||
number = {2}, | ||
publisher = {Blackwell Publishing Ltd}, | ||
issn = {2050-1439}, | ||
url = {http://dx.doi.org/10.1111/j.1469-1809.1936.tb02137.x}, | ||
doi = {10.1111/j.1469-1809.1936.tb02137.x}, | ||
pages = {179--188}, | ||
year = {1936}, | ||
} | ||
|
||
@article{anderson, | ||
ISSN = {00266493}, | ||
URL = {http://www.jstor.org/stable/2394164}, | ||
author = {Edgar Anderson}, | ||
journal = {Annals of the Missouri Botanical Garden}, | ||
number = {3}, | ||
pages = {457-509}, | ||
publisher = {Missouri Botanical Garden Press}, | ||
title = {The Species Problem in Iris}, | ||
volume = {23}, | ||
year = {1936} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
# Repo, commit, diff, tag {#git-basics} | ||
|
||
## Repos or repositories | ||
|
||
Git is a version control system whose original purpose was to help groups of | ||
developers work collaboratively on big software projects. Git manages the | ||
evolution of a set of files -- called a __repository__ or __repo__ -- in a highly structured way. Historically, these files would have consisted of source code and the instructions for how to build an application from its source. | ||
|
||
Git has been re-purposed by the data science community [@Ram2013; | ||
@git-for-humans; @ten-simple-rules-git]. We use it to manage the motley collection of files that make up typical data analytical projects, which consist of data, figures, reports, and, yes, some source code. | ||
|
||
For new or existing projects, we recommand that you: | ||
|
||
* Dedicate a local directory or folder to it. | ||
* Make it an RStudio Project. *Optional but recommended; obviously only applies to projects involving R and users of RStudio.* | ||
* Make it a Git repository. | ||
|
||
This setup happens once per project and can happen at project inception or at any later point. Chances are your existing projects each already live in a dedicated directory. Making such a directory an RStudio Project and Git repository boils down to allowing those applications to leave notes for themselves in hidden files or directories. The project is still a regular directory on your computer, that you can locate, name, move, and generally interact with as you wish. You don't have to handle it with special gloves! | ||
|
||
The daily workflow is probably not dramatically different from what you do currently. You work in the usual way, writing R scripts or authoring reports in LaTeX or R Markdown. But instead of only *saving* individual files, periodically you make a __commit__, which takes a snapshot of all the files in the entire project. If you have ever versioned a file [by adding your initials or the date](http://www.phdcomics.com/comics/archive.php?comicid=1531), you have effectively made a commit, albeit only for a single file. It is a version that is significant to you and that you might want to inspect or revert to later. Periodically, you push commits to GitHub. This is like sharing a document with colleagues on DropBox or sending it out as an email attachment. By pushing to GitHub, you make your work and all your accumulated progress accessible to others. | ||
|
||
This is a moderate change to your normal, daily workflow. It feels weird at first, but quickly becomes second nature. In [STAT 545](http://stat545.com) students are required to submit all coursework via GitHub, starting in week one. Most have never seen Git before and do not identify as programmers. It is a major topic in class and office hours for the first two weeks. Then we practically never discuss it again. | ||
|
||
## Commits, diffs, and tags | ||
|
||
We now connect the fundamental concepts of Git to the data science workflow: | ||
|
||
* repository | ||
* commit | ||
* diff | ||
|
||
Recall that a repository or repo is just a directory of files that Git manages holistically. A commit functions like a snapshot of all the files in the repo, at a specific moment. Under the hood, that is not exactly how Git implements things. Although mental models don't have to be accurate in order to be useful, in this case it helps to align the two. | ||
|
||
```{r commit-diff-sha-tag, echo = FALSE, out.width = "100%", fig.cap="\\label{fig:commit-diff-sha-tag}Partial commit history for our iris example, highlighting diffs, commit messages, SHAs, and tags."} | ||
knitr::include_graphics("img/commit-diff-sha-tag.png") | ||
``` | ||
|
||
Figure \ref{fig:commit-diff-sha-tag} is a look at a fictional analysis of the iris data, focusing on the evolution of a script, `iris.R`. Consider version A of this file and a modified version, version B. Assume that version A was part of one Git commit and version B was part of the next commit. The set of differences between A and B is called a "diff" and Git users contemplate diffs a lot. Diff inspection is how you re-explain to yourself how version A differs from version B. Diff inspection is not limited to adjacent commits. You can inspect the diffs between any two commits. | ||
|
||
In fact, Git's notion of any specific version of `iris.R` is as an accumulation of diffs. If you go back far enough, you find the commit where the file was created in the first place. Every later version is stored by Git as that initial version, plus all the intervening diffs in the history that affect the file. We'll set these internal details aside now, but understanding the importance of these deltas will make Git's operations less baffling in the long run. | ||
|
||
So, by looking at diffs, it's easy to see how two snapshots differ, but what about the why? | ||
|
||
Every time you make a commit you must also write a short __commit message__. Ideally, this conveys the motivation for the change. Remember, the diff will show the content. When you revisit a project after a break or need to digest recent changes made by a colleague, looking at the __history__, by reading commit messages and skimming through diffs, is an extremely efficient way to get up to speed. Figure \ref{fig:commit-diff-sha-tag} shows the messages associated with the last three commits. | ||
|
||
Every commit needs some sort of nickname, so you can identify it. Git does this automatically, assigning each commit what is called a SHA, a seemingly random string of 40 letters and numbers (it is not, in fact, random but is a SHA-1 checksum hash of the commit). Though you will be exposed to these, you don't have to handle them directly very often and, when you do, usually the first 7 characters suffice. The commit messages in Figure \ref{fig:commit-diff-sha-tag} are prefixed by such truncated SHAs. You can also designate certain snapshots as special with a __tag__, which is a name of your choosing. In a software project, it is typical to tag a release with its version, e.g., "v1.0.3". For a manuscript or analytical project, you might tag the version submitted to a journal or transmitted to external collaborators. Figure \ref{fig:commit-diff-sha-tag} shows a tag, "draft-01", associated with the last commit. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,13 +1,11 @@ | ||
# (PART) Basic Git concepts {-} | ||
# (PART) Git fundamentals {-} | ||
|
||
# Some Git fundamentals {#git-intro .unnumbered} | ||
# Some Git basics {#git-intro .unnumbered} | ||
|
||
We've told you shockingly little about Git so far! To a large extent, this is by design. There are lots of [general Git resources](#resources) out there that do a great job explaining the core concepts and terminology. Happy Git is designed to fill what we see as some gaps: | ||
We've told you shockingly little about Git so far! This is by design. | ||
|
||
* A "batteries included" guide for early success with basic workflows. | ||
* Advice on how to use Git/GitHub in data science, which differs from pure software development in several aspects. | ||
* Special considerations relevant to people who use R and RStudio. | ||
|
||
In live workshops, we also intoduce the most important basic ideas in the context of guided activities. | ||
We find that actual usage, in the course of your work, is the most effective way to build up a useful mental model for Git. In live workshops, we strive to introduce the most important basic ideas in the context of our guided activities. Self-learners can achieve the same by working through the "batteries included" guides earlier in the previous sections. | ||
|
||
This part collects anything we've written about core Git concepts, is a work in progress, and should be used to complement [external Git resources](#resources). | ||
However, building on this early success, now is the perfect time to explicitly define some Git vocabulary. We also want to help you link Git concepts to data science tasks and projects. | ||
|
||
This part collects anything we've written about core Git concepts. It is a work in progress and is conceived as a complement to the many excellent [external resources for Git](#resources), which we have no desire to re-invent. |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters