Skip to content

Commit

Permalink
Starting porting content fro "Excuse Me"
Browse files Browse the repository at this point in the history
  • Loading branch information
jennybc committed Jan 8, 2019
1 parent 025b639 commit ef8e2e2
Show file tree
Hide file tree
Showing 6 changed files with 244 additions and 16 deletions.
2 changes: 1 addition & 1 deletion _bookdown.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ rmd_files: [
"usage-r-script-and-github.Rmd",

"git-intro.Rmd",
## TODO: commit, diff
"git-basics.Rmd",
"git-commands.Rmd",
"git-branches.Rmd",
"git-remotes.Rmd",
Expand Down
194 changes: 189 additions & 5 deletions book.bib
Original file line number Diff line number Diff line change
@@ -1,10 +1,194 @@
@Book{xie2015,
title = {Dynamic Documents with {R} and knitr},
@Book{knitr-book,
title = {Dynamic Documents with {R} and knitr},
author = {Yihui Xie},
publisher = {Chapman and Hall/CRC},
address = {Boca Raton, Florida},
year = {2015},
edition = {2nd},
note = {ISBN 978-1498716963},
url = {http://yihui.name/knitr/},
}

@Article{Ram2013,
author="Ram, Karthik",
title="Git can facilitate greater reproducibility and increased transparency in science",
journal="Source Code for Biology and Medicine",
year="2013",
volume="8",
number="1",
pages="7",
abstract="Reproducibility is the hallmark of good science. Maintaining a high degree of transparency in scientific reporting is essential not just for gaining trust and credibility within the scientific community but also for facilitating the development of new ideas. Sharing data and computer code associated with publications is becoming increasingly common, motivated partly in response to data deposition requirements from journals and mandates from funders. Despite this increase in transparency, it is still difficult to reproduce or build upon the findings of most scientific publications without access to a more complete workflow.",
issn="1751-0473",
doi="10.1186/1751-0473-8-7",
url="http://dx.doi.org/10.1186/1751-0473-8-7"
}

@article{good-enough,
author = {Greg Wilson and
Jennifer Bryan and
Karen Cranston and
Justin Kitzes and
Lex Nederbragt and
Tracy K. Teal},
title = {Good Enough Practices in Scientific Computing},
journal = {CoRR},
volume = {abs/1609.00037},
year = {2016},
url = {http://arxiv.org/abs/1609.00037},
timestamp = {Mon, 03 Oct 2016 17:51:10 +0200},
biburl = {http://dblp.uni-trier.de/rec/bib/journals/corr/WilsonBCKNT16},
bibsource = {dblp computer science bibliography, http://dblp.org}
}

@misc{git-for-humans,
Author = "Alice Bartlett",
Title = "Git for Humans",
Institution = "Financial Times, London",
Howpublished = "Talk at UX Brighton",
Year = "2016",
Url = "https://speakerdeck.com/alicebartlett/git-for-humans",
Abstract = "This talk will explore a tool that most developers couldn't live without. We'll look at the way it helps developers tell the story of their project, and how non-technical people can get in on the action too."
}

@Manual{rmd-pkg,
title = {rmarkdown: Dynamic Documents for R},
author = {JJ Allaire and Joe Cheng and Yihui Xie and Jonathan McPherson and Winston Chang and Jeff Allen and Hadley Wickham and Aron Atkins and Rob Hyndman and Ruben Arslan},
year = {2017},
note = {R package version 1.5.9000},
url = {http://rmarkdown.rstudio.com},
}

@Manual{knitr-pkg,
title = {knitr: A General-Purpose Package for Dynamic Report
Generation in R},
author = {Yihui Xie},
year = {2017},
note = {R package version 1.16},
url = {http://yihui.name/knitr/},
}


@article{ten-simple-rules-git,
author = {Yasset Perez-Riverol and
Laurent Gatto and
Rui Wang and
Timo Sachsenberg and
Julian Uszkoreit and
Felipe da Veiga Leprevost and
Christian Fufezan and
Tobias Ternent and
Stephen J. Eglen and
Daniel S. Katz and
Tom J. Pollard and
Alexander Konovalov and
Robert M. Flight and
Kai Blin and
Juan Antonio Vizcaíno},
journal = {PLOS Computational Biology},
publisher = {Public Library of Science},
title = {Ten Simple Rules for Taking Advantage of Git and GitHub},
year = {2016},
month = {07},
volume = {12},
url = {https://doi.org/10.1371/journal.pcbi.1004947},
pages = {1-11},
abstract = {},
number = {7},
doi = {10.1371/journal.pcbi.1004947}
}

@Manual{bookdown-pkg,
title = {bookdown: Authoring Books and Technical Documents with R Markdown},
author = {Yihui Xie},
year = {2016},
note = {R package version 0.3},
url = {https://github.com/rstudio/bookdown},
}

@Book{bookdown-book,
title = {bookdown: Authoring Books and Technical Documents with {R} Markdown},
author = {Yihui Xie},
publisher = {Chapman and Hall/CRC},
address = {Boca Raton, Florida},
year = {2017},
note = {ISBN 978-1138700109},
url = {https://github.com/rstudio/bookdown},
}

@manual{git,
title = {Git},
url = {https://git-scm.com}
}

@manual{github,
title = {GitHub},
url = {https://github.com}
}

@manual{rstudio,
title = {RStudio Integrated Desktop Environment},
url = {https://www.rstudio.com/products/rstudio}
}

@manual{r,
title = {R: A Language and Environment for Statistical Computing},
author = {{R Core Team}},
organization = {R Foundation for Statistical Computing},
address = {Vienna, Austria},
year = {2017},
url = {https://www.R-project.org}
}

@misc{donoho,
author = {David Donoho},
title = {50 years of Data Science},
institution = {Stanford University},
howpublished = {Version 1.00},
month = {September},
year = {2015},
edition = {2nd},
note = {ISBN 978-1498716963},
url = {http://yihui.name/knitr/},
url = {http://courses.csail.mit.edu/18.337/2015/docs/50YearsDataScience.pdf}
}

@article{cetinkaya-rundel-dss-2017,
title = {Infrastructure and tools for teaching computing throughout the statistical curriculum},
author = {Cetinkaya-Rundel, Mine and Rundel, Colin W},
year = 2017,
month = aug,
keywords = {R markdown, git / github, reproducibility, data science, workflow, R language, Continuous integration, RStudio, teaching, cirriculum},
abstract = {
Modern statistics is fundamentally a computational discipline, but too often this fact is not reflected in our statistics curricula. With the rise of big data and data science it has become increasingly clear that students both want, expect, and need explicit training in this area of the discipline. Additionally, recent curricular guidelines clearly state that working with data requires extensive computing skills and that statistics students should be fluent in accessing, manipulating, analyzing, and modeling with professional statistical analysis software. Much has been written in the statistics education literature about pedagogical tools and approaches to provide a practical computational foundation for students. This article discusses the computational infrastructure and toolkit choices to allow for these pedagogical innovations while minimizing frustration and improving adoption for both our students and instructors.
},
volume = 5,
pages = {e3181v1},
journal = {PeerJ Preprints},
issn = {2167-9843},
url = {https://doi.org/10.7287/peerj.preprints.3181v1},
doi = {10.7287/peerj.preprints.3181v1}
}

@article {fisher,
author = {FISHER, R. A.},
title = {THE USE OF MULTIPLE MEASUREMENTS IN TAXONOMIC PROBLEMS},
journal = {Annals of Eugenics},
volume = {7},
number = {2},
publisher = {Blackwell Publishing Ltd},
issn = {2050-1439},
url = {http://dx.doi.org/10.1111/j.1469-1809.1936.tb02137.x},
doi = {10.1111/j.1469-1809.1936.tb02137.x},
pages = {179--188},
year = {1936},
}

@article{anderson,
ISSN = {00266493},
URL = {http://www.jstor.org/stable/2394164},
author = {Edgar Anderson},
journal = {Annals of the Missouri Botanical Garden},
number = {3},
pages = {457-509},
publisher = {Missouri Botanical Garden Press},
title = {The Species Problem in Iris},
volume = {23},
year = {1936}
}
46 changes: 46 additions & 0 deletions git-basics.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# Repo, commit, diff, tag {#git-basics}

## Repos or repositories

Git is a version control system whose original purpose was to help groups of
developers work collaboratively on big software projects. Git manages the
evolution of a set of files -- called a __repository__ or __repo__ -- in a highly structured way. Historically, these files would have consisted of source code and the instructions for how to build an application from its source.

Git has been re-purposed by the data science community [@Ram2013;
@git-for-humans; @ten-simple-rules-git]. We use it to manage the motley collection of files that make up typical data analytical projects, which consist of data, figures, reports, and, yes, some source code.

For new or existing projects, we recommand that you:

* Dedicate a local directory or folder to it.
* Make it an RStudio Project. *Optional but recommended; obviously only applies to projects involving R and users of RStudio.*
* Make it a Git repository.

This setup happens once per project and can happen at project inception or at any later point. Chances are your existing projects each already live in a dedicated directory. Making such a directory an RStudio Project and Git repository boils down to allowing those applications to leave notes for themselves in hidden files or directories. The project is still a regular directory on your computer, that you can locate, name, move, and generally interact with as you wish. You don't have to handle it with special gloves!

The daily workflow is probably not dramatically different from what you do currently. You work in the usual way, writing R scripts or authoring reports in LaTeX or R Markdown. But instead of only *saving* individual files, periodically you make a __commit__, which takes a snapshot of all the files in the entire project. If you have ever versioned a file [by adding your initials or the date](http://www.phdcomics.com/comics/archive.php?comicid=1531), you have effectively made a commit, albeit only for a single file. It is a version that is significant to you and that you might want to inspect or revert to later. Periodically, you push commits to GitHub. This is like sharing a document with colleagues on DropBox or sending it out as an email attachment. By pushing to GitHub, you make your work and all your accumulated progress accessible to others.

This is a moderate change to your normal, daily workflow. It feels weird at first, but quickly becomes second nature. In [STAT 545](http://stat545.com) students are required to submit all coursework via GitHub, starting in week one. Most have never seen Git before and do not identify as programmers. It is a major topic in class and office hours for the first two weeks. Then we practically never discuss it again.

## Commits, diffs, and tags

We now connect the fundamental concepts of Git to the data science workflow:

* repository
* commit
* diff

Recall that a repository or repo is just a directory of files that Git manages holistically. A commit functions like a snapshot of all the files in the repo, at a specific moment. Under the hood, that is not exactly how Git implements things. Although mental models don't have to be accurate in order to be useful, in this case it helps to align the two.

```{r commit-diff-sha-tag, echo = FALSE, out.width = "100%", fig.cap="\\label{fig:commit-diff-sha-tag}Partial commit history for our iris example, highlighting diffs, commit messages, SHAs, and tags."}
knitr::include_graphics("img/commit-diff-sha-tag.png")
```

Figure \ref{fig:commit-diff-sha-tag} is a look at a fictional analysis of the iris data, focusing on the evolution of a script, `iris.R`. Consider version A of this file and a modified version, version B. Assume that version A was part of one Git commit and version B was part of the next commit. The set of differences between A and B is called a "diff" and Git users contemplate diffs a lot. Diff inspection is how you re-explain to yourself how version A differs from version B. Diff inspection is not limited to adjacent commits. You can inspect the diffs between any two commits.

In fact, Git's notion of any specific version of `iris.R` is as an accumulation of diffs. If you go back far enough, you find the commit where the file was created in the first place. Every later version is stored by Git as that initial version, plus all the intervening diffs in the history that affect the file. We'll set these internal details aside now, but understanding the importance of these deltas will make Git's operations less baffling in the long run.

So, by looking at diffs, it's easy to see how two snapshots differ, but what about the why?

Every time you make a commit you must also write a short __commit message__. Ideally, this conveys the motivation for the change. Remember, the diff will show the content. When you revisit a project after a break or need to digest recent changes made by a colleague, looking at the __history__, by reading commit messages and skimming through diffs, is an extremely efficient way to get up to speed. Figure \ref{fig:commit-diff-sha-tag} shows the messages associated with the last three commits.

Every commit needs some sort of nickname, so you can identify it. Git does this automatically, assigning each commit what is called a SHA, a seemingly random string of 40 letters and numbers (it is not, in fact, random but is a SHA-1 checksum hash of the commit). Though you will be exposed to these, you don't have to handle them directly very often and, when you do, usually the first 7 characters suffice. The commit messages in Figure \ref{fig:commit-diff-sha-tag} are prefixed by such truncated SHAs. You can also designate certain snapshots as special with a __tag__, which is a name of your choosing. In a software project, it is typical to tag a release with its version, e.g., "v1.0.3". For a manuscript or analytical project, you might tag the version submitted to a journal or transmitted to external collaborators. Figure \ref{fig:commit-diff-sha-tag} shows a tag, "draft-01", associated with the last commit.
16 changes: 7 additions & 9 deletions git-intro.Rmd
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# (PART) Basic Git concepts {-}
# (PART) Git fundamentals {-}

# Some Git fundamentals {#git-intro .unnumbered}
# Some Git basics {#git-intro .unnumbered}

We've told you shockingly little about Git so far! To a large extent, this is by design. There are lots of [general Git resources](#resources) out there that do a great job explaining the core concepts and terminology. Happy Git is designed to fill what we see as some gaps:
We've told you shockingly little about Git so far! This is by design.

* A "batteries included" guide for early success with basic workflows.
* Advice on how to use Git/GitHub in data science, which differs from pure software development in several aspects.
* Special considerations relevant to people who use R and RStudio.

In live workshops, we also intoduce the most important basic ideas in the context of guided activities.
We find that actual usage, in the course of your work, is the most effective way to build up a useful mental model for Git. In live workshops, we strive to introduce the most important basic ideas in the context of our guided activities. Self-learners can achieve the same by working through the "batteries included" guides earlier in the previous sections.

This part collects anything we've written about core Git concepts, is a work in progress, and should be used to complement [external Git resources](#resources).
However, building on this early success, now is the perfect time to explicitly define some Git vocabulary. We also want to help you link Git concepts to data science tasks and projects.

This part collects anything we've written about core Git concepts. It is a work in progress and is conceived as a complement to the many excellent [external resources for Git](#resources), which we have no desire to re-invent.
Binary file added img/commit-diff-sha-tag.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 1 addition & 1 deletion notes-bookdown-cheat-sheet.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ knitr::kable(
)
```

You can write citations, too. For example, we are using the **bookdown** package [@R-bookdown] in this sample book, which was built on top of R Markdown and **knitr** [@xie2015].
You can write citations, too. For example, we are using the **bookdown** package [@R-bookdown] in this sample book, which was built on top of R Markdown and **knitr** [@knitr-book].

## How the square bracket links work

Expand Down

0 comments on commit ef8e2e2

Please sign in to comment.