Merge pull request #39 from sailthru/vignette

Adds a vignette and increased tidyjson verb documentation, closes #39 and #38
sailthru · Apr 21, 2015 · eeb1f72 · eeb1f72
2 parents 3a89d85 + d45e964
commit eeb1f72
Show file tree

Hide file tree

Showing 19 changed files with 847 additions and 23 deletions.
diff --git a/.gitignore b/.gitignore
@@ -2,3 +2,4 @@
 .RData
 .Rhistory
 *.swp
+inst/doc
diff --git a/R/append_values.r b/R/append_values.r
@@ -1,11 +1,25 @@
-#' Append keys to a new column
+#' Appends all values with a specified type as a new column
+#' 
+#' The append_values_X functions let you take any remaining JSON and add it as
+#' a column X (for X in "string", "number", "logical") insofar as it is of the
+#' JSON type specified.
+#'
+#' Any values that do not conform to the type specified will be NA in the resulting
+#' column. This includes other scalar types (e.g., numbers or logicals if you are
+#' using append_values_string) and *also* any rows where the JSON is still an
+#' object or an array.
 #' 
 #' @name append_values
 #' @param x a tbl_json object
 #' @param column.name the column.name to append the values into the data.frame
 #'   under
 #' @param force parameter that determines if the variable type should be computed or not
 #'        if force is FALSE, then the function may take more memory
+#' @examples
+#' library(magrittr)  # for %>%
+#' '{"first": "bob", "last": "jones"}' %>% 
+#'   gather_keys() %>%
+#'   append_values_string()
 NULL
 
 #' Creates the append_values_* functions

diff --git a/R/data-worldbank.r b/R/data-worldbank.r
@@ -19,7 +19,7 @@
 #'     name = jstring("project_name"), # Spread name 
 #'     region = jstring("regionname")  # Spread region
 #'   ) %>% 
-#'   enter_object("sector") %>%              # Enter the 'sector' object
+#'   enter_object("majorsector_percent") %>%              # Enter the 'sector' object
 #'   gather_array("sector.index") %>%        # Gather the array
 #'   spread_values(sector = jstring("Name")) # Spread the sector name
 #' 

diff --git a/R/enter_object.r b/R/enter_object.r
@@ -1,8 +1,27 @@
-#' Selects an object by key and filters rows to just those with matching keys
+#' Dive into a specific object "key"
+#' 
+#' JSON can contain nested objects, such as {"key1": {"key2": [1, 2, 3]}}. The
+#' function enter_object() can be used to access the array nested under "key1"
+#' and "key2". After using enter_object(), all further tidyjson calls happen 
+#' inside the referenced object (all other JSON data outside the object 
+#' is discarded). If the object doesn't exist for a given row / index, then that 
+#' data.frame row will be discarded.
+#' 
+#' This is useful when you want to limit your data to just information found in
+#' a specific key. Use the ... to specific a sequence of keys that you want to
+#' enter into. Keep in mind that any rows with JSON that do not contain the key
+#' will be discarded by this function.
 #' 
 #' @param x a tbl_json object
 #' @param ... path to filter
 #' @export
+#' @examples
+#' library(magrittr)  # for %>%
+#' c('{"name": "bob", "children": ["sally", "george"]}', '{"name": "anne"}') %>%
+#'   spread_values(parent.name = jstring("name")) %>%
+#'   enter_object("children") %>% 
+#'   gather_array %>% 
+#'   append_values_string("children")
 enter_object <- function(x, ...) {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/R/gather_array.r b/R/gather_array.r
@@ -1,10 +1,31 @@
-#' Expands a tbl_json to span the indices of a JSON array
+#' Stack a JSON array
+#'
+#' Given a JSON array, such as [1, 2, 3], gather_array will "stack" the array in 
+#' the tbl_json data.frame, by replicating each row of the data.frame by the
+#' length of the corresponding JSON array. A new column (by default called 
+#' "array.index") will be added to keep track of the referenced position in the
+#' array for each row of the resuling data.frame.
+#' 
+#' JSON can contain arrays of data, which can be simple vectors (fixed or varying 
+#' length integer, character or logical vectors). But they also often contain 
+#' lists of other objects (like a list of purchases for a user). The function 
+#' gather_array() takes JSON arrays and duplicates the rows in the data.frame to 
+#' correspond to the indices of the array, and puts the elements of 
+#' the array into the JSON attribute. This is equivalent to “stacking” the array 
+#' in the data.frame, and lets you continue to manipulate the remaining JSON 
+#' in the elements of the array. For simple arrays, use append_values_* to 
+#' capture all of the values of the array. For more complex arrays (where the
+#' values are themselves objects or arrays), continue using other tidyjson
+#' functions to structure the data as needed.
 #' 
 #' @param x a tbl_json whose JSON attribute should always be an array
 #' @param column.name the name to give to the array index column created
 #' @return a tbl_json with a new column (column.name) that captures the array
 #'   index and JSON attribute extracted from the array
 #' @export
+#' @examples
+#' library(magrittr)  # for %>%  
+#' '[1, "a", {"k": "v"}]' %>% gather_array %>% json_types
 gather_array <- function(x, column.name = "array.index") {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/R/gather_keys.r b/R/gather_keys.r
@@ -1,10 +1,22 @@
-#' Gathers every key from the top level of the json and stacks them 
-#' 
+#' Stack a JSON {"key": value} object
+#'
+#' Given a JSON key value structure, like {"key1": 1, "key2": 2}, the 
+#' gather_keys() function duplicates the rows of the tbl_json data.frame for
+#' every key, adds a new column (default name "key") to capture the key names,
+#' and then dives into the JSON values to enable further manipulation with
+#' downstream tidyjson functions.
+#'
+#' This allows you to *enter into* the keys of the objects just like `gather_array`
+#' let you enter elements of the array.
+#'
 #' @param x a tbl_json whose JSON attribute should always be an object
 #' @param column.name the name to give to the column of key names created
 #' @return a tbl_json with a new column (column.name) that captures the keys
 #'   and JSON attribute of the associated value data
 #' @export
+#' @examples
+#' library(magrittr)  # for %>% 
+#' '{"name": "bob", "age": 32}' %>% gather_keys %>% json_types
 gather_keys <- function(x, column.name = "key") {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/R/json_lengths.r b/R/json_lengths.r
@@ -1,9 +1,19 @@
-#' Add a column that tells the 'length' of the data in the root of the JSON
+#' Add a column that contains the length of the JSON data
+#' 
+#' When investigating JSON data it can be helpful to identify the lengths of the
+#' JSON objects or arrays, especialy when they are 'ragged' across documents. The
+#' json_lengths() function adds a column (default name "length") that contains
+#' the 'length' of the JSON associated with each row. For objects, this will
+#' be equal to the number of keys. For arrays, this will be equal to the length
+#' of the array. All scalar values will be of length 1.
 #' 
 #' @param x a tbl_json object
 #' @param column.name the name to specify for the length column
 #' @return a tbl_json object with column.name column that tells the length
 #' @export
+#' @examples 
+#' library(magrittr)  # for %>% 
+#' c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', {}) %>% json_lengths
 json_lengths <- function(x, column.name = "length") {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/R/json_types.r b/R/json_types.r
@@ -1,9 +1,21 @@
 #' Add a column that tells the 'type' of the data in the root of the JSON
-#' 
+#'
+#' The function json_types() inspects the JSON associated with 
+#' each row of the tbl_json data.frame, and adds a new column ("type" by 
+#' default) that identifies the type according to the 
+#' JSON standard at http://json.org/.
+#'
+#' This is particularly useful for inspecting your JSON data types, and can added
+#' after gather_array() (or gather_keys()) to inspect the types of the elements
+#' (or values) in arrays (or objects).
+#'
 #' @param x a tbl_json object
 #' @param column.name the name to specify for the type column
 #' @return a tbl_json object with column.name column that tells the type
 #' @export
+#' @examples 
+#' library(magrittr)  # for %>%
+#' c('{"a": 1}', '[1, 2]', '"a"', '1', 'true', 'null') %>% json_types
 json_types <- function(x, column.name = "type") {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/R/spread_values.r b/R/spread_values.r
@@ -1,9 +1,23 @@
-#' Extracts values from JSON refereced by a sequence of keys
+#' Create new columns with JSON values
+#' 
+#' The spread_values() function lets you dive into (potentially nested) JSON 
+#' objects and extract specific values. spread_values() takes jstring(),
+#' jnumber() or jlogical() named function calls as arguments in order to specify
+#' the type of the data that should be captured at each desired key location.
+#' These values can be of varying types at varying depths.
+#' 
 #' @param x tbl_json object
 #' @param ... column=value list where 'column' will be the column name created
 #'   and 'value' must be a call to jstring(), jnumber() or jlogical() specifying
 #'   the path to get the value (and the type implicit in the function name) 
 #' @export
+#' @examples 
+#' library(magrittr)  # for %>%
+#' '{"name": {"first": "bob", "last": "jones"}, "age": 32}' %>%
+#'   spread_values(
+#'     first.name = jstring("name", "first"), 
+#'     age = jnumber("age")
+#'   )
 spread_values <- function(x, ...) {
 
   if (!is.tbl_json(x)) x <- as.tbl_json(x)

diff --git a/data/worldbank.rda b/data/worldbank.rda
diff --git a/man/append_values.Rd b/man/append_values.Rd
@@ -5,7 +5,7 @@
 \alias{append_values_logical}
 \alias{append_values_number}
 \alias{append_values_string}
-\title{Append keys to a new column}
+\title{Appends all values with a specified type as a new column}
 \usage{
 append_values_string(x, column.name = type, force = TRUE)
 
@@ -23,6 +23,20 @@ under}
 if force is FALSE, then the function may take more memory}
 }
 \description{
-Append keys to a new column
+The append_values_X functions let you take any remaining JSON and add it as
+a column X (for X in "string", "number", "logical") insofar as it is of the
+JSON type specified.
+}
+\details{
+Any values that do not conform to the type specified will be NA in the resulting
+column. This includes other scalar types (e.g., numbers or logicals if you are
+using append_values_string) and *also* any rows where the JSON is still an
+object or an array.
+}
+\examples{
+library(magrittr)  # for \%>\%
+'{"first": "bob", "last": "jones"}' \%>\%
+  gather_keys() \%>\%
+  append_values_string()
 }
 
diff --git a/man/enter_object.Rd b/man/enter_object.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/enter_object.r
 \name{enter_object}
 \alias{enter_object}
-\title{Selects an object by key and filters rows to just those with matching keys}
+\title{Dive into a specific object "key"}
 \usage{
 enter_object(x, ...)
 }
@@ -12,6 +12,25 @@ enter_object(x, ...)
 \item{...}{path to filter}
 }
 \description{
-Selects an object by key and filters rows to just those with matching keys
+JSON can contain nested objects, such as {"key1": {"key2": [1, 2, 3]}}. The
+function enter_object() can be used to access the array nested under "key1"
+and "key2". After using enter_object(), all further tidyjson calls happen
+inside the referenced object (all other JSON data outside the object
+is discarded). If the object doesn't exist for a given row / index, then that
+data.frame row will be discarded.
+}
+\details{
+This is useful when you want to limit your data to just information found in
+a specific key. Use the ... to specific a sequence of keys that you want to
+enter into. Keep in mind that any rows with JSON that do not contain the key
+will be discarded by this function.
+}
+\examples{
+library(magrittr)  # for \%>\%
+c('{"name": "bob", "children": ["sally", "george"]}', '{"name": "anne"}') \%>\%
+  spread_values(parent.name = jstring("name")) \%>\%
+  enter_object("children") \%>\%
+  gather_array \%>\%
+  append_values_string("children")
 }
 
diff --git a/man/gather_array.Rd b/man/gather_array.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/gather_array.r
 \name{gather_array}
 \alias{gather_array}
-\title{Expands a tbl_json to span the indices of a JSON array}
+\title{Stack a JSON array}
 \usage{
 gather_array(x, column.name = "array.index")
 }
@@ -16,6 +16,27 @@ a tbl_json with a new column (column.name) that captures the array
   index and JSON attribute extracted from the array
 }
 \description{
-Expands a tbl_json to span the indices of a JSON array
+Given a JSON array, such as [1, 2, 3], gather_array will "stack" the array in
+the tbl_json data.frame, by replicating each row of the data.frame by the
+length of the corresponding JSON array. A new column (by default called
+"array.index") will be added to keep track of the referenced position in the
+array for each row of the resuling data.frame.
+}
+\details{
+JSON can contain arrays of data, which can be simple vectors (fixed or varying
+length integer, character or logical vectors). But they also often contain
+lists of other objects (like a list of purchases for a user). The function
+gather_array() takes JSON arrays and duplicates the rows in the data.frame to
+correspond to the indices of the array, and puts the elements of
+the array into the JSON attribute. This is equivalent to “stacking” the array
+in the data.frame, and lets you continue to manipulate the remaining JSON
+in the elements of the array. For simple arrays, use append_values_* to
+capture all of the values of the array. For more complex arrays (where the
+values are themselves objects or arrays), continue using other tidyjson
+functions to structure the data as needed.
+}
+\examples{
+library(magrittr)  # for \%>\%
+'[1, "a", {"k": "v"}]' \%>\% gather_array \%>\% json_types
 }
 
diff --git a/man/gather_keys.Rd b/man/gather_keys.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/gather_keys.r
 \name{gather_keys}
 \alias{gather_keys}
-\title{Gathers every key from the top level of the json and stacks them}
+\title{Stack a JSON {"key": value} object}
 \usage{
 gather_keys(x, column.name = "key")
 }
@@ -16,6 +16,18 @@ a tbl_json with a new column (column.name) that captures the keys
   and JSON attribute of the associated value data
 }
 \description{
-Gathers every key from the top level of the json and stacks them
+Given a JSON key value structure, like {"key1": 1, "key2": 2}, the
+gather_keys() function duplicates the rows of the tbl_json data.frame for
+every key, adds a new column (default name "key") to capture the key names,
+and then dives into the JSON values to enable further manipulation with
+downstream tidyjson functions.
+}
+\details{
+This allows you to *enter into* the keys of the objects just like `gather_array`
+let you enter elements of the array.
+}
+\examples{
+library(magrittr)  # for \%>\%
+'{"name": "bob", "age": 32}' \%>\% gather_keys \%>\% json_types
 }
 
diff --git a/man/json_lengths.Rd b/man/json_lengths.Rd
@@ -2,7 +2,7 @@
 % Please edit documentation in R/json_lengths.r
 \name{json_lengths}
 \alias{json_lengths}
-\title{Add a column that tells the 'length' of the data in the root of the JSON}
+\title{Add a column that contains the length of the JSON data}
 \usage{
 json_lengths(x, column.name = "length")
 }
@@ -15,6 +15,15 @@ json_lengths(x, column.name = "length")
 a tbl_json object with column.name column that tells the length
 }
 \description{
-Add a column that tells the 'length' of the data in the root of the JSON
+When investigating JSON data it can be helpful to identify the lengths of the
+JSON objects or arrays, especialy when they are 'ragged' across documents. The
+json_lengths() function adds a column (default name "length") that contains
+the 'length' of the JSON associated with each row. For objects, this will
+be equal to the number of keys. For arrays, this will be equal to the length
+of the array. All scalar values will be of length 1.
+}
+\examples{
+library(magrittr)  # for \%>\%
+c('[1, 2, 3]', '{"k1": 1, "k2": 2}', '1', {}) \%>\% json_lengths
 }
 
diff --git a/man/json_types.Rd b/man/json_types.Rd
@@ -15,6 +15,18 @@ json_types(x, column.name = "type")
 a tbl_json object with column.name column that tells the type
 }
 \description{
-Add a column that tells the 'type' of the data in the root of the JSON
+The function json_types() inspects the JSON associated with
+each row of the tbl_json data.frame, and adds a new column ("type" by
+default) that identifies the type according to the
+JSON standard at http://json.org/.
+}
+\details{
+This is particularly useful for inspecting your JSON data types, and can added
+after gather_array() (or gather_keys()) to inspect the types of the elements
+(or values) in arrays (or objects).
+}
+\examples{
+library(magrittr)  # for \%>\%
+c('{"a": 1}', '[1, 2]', '"a"', '1', 'true', 'null') \%>\% json_types
 }
-Original file line number
+Diff line change
@@ Expand Up / @@ -2,3 +2,4 @@ @@
     .RData
     .Rhistory
     *.swp
+    inst/doc