From c997375360c76ee25458b5e6abd1ff5ecb781a1c Mon Sep 17 00:00:00 2001 From: SidharthMacherla Date: Tue, 8 Sep 2020 10:45:36 +1200 Subject: [PATCH] v1.2.0 updated. Awaiting CRAN confirmation --- DESCRIPTION | 6 +-- NAMESPACE | 1 + NEWS.md | 10 ++-- R/buildDistr.R | 22 +++++++-- R/buildNum.R | 68 ++++++++++++++++++++++++++ man/buildDistr.Rd | 4 +- man/buildNum.Rd | 44 +++++++++++++++++ vignettes/introduction_to_conjurer.Rmd | 40 +++++++++++---- 8 files changed, 174 insertions(+), 21 deletions(-) create mode 100644 R/buildNum.R create mode 100644 man/buildNum.Rd diff --git a/DESCRIPTION b/DESCRIPTION index 5d53dc1..946604e 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,14 +1,14 @@ Package: conjurer Type: Package Title: A Parametric Method for Generating Synthetic Data -Version: 1.1.1 -Date: 2020-03-22 +Version: 1.2.0 +Date: 2020-09-06 Authors@R: person("Sidharth", "Macherla", email = "msidharthrasik@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4825-2026")) Description: Builds synthetic data applicable across multiple domains. This package also provides flexibility to control data distribution to make it relevant to many industry examples. Depends: R (>= 2.10) License: MIT + file LICENSE -URL: https://github.com/SidharthMacherla/conjurer +URL: https://www.foyi.co.nz/posts/documentation/documentationconjurer/ BugReports: https://github.com/SidharthMacherla/conjurer/issues Encoding: UTF-8 LazyData: TRUE diff --git a/NAMESPACE b/NAMESPACE index b45d4c5..a0593e2 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,6 +2,7 @@ export(buildCust) export(buildNames) +export(buildNum) export(buildPareto) export(buildProd) export(genTrans) diff --git a/NEWS.md b/NEWS.md index db613e6..98b75f0 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,7 +1,9 @@ -# conjurer 1.1.1 (2020-03-21) -## Bug fixes -* Bug: Generating names based on custom training data doesn’t work #22 +#### conjurer 1.2.0 (2020-09-06) +* Added new function buildNum. -# conjurer 1.1.0 (2020-03-13) +#### conjurer 1.1.1 (2020-03-21) +##### Bug fixes +* Bug: Generating names based on custom training data doesn’t work #22 +#### conjurer 1.1.0 (2020-03-13) * Added new function buildNames. diff --git a/R/buildDistr.R b/R/buildDistr.R index 5836f2b..4b7a12e 100644 --- a/R/buildDistr.R +++ b/R/buildDistr.R @@ -4,6 +4,7 @@ #' @param en A number. This defines the ending value of the number of data points. #' @param cycles A string. This defines the cyclicality of data distribution. #' @param trend A number. This defines the trend of data distribution i.e if the data has a positive slope or a negative slope. +#' @param n A numeric. This specifies the number of values to be generated. It should be non-zero natural number. This parameter is currently used by the function \code{\link{buildNum}}. #' @details A parametric method is used to build data distribution. The data distribution function uses the formulation of #' \deqn{sin(a*x) + cos(b*x) + c} #' Where, @@ -20,12 +21,14 @@ #' #'Finally, the constant 'c' is the intercept part of the formulation and primarily serves as a way to ensure that the data distribution has a positive 'y' axis component. This value is randomly generated between 2 and 5. #' @return A data frame with data distribution is returned. -buildDistr <- function(st, en, cycles, trend) +buildDistr <- function(st, en, cycles, trend, n) { #handle missing arguments st <- missingArgHandler(st,1) en <- missingArgHandler(en,12) cycles <- missingArgHandler(cycles, "y") + trend <- missingArgHandler(trend, 1) + n <- missingArgHandler(n,100) if(cycles == "y") { @@ -39,14 +42,25 @@ buildDistr <- function(st, en, cycles, trend) { a <- 3 b <- 0.25 + }else if(cycles == "n") #Here, "n" is numeric. This is used to generate other distributions (continuous/discrete) + { + #randomize trend value within +- 20% range + coeffs <- seq(from = 0.8, to =1.2, by =0.001) + randomCoeff <- sample(coeffs, 1, replace = T) } #generate intercept as a random int between 2 and 5 c <- sample(2:5,1) - trend <- missingArgHandler(trend, 1) - x <- seq(st,en,by=(en-st)/(en-1)) - if(trend == 1 && cycles != "m") + + if(cycles == "n") + { + x <- seq(from = (pi/2), to = (3*pi/2), by = pi/(n-1)) + y <- sin((randomCoeff*trend)*x) + c + percentY <- (y - min(y))/((max(y)-min(y))) + distr <- ((en-st)*percentY)+st + + }else if(trend == 1 && cycles != "m") { distr <- sin(a*x) + cos(b*x) + c }else if(trend == 1 && cycles == "m") diff --git a/R/buildNum.R b/R/buildNum.R new file mode 100644 index 0000000..0fc3b16 --- /dev/null +++ b/R/buildNum.R @@ -0,0 +1,68 @@ +#' Build Numeric Data +#' @param n A number. This specifies the number of values to be generated. +#' @param st A number. This defines the starting value of the number of data points. +#' @param en A number. This defines the ending value of the number of data points. +#' @param disp A number between \eqn{-(pi/2)} and \eqn{(pi/2)}. This defines the dispersion of the distribution. +#' @param outliers A number. This signifies the presence of outliers. If set to value 1, then outliers are generated randomly. If set to value 0, then no outliers are generated. The presence of outliers is a very common occurrence and hence setting the outliers to 1 is recommended. However, there are instances where outliers are not needed. For example, if the objective of data generation is solely for visualization purposes then outliers may not be needed. +#' @return A dataframe +#' @details This function helps in generating numeric data such as age, height, weight etc. This function could be used along with other functions such as \code{\link{buildCust}} to make it more meaningful. The data distribution function uses the formulation of +#' \deqn{sin((r*a)*x) + c} +#' Where, +#' \enumerate{ +#' \item r is the random value such that \eqn{0.8 <= r <= 1.2}. This adds \eqn{+/-} 20\% randomness to the parameter \eqn{a}. +#' \item a is the parameter such that, \eqn{-(pi/2) <= a <= (pi/2)}. +#' \item x is a variable such that, \eqn{(pi/2) <= x <= (pi/2)}. +#' \item c is a constant such that \eqn{2 <= c <= 5}. +#' } +#' +#'The key component of this function is \eqn{disp}. This helps in controlling the dispersion of the distribution. Let us assume that one would like to generate age of people in years. Furthermore, let us assume that the range of the age is between 23 and 80. If \eqn{disp = 1}, then the function will generate more data with a negative slope i.e more people with age closer to 23 than 80. If \eqn{disp = 1} is used, then the opposite will be true. However, if one would like to generate data that is visually similar to normal distribution i.e more people in the middle age group and less towards 23 or 80, then \eqn{disp = 0.5} could be used. +#' +#'It is recommended to firstly plot the code and inspect visually to check which distribution is needed. +#' +#' @examples +#' age <- buildNum(n = 10, st = 23, en = 80, disp = 0.5, outliers = 1) +#' plot(age) #visualize the resulting distribution + +#' @export + +buildNum <- function(n, st, en, disp, outliers) +{ + #handle missing arguments + outliers <- missingArgHandler(outliers, 1) + disp <- missingArgHandler(disp, 1) + + #Exception handling for n + if(missing(n)) + { + stop("'n' is missing. Please specify the number of values to generate") + }else if(n < 10) + { + warning("'n' is too small. Recommended 'n' value is atleast 10.") + } + #Ensure that 'n' is a whole number + if((n)%%1 != 0) + { + warning(sprintf("'n' must be a whole number. Note that %s is rounded off to %s.", n, round(n))) + } + + + #Exception handling for disp + if(disp < -(pi/2) | disp > (pi/2)) + { + stop("The value of 'disp' must be between -(pi/2) and (pi/2)") + } + + #Build distributions by calling internal function buildDistr + distr <- buildDistr(st = st, en = en, cycles = "n", trend = disp, n = n) + + #Add outliers + if(outliers == 1) + { + distr <- (buildOutliers(distr)) + }else if(outliers == 0) + { + distr <- distr + } + + return(distr) +} diff --git a/man/buildDistr.Rd b/man/buildDistr.Rd index 5016e8f..8c04478 100644 --- a/man/buildDistr.Rd +++ b/man/buildDistr.Rd @@ -4,7 +4,7 @@ \alias{buildDistr} \title{Build Data Distribution} \usage{ -buildDistr(st, en, cycles, trend) +buildDistr(st, en, cycles, trend, n) } \arguments{ \item{st}{A number. This defines the starting value of the number of data points.} @@ -14,6 +14,8 @@ buildDistr(st, en, cycles, trend) \item{cycles}{A string. This defines the cyclicality of data distribution.} \item{trend}{A number. This defines the trend of data distribution i.e if the data has a positive slope or a negative slope.} + +\item{n}{A numeric. This specifies the number of values to be generated. It should be non-zero natural number. This parameter is currently used by the function \code{\link{buildNum}}.} } \value{ A data frame with data distribution is returned. diff --git a/man/buildNum.Rd b/man/buildNum.Rd new file mode 100644 index 0000000..41c49a8 --- /dev/null +++ b/man/buildNum.Rd @@ -0,0 +1,44 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/buildNum.R +\name{buildNum} +\alias{buildNum} +\title{Build Numeric Data} +\usage{ +buildNum(n, st, en, disp, outliers) +} +\arguments{ +\item{n}{A number. This specifies the number of values to be generated.} + +\item{st}{A number. This defines the starting value of the number of data points.} + +\item{en}{A number. This defines the ending value of the number of data points.} + +\item{disp}{A number between \eqn{-(pi/2)} and \eqn{(pi/2)}. This defines the dispersion of the distribution.} + +\item{outliers}{A number. This signifies the presence of outliers. If set to value 1, then outliers are generated randomly. If set to value 0, then no outliers are generated. The presence of outliers is a very common occurrence and hence setting the outliers to 1 is recommended. However, there are instances where outliers are not needed. For example, if the objective of data generation is solely for visualization purposes then outliers may not be needed.} +} +\value{ +A dataframe +} +\description{ +Build Numeric Data +} +\details{ +This function helps in generating numeric data such as age, height, weight etc. This function could be used along with other functions such as \code{\link{buildCust}} to make it more meaningful. The data distribution function uses the formulation of +\deqn{sin((r*a)*x) + c} +Where, +\enumerate{ +\item r is the random value such that \eqn{0.8 <= r <= 1.2}. This adds \eqn{+/-} 20\% randomness to the parameter \eqn{a}. +\item a is the parameter such that, \eqn{-(pi/2) <= a <= (pi/2)}. +\item x is a variable such that, \eqn{(pi/2) <= x <= (pi/2)}. +\item c is a constant such that \eqn{2 <= c <= 5}. +} + +The key component of this function is \eqn{disp}. This helps in controlling the dispersion of the distribution. Let us assume that one would like to generate age of people in years. Furthermore, let us assume that the range of the age is between 23 and 80. If \eqn{disp = 1}, then the function will generate more data with a negative slope i.e more people with age closer to 23 than 80. If \eqn{disp = 1} is used, then the opposite will be true. However, if one would like to generate data that is visually similar to normal distribution i.e more people in the middle age group and less towards 23 or 80, then \eqn{disp = 0.5} could be used. + +It is recommended to firstly plot the code and inspect visually to check which distribution is needed. +} +\examples{ +age <- buildNum(n = 10, st = 23, en = 80, disp = 0.5, outliers = 1) +plot(age) #visualize the resulting distribution +} diff --git a/vignettes/introduction_to_conjurer.Rmd b/vignettes/introduction_to_conjurer.Rmd index a45953e..6007ee9 100644 --- a/vignettes/introduction_to_conjurer.Rmd +++ b/vignettes/introduction_to_conjurer.Rmd @@ -21,6 +21,7 @@ knitr::opts_chunk$set( ## 1.Overview ### 1.1 Background & Motivation Data science applications need data to prototype and demonstrate to potential clients. For such purposes, using production data is a possibility. However, it is not always feasible due to legal and/or ethical considerations[@SynDataNeed]. This resulted in a need for generating synthetic data. This need is the key motivator for the package **conjurer**. +This package is under constant development and the author would update the documentation regularly at [foyi digital library](https://www.foyi.co.nz/posts/documentation/documentationconjurer/) ### 1.2 Need for package conjurer Data across multiple domains are known to exhibit some form of seasonality, cyclicality and trend. Although there are synthetic data generation packages currently available, they focus primarily on synthetic versions of microdata containing confidential information or for machine learning purposes. There is a need for a more generic synthetic data generation package that helps for multiple purposes such as forecasting, customer segmentation, insight generation etc. This package **conjurer** helps in generating such synthetic data. @@ -61,7 +62,24 @@ customer2name <- cbind(customers, custNames) print(head(customer2name)) ``` -### 2.2 Build products +#### 2.2.3 Build customer age +A list of customer ages for the 100 customer IDs can be generated in the following way. +```{r, eval=TRUE, echo=TRUE, results='markup'} +custAge <- as.data.frame(round(buildNum(n = 10, st = 23, en = 80, disp = 0.5, outliers = 1))) + +#set column heading +colnames(custAge) <- c("customerAge") +print(head(custAge)) +``` +#### 2.2.4 Assign customer age to customer ID +Let us assign customer ages to customer IDs. This is a random one to one mapping using the following code. +```{r, eval=TRUE, echo=TRUE, results='markup'} +customer2age <- cbind(customers, custAge) +#set column heading +print(head(customer2age)) +``` + +### 2.3 Build products The next step is building some products. A product is identified by a product ID. Similar to a customer ID, a product ID is also an alphanumeric with prefix "sku" which signifies a stock keeping unit. This prefix is followed by a numeric ranging from 1 and extending to the number of products provided as the argument within the function. For example, if there are 10 products, then the product ID will range from sku01 to sku10. This ensures that the product ID is always of the same length. Besides product ID, the product price range must be specified. Let us build a group of products using the following code. For simplicity, let us assume that there are 10 products and the price range for them is from 5 dollars to 50 dollars. Products are built using the function buildProd. This function takes 3 arguments as given below. @@ -74,7 +92,7 @@ products <- buildProd(numOfProd = 10, minPrice = 5, maxPrice = 50) print(head(products)) ``` -### 2.3 Build transactions +### 2.4 Build transactions Now that a group of customer IDs and Products are built, the next step is to build transactions. Transactions are built using the function genTrans. This function takes 5 arguments. The details of them are as follows. + **cylces**. This represents the cyclicality of data. It can take the following values @@ -98,11 +116,11 @@ TxnAggregated <- aggregate(transactions$transactionID, by = list(transactions$da plot(TxnAggregated, type = "l", ann = FALSE) ``` -### 2.4 Build final data +### 2.5 Build final data Bringing customers, products and transactions together is the final step of generating synthetic data. This process entails 3 steps as given below. -#### 2.4.1 Allocate customers to transactions +#### 2.5.1 Allocate customers to transactions The allocation of transactions is achieved with the help of buildPareto function. This function takes 3 arguments as detailed below. + **factor1** and **factor2**. These are factors to be mapped to each other. As the name suggests, they must be of data type factor. @@ -120,7 +138,7 @@ names(customer2transaction) <- c('transactionID', 'customer') print(head(customer2transaction)) ``` -#### 2.4.2 Allocate products to transactions +#### 2.5.2 Allocate products to transactions Now, using similar step as mentioned above, allocate transactions to products using following code. ```{r, eval=TRUE, echo=TRUE, results='markup'} product2transaction <- buildPareto(products$SKU,transactions$transactionID,pareto = c(70,30)) @@ -130,7 +148,7 @@ names(product2transaction) <- c('transactionID', 'SKU') print(head(product2transaction)) ``` -#### 2.4.3 Combine customers and transactions data +#### 2.5.3 Combine customers and transactions data The following code brings together transactions, products and customers into one dataframe. ```{r, eval=TRUE, echo=TRUE, results='markup'} df1 <- merge(x = customer2transaction, y = product2transaction, by = "transactionID") @@ -141,10 +159,14 @@ df2 <- merge(x = df1, y = transactions, by = "transactionID", all.x = TRUE) print(head(df2)) ``` -#### 2.4.3 Final data -We can add additional data such as customer name using the code below. +#### 2.5.4 Final data +We can add additional data such as customer name, product price using the code below. ```{r, eval=TRUE, echo=TRUE, results='markup'} -dfFinal <- merge(x = df2, y = customer2name, by.x = "customer", by.y = "customers", all.x = TRUE) +df3 <- merge(x = df2, y = customer2name, by.x = "customer", by.y = "customers", all.x = TRUE) +df4 <- merge(x = df3, y = customer2age, by.x = "customer", by.y = "customers", all.x = TRUE) +df5 <- merge(x = df4, y = products, by = "SKU", all.x = TRUE) +dfFinal <- df5[,c("dayNum", "mthNum", "customer", "customerName", "customerAge", "SKU", "Price", "transactionID")] + #inspect the output print(head(dfFinal))