Merge pull request #26 from SidharthMacherla/develop_v1.2.0

v1.2.0 updated.
SidharthMacherla · Sep 8, 2020 · 2c507f2 · 2c507f2
2 parents c02ec4c + 5f263fd
commit 2c507f2
Show file tree

Hide file tree

Showing 8 changed files with 168 additions and 24 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,14 +1,14 @@
 Package: conjurer
 Type: Package
 Title: A Parametric Method for Generating Synthetic Data
-Version: 1.1.1
-Date: 2020-03-22
+Version: 1.2.0
+Date: 2020-09-06
 Authors@R: person("Sidharth", "Macherla", email = "[email protected]",
                   role = c("aut", "cre"), comment = c(ORCID = "0000-0002-4825-2026"))
 Description: Builds synthetic data applicable across multiple domains. This package also provides flexibility to control data distribution to make it relevant to many industry examples.
 Depends: R (>= 2.10)
 License: MIT + file LICENSE
-URL: https://github.com/SidharthMacherla/conjurer
+URL: https://www.foyi.co.nz/posts/documentation/documentationconjurer/
 BugReports: https://github.com/SidharthMacherla/conjurer/issues
 Encoding: UTF-8
 LazyData: TRUE

diff --git a/NAMESPACE b/NAMESPACE
@@ -2,6 +2,7 @@
 
 export(buildCust)
 export(buildNames)
+export(buildNum)
 export(buildPareto)
 export(buildProd)
 export(genTrans)
diff --git a/NEWS.md b/NEWS.md
@@ -1,7 +0,0 @@
-## conjurer 1.1.1 (2020-03-21)
-### Bug fixes
-  * Bug: Generating names based on custom training data doesn’t work #22
-
-## conjurer 1.1.0 (2020-03-13)
-
-  * Added new function buildNames.

diff --git a/R/buildDistr.R b/R/buildDistr.R
@@ -4,6 +4,7 @@
 #' @param en A number. This defines the ending value of the number of data points.
 #' @param cycles A string. This defines the cyclicality of data distribution.
 #' @param trend A number. This defines the trend of data distribution i.e if the data has a positive slope or a negative slope.
+#' @param n A numeric. This specifies the number of values to be generated. It should be non-zero natural number. This parameter is currently used by the function \code{\link{buildNum}}.
 #' @details A parametric method is used to build data distribution. The data distribution function uses the formulation of
 #' \deqn{sin(a*x) + cos(b*x) + c}
 #' Where,
@@ -20,12 +21,14 @@
 #'
 #'Finally, the constant 'c' is the intercept part of the formulation and primarily serves as a way to ensure that the data distribution has a positive 'y' axis component. This value is randomly generated between 2 and 5.
 #' @return A data frame with data distribution is returned.
-buildDistr <- function(st, en, cycles, trend)
+buildDistr <- function(st, en, cycles, trend, n)
 {
   #handle missing arguments
   st <- missingArgHandler(st,1)
   en <- missingArgHandler(en,12)
   cycles <- missingArgHandler(cycles, "y")
+  trend <- missingArgHandler(trend, 1)
+  n <- missingArgHandler(n,100)
 
   if(cycles == "y")
   {
@@ -39,14 +42,25 @@ buildDistr <- function(st, en, cycles, trend)
   {
     a <- 3
     b <- 0.25
+  }else if(cycles == "n") #Here, "n" is numeric. This is used to generate other distributions (continuous/discrete)
+  {
+    #randomize trend value within +- 20% range
+    coeffs <- seq(from = 0.8, to =1.2, by =0.001)
+    randomCoeff <- sample(coeffs, 1, replace = T)
   }
 
   #generate intercept as a random int between 2 and 5
   c <- sample(2:5,1)
-  trend <- missingArgHandler(trend, 1)
-
   x <- seq(st,en,by=(en-st)/(en-1))
-  if(trend == 1 && cycles != "m")
+
+  if(cycles == "n")
+  {
+    x <- seq(from = (pi/2), to = (3*pi/2), by = pi/(n-1))
+    y <- sin((randomCoeff*trend)*x) + c
+    percentY <- (y - min(y))/((max(y)-min(y)))
+    distr <- ((en-st)*percentY)+st
+
+  }else if(trend == 1 && cycles != "m")
   {
     distr <- sin(a*x) + cos(b*x) + c
   }else if(trend == 1 && cycles == "m")

diff --git a/R/buildNum.R b/R/buildNum.R
@@ -0,0 +1,68 @@
+#' Build Numeric Data
+#' @param n A number. This specifies the number of values to be generated.
+#' @param st A number. This defines the starting value of the number of data points.
+#' @param en A number. This defines the ending value of the number of data points.
+#' @param disp A number between \eqn{-(pi/2)} and  \eqn{(pi/2)}. This defines the dispersion of the distribution.
+#' @param outliers A number. This signifies the presence of outliers. If set to value 1, then outliers are generated randomly. If set to value 0, then no outliers are generated. The presence of outliers is a very common occurrence and hence setting the outliers to 1 is recommended. However, there are instances where outliers are not needed. For example, if the objective of data generation is solely for visualization purposes then outliers may not be needed.
+#' @return A dataframe
+#' @details This function helps in generating numeric data such as age, height, weight etc. This function could be used along with other functions such as \code{\link{buildCust}} to make it more meaningful. The data distribution function uses the formulation of
+#' \deqn{sin((r*a)*x) + c}
+#' Where,
+#' \enumerate{
+#' \item r is the random value such that \eqn{0.8 <= r <= 1.2}. This adds \eqn{+/-} 20\% randomness to the parameter \eqn{a}.
+#' \item a is the parameter such that, \eqn{-(pi/2) <= a <= (pi/2)}.
+#' \item x is a variable such that,  \eqn{(pi/2) <= x <= (pi/2)}.
+#' \item c is a constant such that \eqn{2 <= c <= 5}.
+#' }
+#'
+#'The key component of this function is \eqn{disp}. This helps in controlling the dispersion of the distribution. Let us assume that one would like to generate age of people in years. Furthermore, let us assume that the range of the age is between 23 and 80. If \eqn{disp = 1}, then the function will generate more data with a negative slope i.e more people with age closer to 23 than 80. If \eqn{disp = 1} is used, then the opposite will be true. However, if one would like to generate data that is visually similar to normal distribution i.e more people in the middle age group and less towards 23 or 80, then \eqn{disp = 0.5} could be used.
+#'
+#'It is recommended to firstly plot the code and inspect visually to check which distribution is needed.
+#'
+#' @examples
+#' age <- buildNum(n = 10, st = 23, en = 80, disp = 0.5, outliers = 1)
+#' plot(age) #visualize the resulting distribution
+
+#' @export
+
+buildNum <- function(n, st, en, disp, outliers)
+{
+  #handle missing arguments
+  outliers <- missingArgHandler(outliers, 1)
+  disp <- missingArgHandler(disp, 1)
+
+  #Exception handling for n
+  if(missing(n))
+  {
+    stop("'n' is missing. Please specify the number of values to generate")
+  }else if(n < 10)
+  {
+    warning("'n' is too small. Recommended 'n' value is atleast 10.")
+  }
+  #Ensure that 'n' is a whole number
+  if((n)%%1 != 0)
+  {
+    warning(sprintf("'n' must be a whole number. Note that %s is rounded off to %s.", n, round(n)))
+  }
+
+
+  #Exception handling for disp
+  if(disp < -(pi/2) | disp > (pi/2))
+  {
+    stop("The value of 'disp' must be between -(pi/2) and (pi/2)")
+  }
+
+  #Build distributions by calling internal function buildDistr
+  distr <- buildDistr(st = st, en = en, cycles = "n", trend = disp, n = n)
+
+  #Add outliers
+  if(outliers == 1)
+  {
+    distr <- (buildOutliers(distr))
+  }else if(outliers == 0)
+  {
+    distr <- distr
+  }
+
+  return(distr)
+}
diff --git a/man/buildDistr.Rd b/man/buildDistr.Rd
diff --git a/man/buildNum.Rd b/man/buildNum.Rd
diff --git a/vignettes/introduction_to_conjurer.Rmd b/vignettes/introduction_to_conjurer.Rmd
@@ -21,6 +21,7 @@ knitr::opts_chunk$set(
 ## 1.Overview
 ### 1.1 Background & Motivation
 Data science applications need data to prototype and demonstrate to potential clients. For such purposes, using production data is a possibility. However, it is not always feasible due to legal and/or ethical considerations[@SynDataNeed]. This resulted in a need for generating synthetic data. This need is the key motivator for the package **conjurer**.
+This package is under constant development and the author would update the documentation regularly at [foyi digital library](https://www.foyi.co.nz/posts/documentation/documentationconjurer/)
 
 ### 1.2 Need for package conjurer
 Data across multiple domains are known to exhibit some form of seasonality, cyclicality and trend. Although there are synthetic data generation packages currently available, they focus primarily on synthetic versions of microdata containing confidential information or for machine learning purposes. There is a need for a more generic synthetic data generation package that helps for multiple purposes such as forecasting, customer segmentation, insight generation etc. This package **conjurer** helps in generating such synthetic data.
@@ -61,7 +62,24 @@ customer2name <- cbind(customers, custNames)
 print(head(customer2name))
 ```
 
-### 2.2 Build products
+#### 2.2.3 Build customer age
+A list of customer ages for the 100 customer IDs can be generated in the following way.
+```{r, eval=TRUE, echo=TRUE, results='markup'}
+custAge <- as.data.frame(round(buildNum(n = 10, st = 23, en = 80, disp = 0.5, outliers = 1)))
+
+#set column heading
+colnames(custAge) <- c("customerAge")
+print(head(custAge))
+```
+#### 2.2.4 Assign customer age to customer ID
+Let us assign customer ages to customer IDs. This is a random one to one mapping using the following code.
+```{r, eval=TRUE, echo=TRUE, results='markup'}
+customer2age <- cbind(customers, custAge)
+#set column heading
+print(head(customer2age))
+```
+
+### 2.3 Build products
 The next step is building some products. A product is identified by a product ID. Similar to a customer ID, a product ID is also an alphanumeric with prefix "sku" which signifies a stock keeping unit. This prefix is followed by a numeric ranging from 1 and extending to the number of products provided as the argument within the function. For example, if there are 10 products, then the product ID will range from sku01 to sku10. This ensures that the product ID is always of the same length.
   Besides product ID, the product price range must be specified. Let us build a group of products using the following code. For simplicity, let us assume that there are 10 products and the price range for them is from 5 dollars to 50 dollars. Products are built using the function buildProd. This function takes 3 arguments as given below.
 
@@ -74,7 +92,7 @@ products <- buildProd(numOfProd = 10, minPrice = 5, maxPrice = 50)
 print(head(products))
 ```
 
-### 2.3 Build transactions
+### 2.4 Build transactions
 Now that a group of customer IDs and Products are built, the next step is to build transactions. Transactions are built using the function genTrans. This function takes 5 arguments. The details of them are as follows.
 
   + **cylces**. This represents the cyclicality of data. It can take the following values
@@ -98,11 +116,11 @@ TxnAggregated <- aggregate(transactions$transactionID, by = list(transactions$da
 plot(TxnAggregated, type = "l", ann = FALSE)
 ```
 
-### 2.4 Build final data 
+### 2.5 Build final data 
 Bringing customers, products and transactions together is the final step of generating synthetic data. This process entails 3 steps as given below. 
 
 
-#### 2.4.1 Allocate customers to transactions
+#### 2.5.1 Allocate customers to transactions
 The allocation of transactions is achieved with the help of buildPareto function. This function takes 3 arguments as detailed below.
 
   + **factor1** and **factor2**. These are factors to be mapped to each other. As the name suggests, they must be of data type factor.
@@ -120,7 +138,7 @@ names(customer2transaction) <- c('transactionID', 'customer')
 print(head(customer2transaction))
 ```
 
-#### 2.4.2 Allocate products to transactions
+#### 2.5.2 Allocate products to transactions
 Now, using similar step as mentioned above, allocate transactions to products using following code.
 ```{r, eval=TRUE, echo=TRUE, results='markup'}
 product2transaction <- buildPareto(products$SKU,transactions$transactionID,pareto = c(70,30))
@@ -130,7 +148,7 @@ names(product2transaction) <- c('transactionID', 'SKU')
 print(head(product2transaction))
 ```
 
-#### 2.4.3 Combine customers and transactions data
+#### 2.5.3 Combine customers and transactions data
 The following code brings together transactions, products and customers into one dataframe.
 ```{r, eval=TRUE, echo=TRUE, results='markup'}
 df1 <- merge(x = customer2transaction, y = product2transaction, by = "transactionID")
@@ -141,10 +159,14 @@ df2 <- merge(x = df1, y = transactions, by = "transactionID", all.x = TRUE)
 print(head(df2))
 ```
 
-#### 2.4.3 Final data
-We can add additional data such as customer name using the code below.
+#### 2.5.4 Final data
+We can add additional data such as customer name, product price using the code below.
 ```{r, eval=TRUE, echo=TRUE, results='markup'}
-dfFinal <- merge(x = df2, y = customer2name, by.x = "customer", by.y = "customers", all.x = TRUE)
+df3 <- merge(x = df2, y = customer2name, by.x = "customer", by.y = "customers", all.x = TRUE)
+df4 <- merge(x = df3, y = customer2age, by.x = "customer", by.y = "customers", all.x = TRUE)
+df5 <- merge(x = df4, y = products, by = "SKU", all.x = TRUE)
+dfFinal <- df5[,c("dayNum", "mthNum", "customer", "customerName", "customerAge", "SKU", "Price", "transactionID")]
+
 
 #inspect the output
 print(head(dfFinal))