From d5873268d25628e911c78a5cfe20ae048523ec0d Mon Sep 17 00:00:00 2001 From: humoroussmile <77305282+humoroussmile@users.noreply.github.com> Date: Sun, 21 Jul 2024 23:37:54 -0600 Subject: [PATCH] deployed with small changes --- _book/search.json | 68 +++++++++++++---------------------------------- va_cs.qmd | 12 ++++----- 2 files changed, 25 insertions(+), 55 deletions(-) diff --git a/_book/search.json b/_book/search.json index f13229c..af7130c 100644 --- a/_book/search.json +++ b/_book/search.json @@ -2,7 +2,7 @@ { "objectID": "index.html", "href": "index.html", - "title": "VA Summer Conference Case Study", + "title": "Summer Conference Case Study", "section": "", "text": "Welcome\nAs part of an ongoing staffing crisis, you have been asked by your director to reproduce a report that was produced under the last research director, who is now on a beach, enjoying her retirement, and unreachable. You asked your only analyst to try to reproduce the report, but they have not yet been able to, and there doesn’t seem to be any record of how the numbers were produced. The report establishes recidivism rates for several key programs and provides staffing levels for those programs. Your director wants to make the case that the DOC needs staff to implement successful programs. This is in response to a group of hostile legislators (a mix of far-right fiscal conservatives who do not believe in prison programs and far-left “de-funders” who think your budget is too big). They are running a bill to reduce your overall budget and eliminate some existing staff vacancies, with the argument that if you have survived for the past two years with your current staffing levels, you must be doing okay.", "crumbs": [ @@ -12,9 +12,9 @@ { "objectID": "index.html#case-study-narrative", "href": "index.html#case-study-narrative", - "title": "VA Summer Conference Case Study", + "title": "Summer Conference Case Study", "section": "Case Study Narrative", - "text": "Case Study Narrative\n\nScenario\nAmidst a swirling staffing crisis at the Gotham Department of Corrections (GDOC), you find yourself in a peculiar predicament. Your director, the fearsome Evelyn “Firebrand” Summers, has thrust upon you the Herculean task of resurrecting a report forged under the previous Research Director Miranda “Mermaid” Shelsky. Dr. Shelsky, now retired and sipping mojitos on a secluded beach in Fiji, left behind a report detailing recidivism rates and the indispensable staffing needs for DOC’s prized programs.\nThe report establishes recidivism rates for several key supervision/reentry programs for high risk people and provides staffing levels for these programs. Director Summers is depending on you to build a case to keep the necessary staff to implement these programs successfully.\nYou ask your only analyst to try to reproduce the report, yet there mysteriously doesn’t seem to be any record of how the original report’s numbers were produced. Your analyst has spent 3 days and cannot seem to figure out the formula. The analyst is also now 3 days behind on other tasks that have been assigned and it is time to move on.\nMeanwhile, the ever-persistent Center Wing Coalition (CWC) are crafting a bill to counteract all of the progress your agency has been making in your recidivism-reduction efforts. The CWC argue that, if your agency has survived for the past two years with current staffing levels, you must be doing okay and are advocating to cut your overall allocated FTE. Meanwhile, your staff is working overtime through lunch, and you are certain you saw at least two team members browsing on Indeed last week.\nYou’re in a tight bind, and time is ticking. Not to mention, your best friend has planned a getaway next month in Las Vegas. You paid for this trip over six months ago and you have no intention of missing it. However, you have no desire to be sitting in the Blue Man Group show thinking about this data…", + "text": "Case Study Narrative\n\nGDOC Recidivism Analysis\nAmidst a swirling staffing crisis at the Gotham Department of Corrections (GDOC), you find yourself in a peculiar predicament. Your director, the fearsome Evelyn “Firebrand” Summers, has thrust upon you the Herculean task of resurrecting a report forged under the previous Research Director Miranda “Mermaid” Shelsky. Dr. Shelsky, now retired and sipping mojitos on a secluded beach in Fiji, left behind a report detailing recidivism rates and the indispensable staffing needs for DOC’s prized programs.\nThe report establishes recidivism rates for several key supervision/reentry programs for high risk people and provides staffing levels for these programs. Director Summers is depending on you to build a case to keep the necessary staff to implement these programs successfully.\nYou ask your only analyst to try to reproduce the report, yet there mysteriously doesn’t seem to be any record of how the original report’s numbers were produced. Your analyst has spent 3 days and cannot seem to figure out the formula. The analyst is also now 3 days behind on other tasks that have been assigned and it is time to move on.\nMeanwhile, the ever-persistent Center Wing Coalition (CWC) are crafting a bill to counteract all of the progress your agency has been making in your recidivism-reduction efforts. The CWC argue that, if your agency has survived for the past two years with current staffing levels, you must be doing okay and are advocating to cut your overall allocated FTE. Meanwhile, your staff is working overtime through lunch, and you are certain you saw at least two team members browsing on Indeed last week.\nYou’re in a tight bind, and time is ticking. Not to mention, your best friend has planned a getaway next month in Las Vegas. You paid for this trip over six months ago and you have no intention of missing it. However, you have no desire to be sitting in the Blue Man Group show thinking about this data…", "crumbs": [ "1  Welcome" ] @@ -22,9 +22,9 @@ { "objectID": "index.html#about-virtual-academy", "href": "index.html#about-virtual-academy", - "title": "VA Summer Conference Case Study", + "title": "Summer Conference Case Study", "section": "About Virtual Academy", - "text": "About Virtual Academy\nThe Advancing Data in Corrections initiative includes a Virtual Academy, which builds data literacy and analytic capabilities within departments of corrections nationwide to further data-informed planning and decision-making. Through their participation in the Virtual Academy, corrections agency staff can access the program’s self-paced learning materials, community forum, peer-to-peer networks, and technical assistance at no cost.\nFor more information about the program and the tools, services, and opportunies available, visit the Virtual Academy website.", + "text": "About Virtual Academy\nThe Advancing Data in Corrections initiative includes a Virtual Academy, which builds data literacy and analytic capabilities within departments of corrections nationwide to further data-informed planning and decision-making. Through their participation in the Virtual Academy, corrections agency staff can access the program’s self-paced learning materials, community forum, peer-to-peer networks, and technical assistance at no cost.\nFor more information about the program and the tools, services, and opportunities available, visit the Virtual Academy website.", "crumbs": [ "1  Welcome" ] @@ -54,77 +54,47 @@ "href": "va_cs.html#data-exploration", "title": "GDOC Recidivism Analysis", "section": "Data Exploration", - "text": "Data Exploration\nLet’s take a look at our roster! The name of our data is roster. Let’s take a peek under the hood and see what variables/columns we have, and a quick summary of what they all look like. We know that our DOC has 10 programs.\n\n\nCode\n#what's in our data\nsummary(roster)\n#> client_names age datestart programs \n#> Length:1000 Min. :14.00 Min. :2023-05-09 Length:1000 \n#> Class :character 1st Qu.:25.00 1st Qu.:2023-09-03 Class :character \n#> Mode :character Median :29.00 Median :2023-11-10 Mode :character \n#> Mean :28.84 Mean :2023-12-06 \n#> 3rd Qu.:32.00 3rd Qu.:2024-03-13 \n#> Max. :47.00 Max. :2024-07-17 \n#> ret \n#> Min. :0.000 \n#> 1st Qu.:0.000 \n#> Median :0.000 \n#> Mean :0.339 \n#> 3rd Qu.:1.000 \n#> Max. :1.000\nnumcol <- ncol(roster)\nprint(numcol)\n#> [1] 5\n\n\nInteresting! It appears we have 5 variables in the roster data! What else do we see going on here? Let’s just lift the hood a tiny bit more.\n\n\nCode\n#peek under the hood\nhead(roster)\n\n\n\n \n\n\n\nFascinating! It looks like person-level data! Let’s learn more about some of the potentially relevant variables.\n\n\nCode\n#distribution of returns\nhist(roster$ret, breaks=20)\n\n\n\n\n\n\n\n\n\nCode\n\n#distribution of age\nhist(roster$age)\n\n\n\n\n\n\n\n\n\nCode\n\n#distribution of dates\nhist(roster$datestart, breaks=10)", + "text": "Data Exploration\nLet’s take a look at our roster! The name of our data is roster. Let’s take a peek under the hood and see what variables/columns we have, and a quick summary of what they all look like. We know that our DOC has 10 programs.\n\n\nCode\n#what's in our data\nsummary(roster)\n#> client_names age datestart programs \n#> Length:55 Min. :19.00 Min. :1991-08-22 Length:55 \n#> Class :character 1st Qu.:25.00 1st Qu.:2021-09-14 Class :character \n#> Mode :character Median :29.00 Median :2021-11-30 Mode :character \n#> Mean :28.71 Mean :2021-01-05 \n#> 3rd Qu.:32.00 3rd Qu.:2022-02-04 \n#> Max. :41.00 Max. :2022-07-09 \n#> ret \n#> Min. :-1.0000 \n#> 1st Qu.: 0.0000 \n#> Median : 0.0000 \n#> Mean : 0.5091 \n#> 3rd Qu.: 1.0000 \n#> Max. :11.0000\nnumcol <- ncol(roster)\nprint(numcol)\n#> [1] 5\n\n\nInteresting! It appears we have 5 variables in the roster data! What else do we see going on here? Let’s just lift the hood a tiny bit more.\n\n\nCode\n#peek under the hood\nhead(roster)\n\n\n\n \n\n\n\nFascinating! It looks like person-level data! Let’s learn more about some of the potentially relevant variables.\n\n\nCode\n#distribution of returns\nhist(roster$ret, breaks=20)\n\n\n\n\n\n\n\n\n\nCode\n\n#distribution of age\nhist(roster$age)\n\n\n\n\n\n\n\n\n\nCode\n\n#distribution of dates\nhist(roster$datestart, breaks=10)", "crumbs": [ "2  GDOC Recidivism Analysis" ] }, - { - "objectID": "va_cs.html#reporting-results", - "href": "va_cs.html#reporting-results", - "title": "GDOC Recidivism Analysis", - "section": "Reporting Results", - "text": "Reporting Results\n\nData Visualizations\nWe have to get out those results now! Let’s combine the program staff and recidivism rates dataframes so we can print out a table!\n\n\nCode\ntabout <- inner_join(roster2, staff2, by = (\"programs_clean\")) |>\n select(-c(clients_served_all, clients_served_year)) |>\n ungroup() |>\n #if any years are missing, fill in\n complete(year, nesting(programs_clean,num_staff),\n fill = list(recid_rate_all = NA, recid_rate_year = NA)\n ) |>\n #correct missing values for recid_rate_all since this is the overall recidivism rate across multiple years\n group_by(programs_clean) |>\n fill(recid_rate_all, .direction = \"updown\")\n#verify join was successful\nanti_join(roster2, staff2, by = (\"programs_clean\"))\n\n\n\n \n\n\n\nCode\n\n#overall recidivism rates\ntabout |> \n filter(year==date1) |>\n select(-c(year, recid_rate_year)) |>\n kable(format = \"html\", caption = \"Overall recidivism rates by program\")\n\n\n\nOverall recidivism rates by program\n\n\nprograms_clean\nnum_staff\nrecid_rate_all\n\n\n\n\nam\n2\n0.35\n\n\nbcs\n4\n0.19\n\n\nbrave\n8\n0.28\n\n\nchallenge\n4\n0.49\n\n\ndbt\n15\n0.30\n\n\nmhsd\n6\n0.56\n\n\nrdap\n5\n0.20\n\n\nresolve\n8\n0.35\n\n\nsotrt\n11\n0.41\n\n\nstages\n17\n0.60\n\n\n\n\n\n\n\nCode\n\n#overall recidivism rates\ntabout |> \n select(-c(recid_rate_all)) |>\n kable(format = \"html\", caption = \"Recidivism rates by program by year\")\n\n\n\nRecidivism rates by program by year\n\n\nyear\nprograms_clean\nnum_staff\nrecid_rate_year\n\n\n\n\n2023\nam\n2\n0.34\n\n\n2023\nbcs\n4\n0.18\n\n\n2023\nbrave\n8\n0.38\n\n\n2023\nchallenge\n4\n0.37\n\n\n2023\ndbt\n15\n0.27\n\n\n2023\nmhsd\n6\n0.53\n\n\n2023\nrdap\n5\n0.26\n\n\n2023\nresolve\n8\n0.30\n\n\n2023\nsotrt\n11\n0.39\n\n\n2023\nstages\n17\n0.56\n\n\n2024\nam\n2\n0.35\n\n\n2024\nbcs\n4\n0.20\n\n\n2024\nbrave\n8\n0.15\n\n\n2024\nchallenge\n4\n0.67\n\n\n2024\ndbt\n15\n0.33\n\n\n2024\nmhsd\n6\n0.61\n\n\n2024\nrdap\n5\n0.12\n\n\n2024\nresolve\n8\n0.43\n\n\n2024\nsotrt\n11\n0.45\n\n\n2024\nstages\n17\n0.65\n\n\n\n\n\n\n\nBut what if we want a nice data visualization of it all?\n\n\nCode\n#basic bar chart of overall recidivism rate by program\nggplot(tabout |>\n filter(year == date1)\n ,aes(x=programs_clean, y=recid_rate_all)) +\n geom_bar(stat=\"identity\")\n\n\n\n\n\n\n\n\n\nCode\n\n#basic bar chart of recidivism rate by year by program\nggplot(tabout,aes(x=programs_clean, y=recid_rate_year,fill=year)) +\n geom_bar(position=\"dodge\", stat=\"identity\")\n\n\n\n\n\n\n\n\n\nNicer!!!\n\n\nCode\n#build bar chart of recidivism rates across programs\n#information to plot, pick dates\ndates <- as.numeric(c(date1,date2)) #what years of data do you want to plot?\n\n#custom title header of plot\ntitledates <- ifelse(length(dates)>=2 & date1 != date2, paste0(date1,\" - \",date2),\n ifelse((dates==date1 | dates==date2) & ALL.BY, as.character(dates),\n ifelse(!ALL.BY, date1, \"\")))\n\n#which years/programs are missing data?\nprg.NA <- tabout |> \n filter(is.na(recid_rate_year)) |>\n pull(programs_clean)\n\n#plot it! this will plot recidivism rates with overlaid staffing text\nrr <- ggplot(tabout |> \n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when(ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all))\n ,aes(x=programs_clean, y=recid_rate, fill=year)) +\n geom_bar(position = \"dodge\",stat = \"identity\") +\n geom_text(aes(label=ifelse(year==dates[2],paste(num_staff,\"staff\"),\"\")), vjust=-0.3, color = staffc) +\n scale_fill_manual(values=c(date1c,date2c)) +\n ylim(0,1) +\n ylab(\"Recidivism Rate\") +\n xlab(\"EBBR Programs\") +\n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n theme_classic() +\n #remove legend if plotting overall (not by year)\n {if(!ALL.BY) theme(legend.position=\"none\")}+\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n#display\nrr\n\n\n\n\n\n\n\n\n\n\n\nCWC Damned Lies and Statistics\nOh no!! An advocacy group just publicly published that EBBR programs’ recidivism rates are at an all time high of 38.8% with a report that claims to have used your DOC’s reported data on EBBR program recidivism rates! Find out what’s going on, and fast!\n\n\nCode\n#manage the data to produce recidivism rates\ntabout2 <- inner_join(roster2, staff2, by = (\"programs_clean\")) |>\n ungroup() |>\n #if any years are missing, fill in\n complete(year, nesting(programs_clean,num_staff),\n fill = list(recid_rate_all = NA, recid_rate_year = NA)\n ) |>\n #correct missing values for recid_rate_all since this is the overall recidivism rate across multiple years\n group_by(programs_clean) |>\n fill(c(recid_rate_all,clients_served_all), .direction = \"updown\")\n#verify join was successful\nanti_join(roster2, staff2, by = (\"programs_clean\"))\n\n\n\n \n\n\n\nCode\n\n#calculate average recidivism rate across programs from all years\nunw.a <- round(mean(tabout2$recid_rate_all,na.rm=TRUE),2)\nprint(paste0(unw.a*100,\"%\"))\n#> [1] \"37%\"\n\n#calculate average recidivism rate across programs from year 1\nunw.d1 <- round(mean(tabout2[which(tabout2$year==date1),]$recid_rate_year,na.rm=TRUE),2)\nprint(paste0(unw.d1*100,\"%\"))\n#> [1] \"36%\"\n\n#calculate average recidivism rate across programs from year 2\nunw.d2 <- round(mean(tabout2[which(tabout2$year==date2),]$recid_rate_year,na.rm=TRUE),2)\nprint(paste0(unw.d2*100,\"%\"))\n#> [1] \"40%\"\n\n\nWell a recidivism rate of 37% is much better, but it still doesn’t match what the advocacy group is reporting! Keep investigating!\n\n\nCode\n#programs to remove\nrm.pgms <- c(\"bcs\", \"brave\", \"sotrt\", \"mhsd\", \"resolve\")\n\n\n\n\nCode\n#remove 5 of the 10 programs because the advocacy group was sneaky\nadv <- tabout |>\n filter(!(programs_clean %in% rm.pgms) &\n year == date1) #dates repeat the same information, so just pick one date to average over\n#calculate ADVOCACY rate, which will be inserted into document text\nadv_rate <- round(mean(adv$recid_rate_all,na.rm=TRUE)*100,1)\nprint(adv_rate)\n#> [1] 38.8\n\n\n\n\nData-Informed Reporting\nAlright! There’s the number the advocacy group reported! But what’s missing?\n\n\nCode\n#manage the data to produce recidivism rates\n#total clients served (all years)\ntotal.a <- sum(tabout2[which(tabout2$year==date1),]$clients_served_all, na.rm=TRUE)\ntabout2.wgt <- tabout2 |>\n filter(year==date1) |>\n mutate(recid_rate_all_w = clients_served_all*recid_rate_all)\nw.a <- round(sum(tabout2.wgt$recid_rate_all_w)/total.a,2)\nprint(paste0(w.a*100,\"%\"))\n#> [1] \"34%\"\n\n#total clients served (all years), JUST 5 PROGRAMS!!!\ntotal.a5 <- sum(tabout2[which(tabout2$year==date1 & !tabout2$programs_clean %in% rm.pgms),]$clients_served_all, na.rm=TRUE)\ntabout2.wgt5 <- tabout2 |>\n filter(!(programs_clean %in% rm.pgms) & year==date1) |>\n mutate(recid_rate_all_w = clients_served_all*recid_rate_all)\nw.a5 <- round(sum(tabout2.wgt5$recid_rate_all_w)/total.a5,2)\nprint(paste0(w.a5*100,\"%\"))\n#> [1] \"35%\"\n\n\nAlright! If we just weight our data then we see that the average overall recidivism rate across the five programs that the advocacy group highlighted is only 35%. Great work!\nNow let’s report it through some fancy data visualization work.\n\n\nCode\n#which weights to plot\nw <- if(ALL.BY) c(w.d1,w.d2) else w.a\nunw <- if(ALL.BY) c(unw.d1,unw.d2) else unw.a\n\n#for positioning\nlast <- tail(tabout$programs_clean,n=1)\n\n#add recidivism weight averages to plot\nrr + geom_hline(yintercept=w[1], linetype = \"dashed\", color = hlinew1, size = 1) +\n {if(ALL.BY) geom_hline(yintercept=w[2], linetype = \"dashed\", color = hlinew1, size = 1)}+\n geom_hline(yintercept=unw[1], linetype = \"dashed\", color = hlinew2, size = 1) +\n {if(ALL.BY) geom_hline(yintercept=unw[2], linetype = \"dashed\", color = hlinew2, size = 1)}+\n geom_label_repel(aes(label=lab, y=0.25), fill=hlinew1,\n data = tabout |>\n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when( ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all),\n lab = case_when( ALL.BY & date1 != date2 ~ paste0(date1,\" Weighted avg: \",w[1],\"\\n\",date2,\" Weighted avg: \",w[2]),\n !ALL.BY & date1 != date2 ~ paste0(date1,\"-\",date2,\"\\nWeighted avg: \",w[1]),\n ALL.BY & date1 == date2 ~ paste0(date1,\"\\nWeighted avg: \",w[1]))) |>\n filter(programs_clean == last & year == date2),\n nudge_x = -4,\n min.segment.length = Inf) + \n geom_label_repel(aes(label=lab, y=0.5), fill=hlinew2,\n data = tabout |>\n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when( ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all),\n lab = case_when( ALL.BY & date1 != date2 ~ paste0(date1,\" Unweighted avg: \",unw[1],\"\\n\",date2,\" Unweighted avg: \",unw[2]),\n !ALL.BY & date1 != date2 ~ paste0(date1,\"-\",date2,\"\\nUnweighted avg: \",unw[1]),\n ALL.BY & date1 == date2 ~ paste0(date1,\"\\nUnweighted avg: \",unw[1]))) |>\n filter(programs_clean == last & year == date2),\n nudge_x = -8,\n min.segment.length = Inf\n )", - "crumbs": [ - "2  GDOC Recidivism Analysis" - ] - }, - { - "objectID": "va_cs.html#duplicates", - "href": "va_cs.html#duplicates", - "title": "Advancing Data in Corrections: Virtual Academy Case Study", - "section": "Duplicates", - "text": "Duplicates\nWhat other pieces of information might be relevant to what we need to know about the data? How about since it appears to be person-level that we check to make sure that there aren’t any duplicates?\n\n\nCode\n#are there any duplicates?\nroster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n\n#how many duplicates?\ndupct <- length(unique(\n roster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n ))\nprint(dupct)\n#> [1] client_names age datestart programs ret \n#> <0 rows> (or 0-length row.names)\n#> [1] 5\n\n\nWow! It looks like we have 5 duplicate observations/rows in our data! Let’s remove them and keep exploring!\n\n\nCode\n#deduplicate across all columns\nroster.nodup <- roster[!duplicated(roster), ]\n\n#check for dups again\nroster.nodup[duplicated(roster.nodup) | duplicated(roster.nodup, fromLast=TRUE),]\n#> [1] client_names age datestart programs ret \n#> <0 rows> (or 0-length row.names)\n\n\nAlright! No more duplicates! No let’s take a closer look at our 10 programs.\n\n\nCode\n#count total number of programs\nprgnum <- n_distinct(roster.nodup$programs)\nprint(prgnum)\n#> [1] 14\n\n\nHm - it looks like there are more than 10 programs; 14 programs to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nroster.nodup |>\n count(programs)\n#> programs n\n#> 1 am 101\n#> 2 bcs 102\n#> 3 bgs 31\n#> 4 brave 90\n#> 5 brv 22\n#> 6 challenge 65\n#> 7 dbt 99\n#> 8 fbt 32\n#> 9 mhsd 66\n#> 10 rdap 84\n#> 11 rdp 43\n#> 12 resolve 94\n#> 13 sotrt 109\n#> 14 stages 62", - "crumbs": [ - "2  Advancing Data in Corrections: Virtual Academy Case Study" - ] - }, { "objectID": "va_cs.html#cleaning", "href": "va_cs.html#cleaning", "title": "GDOC Recidivism Analysis", "section": "Cleaning", - "text": "Cleaning\n\nDuplicates\nWhat other pieces of information might be relevant to what we need to know about the data? How about since it appears to be person-level that we check to make sure that there aren’t any duplicates?\n\n\nCode\n#are there any duplicates?\nroster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n\n\n\n \n\n\n\nCode\n\n#how many duplicates?\ndupct <- length(unique(\n roster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n ))\nprint(dupct)\n#> [1] 5\n\n\nWow! It looks like we have 5 duplicate observations/rows in our data! Let’s remove them and keep exploring!\n\n\nCode\n#deduplicate across all columns\nroster.nodup <- roster[!duplicated(roster), ]\n\n#check for dups again\nroster.nodup[duplicated(roster.nodup) | duplicated(roster.nodup, fromLast=TRUE),]\n\n\n\n \n\n\n\nAlright! No more duplicates! No let’s take a closer look at our 10 programs.\n\n\nCode\n#count total number of programs\nprgnum <- n_distinct(roster.nodup$programs)\nprint(prgnum)\n#> [1] 14\n\n\nHm - it looks like there are more than 10 programs; 14 programs to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nroster.nodup |>\n count(programs)\n\n\n\n \n\n\n\n\n\nRecoding\nAh! It looks like there are some errors in your data across programs, ret, and datestart! Clean them up so you can accurately report all EBRR programs and their associated recidivism rates. We’ll probably have to make some assumptions on our data.\n\n\nCode\n\n#check out weird date values\nroster.nodup |>\n count(year(datestart))\n\n\n\n \n\n\n\nCode\n\n#check out weird return values\nroster.nodup |>\n count(ret)\n\n\n\n \n\n\n\nCode\n\n#clean program names\nroster.clean <- roster.nodup |>\n #fix programs, dates, and returns\n mutate(programs_clean = case_when(programs == \"bgs\" ~ \"bcs\",\n programs == \"fbt\" ~ \"dbt\",\n programs == \"brv\" ~ \"brave\",\n programs == \"rdp\" ~ \"rdap\",\n TRUE ~ programs),\n \n #dates - assume that wrong dates occurred in 2021\n datestart = case_when(year(datestart) < 2021 ~ `year<-`(datestart, 2021),\n TRUE ~ datestart),\n \n #returns - assume that wrong returns should all be =1\n ret = case_when(ret == -1 ~ 1,\n ret == 2 ~ 1,\n ret == 11 ~ 1,\n TRUE ~ ret)\n )\n\n#check cleaned program names\nroster.clean |>\n count(programs_clean,programs)\n\n\n\n \n\n\n\nCode\n\n#check date values\nroster.clean |>\n count(year(datestart))\n\n\n\n \n\n\n\nCode\n\n#check out weird return values\nroster.clean |>\n count(ret)\n\n\n\n \n\n\n\nMuch better! 10 programs as expected for our DOC, and cleaned dates and returns! Now let’s see what’s going on with this returns variable when cross-tabbed with our 10 programs.\n\n\nCode\n#recidivism by program count\nroster.clean |>\n count(programs_clean,ret)\n\n\n\n \n\n\n\nLet’s calculate recidivism rates for all of our programs! It looks like we have data across two years from the datestart column, from 2023-05-09 to 2024-07-17, so perhaps we should calculate recidivism rates overall and by year!\n\n\nCode\n##create dataset of numerators and denominators\n#recidivism rates overall\nroster2.1 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean) |>\n count(ret) |>\n mutate(clients_served = sum(n),\n recid_rate_all = case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_all = clients_served)\n#print out recidivism rates overall years\nroster2.1 |>\n select(programs_clean,recid_rate_all) |>\n kable(format=\"html\")\n\n\n\n\n\nprograms_clean\nrecid_rate_all\n\n\n\n\nam\n0.35\n\n\nbcs\n0.19\n\n\nbrave\n0.28\n\n\nchallenge\n0.49\n\n\ndbt\n0.30\n\n\nmhsd\n0.56\n\n\nrdap\n0.20\n\n\nresolve\n0.35\n\n\nsotrt\n0.41\n\n\nstages\n0.60\n\n\n\n\n\n\n\nCode\n\n#recidivism rates by year\nroster2.2 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean,year) |>\n count(ret,year) |>\n mutate(clients_served = sum(n),\n recid_rate_year= case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean,year) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_year = clients_served)\n#print out recidivism rates by year\nroster2.2 |>\n arrange(year,programs_clean) |>\n select(year,programs_clean,recid_rate_year) |>\n kable(format=\"html\")\n\n\n\n\n\nyear\nprograms_clean\nrecid_rate_year\n\n\n\n\n2023\nam\n0.34\n\n\n2023\nbcs\n0.18\n\n\n2023\nbrave\n0.38\n\n\n2023\nchallenge\n0.37\n\n\n2023\ndbt\n0.27\n\n\n2023\nmhsd\n0.53\n\n\n2023\nrdap\n0.26\n\n\n2023\nresolve\n0.30\n\n\n2023\nsotrt\n0.39\n\n\n2023\nstages\n0.56\n\n\n2024\nam\n0.35\n\n\n2024\nbcs\n0.20\n\n\n2024\nbrave\n0.15\n\n\n2024\nchallenge\n0.67\n\n\n2024\ndbt\n0.33\n\n\n2024\nmhsd\n0.61\n\n\n2024\nrdap\n0.12\n\n\n2024\nresolve\n0.43\n\n\n2024\nsotrt\n0.45\n\n\n2024\nstages\n0.65\n\n\n\n\n\n\n\nCode\n\n#combine data by programs to capture recidivism rates overall and by year\nroster2 <- inner_join(roster2.1, roster2.2, by = \"programs_clean\")\n#verify that join did not lose any observations\nanti_join(roster2.1, roster2.2, by = \"programs_clean\")\n\n\n\n \n\n\n\nLet’s take a look at our program staffing! Our DOC captures 10 Evidence Based Recidivism Reduction (EBRR) programs listed by the Federal Bureau of Prisons and the individuals involved.The name of our data is staff.\n\n\nCode\n#what's in our data\nsummary(staff)\n#> staff prg \n#> Length:80 Length:80 \n#> Class :character Class :character \n#> Mode :character Mode :character\n\nnumcolst <- ncol(staff)\nprint(numcolst)\n#> [1] 2\n\n\nIt appears we only have 2 variables in the program staffing data. Let’s keep exploring!\n\n\nCode\n#take a peek\nhead(staff)\n\n\n\n \n\n\n\nWell, well, well - person-level data again! Why don’t we check for duplicates just in case?\n\n\nCode\n#are there any duplicates?\nstaff[duplicated(staff) | duplicated(staff, fromLast=TRUE),]\n\n\n\n \n\n\n\nPhew! No duplicates. That was a close one. Now let’s take a closer look at our 10 programs covered by these staff.\n\n\nCode\n#count total number of programs\nprgnum.stf <- n_distinct(staff$prg)\nprint(prgnum.stf)\n#> [1] 13\n\n\nWell - it looks like yet again we have data with more than 10 programs; 13 to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nstaff |>\n count(prg)\n\n\n\n \n\n\n\nIt looks like there are some errors in your data (again!?)! Clean them up so you can accurately report all EBRR programs and their associated program staff.\n\n\nCode\n#clean program names\nstaff.clean <- staff |>\n mutate(programs_clean = case_when(prg == \"resol\" ~ \"resolve\",\n prg == \"brv\" ~ \"brave\",\n prg == \"stg\" ~ \"stages\",\n TRUE ~ prg))\n\n#check cleaned program names\nstaff.clean |>\n count(programs_clean,prg)\n\n\n\n \n\n\n\nMuch better! Now let’s see how many staff we have by program!\n\n\nCode\n#create dataset of staff counts\nstaff2 <- staff.clean |>\n select(-c(staff,prg)) |>\n count(programs_clean) |>\n rename(num_staff = n)\n\n#print staffing\nprint(staff2)\n#> programs_clean num_staff\n#> 1 am 2\n#> 2 bcs 4\n#> 3 brave 8\n#> 4 challenge 4\n#> 5 dbt 15\n#> 6 mhsd 6\n#> 7 rdap 5\n#> 8 resolve 8\n#> 9 sotrt 11\n#> 10 stages 17\n\n#histogram of staffing\nhist(staff2$num_staff)", + "text": "Cleaning\n\nDuplicates\nWhat other pieces of information might be relevant to what we need to know about the data? How about since it appears to be person-level that we check to make sure that there aren’t any duplicates?\n\n\nCode\n#are there any duplicates?\nroster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n\n\n\n \n\n\n\nCode\n\n#how many duplicates?\ndupct <- length(unique(\n roster[duplicated(roster) | duplicated(roster, fromLast=TRUE),]\n ))\nprint(dupct)\n#> [1] 5\n\n\nWow! It looks like we have 5 duplicate observations/rows in our data! Let’s remove them and keep exploring!\n\n\nCode\n#deduplicate across all columns\nroster.nodup <- roster[!duplicated(roster), ]\n\n#check for dups again\nroster.nodup[duplicated(roster.nodup) | duplicated(roster.nodup, fromLast=TRUE),]\n\n\n\n \n\n\n\nAlright! No more duplicates! No let’s take a closer look at our 10 programs.\n\n\nCode\n#count total number of programs\nprgnum <- n_distinct(roster.nodup$programs)\nprint(prgnum)\n#> [1] 14\n\n\nHm - it looks like there are more than 10 programs; 14 programs to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nroster.nodup |>\n count(programs)\n\n\n\n \n\n\n\n\n\nRecoding\nAh! It looks like there are some errors in your data across programs, ret, and datestart! Clean them up so you can accurately report all EBRR programs and their associated recidivism rates. We’ll probably have to make some assumptions on our data.\n\n\nCode\n\n#check out weird date values\nroster.nodup |>\n count(year(datestart))\n\n\n\n \n\n\n\nCode\n\n#check out weird return values\nroster.nodup |>\n count(ret)\n\n\n\n \n\n\n\nCode\n\n#clean program names\nroster.clean <- roster.nodup |>\n #fix programs, dates, and returns\n mutate(programs_clean = case_when(programs == \"bgs\" ~ \"bcs\",\n programs == \"fbt\" ~ \"dbt\",\n programs == \"brv\" ~ \"brave\",\n programs == \"rdp\" ~ \"rdap\",\n TRUE ~ programs),\n \n #dates - assume that wrong dates occurred in 2021\n datestart = case_when(year(datestart) < 2021 ~ `year<-`(datestart, 2021),\n TRUE ~ datestart),\n \n #returns - assume that wrong returns should all be =1\n ret = case_when(ret == -1 ~ 1,\n ret == 2 ~ 1,\n ret == 11 ~ 1,\n TRUE ~ ret)\n )\n\n#check cleaned program names\nroster.clean |>\n count(programs_clean,programs)\n\n\n\n \n\n\n\nCode\n\n#check date values\nroster.clean |>\n count(year(datestart))\n\n\n\n \n\n\n\nCode\n\n#check out weird return values\nroster.clean |>\n count(ret)\n\n\n\n \n\n\n\nMuch better! 10 programs as expected for our DOC, and cleaned dates and returns! Now let’s see what’s going on with this returns variable when cross-tabbed with our 10 programs.\n\n\nCode\n#recidivism by program count\nroster.clean |>\n count(programs_clean,ret)\n\n\n\n \n\n\n\nLet’s calculate recidivism rates for all of our programs! It looks like we have data across two years from the datestart column, from 2021-05-08 to 2022-07-09, so perhaps we should calculate recidivism rates overall and by year!\n\n\nCode\n##create dataset of numerators and denominators\n#recidivism rates overall\nroster2.1 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean) |>\n count(ret) |>\n mutate(clients_served = sum(n),\n recid_rate_all = case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_all = clients_served)\n#print out recidivism rates overall years\nroster2.1 |>\n select(programs_clean,recid_rate_all) |>\n kable(format=\"html\")\n\n\n\n\n\nprograms_clean\nrecid_rate_all\n\n\n\n\nam\n1.00\n\n\nbcs\n0.27\n\n\nbrave\n0.17\n\n\nchallenge\n0.67\n\n\ndbt\n0.50\n\n\nmhsd\n0.00\n\n\nrdap\n0.43\n\n\nresolve\n0.29\n\n\nsotrt\n0.00\n\n\nstages\n0.75\n\n\n\n\n\n\n\nCode\n\n#recidivism rates by year\nroster2.2 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean,year) |>\n count(ret,year) |>\n mutate(clients_served = sum(n),\n recid_rate_year= case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean,year) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_year = clients_served)\n#print out recidivism rates by year\nroster2.2 |>\n arrange(year,programs_clean) |>\n select(year,programs_clean,recid_rate_year) |>\n kable(format=\"html\")\n\n\n\n\n\nyear\nprograms_clean\nrecid_rate_year\n\n\n\n\n2021\nbcs\n0.29\n\n\n2021\nbrave\n0.33\n\n\n2021\nchallenge\n0.50\n\n\n2021\ndbt\n0.00\n\n\n2021\nmhsd\n0.00\n\n\n2021\nrdap\n0.40\n\n\n2021\nresolve\n0.50\n\n\n2021\nsotrt\n0.00\n\n\n2021\nstages\n1.00\n\n\n2022\nam\n1.00\n\n\n2022\nbcs\n0.25\n\n\n2022\nbrave\n0.00\n\n\n2022\nchallenge\n1.00\n\n\n2022\ndbt\n0.60\n\n\n2022\nrdap\n0.50\n\n\n2022\nresolve\n0.00\n\n\n2022\nstages\n0.50\n\n\n\n\n\n\n\nCode\n\n#combine data by programs to capture recidivism rates overall and by year\nroster2 <- inner_join(roster2.1, roster2.2, by = \"programs_clean\")\n#verify that join did not lose any observations\nanti_join(roster2.1, roster2.2, by = \"programs_clean\")\n\n\n\n \n\n\n\nLet’s take a look at our program staffing! Our DOC captures 10 Evidence Based Recidivism Reduction (EBRR) programs listed by the Federal Bureau of Prisons and the individuals involved.The name of our data is staff.\n\n\nCode\n#what's in our data\nsummary(staff)\n#> staff prg \n#> Length:80 Length:80 \n#> Class :character Class :character \n#> Mode :character Mode :character\n\nnumcolst <- ncol(staff)\nprint(numcolst)\n#> [1] 2\n\n\nIt appears we only have 2 variables in the program staffing data. Let’s keep exploring!\n\n\nCode\n#take a peek\nhead(staff)\n\n\n\n \n\n\n\nWell, well, well - person-level data again! Why don’t we check for duplicates just in case?\n\n\nCode\n#are there any duplicates?\nstaff[duplicated(staff) | duplicated(staff, fromLast=TRUE),]\n\n\n\n \n\n\n\nPhew! No duplicates. That was a close one. Now let’s take a closer look at our 10 programs covered by these staff.\n\n\nCode\n#count total number of programs\nprgnum.stf <- n_distinct(staff$prg)\nprint(prgnum.stf)\n#> [1] 13\n\n\nWell - it looks like yet again we have data with more than 10 programs; 13 to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nstaff |>\n count(prg)\n\n\n\n \n\n\n\nIt looks like there are some errors in your data (again!?)! Clean them up so you can accurately report all EBRR programs and their associated program staff.\n\n\nCode\n#clean program names\nstaff.clean <- staff |>\n mutate(programs_clean = case_when(prg == \"resol\" ~ \"resolve\",\n prg == \"brv\" ~ \"brave\",\n prg == \"stg\" ~ \"stages\",\n TRUE ~ prg))\n\n#check cleaned program names\nstaff.clean |>\n count(programs_clean,prg)\n\n\n\n \n\n\n\nMuch better! Now let’s see how many staff we have by program!\n\n\nCode\n#create dataset of staff counts\nstaff2 <- staff.clean |>\n select(-c(staff,prg)) |>\n count(programs_clean) |>\n rename(num_staff = n)\n\n#print staffing\nprint(staff2)\n#> programs_clean num_staff\n#> 1 am 2\n#> 2 bcs 4\n#> 3 brave 8\n#> 4 challenge 4\n#> 5 dbt 15\n#> 6 mhsd 6\n#> 7 rdap 5\n#> 8 resolve 8\n#> 9 sotrt 11\n#> 10 stages 17\n\n#histogram of staffing\nhist(staff2$num_staff)", "crumbs": [ "2  GDOC Recidivism Analysis" ] }, { - "objectID": "va_cs.html#recoding", - "href": "va_cs.html#recoding", - "title": "Advancing Data in Corrections: Virtual Academy Case Study", - "section": "Recoding", - "text": "Recoding\nAh! It looks like there are some errors in your data across programs, ret, and datestart! Clean them up so you can accurately report all EBRR programs and their associated recidivism rates. We’ll probably have to make some assumptions on our data.\n\n\nCode\n\n#check out weird date values\nroster.nodup |>\n count(year(datestart))\n\n#check out weird return values\nroster.nodup |>\n count(ret)\n\n#clean program names\nroster.clean <- roster.nodup |>\n #fix programs, dates, and returns\n mutate(programs_clean = case_when(programs == \"bgs\" ~ \"bcs\",\n programs == \"fbt\" ~ \"dbt\",\n programs == \"brv\" ~ \"brave\",\n programs == \"rdp\" ~ \"rdap\",\n TRUE ~ programs),\n \n #dates - assume that wrong dates occurred in 2021\n datestart = case_when(year(datestart) < 2021 ~ `year<-`(datestart, 2021),\n TRUE ~ datestart),\n \n #returns - assume that wrong returns should all be =1\n ret = case_when(ret == -1 ~ 1,\n ret == 2 ~ 1,\n ret == 11 ~ 1,\n TRUE ~ ret)\n )\n\n#check cleaned program names\nroster.clean |>\n count(programs_clean,programs)\n\n#check date values\nroster.clean |>\n count(year(datestart))\n\n#check out weird return values\nroster.clean |>\n count(ret)\n#> year(datestart) n\n#> 1 2023 500\n#> 2 2024 500\n#> ret n\n#> 1 0 661\n#> 2 1 339\n#> programs_clean programs n\n#> 1 am am 101\n#> 2 bcs bcs 102\n#> 3 bcs bgs 31\n#> 4 brave brave 90\n#> 5 brave brv 22\n#> 6 challenge challenge 65\n#> 7 dbt dbt 99\n#> 8 dbt fbt 32\n#> 9 mhsd mhsd 66\n#> 10 rdap rdap 84\n#> 11 rdap rdp 43\n#> 12 resolve resolve 94\n#> 13 sotrt sotrt 109\n#> 14 stages stages 62\n#> year(datestart) n\n#> 1 2023 500\n#> 2 2024 500\n#> ret n\n#> 1 0 661\n#> 2 1 339\n\n\nMuch better! 10 programs as expected for our DOC, and cleaned dates and returns! Now let’s see what’s going on with this returns variable when cross-tabbed with our 10 programs.\n\n\nCode\n#recidivism by program count\nroster.clean |>\n count(programs_clean,ret)\n#> programs_clean ret n\n#> 1 am 0 66\n#> 2 am 1 35\n#> 3 bcs 0 108\n#> 4 bcs 1 25\n#> 5 brave 0 81\n#> 6 brave 1 31\n#> 7 challenge 0 33\n#> 8 challenge 1 32\n#> 9 dbt 0 92\n#> 10 dbt 1 39\n#> 11 mhsd 0 29\n#> 12 mhsd 1 37\n#> 13 rdap 0 102\n#> 14 rdap 1 25\n#> 15 resolve 0 61\n#> 16 resolve 1 33\n#> 17 sotrt 0 64\n#> 18 sotrt 1 45\n#> 19 stages 0 25\n#> 20 stages 1 37\n\n\nLet’s calculate recidivism rates for all of our programs! It looks like we have data across two years from the datestart column, from 2023-01-12 to 2024-12-28, so perhaps we should calculate recidivism rates overall and by year!\n\n\nCode\n##create dataset of numerators and denominators\n#recidivism rates overall\nroster2.1 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean) |>\n count(ret) |>\n mutate(clients_served = sum(n),\n recid_rate_all = case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_all = clients_served)\n#print out recidivism rates overall years\nroster2.1 |>\n select(programs_clean,recid_rate_all) |>\n kable(format=\"html\")\n\n#recidivism rates by year\nroster2.2 <- roster.clean |>\n mutate(year = factor(year(datestart))) |>\n select(-c(client_names,age,programs)) |>\n group_by(programs_clean,year) |>\n count(ret,year) |>\n mutate(clients_served = sum(n),\n recid_rate_year= case_when(ret == 1 ~ round(n/clients_served,2),\n ret == 0 ~ 0)\n ) |>\n group_by(programs_clean,year) |>\n slice(n()) |>\n select(-c(ret, n)) |>\n rename(clients_served_year = clients_served)\n#print out recidivism rates by year\nroster2.2 |>\n arrange(year,programs_clean) |>\n select(year,programs_clean,recid_rate_year) |>\n kable(format=\"html\")\n\n#combine data by programs to capture recidivism rates overall and by year\nroster2 <- inner_join(roster2.1, roster2.2, by = \"programs_clean\")\n#verify that join did not lose any observations\nanti_join(roster2.1, roster2.2, by = \"programs_clean\")\n\n\n\n\n\nprograms_clean\nrecid_rate_all\n\n\n\n\nam\n0.35\n\n\nbcs\n0.19\n\n\nbrave\n0.28\n\n\nchallenge\n0.49\n\n\ndbt\n0.30\n\n\nmhsd\n0.56\n\n\nrdap\n0.20\n\n\nresolve\n0.35\n\n\nsotrt\n0.41\n\n\nstages\n0.60\n\n\n\n\n\n\n\n\n\nyear\nprograms_clean\nrecid_rate_year\n\n\n\n\n2023\nam\n0.33\n\n\n2023\nbcs\n0.20\n\n\n2023\nbrave\n0.30\n\n\n2023\nchallenge\n0.48\n\n\n2023\ndbt\n0.34\n\n\n2023\nmhsd\n0.62\n\n\n2023\nrdap\n0.24\n\n\n2023\nresolve\n0.37\n\n\n2023\nsotrt\n0.36\n\n\n2023\nstages\n0.62\n\n\n2024\nam\n0.37\n\n\n2024\nbcs\n0.18\n\n\n2024\nbrave\n0.24\n\n\n2024\nchallenge\n0.50\n\n\n2024\ndbt\n0.26\n\n\n2024\nmhsd\n0.48\n\n\n2024\nrdap\n0.16\n\n\n2024\nresolve\n0.33\n\n\n2024\nsotrt\n0.45\n\n\n2024\nstages\n0.57\n\n\n\n\n\n\n#> # A tibble: 0 × 3\n#> # Groups: programs_clean [0]\n#> # ℹ 3 variables: programs_clean <chr>, clients_served_all <int>,\n#> # recid_rate_all <dbl>\n\nLet’s take a look at our program staffing! Our DOC captures 10 Evidence Based Recidivism Reduction (EBRR) programs listed by the Federal Bureau of Prisons and the individuals involved.The name of our data is staff.\n\n\nCode\n#what's in our data\nsummary(staff)\n\nnumcolst <- ncol(staff)\nprint(numcolst)\n#> staff prg \n#> Length:80 Length:80 \n#> Class :character Class :character \n#> Mode :character Mode :character \n#> [1] 2\n\n\nIt appears we only have 2 variables in the program staffing data. Let’s keep exploring!\n\n\nCode\n#take a peek\nhead(staff)\n#> staff prg\n#> 1 Mrs. Skyler Sawayn MD brave\n#> 2 Telly Mitchell dbt\n#> 3 Watson O'Conner stages\n#> 4 Ruthie Wolff brave\n#> 5 Efrain Koch DDS resolve\n#> 6 Miss. Kristan Flatley brave\n\n\nWell, well, well - person-level data again! Why don’t we check for duplicates just in case?\n\n\nCode\n#are there any duplicates?\nstaff[duplicated(staff) | duplicated(staff, fromLast=TRUE),]\n#> [1] staff prg \n#> <0 rows> (or 0-length row.names)\n\n\nPhew! No duplicates. That was a close one. Now let’s take a closer look at our 10 programs covered by these staff.\n\n\nCode\n#count total number of programs\nprgnum.stf <- n_distinct(staff$prg)\nprint(prgnum.stf)\n#> [1] 13\n\n\nWell - it looks like yet again we have data with more than 10 programs; 13 to be exact. Let’s see what’s going on here.\n\n\nCode\n#list out the actual program names\nstaff |>\n count(prg)\n#> prg n\n#> 1 am 2\n#> 2 bcs 4\n#> 3 brave 6\n#> 4 brv 2\n#> 5 challenge 4\n#> 6 dbt 15\n#> 7 mhsd 6\n#> 8 rdap 5\n#> 9 resol 2\n#> 10 resolve 6\n#> 11 sotrt 11\n#> 12 stages 12\n#> 13 stg 5\n\n\nIt looks like there are some errors in your data (again!?)! Clean them up so you can accurately report all EBRR programs and their associated program staff.\n\n\nCode\n#clean program names\nstaff.clean <- staff |>\n mutate(programs_clean = case_when(prg == \"resol\" ~ \"resolve\",\n prg == \"brv\" ~ \"brave\",\n prg == \"stg\" ~ \"stages\",\n TRUE ~ prg))\n\n#check cleaned program names\nstaff.clean |>\n count(programs_clean,prg)\n#> programs_clean prg n\n#> 1 am am 2\n#> 2 bcs bcs 4\n#> 3 brave brave 6\n#> 4 brave brv 2\n#> 5 challenge challenge 4\n#> 6 dbt dbt 15\n#> 7 mhsd mhsd 6\n#> 8 rdap rdap 5\n#> 9 resolve resol 2\n#> 10 resolve resolve 6\n#> 11 sotrt sotrt 11\n#> 12 stages stages 12\n#> 13 stages stg 5\n\n\nMuch better! Now let’s see how many staff we have by program!\n\n\nCode\n#create dataset of staff counts\nstaff2 <- staff.clean |>\n select(-c(staff,prg)) |>\n count(programs_clean) |>\n rename(num_staff = n)\n\n#print staffing\nprint(staff2)\n\n#histogram of staffing\nhist(staff2$num_staff)\n\n\n\n\n\n\n\n\n#> programs_clean num_staff\n#> 1 am 2\n#> 2 bcs 4\n#> 3 brave 8\n#> 4 challenge 4\n#> 5 dbt 15\n#> 6 mhsd 6\n#> 7 rdap 5\n#> 8 resolve 8\n#> 9 sotrt 11\n#> 10 stages 17", - "crumbs": [ - "2  Advancing Data in Corrections: Virtual Academy Case Study" - ] - }, - { - "objectID": "va_cs.html#r-session", - "href": "va_cs.html#r-session", + "objectID": "va_cs.html#reporting-results", + "href": "va_cs.html#reporting-results", "title": "GDOC Recidivism Analysis", - "section": "R Session", - "text": "R Session\n\n\nCode\n#for reproducibility\nsi <- sessioninfo::session_info()\nsi$packages$library <- NULL\nsi$platform$pandoc <- NULL\nsi\n#> ─ Session info ───────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.4.1 (2024-06-14 ucrt)\n#> os Windows 10 x64 (build 19045)\n#> system x86_64, mingw32\n#> ui RTerm\n#> language (EN)\n#> collate English_United States.utf8\n#> ctype English_United States.utf8\n#> tz America/Denver\n#> date 2024-07-21\n#> \n#> ─ Packages ───────────────────────────────────────────────────────────────────\n#> package * version date (UTC) lib source\n#> charlatan * 0.5.1 2023-09-13 [] CRAN (R 4.4.1)\n#> cli 3.6.3 2024-06-21 [] CRAN (R 4.4.1)\n#> colorspace 2.1-0 2023-01-23 [] CRAN (R 4.4.1)\n#> digest 0.6.36 2024-06-23 [] CRAN (R 4.4.1)\n#> dplyr * 1.1.4 2023-11-17 [] CRAN (R 4.4.1)\n#> evaluate 0.24.0 2024-06-10 [] CRAN (R 4.4.1)\n#> fansi 1.0.6 2023-12-08 [] CRAN (R 4.4.1)\n#> farver 2.1.2 2024-05-13 [] CRAN (R 4.4.1)\n#> fastmap 1.2.0 2024-05-15 [] CRAN (R 4.4.1)\n#> forcats * 1.0.0 2023-01-29 [] CRAN (R 4.4.1)\n#> generics 0.1.3 2022-07-05 [] CRAN (R 4.4.1)\n#> ggplot2 * 3.5.1 2024-04-23 [] CRAN (R 4.4.1)\n#> ggrepel * 0.9.5 2024-01-10 [] CRAN (R 4.4.1)\n#> glue 1.7.0 2024-01-09 [] CRAN (R 4.4.1)\n#> gtable 0.3.5 2024-04-22 [] CRAN (R 4.4.1)\n#> highr 0.11 2024-05-26 [] CRAN (R 4.4.1)\n#> hms 1.1.3 2023-03-21 [] CRAN (R 4.4.1)\n#> htmltools 0.5.8.1 2024-04-04 [] CRAN (R 4.4.1)\n#> jsonlite 1.8.8 2023-12-04 [] CRAN (R 4.4.1)\n#> knitr * 1.48 2024-07-07 [] CRAN (R 4.4.1)\n#> labeling 0.4.3 2023-08-29 [] CRAN (R 4.4.0)\n#> lifecycle 1.0.4 2023-11-07 [] CRAN (R 4.4.1)\n#> lubridate * 1.9.3 2023-09-27 [] CRAN (R 4.4.1)\n#> magrittr 2.0.3 2022-03-30 [] CRAN (R 4.4.1)\n#> munsell 0.5.1 2024-04-01 [] CRAN (R 4.4.1)\n#> pillar 1.9.0 2023-03-22 [] CRAN (R 4.4.1)\n#> pkgconfig 2.0.3 2019-09-22 [] CRAN (R 4.4.1)\n#> purrr * 1.0.2 2023-08-10 [] CRAN (R 4.4.1)\n#> R6 2.5.1 2021-08-19 [] CRAN (R 4.4.1)\n#> Rcpp 1.0.13 2024-07-17 [] CRAN (R 4.4.1)\n#> readr * 2.1.5 2024-01-10 [] CRAN (R 4.4.1)\n#> rlang 1.1.4 2024-06-04 [] CRAN (R 4.4.1)\n#> rmarkdown 2.27 2024-05-17 [] CRAN (R 4.4.1)\n#> rstudioapi 0.16.0 2024-03-24 [] CRAN (R 4.4.1)\n#> scales 1.3.0 2023-11-28 [] CRAN (R 4.4.1)\n#> sessioninfo 1.2.2 2021-12-06 [] CRAN (R 4.4.1)\n#> stringi 1.8.4 2024-05-06 [] CRAN (R 4.4.0)\n#> stringr * 1.5.1 2023-11-14 [] CRAN (R 4.4.1)\n#> tibble * 3.2.1 2023-03-20 [] CRAN (R 4.4.1)\n#> tidyr * 1.3.1 2024-01-24 [] CRAN (R 4.4.1)\n#> tidyselect 1.2.1 2024-03-11 [] CRAN (R 4.4.1)\n#> tidyverse * 2.0.0 2023-02-22 [] CRAN (R 4.4.1)\n#> timechange 0.3.0 2024-01-18 [] CRAN (R 4.4.1)\n#> tzdb 0.4.0 2023-05-12 [] CRAN (R 4.4.1)\n#> utf8 1.2.4 2023-10-22 [] CRAN (R 4.4.1)\n#> vctrs 0.6.5 2023-12-01 [] CRAN (R 4.4.1)\n#> whisker 0.4.1 2022-12-05 [] CRAN (R 4.4.1)\n#> withr 3.0.0 2024-01-16 [] CRAN (R 4.4.1)\n#> xfun 0.46 2024-07-18 [] CRAN (R 4.4.1)\n#> yaml 2.3.9 2024-07-05 [] CRAN (R 4.4.1)\n#> \n#> \n#> ──────────────────────────────────────────────────────────────────────────────", + "section": "Reporting Results", + "text": "Reporting Results\n\nData Visualizations\nWe have to get out those results now! Let’s combine the program staff and recidivism rates dataframes so we can print out a table!\n\n\nCode\ntabout <- inner_join(roster2, staff2, by = (\"programs_clean\")) |>\n select(-c(clients_served_all, clients_served_year)) |>\n ungroup() |>\n #if any years are missing, fill in\n complete(year, nesting(programs_clean,num_staff),\n fill = list(recid_rate_all = NA, recid_rate_year = NA)\n ) |>\n #correct missing values for recid_rate_all since this is the overall recidivism rate across multiple years\n group_by(programs_clean) |>\n fill(recid_rate_all, .direction = \"updown\")\n#verify join was successful\nanti_join(roster2, staff2, by = (\"programs_clean\"))\n\n\n\n \n\n\n\nCode\n\n#overall recidivism rates\ntabout |> \n filter(year==date1) |>\n select(-c(year, recid_rate_year)) |>\n kable(format = \"html\", caption = \"Overall recidivism rates by program\")\n\n\n\nOverall recidivism rates by program\n\n\nprograms_clean\nnum_staff\nrecid_rate_all\n\n\n\n\nam\n2\n1.00\n\n\nbcs\n4\n0.27\n\n\nbrave\n8\n0.17\n\n\nchallenge\n4\n0.67\n\n\ndbt\n15\n0.50\n\n\nmhsd\n6\n0.00\n\n\nrdap\n5\n0.43\n\n\nresolve\n8\n0.29\n\n\nsotrt\n11\n0.00\n\n\nstages\n17\n0.75\n\n\n\n\n\n\n\nCode\n\n#overall recidivism rates\ntabout |> \n select(-c(recid_rate_all)) |>\n kable(format = \"html\", caption = \"Recidivism rates by program by year\")\n\n\n\nRecidivism rates by program by year\n\n\nyear\nprograms_clean\nnum_staff\nrecid_rate_year\n\n\n\n\n2021\nam\n2\nNA\n\n\n2021\nbcs\n4\n0.29\n\n\n2021\nbrave\n8\n0.33\n\n\n2021\nchallenge\n4\n0.50\n\n\n2021\ndbt\n15\n0.00\n\n\n2021\nmhsd\n6\n0.00\n\n\n2021\nrdap\n5\n0.40\n\n\n2021\nresolve\n8\n0.50\n\n\n2021\nsotrt\n11\n0.00\n\n\n2021\nstages\n17\n1.00\n\n\n2022\nam\n2\n1.00\n\n\n2022\nbcs\n4\n0.25\n\n\n2022\nbrave\n8\n0.00\n\n\n2022\nchallenge\n4\n1.00\n\n\n2022\ndbt\n15\n0.60\n\n\n2022\nmhsd\n6\nNA\n\n\n2022\nrdap\n5\n0.50\n\n\n2022\nresolve\n8\n0.00\n\n\n2022\nsotrt\n11\nNA\n\n\n2022\nstages\n17\n0.50\n\n\n\n\n\n\n\nBut what if we want a nice data visualization of it all?\n\n\nCode\n#basic bar chart of overall recidivism rate by program\nggplot(tabout |>\n filter(year == date1)\n ,aes(x=programs_clean, y=recid_rate_all)) +\n geom_bar(stat=\"identity\")\n\n\n\n\n\n\n\n\n\nCode\n\n#basic bar chart of recidivism rate by year by program\nggplot(tabout,aes(x=programs_clean, y=recid_rate_year,fill=year)) +\n geom_bar(position=\"dodge\", stat=\"identity\")\n\n\n\n\n\n\n\n\n\nNicer!!!\n\n\nCode\n#build bar chart of recidivism rates across programs\n#information to plot, pick dates\ndates <- as.numeric(c(date1,date2)) #what years of data do you want to plot?\n\n#custom title header of plot\ntitledates <- ifelse(length(dates)>=2 & date1 != date2, paste0(date1,\" - \",date2),\n ifelse((dates==date1 | dates==date2) & ALL.BY, as.character(dates),\n ifelse(!ALL.BY, date1, \"\")))\n\n#which years/programs are missing data?\nprg.NA <- tabout |> \n filter(is.na(recid_rate_year)) |>\n pull(programs_clean)\n\n#plot it! this will plot recidivism rates with overlaid staffing text\nrr <- ggplot(tabout |> \n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when(ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all))\n ,aes(x=programs_clean, y=recid_rate, fill=year)) +\n geom_bar(position = \"dodge\",stat = \"identity\") +\n geom_text(aes(label=ifelse(year==dates[2],paste(num_staff,\"staff\"),\"\")), vjust=-0.3, color = staffc) +\n scale_fill_manual(values=c(date1c,date2c)) +\n ylim(0,1) +\n ylab(\"Recidivism Rate\") +\n xlab(\"EBBR Programs\") +\n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n theme_classic() +\n #remove legend if plotting overall (not by year)\n {if(!ALL.BY) theme(legend.position=\"none\")}+\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n#display\nrr\n\n\n\n\n\n\n\n\n\n\n\nCWC Damned Lies and Statistics\nOh no!! An advocacy group just publicly published that EBBR programs’ recidivism rates are at an all time high of 67% with a report that claims to have used your DOC’s reported data on EBBR program recidivism rates! Find out what’s going on, and fast!\n\n\nCode\n#manage the data to produce recidivism rates\ntabout2 <- inner_join(roster2, staff2, by = (\"programs_clean\")) |>\n ungroup() |>\n #if any years are missing, fill in\n complete(year, nesting(programs_clean,num_staff),\n fill = list(recid_rate_all = NA, recid_rate_year = NA)\n ) |>\n #correct missing values for recid_rate_all since this is the overall recidivism rate across multiple years\n group_by(programs_clean) |>\n fill(c(recid_rate_all,clients_served_all), .direction = \"updown\")\n#verify join was successful\nanti_join(roster2, staff2, by = (\"programs_clean\"))\n\n\n\n \n\n\n\nCode\n\n#calculate average recidivism rate across programs from all years\nunw.a <- round(mean(tabout2$recid_rate_all,na.rm=TRUE),2)\nprint(paste0(unw.a*100,\"%\"))\n#> [1] \"41%\"\n\n#calculate average recidivism rate across programs from year 1\nunw.d1 <- round(mean(tabout2[which(tabout2$year==date1),]$recid_rate_year,na.rm=TRUE),2)\nprint(paste0(unw.d1*100,\"%\"))\n#> [1] \"34%\"\n\n#calculate average recidivism rate across programs from year 2\nunw.d2 <- round(mean(tabout2[which(tabout2$year==date2),]$recid_rate_year,na.rm=TRUE),2)\nprint(paste0(unw.d2*100,\"%\"))\n#> [1] \"48%\"\n\n\nWell a recidivism rate of 41% is much better, but it still doesn’t match what the advocacy group is reporting! Keep investigating!\n\n\nCode\n#programs to remove\nrm.pgms <- c(\"bcs\", \"brave\", \"sotrt\", \"mhsd\", \"resolve\")\n\n\n\n\nCode\n#remove 5 of the 10 programs because the advocacy group was sneaky\nadv <- tabout |>\n filter(!(programs_clean %in% rm.pgms) &\n year == date1) #dates repeat the same information, so just pick one date to average over\n#calculate ADVOCACY rate, which will be inserted into document text\nadv_rate <- round(mean(adv$recid_rate_all,na.rm=TRUE)*100,1)\nprint(adv_rate)\n#> [1] 67\n\n\n\n\nData-Informed Reporting\nAlright! There’s the number the advocacy group reported! But what’s missing?\n\n\nCode\n#manage the data to produce recidivism rates\n#total clients served (all years)\ntotal.a <- sum(tabout2[which(tabout2$year==date1),]$clients_served_all, na.rm=TRUE)\ntabout2.wgt <- tabout2 |>\n filter(year==date1) |>\n mutate(recid_rate_all_w = clients_served_all*recid_rate_all)\nw.a <- round(sum(tabout2.wgt$recid_rate_all_w)/total.a,2)\nprint(paste0(w.a*100,\"%\"))\n#> [1] \"36%\"\n\n#total clients served (all years), JUST 5 PROGRAMS!!!\ntotal.a5 <- sum(tabout2[which(tabout2$year==date1 & !tabout2$programs_clean %in% rm.pgms),]$clients_served_all, na.rm=TRUE)\ntabout2.wgt5 <- tabout2 |>\n filter(!(programs_clean %in% rm.pgms) & year==date1) |>\n mutate(recid_rate_all_w = clients_served_all*recid_rate_all)\nw.a5 <- round(sum(tabout2.wgt5$recid_rate_all_w)/total.a5,2)\nprint(paste0(w.a5*100,\"%\"))\n#> [1] \"57%\"\n\n\nAlright! If we just weight our data then we see that the average overall recidivism rate across the five programs that the advocacy group highlighted is only 57%. Great work!\nNow let’s report it through some fancy data visualization work.\n\n\nCode\n#which weights to plot\nw <- if(ALL.BY) c(w.d1,w.d2) else w.a\nunw <- if(ALL.BY) c(unw.d1,unw.d2) else unw.a\n\n#for positioning\nlast <- tail(tabout$programs_clean,n=1)\n\n#add recidivism weight averages to plot\nrr + geom_hline(yintercept=w[1], linetype = \"dashed\", color = hlinew1, size = 1) +\n {if(ALL.BY) geom_hline(yintercept=w[2], linetype = \"dashed\", color = hlinew1, size = 1)}+\n geom_hline(yintercept=unw[1], linetype = \"dashed\", color = hlinew2, size = 1) +\n {if(ALL.BY) geom_hline(yintercept=unw[2], linetype = \"dashed\", color = hlinew2, size = 1)}+\n geom_label_repel(aes(label=lab, y=0.25), fill=hlinew1,\n data = tabout |>\n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when( ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all),\n lab = case_when( ALL.BY & date1 != date2 ~ paste0(date1,\" Weighted avg: \",w[1],\"\\n\",date2,\" Weighted avg: \",w[2]),\n !ALL.BY & date1 != date2 ~ paste0(date1,\"-\",date2,\"\\nWeighted avg: \",w[1]),\n ALL.BY & date1 == date2 ~ paste0(date1,\"\\nWeighted avg: \",w[1]))) |>\n filter(programs_clean == last & year == date2),\n nudge_x = -4,\n min.segment.length = Inf) + \n geom_label_repel(aes(label=lab, y=0.5), fill=hlinew2,\n data = tabout |>\n filter(if(ALL.BY) year %in% dates else year == date2) |>\n mutate(recid_rate = case_when( ALL.BY ~ recid_rate_year,\n !ALL.BY ~ recid_rate_all),\n lab = case_when( ALL.BY & date1 != date2 ~ paste0(date1,\" Unweighted avg: \",unw[1],\"\\n\",date2,\" Unweighted avg: \",unw[2]),\n !ALL.BY & date1 != date2 ~ paste0(date1,\"-\",date2,\"\\nUnweighted avg: \",unw[1]),\n ALL.BY & date1 == date2 ~ paste0(date1,\"\\nUnweighted avg: \",unw[1]))) |>\n filter(programs_clean == last & year == date2),\n nudge_x = -8,\n min.segment.length = Inf\n )", "crumbs": [ "2  GDOC Recidivism Analysis" ] }, { - "objectID": "va_cs.html#more-data-viz", - "href": "va_cs.html#more-data-viz", + "objectID": "va_cs.html#more-cool-data-viz", + "href": "va_cs.html#more-cool-data-viz", "title": "GDOC Recidivism Analysis", - "section": "More Data Viz!", - "text": "More Data Viz!\n\n\nCode\n#this code will run if plotting data for multiple years, otherwise nothing will be produced (i.e., ALL.BY <- T)\n\n#manipulate data for plotting\ntabout.date1 <- tabout |>\n filter(year==date1) |>\n select(c(recid_rate_year, programs_clean)) |>\n rename(recid_rate_date1 = recid_rate_year)\ntabout.date2 <- tabout |>\n filter(year==date2) |>\n select(c(recid_rate_year, programs_clean)) |>\n rename(recid_rate_date2 = recid_rate_year)\ntabout.dates <- inner_join(tabout.date1, tabout.date2, by = \"programs_clean\")\n\n#plot!\ngg_dot <- tabout.dates |>\n # rearrange the factor levels for discipline by rates for women\n arrange(recid_rate_date1) |>\n mutate(discipline = fct_inorder(programs_clean)) |>\n \n ggplot() +\n # remove axes and superfluous grids\n theme_classic() +\n theme(axis.title = element_blank(),\n axis.ticks.y = element_blank(),\n axis.line = element_blank()) +\n \n # add a dummy point for scaling purposes\n geom_point(aes(x = 0.7, y = programs_clean), \n size = 0, col = \"white\") + \n \n # add the horizontal programs_clean lines\n geom_hline(yintercept = 1:length(tabout.dates$programs_clean), col = \"grey80\") +\n \n # add a point for each date1 recidivism rate\n geom_point(aes(x = recid_rate_date1, y = programs_clean), \n size = 11, col = date1c) +\n\n # add a point for each date2 recidivism rate\n geom_point(aes(x = recid_rate_date2, y = programs_clean),\n size = 11, col = date2c) + \n\n # add the text (%) for each date2 recidivism rate\n geom_text(aes(x = recid_rate_date2, y = programs_clean, \n label = paste0(round(recid_rate_date2, 1))),\n col = \"black\") +\n\n # add the text (%) for each date1 recidivism rate\n geom_text(aes(x = recid_rate_date1, y = programs_clean, \n label = paste0(round(recid_rate_date1, 1))),\n col = \"white\") +\n\n # add a label above the first two points\n geom_text(aes(x = x, y = y, label = label, col = label),\n data.frame(x = c(tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1,\n tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date2), \n y = length(tabout.dates$programs_clean) + 1, \n label = c(as.factor(date1), as.factor(date2))), size = 6) +\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n \n # manually specify the x-axis\n scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), \n labels = c(\"0\",\"0.25\", \"0.50\", \"0.75\", \"1\")) +\n # manually set the spacing above and below the plot\n scale_y_discrete(expand = c(0.2, 0)) \n\n#add titles/captions\ngg_dot + ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates))+\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))", + "section": "More Cool Data Viz!", + "text": "More Cool Data Viz!\nLet’s prepare our data to do some really fun data viz!\n\n\nCode\n#this code will run if plotting data for multiple years, otherwise nothing will be produced (i.e., ALL.BY <- T)\n\n#manipulate data for plotting\ntabout.date1 <- tabout |>\n filter(year==date1) |>\n select(c(recid_rate_year, programs_clean, recid_rate_all)) |>\n rename(recid_rate_date1 = recid_rate_year)\ntabout.date2 <- tabout |>\n filter(year==date2) |>\n select(c(recid_rate_year, programs_clean)) |>\n rename(recid_rate_date2 = recid_rate_year)\ntabout.dates <- inner_join(tabout.date1, tabout.date2, by = \"programs_clean\") |>\n select(programs_clean, recid_rate_date1, recid_rate_date2, recid_rate_all)\n\nhead(tabout.dates)\n\n\n\n \n\n\n\n\nPlotting Overall\nWhat are some more engaging ways we can plot recidivism rates for leadership and our stakeholders overall for these programs?\n\n\nCode\n#make some really cool horizontal floating dot charts!\n#overwrite value of rates to overall if ALL.BY\n{if(!ALL.BY) tabout.dates$recid_rate_date1 <- tabout.dates$recid_rate_all}\n\n#plot two years or one year depending on ALL.BY setting\n{if(ALL.BY) plotit <- c(tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1, tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date2) else plotit <- tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1}\n\n#remove label legend if by year\n{if(ALL.BY) titledates2 <- c(as.factor(date1),as.factor(date2)) else titledates2 <- \"\"}\n\n#plot!\ngg_dot <- tabout.dates |>\n # rearrange the factor levels for programs by rates for date1\n arrange(recid_rate_date1) |>\n mutate(discipline = fct_inorder(programs_clean)) |>\n \n ggplot() +\n # remove axes and superfluous grids\n theme_classic() +\n theme(axis.title = element_blank(),\n axis.ticks.y = element_blank(),\n axis.line = element_blank()) +\n \n # add a dummy point for scaling purposes\n geom_point(aes(x = 0.7, y = programs_clean), \n size = 0, col = \"white\") + \n \n # add the horizontal programs_clean lines\n geom_hline(yintercept = 1:length(tabout.dates$programs_clean), col = \"grey80\") +\n \n # add a point for each date1 recidivism rate\n geom_point(aes(x = recid_rate_date1, y = programs_clean), \n size = 11, col = date1c) +\n\n # add a point for each date2 recidivism rate\n {if(ALL.BY) geom_point(aes(x = recid_rate_date2, y = programs_clean),size = 11, col = date2c)} + \n\n # round each date2 recidivism rate\n {if(ALL.BY) geom_text(aes(x = recid_rate_date2, y = programs_clean, label = paste0(round(recid_rate_date2, 2))), col = \"black\")} +\n\n # round each date1 recidivism rate\n geom_text(aes(x = recid_rate_date1, y = programs_clean, \n label = paste0(round(recid_rate_date1, 2))),\n col = \"white\") +\n\n # add a label above the first two points\n geom_text(aes(x = x, y = y, label = label, col = label),\n data.frame(x = plotit, \n y = length(tabout.dates$programs_clean) + 1, \n label = titledates2), size = 6) +\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n \n # manually specify the x-axis\n scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), \n labels = c(\"0\",\"0.25\", \"0.50\", \"0.75\", \"1\")) +\n # manually set the spacing above and below the plot\n scale_y_discrete(expand = c(0.2, 0)) \n\n#add titles/captions\ngg_dot + \n {if (ALL.BY) ggtitle(\"Recidivism Rates across EBBR programs\\n\") else ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates))} +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n\n\n\nCode\n##horizontal lollipop chart\nggplot(tabout, aes(x=programs_clean, y=recid_rate_all)) +\n geom_segment( aes(x=programs_clean, xend=programs_clean, y=0, yend=recid_rate_all), color=date1c) +\n geom_point( color=staffc, size=4, alpha=0.6) +\n theme_light() +\n coord_flip() +\n xlab(\"EBBR Programs\") +\n ylab(\"Recidivism Rate\") +\n theme(\n panel.grid.major.y = element_blank(),\n panel.border = element_blank(),\n axis.ticks.y = element_blank()\n ) + \n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n theme(plot.caption=element_text(hjust=0)) +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}\n\n\n\n\n\n\n\n\n\nCode\n\n##horizontal lollipop chart w/weighted average\nggplot(tabout, aes(x=programs_clean, y=recid_rate_all)) +\n geom_segment(aes(x=programs_clean, xend=programs_clean, y=w.a, yend=recid_rate_all), color=date1c) +\n geom_point(color=staffc, size=4, alpha=0.6) +\n geom_hline(yintercept=w.a, linetype = \"dashed\", color = hlinew1, size = 1) +\n geom_label(aes(label=paste0(\"Weighted avg: \",w.a), x=w.a, vjust = -10, hjust = 1.5), fill=hlinew1,\n data = tabout |>\n filter(programs_clean == last & year == date2)) +\n theme_light() +\n coord_flip() +\n xlab(\"EBBR Programs\") +\n ylab(\"Recidivism Rate\") +\n theme(\n panel.grid.major.y = element_blank(),\n panel.border = element_blank(),\n axis.ticks.y = element_blank()\n ) + \n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n\n\n\n\n\n\n\n\n\n\nPlotting by Year\nWhat about displaying these rates by year?\n\n\nCode\n#make some really cool horizontal floating dot charts!\n#overwrite value of rates to overall if ALL.BY\n{if(!ALL.BY) tabout.dates$recid_rate_date1 <- tabout.dates$recid_rate_all}\n\n#plot two years or one year depending on ALL.BY setting\n{if(ALL.BY) plotit <- c(tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1, tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date2) else plotit <- tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1}\n\n#remove label legend if by year\n{if(ALL.BY) titledates2 <- c(as.factor(date1),as.factor(date2)) else titledates2 <- \"\"}\n\n#plot!\ngg_dot <- tabout.dates |>\n # rearrange the factor levels for programs by rates for date1\n arrange(recid_rate_date1) |>\n mutate(discipline = fct_inorder(programs_clean)) |>\n \n ggplot() +\n # remove axes and superfluous grids\n theme_classic() +\n theme(axis.title = element_blank(),\n axis.ticks.y = element_blank(),\n axis.line = element_blank()) +\n \n # add a dummy point for scaling purposes\n geom_point(aes(x = 0.7, y = programs_clean), \n size = 0, col = \"white\") + \n \n # add the horizontal programs_clean lines\n geom_hline(yintercept = 1:length(tabout.dates$programs_clean), col = \"grey80\") +\n \n # add a point for each date1 recidivism rate\n geom_point(aes(x = recid_rate_date1, y = programs_clean), \n size = 11, col = date1c) +\n\n # add a point for each date2 recidivism rate\n {if(ALL.BY) geom_point(aes(x = recid_rate_date2, y = programs_clean),size = 11, col = date2c)} + \n\n # round each date2 recidivism rate\n {if(ALL.BY) geom_text(aes(x = recid_rate_date2, y = programs_clean, label = paste0(round(recid_rate_date2, 2))), col = \"black\")} +\n\n # round each date1 recidivism rate\n geom_text(aes(x = recid_rate_date1, y = programs_clean, \n label = paste0(round(recid_rate_date1, 2))),\n col = \"white\") +\n\n # add a label above the first two points\n geom_text(aes(x = x, y = y, label = label, col = label),\n data.frame(x = plotit, \n y = length(tabout.dates$programs_clean) + 1, \n label = titledates2), size = 6) +\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n \n # manually specify the x-axis\n scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), \n labels = c(\"0\",\"0.25\", \"0.50\", \"0.75\", \"1\")) +\n # manually set the spacing above and below the plot\n scale_y_discrete(expand = c(0.2, 0)) \n\n#add titles/captions\ngg_dot + \n {if (ALL.BY) ggtitle(\"Recidivism Rates across EBBR programs\\n\") else ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates))} +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n#> Warning: Removed 1 row containing missing values or values outside the scale range\n#> (`geom_point()`).\n#> Warning: Removed 2 rows containing missing values or values outside the scale range\n#> (`geom_point()`).\n#> Warning: Removed 2 rows containing missing values or values outside the scale range\n#> (`geom_text()`).\n#> Warning: Removed 1 row containing missing values or values outside the scale range\n#> (`geom_text()`).\n\n\n\n\n\n\n\n\n\n\n\nCode\n#plot!\ngg_line <- tabout.dates |>\n # add a variable for when rates are higher in date1 than in date2 (for colours)\n mutate(date1high = recid_rate_date1 > recid_rate_date2) |>\n ggplot() +\n # add a line segment that goes from date1 to date2 for each program\n geom_segment(aes(x = 1, xend = 2, \n y = recid_rate_date1, yend = recid_rate_date2,\n group = programs_clean,\n col = date1high), \n size = 1.2) +\n # set the colors\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n # remove all axis stuff\n theme_classic() + \n theme(axis.line = element_blank(),\n axis.text = element_blank(),\n axis.title = element_blank(),\n axis.ticks = element_blank()) +\n # add vertical lines that act as axis for date1\n geom_segment(x = 1, \n xend = 1, \n y = min(tabout.dates$recid_rate_date1, na.rm=T) - 0.1,\n yend = max(tabout.dates$recid_rate_date1, na.rm=T) + 0.125,\n col = \"grey70\", size = 0.5) +\n # add vertical lines that act as axis for date2\n geom_segment(x = 2, \n xend = 2, \n y = min(tabout.dates$recid_rate_date1, na.rm=T) - 0.1,\n yend = max(tabout.dates$recid_rate_date1, na.rm=T) + 0.125,\n col = \"grey70\", size = 0.5) +\n # add the labels above their axes\n geom_text(aes(x = x, y = y, label = label),\n data = data.frame(x = 1:2, \n y = max(tabout.dates$recid_rate_date2, na.rm=T) + 0.05,\n label = c(date1, date2)),\n col = \"grey30\",\n size = 6) +\n # add the label and rate for each program next the date1 axis\n geom_text_repel(aes(x = 1 - 0.03, \n y = recid_rate_date1, \n label = paste0(programs_clean, \", \", round(recid_rate_date1, 2))),\n force_pull = 0,\n nudge_y = 0.05, nudge_x = -0.075,\n direction = \"y\",\n hjust = 1,\n segment.size = 0.2,\n max.iter = 1e4, max.time = 1) +\n # add the rate next to each point on the date2 axis\n geom_text(aes(x = 2 + 0.08, \n y = recid_rate_date2, \n label = paste0(round(recid_rate_date2, 2))),\n col = \"grey30\") +\n # set the limits of the x-axis so that the labels are not cut off\n scale_x_continuous(limits = c(0.5, 2.1)) + \n \n # add the white outline for the points at each rate for date1\n geom_point(aes(x = 1, \n y = recid_rate_date1), size = 4.5,\n col = \"white\") +\n # add the white outline for the points at each rate for date2\n geom_point(aes(x = 2, \n y = recid_rate_date2), size = 4.5,\n col = \"white\") +\n \n # add the actual points at each rate for date1\n geom_point(aes(x = 1, \n y = recid_rate_date1), size = 4,\n col = \"grey60\") +\n # add the actual points at each rate for date2\n geom_point(aes(x = 2, \n y = recid_rate_date2), size = 4,\n col = \"grey60\") \n \ngg_line +\n ggtitle(\"Recidivism Rates across EBBR programs\\n\") +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))", "crumbs": [ "2  GDOC Recidivism Analysis" ] }, { - "objectID": "va_cs.html#more-cool-data-viz", - "href": "va_cs.html#more-cool-data-viz", + "objectID": "va_cs.html#r-session", + "href": "va_cs.html#r-session", "title": "GDOC Recidivism Analysis", - "section": "More Cool Data Viz!", - "text": "More Cool Data Viz!\nLet’s prepare our data to do some really fun data viz!\n\n\nCode\n#this code will run if plotting data for multiple years, otherwise nothing will be produced (i.e., ALL.BY <- T)\n\n#manipulate data for plotting\ntabout.date1 <- tabout |>\n filter(year==date1) |>\n select(c(recid_rate_year, programs_clean, recid_rate_all)) |>\n rename(recid_rate_date1 = recid_rate_year)\ntabout.date2 <- tabout |>\n filter(year==date2) |>\n select(c(recid_rate_year, programs_clean)) |>\n rename(recid_rate_date2 = recid_rate_year)\ntabout.dates <- inner_join(tabout.date1, tabout.date2, by = \"programs_clean\") |>\n select(programs_clean, recid_rate_date1, recid_rate_date2, recid_rate_all)\n\nhead(tabout.dates)\n\n\n\n \n\n\n\n\nPlotting Overall\nWhat are some more engaging ways we can plot recidivism rates for leadership and our stakeholders overall for these programs?\n\n\nCode\n#overwrite value of rates to overall if ALL.By\n{if(!ALL.BY) tabout.dates$recid_rate_date1 <- tabout.dates$recid_rate_all}\n\n#plot two years or one year depending on ALL.BY setting\n{if(ALL.BY) plotit <- c(tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1, tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date2) else plotit <- tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1}\n\n#remove label legend if by year\n{if(ALL.BY) titledates2 <- c(as.factor(date1),as.factor(date2)) else titledates2 <- \"\"}\n\n#plot!\ngg_dot <- tabout.dates |>\n # rearrange the factor levels for programs by rates for date1\n arrange(recid_rate_date1) |>\n mutate(discipline = fct_inorder(programs_clean)) |>\n \n ggplot() +\n # remove axes and superfluous grids\n theme_classic() +\n theme(axis.title = element_blank(),\n axis.ticks.y = element_blank(),\n axis.line = element_blank()) +\n \n # add a dummy point for scaling purposes\n geom_point(aes(x = 0.7, y = programs_clean), \n size = 0, col = \"white\") + \n \n # add the horizontal programs_clean lines\n geom_hline(yintercept = 1:length(tabout.dates$programs_clean), col = \"grey80\") +\n \n # add a point for each date1 recidivism rate\n geom_point(aes(x = recid_rate_date1, y = programs_clean), \n size = 11, col = date1c) +\n\n # add a point for each date2 recidivism rate\n {if(ALL.BY) geom_point(aes(x = recid_rate_date2, y = programs_clean),size = 11, col = date2c)} + \n\n # round each date2 recidivism rate\n {if(ALL.BY) geom_text(aes(x = recid_rate_date2, y = programs_clean, label = paste0(round(recid_rate_date2, 2))), col = \"black\")} +\n\n # round each date1 recidivism rate\n geom_text(aes(x = recid_rate_date1, y = programs_clean, \n label = paste0(round(recid_rate_date1, 2))),\n col = \"white\") +\n\n # add a label above the first two points\n geom_text(aes(x = x, y = y, label = label, col = label),\n data.frame(x = plotit, \n y = length(tabout.dates$programs_clean) + 1, \n label = titledates2), size = 6) +\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n \n # manually specify the x-axis\n scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), \n labels = c(\"0\",\"0.25\", \"0.50\", \"0.75\", \"1\")) +\n # manually set the spacing above and below the plot\n scale_y_discrete(expand = c(0.2, 0)) \n\n#add titles/captions\ngg_dot + \n {if (ALL.BY) ggtitle(\"Recidivism Rates across EBBR programs\\n\") else ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates))} +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n\n\n\nCode\n##horizontal lollipop chart\nggplot(tabout, aes(x=programs_clean, y=recid_rate_all)) +\n geom_segment( aes(x=programs_clean, xend=programs_clean, y=0, yend=recid_rate_all), color=date1c) +\n geom_point( color=staffc, size=4, alpha=0.6) +\n theme_light() +\n coord_flip() +\n xlab(\"EBBR Programs\") +\n ylab(\"Recidivism Rate\") +\n theme(\n panel.grid.major.y = element_blank(),\n panel.border = element_blank(),\n axis.ticks.y = element_blank()\n ) + \n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n theme(plot.caption=element_text(hjust=0)) +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}\n\n##horizontal lollipop chart w/weighted average\nggplot(tabout, aes(x=programs_clean, y=recid_rate_all)) +\n geom_segment(aes(x=programs_clean, xend=programs_clean, y=w.a, yend=recid_rate_all), color=date1c) +\n geom_point(color=staffc, size=4, alpha=0.6) +\n geom_hline(yintercept=w.a, linetype = \"dashed\", color = hlinew1, size = 1) +\n geom_label(aes(label=paste0(\"Weighted avg: \",w.a), x=w.a, vjust = -10, hjust = 1.5), fill=hlinew1,\n data = tabout |>\n filter(programs_clean == last & year == date2)) +\n theme_light() +\n coord_flip() +\n xlab(\"EBBR Programs\") +\n ylab(\"Recidivism Rate\") +\n theme(\n panel.grid.major.y = element_blank(),\n panel.border = element_blank(),\n axis.ticks.y = element_blank()\n ) + \n ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates)) +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n\n\n\nPlotting by Year\nWhat about displaying these rates by year?\n\n\nCode\n#overwrite value of rates to overall if ALL.By\n{if(!ALL.BY) tabout.dates$recid_rate_date1 <- tabout.dates$recid_rate_all}\n\n#plot two years or one year depending on ALL.BY setting\n{if(ALL.BY) plotit <- c(tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1, tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date2) else plotit <- tabout.dates[which(tabout.dates$programs_clean==\"stages\"),]$recid_rate_date1}\n\n#remove label legend if by year\n{if(ALL.BY) titledates2 <- c(as.factor(date1),as.factor(date2)) else titledates2 <- \"\"}\n\n#plot!\ngg_dot <- tabout.dates |>\n # rearrange the factor levels for programs by rates for date1\n arrange(recid_rate_date1) |>\n mutate(discipline = fct_inorder(programs_clean)) |>\n \n ggplot() +\n # remove axes and superfluous grids\n theme_classic() +\n theme(axis.title = element_blank(),\n axis.ticks.y = element_blank(),\n axis.line = element_blank()) +\n \n # add a dummy point for scaling purposes\n geom_point(aes(x = 0.7, y = programs_clean), \n size = 0, col = \"white\") + \n \n # add the horizontal programs_clean lines\n geom_hline(yintercept = 1:length(tabout.dates$programs_clean), col = \"grey80\") +\n \n # add a point for each date1 recidivism rate\n geom_point(aes(x = recid_rate_date1, y = programs_clean), \n size = 11, col = date1c) +\n\n # add a point for each date2 recidivism rate\n {if(ALL.BY) geom_point(aes(x = recid_rate_date2, y = programs_clean),size = 11, col = date2c)} + \n\n # round each date2 recidivism rate\n {if(ALL.BY) geom_text(aes(x = recid_rate_date2, y = programs_clean, label = paste0(round(recid_rate_date2, 2))), col = \"black\")} +\n\n # round each date1 recidivism rate\n geom_text(aes(x = recid_rate_date1, y = programs_clean, \n label = paste0(round(recid_rate_date1, 2))),\n col = \"white\") +\n\n # add a label above the first two points\n geom_text(aes(x = x, y = y, label = label, col = label),\n data.frame(x = plotit, \n y = length(tabout.dates$programs_clean) + 1, \n label = titledates2), size = 6) +\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n \n # manually specify the x-axis\n scale_x_continuous(breaks = c(0, 0.25, 0.5, 0.75, 1), \n labels = c(\"0\",\"0.25\", \"0.50\", \"0.75\", \"1\")) +\n # manually set the spacing above and below the plot\n scale_y_discrete(expand = c(0.2, 0)) \n\n#add titles/captions\ngg_dot + \n {if (ALL.BY) ggtitle(\"Recidivism Rates across EBBR programs\\n\") else ggtitle(paste0(\"Recidivism Rates across EBBR programs\\n\",titledates))} +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))\n\n\n\n\n\n\n\n\n\n\n\nCode\n#plot!\ngg_line <- tabout.dates |>\n # add a variable for when rates are higher in date1 than in date2 (for colours)\n mutate(date1high = recid_rate_date1 > recid_rate_date2) |>\n ggplot() +\n # add a line segment that goes from date1 to date2 for each program\n geom_segment(aes(x = 1, xend = 2, \n y = recid_rate_date1, yend = recid_rate_date2,\n group = programs_clean,\n col = date1high), \n size = 1.2) +\n # set the colors\n scale_color_manual(values = c(date1c, date2c), guide = \"none\") +\n # remove all axis stuff\n theme_classic() + \n theme(axis.line = element_blank(),\n axis.text = element_blank(),\n axis.title = element_blank(),\n axis.ticks = element_blank()) +\n # add vertical lines that act as axis for date1\n geom_segment(x = 1, \n xend = 1, \n y = min(tabout.dates$recid_rate_date1, na.rm=T) - 0.1,\n yend = max(tabout.dates$recid_rate_date1, na.rm=T) + 0.125,\n col = \"grey70\", size = 0.5) +\n # add vertical lines that act as axis for date2\n geom_segment(x = 2, \n xend = 2, \n y = min(tabout.dates$recid_rate_date1, na.rm=T) - 0.1,\n yend = max(tabout.dates$recid_rate_date1, na.rm=T) + 0.125,\n col = \"grey70\", size = 0.5) +\n # add the labels above their axes\n geom_text(aes(x = x, y = y, label = label),\n data = data.frame(x = 1:2, \n y = max(tabout.dates$recid_rate_date2, na.rm=T) + 0.05,\n label = c(date1, date2)),\n col = \"grey30\",\n size = 6) +\n # add the label and rate for each program next the date1 axis\n geom_text_repel(aes(x = 1 - 0.03, \n y = recid_rate_date1, \n label = paste0(programs_clean, \", \", round(recid_rate_date1, 2))),\n force_pull = 0,\n nudge_y = 0.05, nudge_x = -0.075,\n direction = \"y\",\n hjust = 1,\n segment.size = 0.2,\n max.iter = 1e4, max.time = 1) +\n # add the rate next to each point on the date2 axis\n geom_text(aes(x = 2 + 0.08, \n y = recid_rate_date2, \n label = paste0(round(recid_rate_date2, 2))),\n col = \"grey30\") +\n # set the limits of the x-axis so that the labels are not cut off\n scale_x_continuous(limits = c(0.5, 2.1)) + \n \n # add the white outline for the points at each rate for date1\n geom_point(aes(x = 1, \n y = recid_rate_date1), size = 4.5,\n col = \"white\") +\n # add the white outline for the points at each rate for date2\n geom_point(aes(x = 2, \n y = recid_rate_date2), size = 4.5,\n col = \"white\") +\n \n # add the actual points at each rate for date1\n geom_point(aes(x = 1, \n y = recid_rate_date1), size = 4,\n col = \"grey60\") +\n # add the actual points at each rate for date2\n geom_point(aes(x = 2, \n y = recid_rate_date2), size = 4,\n col = \"grey60\") \n \ngg_line +\n ggtitle(\"Recidivism Rates across EBBR programs\\n\") +\n #only print caption if a program is missing data\n {if(length(prg.NA)!=0) labs(caption = capture.output(cat(\"The following programs were missing data in some years:\", unique(toupper(prg.NA)), sep=\" \")))}+\n theme(plot.caption=element_text(hjust=0))", + "section": "R Session", + "text": "R Session\n\n\nCode\n#for reproducibility\nsi <- sessioninfo::session_info()\nsi$packages$library <- NULL\nsi$platform$pandoc <- NULL\nsi\n#> ─ Session info ───────────────────────────────────────────────────────────────\n#> setting value\n#> version R version 4.4.1 (2024-06-14 ucrt)\n#> os Windows 10 x64 (build 19045)\n#> system x86_64, mingw32\n#> ui RTerm\n#> language (EN)\n#> collate English_United States.utf8\n#> ctype English_United States.utf8\n#> tz America/Denver\n#> date 2024-07-21\n#> \n#> ─ Packages ───────────────────────────────────────────────────────────────────\n#> package * version date (UTC) lib source\n#> charlatan * 0.5.1 2023-09-13 [] CRAN (R 4.4.1)\n#> cli 3.6.3 2024-06-21 [] CRAN (R 4.4.1)\n#> colorspace 2.1-0 2023-01-23 [] CRAN (R 4.4.1)\n#> digest 0.6.36 2024-06-23 [] CRAN (R 4.4.1)\n#> dplyr * 1.1.4 2023-11-17 [] CRAN (R 4.4.1)\n#> evaluate 0.24.0 2024-06-10 [] CRAN (R 4.4.1)\n#> fansi 1.0.6 2023-12-08 [] CRAN (R 4.4.1)\n#> farver 2.1.2 2024-05-13 [] CRAN (R 4.4.1)\n#> fastmap 1.2.0 2024-05-15 [] CRAN (R 4.4.1)\n#> forcats * 1.0.0 2023-01-29 [] CRAN (R 4.4.1)\n#> generics 0.1.3 2022-07-05 [] CRAN (R 4.4.1)\n#> ggplot2 * 3.5.1 2024-04-23 [] CRAN (R 4.4.1)\n#> ggrepel * 0.9.5 2024-01-10 [] CRAN (R 4.4.1)\n#> glue 1.7.0 2024-01-09 [] CRAN (R 4.4.1)\n#> gtable 0.3.5 2024-04-22 [] CRAN (R 4.4.1)\n#> highr 0.11 2024-05-26 [] CRAN (R 4.4.1)\n#> hms 1.1.3 2023-03-21 [] CRAN (R 4.4.1)\n#> htmltools 0.5.8.1 2024-04-04 [] CRAN (R 4.4.1)\n#> jsonlite 1.8.8 2023-12-04 [] CRAN (R 4.4.1)\n#> knitr * 1.48 2024-07-07 [] CRAN (R 4.4.1)\n#> labeling 0.4.3 2023-08-29 [] CRAN (R 4.4.0)\n#> lifecycle 1.0.4 2023-11-07 [] CRAN (R 4.4.1)\n#> lubridate * 1.9.3 2023-09-27 [] CRAN (R 4.4.1)\n#> magrittr 2.0.3 2022-03-30 [] CRAN (R 4.4.1)\n#> munsell 0.5.1 2024-04-01 [] CRAN (R 4.4.1)\n#> pillar 1.9.0 2023-03-22 [] CRAN (R 4.4.1)\n#> pkgconfig 2.0.3 2019-09-22 [] CRAN (R 4.4.1)\n#> purrr * 1.0.2 2023-08-10 [] CRAN (R 4.4.1)\n#> R6 2.5.1 2021-08-19 [] CRAN (R 4.4.1)\n#> Rcpp 1.0.13 2024-07-17 [] CRAN (R 4.4.1)\n#> readr * 2.1.5 2024-01-10 [] CRAN (R 4.4.1)\n#> rlang 1.1.4 2024-06-04 [] CRAN (R 4.4.1)\n#> rmarkdown 2.27 2024-05-17 [] CRAN (R 4.4.1)\n#> rstudioapi 0.16.0 2024-03-24 [] CRAN (R 4.4.1)\n#> scales 1.3.0 2023-11-28 [] CRAN (R 4.4.1)\n#> sessioninfo 1.2.2 2021-12-06 [] CRAN (R 4.4.1)\n#> stringi 1.8.4 2024-05-06 [] CRAN (R 4.4.0)\n#> stringr * 1.5.1 2023-11-14 [] CRAN (R 4.4.1)\n#> tibble * 3.2.1 2023-03-20 [] CRAN (R 4.4.1)\n#> tidyr * 1.3.1 2024-01-24 [] CRAN (R 4.4.1)\n#> tidyselect 1.2.1 2024-03-11 [] CRAN (R 4.4.1)\n#> tidyverse * 2.0.0 2023-02-22 [] CRAN (R 4.4.1)\n#> timechange 0.3.0 2024-01-18 [] CRAN (R 4.4.1)\n#> tzdb 0.4.0 2023-05-12 [] CRAN (R 4.4.1)\n#> utf8 1.2.4 2023-10-22 [] CRAN (R 4.4.1)\n#> vctrs 0.6.5 2023-12-01 [] CRAN (R 4.4.1)\n#> whisker 0.4.1 2022-12-05 [] CRAN (R 4.4.1)\n#> withr 3.0.0 2024-01-16 [] CRAN (R 4.4.1)\n#> xfun 0.46 2024-07-18 [] CRAN (R 4.4.1)\n#> yaml 2.3.9 2024-07-05 [] CRAN (R 4.4.1)\n#> \n#> \n#> ──────────────────────────────────────────────────────────────────────────────", "crumbs": [ "2  GDOC Recidivism Analysis" ] diff --git a/va_cs.qmd b/va_cs.qmd index c40e8d0..03cc636 100644 --- a/va_cs.qmd +++ b/va_cs.qmd @@ -164,9 +164,9 @@ write.csv(staff,"staff.csv", row.names = FALSE) ```{r toggle, echo=FALSE} #ANALYSIS TOGGLE ###################################### -roster <- roster.update #options: roster; roster.update -date1 <- date3 #options: date1 (within roster); date3 (within roster.update) -date2 <- date4 #options: date2 (within roster); date4 (within roster.update) +roster <- roster #options: roster; roster.update +date1 <- date1 #options: date1 (within roster); date3 (within roster.update) +date2 <- date2 #options: date2 (within roster); date4 (within roster.update) ALL.BY <- T #change to F if you want to plot recidivism rate over two years TOTAL (T if you want to stratify by year) #colors @@ -693,8 +693,8 @@ head(tabout.dates) ```{r dataviz4, echo=FALSE, warning=FALSE, message=FALSE, include=FALSE} #| code-fold: true - -#overwrite value of rates to overall if ALL.By +#make some really cool horizontal floating dot charts! +#overwrite value of rates to overall if ALL.BY {if(!ALL.BY) tabout.dates$recid_rate_date1 <- tabout.dates$recid_rate_all} #plot two years or one year depending on ALL.BY setting @@ -769,7 +769,7 @@ What are some more engaging ways we can plot recidivism rates for leadership and #| ref-label: 'dataviz4' ``` -```{r dataviz6, warning=FALSE, message=FALSE, eval=!ALL.BY} +```{r dataviz6, warning=FALSE, message=FALSE} #| code-fold: true ##horizontal lollipop chart