-
Notifications
You must be signed in to change notification settings - Fork 0
/
README.RMD
130 lines (96 loc) · 3.03 KB
/
README.RMD
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
---
title: "TokenLink"
output: github_document
date: '2022-05-05'
editor_options:
chunk_output_type: console
---
# TokenLink
link two dataset using tokens or words in common between them
```{r setup, include=FALSE}
knitr::opts_chunk$set(warning = FALSE, message = FALSE)
```
# Install
```{r Install, eval = FALSE}
devtools::install_github("csps-efpc/TokenLink")
```
# Example Basic Usage
## Load Libraries and Data from Internet
```{r load_data}
source('R/tokenify.R')
library(tidyr)
library(dplyr)
library(readr)
library(magrittr)
library(TokenLink)
ceo_url <- 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-04-27/departures.csv'
ceo_url <- 'https://tinyurl.com/2p8etjr6'
alb_url <- 'https://open.alberta.ca/dataset/a2b1fc9b-aac4-4718-8645-b0466ca5ec57/resource/3da9a7f9-bd34-48c0-841f-19c856b551ad/download/foodindustry.csv'
alb_url <- 'https://tinyurl.com/2p8ap4ad'
# Load Data From internet
dat_ceo <- readr::read_csv(ceo_url)
dat_alb <- readr::read_csv(alb_url)
```
### CEO Resignation data from [Tidy Tuesday](https://github.com/rfordatascience/tidytuesday/blob/master/data/2021/2021-04-27/readme.md)
```{r echo=FALSE}
knitr::kable(dplyr::select(dplyr::sample_n(dat_ceo, 3), coname, exec_fullname), floating.environment="sidewaystable")
```
### Data from [open.alberta.ca](https://open.alberta.ca/dataset/a2b1fc9b-aac4-4718-8645-b0466ca5ec57)
```{r echo=FALSE}
knitr::kable(dplyr::select(dplyr::sample_n(dat_alb, 3), companyName, address, town, province ), floating.environment="sidewaystable")
```
# Show Tokenization
```{r basic_tokenization}
dat_ceo_tokes <-
dat_ceo |>
tokenize_ations(col_nms = 'coname', token_types = 'company_name')
dat_ceo_tokes |>
magrittr::extract2('tokens') |>
group_by(row_name) |>
nest() |> ungroup() |>
sample_n(3) |>
unnest() |>
knitr::kable(caption = 'Tokens')
```
# Count Tokenization
```{r count_tokens}
nsamp <- 4
dat_ceo_tokes |>
magrittr::extract2('token_counts') |>
{\(.) bind_rows(head(., nsamp), sample_n(.,nsamp), tail(., nsamp))}() |>
arrange(desc(n)) |>
knitr::kable(caption = 'Token Counts')
```
# Create a t_dat Object
```{r basic_linking}
t_dat <- token_links(
dat_x = dat_ceo,
dat_y = dat_alb,
args_x = list(col_nms = 'coname'),
args_y = list(col_nms = 'companyName'),
token_types = 'company_name',
token_index = '',
suffix = c('ceo', 'alb')
)
t_dat |>
extract2('tokens_all') |>
{\(.) bind_rows(head(., nsamp), sample_n(.,nsamp), tail(., nsamp))}() |>
knitr::kable(caption = 'All Tokens')
```
# Find Posteriors
```{r find_posterior}
t_dat <-
t_dat |>
find_posterior()
t_dat$all_evidence |>
{\(.) bind_rows(head(., nsamp), sample_n(.,nsamp), tail(., nsamp))}() |>
arrange(desc(posterior)) |>
knitr::kable()
```
# Vissual Compare Of Results
```{r joind_result}
t_dat |> joined_results(include_row_numbers = TRUE, link_col_nms = c('posterior', 'tokens_in_favour', 'tokens_against')) |>
{\(.) bind_rows(head(., nsamp), sample_n(.,nsamp), tail(., nsamp))}() |>
arrange(desc(posterior)) |>
knitr::kable()
```