-
-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4bdf171
commit dd89202
Showing
2 changed files
with
152 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
#!/usr/bin/env Rscript | ||
|
||
# Install/Attach necessary packages to session | ||
pkgs <- c('tidyverse','tidytext','tokenizers','DiagrammeR','glue') | ||
xfun::pkg_attach2(pkgs) | ||
|
||
#===============================================================# | ||
#################### Environmental Variables #################### | ||
vault_path <- '/Users/bryanjenks/Documents/Knowledge' | ||
#===============================================================# | ||
|
||
#===============================================================# | ||
#################### Global Variables #################### | ||
total_word_count <- 0 | ||
total_file_count <- 0 | ||
top_5_words <- 0 | ||
chart_string <- "" | ||
aggregate_dataframe <- tibble(word = character()) | ||
#===============================================================# | ||
|
||
# The actual processing of the text | ||
analyze <- function(files) { | ||
# For each file in the vault | ||
for (file in fileNames) { | ||
words <- read_file(file) %>% | ||
# remove `[[` and `]]` so the link words are recognized as text too | ||
gsub("\\[\\[", "", .) %>% | ||
gsub("\\]\\]", "", .) %>% | ||
gsub("\\n", " ", .) %>% | ||
gsub("\\_+", "", .) %>% | ||
# tokenize the words | ||
tokenize_words(strip_punct = TRUE, | ||
strip_numeric = TRUE, | ||
simplify = TRUE, | ||
lowercase = TRUE) %>% | ||
# Make the words into a data frame | ||
unlist() %>% | ||
as_tibble() | ||
# Append result data frame to global data frame i.e. data_frame += 1 | ||
aggregate_dataframe <<- rbind(aggregate_dataframe, words) | ||
} | ||
} | ||
|
||
# Load files from vault | ||
setwd(vault_path) # Sorry Jenny Bryan! | ||
fileNames <-list.files(vault_path, recursive = TRUE, pattern = "*.md") | ||
# fileNames <- Sys.glob('*.md') | ||
|
||
# Lets start analyzing! | ||
analyze(fileNames) | ||
|
||
# Get a data frame of the aggregation of words and their counts | ||
distinct_words <- aggregate_dataframe %>% | ||
count(word = value,name = 'count',sort = TRUE) | ||
|
||
# Remove the stop words | ||
stop_words_removed <- distinct_words %>% | ||
anti_join(stop_words, by = c('word' = 'word')) | ||
|
||
|
||
|
||
# 'bing' is better for binary sentiment | ||
# but 'nrc' produces nicer visuals 🤷 | ||
sentiments <- get_sentiments("nrc") | ||
|
||
# Take initial raw aggregate column of words (with dupes) | ||
word_sentiments <- aggregate_dataframe %>% | ||
# Add sentiments to the data frame | ||
left_join(sentiments, by = c('value' = 'word')) %>% | ||
# Any missing sentiment words are removed | ||
filter(!is.na(sentiment)) %>% | ||
# Select only the columns we need | ||
select(sentiment) %>% | ||
# count frequency of sentiments | ||
count(sentiment) | ||
|
||
# word_sentiments <- summarise_at(group_by(word_sentiments,sentiment),vars(counts),funs(sum(.,na.rm=TRUE))) | ||
|
||
# Get the values for each sentiment into vector for subset assignment | ||
nums <- word_sentiments[[2]] | ||
|
||
# Assign each sentiment its value count | ||
anger <- nums[1] | ||
anticipation <- nums[2] | ||
disgust <- nums[3] | ||
fear <- nums[4] | ||
joy <- nums[5] | ||
negative <- nums[6] | ||
positive <- nums[7] | ||
sadness <- nums[8] | ||
surprise <- nums[9] | ||
trust <- nums[10] | ||
|
||
#======================================================# | ||
# GLOBAL VAR -- Assign total word count | ||
total_word_count <- length(aggregate_dataframe[[1]]) | ||
|
||
# GLOBAL VAR -- Assign top 5 words minus stop words | ||
top_5_words <- stop_words_removed %>% | ||
slice_max(count, n = 5) | ||
word_5 <- top_5_words[[1]] | ||
count_5 <- top_5_words[[2]] | ||
|
||
# GLOBAL VAR -- Count of files | ||
total_file_count <- length(fileNames) | ||
|
||
# GLOBAL VAR --Result Chart | ||
chart_string <- glue("```mermaid | ||
pie title Vault Sentiment | ||
\"anger\": {anger} | ||
\"anticipation\": {anticipation} | ||
\"disgust\": {disgust} | ||
\"fear\": {fear} | ||
\"joy\": {joy} | ||
\"negative\": {negative} | ||
\"positive\": {positive} | ||
\"sadness\": {sadness} | ||
\"surprise\": {surprise} | ||
\"trust\": {trust} | ||
```") | ||
#======================================================# | ||
|
||
# Create the presentation string of what actually gets written to the file | ||
# and displayed in Obsidian | ||
output_string <- glue(" | ||
# Vault Analysis | ||
## Stats | ||
**File Count:** {total_file_count} | ||
**Word Count:** {total_word_count} | ||
**Top 5 Words:** | ||
| Word | Frequency | | ||
|:----|:-----------| | ||
| {word_5[1]} | {count_5[1]} | | ||
| {word_5[2]} | {count_5[2]} | | ||
| {word_5[3]} | {count_5[3]} | | ||
| {word_5[4]} | {count_5[4]} | | ||
| {word_5[5]} | {count_5[5]} | | ||
## Visualization | ||
{chart_string} | ||
") | ||
|
||
fileConn<-file("Vault Stats.md") | ||
writeLines(output_string, fileConn) | ||
close(fileConn) |