From b82c40cb03807787f8cc46e0f08d9a6265579798 Mon Sep 17 00:00:00 2001 From: antheajeanne <94174603+antheajeanne@users.noreply.github.com> Date: Wed, 9 Nov 2022 14:28:50 +0100 Subject: [PATCH] Update spacy_update.R changed path and adapted script slightly --- files/analysis/ner/spacy_update.R | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/files/analysis/ner/spacy_update.R b/files/analysis/ner/spacy_update.R index 832953b..6dca218 100644 --- a/files/analysis/ner/spacy_update.R +++ b/files/analysis/ner/spacy_update.R @@ -4,7 +4,7 @@ library(foreign) library(dplyr) library(quanteda) library(readtext) -mypath <- "/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase/transcriptions/txt" +mypath <- "/Users/antheaalberto/Documents/GitHub/rheinschifffahrt-showcase/files/transcriptions/txt" setwd(mypath) ## create a list of all .txt-files in directory and read them into the environment @@ -22,10 +22,11 @@ rownames(rhein_txt) <- rhein_txt$image spacy_initialize(model = "de_core_news_sm") parsed_txt <- spacy_parse(rhein_txt$text) head(parsed_txt) +tail(parsed_txt) # OCR has not worked properly everywhere, but that need not concern us for now # extract named entities rhein_entity <- entity_extract(parsed_txt) -#persons <- rhein_entity[rhein_entity$entity_type=="PER",] +persons <- rhein_entity[rhein_entity$entity_type=="PER",] ## creating a link (e.g. 0002.jpg) to merge it to images and metadata rhein_entity$link <- gsub("text","", rhein_entity$doc_id) @@ -35,5 +36,7 @@ rhein_entity$link <- ifelse(rhein_entity$link < 10, paste0("000", rhein_entity$l paste0("0", rhein_entity$link))) rhein_entity$link <- paste0(rhein_entity$link, ".jpg") -setwd("/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase") -write.csv(rhein_entity, file = "persons.csv", fileEncoding = "UTF-8") +#setwd("/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase") +#write.csv(rhein_entity, file = "persons.csv", fileEncoding = "UTF-8") +# The above lines are for writing and saving a .csv file with only persons +# It is commented out because I have already saved it earlier