Skip to content

Commit

Permalink
Merge pull request MHindermann#4 from RISE-UNIBAS/master
Browse files Browse the repository at this point in the history
Sync
  • Loading branch information
MHindermann authored Nov 9, 2022
2 parents 1dbb0a8 + 16e82f4 commit 48537b0
Showing 1 changed file with 7 additions and 4 deletions.
11 changes: 7 additions & 4 deletions files/analysis/ner/spacy_update.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ library(foreign)
library(dplyr)
library(quanteda)
library(readtext)
mypath <- "/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase/transcriptions/txt"
mypath <- "/Users/antheaalberto/Documents/GitHub/rheinschifffahrt-showcase/files/transcriptions/txt"
setwd(mypath)

## create a list of all .txt-files in directory and read them into the environment
Expand All @@ -22,10 +22,11 @@ rownames(rhein_txt) <- rhein_txt$image
spacy_initialize(model = "de_core_news_sm")
parsed_txt <- spacy_parse(rhein_txt$text)
head(parsed_txt)
tail(parsed_txt) # OCR has not worked properly everywhere, but that need not concern us for now

# extract named entities
rhein_entity <- entity_extract(parsed_txt)
#persons <- rhein_entity[rhein_entity$entity_type=="PER",]
persons <- rhein_entity[rhein_entity$entity_type=="PER",]

## creating a link (e.g. 0002.jpg) to merge it to images and metadata
rhein_entity$link <- gsub("text","", rhein_entity$doc_id)
Expand All @@ -35,5 +36,7 @@ rhein_entity$link <- ifelse(rhein_entity$link < 10, paste0("000", rhein_entity$l
paste0("0", rhein_entity$link)))
rhein_entity$link <- paste0(rhein_entity$link, ".jpg")

setwd("/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase")
write.csv(rhein_entity, file = "persons.csv", fileEncoding = "UTF-8")
#setwd("/Users/antheaalberto/switchdrive/RISE/Veranstaltungen/20220301_BGSH/Rheinschifffahrt Showcase")
#write.csv(rhein_entity, file = "persons.csv", fileEncoding = "UTF-8")
# The above lines are for writing and saving a .csv file with only persons
# It is commented out because I have already saved it earlier

0 comments on commit 48537b0

Please sign in to comment.