-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathHelfrich-1916-orthoprofile.R
123 lines (106 loc) · 5.83 KB
/
Helfrich-1916-orthoprofile.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
library(tidyverse)
library(stringi)
library(qlcData)
helfrich_wl <- readr::read_rds("helfrich_wl.rds") |>
mutate(ortho_id_form = row_number(),
ortho_id_variant = row_number(),
ortho_id_xref = row_number(),
ortho_id_form = if_else(is.na(form), 0, ortho_id_form),
ortho_id_variant = if_else(is.na(variant), 0, ortho_id_variant),
ortho_id_xref = if_else(is.na(crossref_form), 0, ortho_id_xref))
# dir.create("ortho")
# Create profile ====
## form column ====
# helfrich_wl |>
# filter(!is.na(form)) |>
# pull(form) |>
# qlcData::write.profile(normalize = "NFC", editing = TRUE, info = FALSE,
# file.out = "ortho/_01-h1916-form_profile-skeleton.tsv")
## variant column ====
# helfrich_wl |>
# filter(!is.na(variant)) |>
# pull(variant) |>
# qlcData::write.profile(normalize = "NFC", editing = TRUE, info = FALSE,
# file.out = "ortho/_02-h1916-variant_profile-skeleton.tsv")
## cross-reference column ====
# helfrich_wl |>
# filter(!is.na(crossref_form)) |>
# pull(crossref_form) |>
# qlcData::write.profile(normalize = "NFC", editing = TRUE, info = FALSE,
# file.out = "ortho/_03-h1916-xref_profile-skeleton.tsv")
# transliterate ====
# read_tsv("https://raw.githubusercontent.com/engganolang/enolex/main/ortho/_12-helfrich1916_profile-skeleton-ipa.tsv") |>
# write_tsv("ortho/_00-ortho-ipa.tsv", na = "") # save the IPA profile
## form ====
h1916 <- qlcData::tokenize(helfrich_wl$form,
profile = "ortho/_01-h1916-form_profile-skeleton.tsv",
file.out = "ortho/_04-h1916-form",
method = "global",
transliterate = "Replacement",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_form_common_tokenised <- h1916$strings$transliterated
h1916$missing
### form IPA ====
h1916_form_ipa_tokenised <- qlcData::tokenize(helfrich_wl$form,
profile = "ortho/_00-ortho-ipa.tsv",
file.out = "ortho/_04-h1916-form-ipa",
method = "global",
transliterate = "Phoneme",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_form_ipa_tokenised[["missing"]]
h1916_form_ipa_tokenised[["errors"]]
h1916_form_ipa_tokenised <- h1916_form_ipa_tokenised$strings$transliterated
## variant ====
h1916_variant <- qlcData::tokenize(helfrich_wl$variant,
profile = "ortho/_01-h1916-form_profile-skeleton.tsv",
file.out = "ortho/_04-h1916-variant",
method = "global",
transliterate = "Replacement",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_variant_common_tokenised <- h1916_variant$strings$transliterated
### variant IPA ====
h1916_variant_ipa_tokenised <- qlcData::tokenize(helfrich_wl$variant,
profile = "ortho/_00-ortho-ipa.tsv",
file.out = "ortho/_04-h1916-variant-ipa",
method = "global",
transliterate = "Phoneme",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_variant_ipa_tokenised[["missing"]]
h1916_variant_ipa_tokenised[["errors"]]
h1916_variant_ipa_tokenised <- h1916_variant_ipa_tokenised$strings$transliterated
## cross-ref ====
h1916_xref <- qlcData::tokenize(helfrich_wl$crossref_form,
profile = "ortho/_01-h1916-form_profile-skeleton.tsv",
file.out = "ortho/_04-h1916-xref",
method = "global",
transliterate = "Replacement",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_xref_common_tokenised <- h1916_xref$strings$transliterated
### cross-ref IPA ====
h1916_xref_ipa_tokenised <- qlcData::tokenize(helfrich_wl$crossref_form,
profile = "ortho/_00-ortho-ipa.tsv",
file.out = "ortho/_04-h1916-xref-ipa",
method = "global",
transliterate = "Phoneme",
ordering = NULL, # cf. Moran & Cysouw (2018: 112-114)
normalize = "NFC",
sep.replace = "#",
regex = TRUE)
h1916_xref_ipa_tokenised[["missing"]]
h1916_xref_ipa_tokenised[["errors"]]
h1916_xref_ipa_tokenised <- h1916_xref_ipa_tokenised$strings$transliterated