forked from caltechlibrary/irdmtools
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_migration.bash
executable file
·337 lines (276 loc) · 7.74 KB
/
run_migration.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
#!/bin/bash
#
# Migration reads a replica of the EPrints repository database
# from the localhost (this could be changed to point at production)
# and updates a local dataset collection with records harvested using
# eprint2rdm.
#
function usage() {
APP_NAME=$(basename "$0")
cat <<EOT
---
title: "${APP_NAME} (1) user manual"
pubDate: 2023-02-28
author: "R. S. Doiel"
---
# NAME
${APP_NAME}
# SYNOPSIS
${APP_NAME} [OPTIONS] [setup|export|import] [keys|key_list|full]
# DESCRIPTION
${APP_NAME} can harvest records from a EPrints repository based on
settings in the environment (e.g. REPO_ID, EPRINT_USER, EPRINT_PASSWORD,
C_NAME). ${APP_NAME} will generate datasets for each record
EPrints repository harvested. It can also import the harvested records
into Invenio RDM.
If you are missing configuration ${APP_NAME} will prompt and create
the necessary configuration based on the repository name you're
harvesting (e.g. caltechauthors configuration file would be
'caltechauthors.env').
If "full" is passed as a parameter it will harvest EPrint records
created since 2008.
# OPTIONS
-h,-help,--help
: Display this help page
setup
: configure the export and import working directory
export
: export an EPrints repository into a dataset collection
import
: import the dataset collection into Invenio-RDM
keys
: works with export, harvest all keys from the EPrints repository
key_list
: works with export and import, the filename containing the ids to harvest
full
: export a list of all keys available from EPrints, then export or import the collections from those keys.
# ENVIRONMENT
The following environment varaibles are relied on.
EPRINT_HOST
: This hostname used for the EPrints repository, e.g. authors.library.caltech.edu
EPRINT_USER
: This is the username to access the EPrints repository with eprint2rdm
EPRINT_PASSWORD
: This is the password to access the EPrints repository with eprint2rdm
C_NAME
: This dataset collection to save the harvested EPrints content in.
# EXAMPLE
Set things up
~~~
${APP_NAME} setup
~~~
Harvest all records in repositories defined in environment.
~~~
${APP_NAME} full
~~~
EOT
}
function setup_dataset_collection() {
if [ ! -d "$C_NAME" ]; then
echo "Need to create $C_NAME using MySQL JSON store"
if [ "$DB_USER" = "" ]; then
read -r -p 'MySQL DB username: ' DB_USER
fi
if [ "$DB_PASSWORD" = "" ]; then
echo -n 'MySQL DB user password: '
read -r -s DB_PASSWORD
fi
DB_NAME="$(basename "${C_NAME}" .ds)_ds"
CONN="mysql://${DB_USER}:${DB_PASSWORD}@/${DB_NAME}"
echo "Creating MySQL database $DB_NAME"
echo "with $CONN"
mysql -e "CREATE DATABASE IF NOT EXIST ${DB_NAME};"
echo "Initializing $_CNAME using MySQL JSON Store"
if ! dataset init "${C_NAME}" "${CONN}"; then
cat <<EOT
Failed:
dataset init "${C_NAME}" "${CONN}"
EOT
exit 1
fi
if [ ! -d "$C_NAME" ]; then
echo "Something went wrong creating $_CNAME, aborting"
exit 1
fi
if ! dataset count "$C_NAME"; then
echo "Something went wrong access $C_NAME, aborting"
exit 1
fi
fi
echo "Using $C_NAME"
}
function do_eprints_export() {
FULL="$1"
KEY_LIST="$2"
if [ "$FULL" = "true" ]; then
echo "eprint2rdm -all-ids $EPRINT_HOST >${REPO_ID}_all_ids.txt"
KEY_LIST="${REPO_ID}_all_ids.txt"
if eprint2rdm -all-ids "$EPRINT_HOST" >"${REPO_ID}_all_ids.txt"; then
# NOTE need a trailing new line for while loops
echo '' >>"${REPO_ID}_all_ids.txt"
else
exit 1
fi
fi
if [ "${KEY_LIST}" = "" ] || [ ! -f "${KEY_LIST}" ]; then
echo 'Missing key list for export'
exit 1
fi
echo "eprint2rdm respecting resources, people and groups"
if eprint2rdm -id-list "${KEY_LIST}" -harvest "${C_NAME}" \
-resource-map resource_types.csv \
-contributor-map contributor_types.csv \
"${EPRINT_HOST}"; then
exit 1
fi
}
function do_rdm_import() {
FULL="$1"
KEY_LIST="$2"
if [ "${FULL}" = "true" ]; then
echo 'Setting up to import all ids'
KEY_LIST="${REPO_ID}_all_ids.txt"
if [ -f "${KEY_LIST}" ]; then
echo "WARNING: using existing ids in ${KEY_LIST}"
fi
fi
if [ "$KEY_LIST" = "" ]; then
echo 'Missing key list name'
exit 1
fi
# NOTE: need to get path to dataset collection from settings.json
if [ ! -f "${KEY_LIST}" ]; then
echo "Getting all keys in ${C_NAME}, saving in ${KEY_LIST}.txt"
dataset keys "$C_NAME" >"${KEY_LIST}"
fi
echo "$(wc -l "${KEY_LIST}") keys found in ${KEY_LIST}"
echo ""
if [ -f "${KEY_LIST}" ]; then
echo "running migrate_records.py -exit_on_error -eprintids ${KEY_LIST} ${C_NAME}"
if ! python3 migrate_records.py -exit_on_error -eprintids "${KEY_LIST}" "$C_NAME"; then
echo "Failed to process ${KEY_LIST} using ${C_NAME}"
exit 1
fi
else
echo "${KEY_LIST} appears to be empty for $C_NAME"
fi
}
function retrieve_csv_files() {
if [ ! -f people.csv ]; then
curl -L -o people.csv https://feeds.library.caltech.edu/people/people.csv
fi
if [ ! -f groups.csv ]; then
curl -L -o groups.csv https://feeds.library.caltech.edu/groups/groups.csv
fi
}
#
# Main processing
#
FULL="false"
SETUP="false"
EXPORT_EPRINTS="false"
IMPORT_IRDM="false"
KEY_LIST=""
for ARG in "$@"; do
case "${ARG}" in
-h|-help|--help|help)
usage
exit 0
;;
full)
FULL="true"
;;
setup)
SETUP="true"
;;
export)
EXPORT_EPRINTS="true"
;;
import)
IMPORT_IRDM="true"
;;
*)
KEY_LIST="${ARG}"
;;
esac
done
if [ "$REPO_ID" = "" ]; then
read -r -p 'What is the repository id to migrate? ' REPO_ID
export REPO_ID
fi
if [ "$REPO_ID" = "" ] || [ "$C_NAME" = "" ] || [ "$EPRINT_HOST" = "" ]; then
SETUP="true"
fi
if [ "${SETUP}" = "true" ] && [ ! -f "${REPO_ID}.env" ]; then
echo 'Setup EPrints REST API access'
echo '----------------------------------'
read -r -p 'What is the repo id (e.g. caltechauthors)? ' REPO_ID
read -r -p 'What is the EPrints hostname? ' EPRINT_HOST
read -r -p 'What is the EPrints username? ' EPRINT_USER
echo -n 'What is the EPrints password? (will not be shown)'
read -r -s EPRINT_PASSWORD
ehco ''
echo 'Setup dataset collection to hold exported records'
echo '--------------------------------------------------'
read -r -p 'What is the dataset collection name? ' C_NAME
read -r -p 'What is the DB_USER for SQL Store? ' DB_USER
echo -n 'What is the DB_PASSWORD for SQL Store? (will not be shown)'
read -r -s DB_PASSWORD
echo 'Setup RDM access'
read -r -p 'What is the RDM_URL? ' RDM_URL
echo -r -s 'What is the RDM Access Token? (will not be shown)'
cat <<EOT >"${REPO_ID}.env"
#!/bin/sh
#
# Setup for $REPO_ID
# This will be sourced from the environment by $APP_NAME
#
REPO_ID="${REPO_ID}"
# EPrints REST API Access
EPRINT_HOST="${EPRINT_HOST}"
EPRINT_USER="${EPRINT_USER}"
EPRINT_PASSWORD="${EPRINT_PASSWORD}"
# Dataset collection setup
C_NAME="${C_NAME}"
DB_USER="${DB_NAME}"
DB_PASSWORD="${DB_PASSWORD}"
# RDM API Access
RDM_URL="${RDM_URL}"
RDMTOK="${RDMTOK}"
export REPO_ID
export EPRINT_HOST
export EPRINT_USER
export EPRINT_PASSWORD
export C_NAME
export DB_USER
export DB_PASSWORD
export RDM_URL
export RDMTOK
EOT
chmod 600 "${REPO_ID}.env"
cat <<EOT
Wrote ${REPO_ID}.env configuration file
If REPO_ID environment is set then this
file will control how ${APP_NAME} runs.
EOT
fi
if [ ! -f "${REPO_ID}.env" ]; then
echo "Can't find ${REPO_ID}.env, aborting"
exit 1
fi
# shellcheck disable=SC1090
source "${REPO_ID}.env"
retrieve_csv_files
setup_dataset_collection
if [ "${EXPORT_EPRINTS}" != "true" ] && [ "${IMPORT_IRDM}" != "true" ]; then
echo "Nothing left to do."
exit 0
fi
echo "Starting eprint export=${EXPORT_EPRINTS}, rdm import=${IMPORT_IRDM} $(date)"
if [ "${EXPORT_EPRINTS}" = "true" ]; then
do_eprints_export "${FULL}" "${KEY_LIST}"
fi
if [ "${IMPORT_IRDM}" = "true" ]; then
do_rdm_import "${FULL}" "${KEY_LIST}"
fi
echo "Completed eprint export=${EXPORT_EPRINTS}, rdm import=${IMPORT_IRDM} $(date)"