From 2c582ebc1d3c72c7ce1a2b00270492df95065975 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Fri, 13 Sep 2024 23:53:55 +0000 Subject: [PATCH 01/14] fix bug where if no results are found an error is thrown trying to rbind empty results --- R/citation_search.R | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/R/citation_search.R b/R/citation_search.R index ab17f77..aebcec9 100644 --- a/R/citation_search.R +++ b/R/citation_search.R @@ -30,7 +30,13 @@ citation_search <- function(identifiers, }) # Combine the resulting data frames and return the result df - result <- dplyr::bind_rows(result_df_list) + if (all(sapply(result_df_list, function(x) nrow(x) == 0))) { + # If all data frames are empty, create an empty data frame with the same structure + result <- data.frame() + } else { + # Otherwise, bind the rows + result <- dplyr::bind_rows(result_df_list) + } return(result) } From c0aff73444ac0856e352ce5e602d24815c40b370 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Fri, 13 Sep 2024 17:14:56 -0700 Subject: [PATCH 02/14] initial commit of helm stuff Dockerfile and script are working correctly in isolation, neext need to connect to helm templates --- helm/.helmignore | 23 ++++++++ helm/Chart.yaml | 24 ++++++++ helm/Dockerfile | 31 ++++++++++ helm/scripts/search.R | 69 +++++++++++++++++++++++ helm/templates/NOTES.txt | 22 ++++++++ helm/templates/_helpers.tpl | 62 ++++++++++++++++++++ helm/templates/cron-job.yaml | 34 +++++++++++ helm/templates/serviceaccount.yaml | 12 ++++ helm/templates/tests/test-connection.yaml | 15 +++++ helm/values.yaml | 65 +++++++++++++++++++++ 10 files changed, 357 insertions(+) create mode 100644 helm/.helmignore create mode 100644 helm/Chart.yaml create mode 100644 helm/Dockerfile create mode 100644 helm/scripts/search.R create mode 100644 helm/templates/NOTES.txt create mode 100644 helm/templates/_helpers.tpl create mode 100644 helm/templates/cron-job.yaml create mode 100644 helm/templates/serviceaccount.yaml create mode 100644 helm/templates/tests/test-connection.yaml create mode 100644 helm/values.yaml diff --git a/helm/.helmignore b/helm/.helmignore new file mode 100644 index 0000000..0e8a0eb --- /dev/null +++ b/helm/.helmignore @@ -0,0 +1,23 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/helm/Chart.yaml b/helm/Chart.yaml new file mode 100644 index 0000000..9318f95 --- /dev/null +++ b/helm/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: scythe +description: A Helm chart for Kubernetes + +# A chart can be either an 'application' or a 'library' chart. +# +# Application charts are a collection of templates that can be packaged into versioned archives +# to be deployed. +# +# Library charts provide useful utilities or functions for the chart developer. They're included as +# a dependency of application charts to inject those utilities and functions into the rendering +# pipeline. Library charts do not define any templates and therefore cannot be deployed. +type: application + +# This is the chart version. This version number should be incremented each time you make changes +# to the chart and its templates, including the app version. +# Versions are expected to follow Semantic Versioning (https://semver.org/) +version: 0.1.0 + +# This is the version number of the application being deployed. This version number should be +# incremented each time you make changes to the application. Versions are not expected to +# follow Semantic Versioning. They should reflect the version the application is using. +# It is recommended to use it with quotes. +appVersion: "1.0.0" diff --git a/helm/Dockerfile b/helm/Dockerfile new file mode 100644 index 0000000..50042aa --- /dev/null +++ b/helm/Dockerfile @@ -0,0 +1,31 @@ +FROM rocker/r-ver:4 + +RUN groupadd -r scythe && useradd -r -g scythe scythe +RUN mkdir -p /var/data/scythe && chown scythe.scythe /var/data/scythe + +RUN apt-get update && apt-get install -y \ + librdf-dev \ + libxml2-dev \ + libfontconfig1-dev \ + libssl-dev \ + libcurl4-gnutls-dev \ + libsodium-dev \ + libfribidi-dev \ + libgit2-dev \ + libharfbuzz-dev \ + libfreetype6-dev \ + libpng-dev \ + libtiff5-dev \ + libjpeg-dev \ + && rm -rf /var/lib/apt/lists/* + +RUN Rscript -e "install.packages(c('devtools', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate'))" +RUN Rscript -e "devtools::install_github('dataoneorg/scythe@v1.0.0', dependencies = TRUE)" + +USER scythe:scythe + +WORKDIR /var/data/scythe + +COPY ./scripts/search.R . + +CMD ["Rscript", "--vanilla", "search.R", "urn:node:ARCTIC"] \ No newline at end of file diff --git a/helm/scripts/search.R b/helm/scripts/search.R new file mode 100644 index 0000000..b42b2f1 --- /dev/null +++ b/helm/scripts/search.R @@ -0,0 +1,69 @@ +# search.R +suppressPackageStartupMessages(library(dplyr)) +library(tidyr) +library(scythe) +library(dataone) +library(jsonlite) +suppressPackageStartupMessages(library(lubridate)) + +sources <- c("plos", "xdd", "scopus", "springer") +nodes <- commandArgs(trailingOnly = TRUE) + +get_node_dois <- function(node_id) { + mn <- getMNode(CNode("PROD"), node_id) + queryParamList <- list(q="id:doi*", + fl="id", + start ="0", + rows = "10") + result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE) + return(result$id) +} + +get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){ + + from <- as.Date(from); to <- as.Date(to) + from_q <- paste(stringr::str_pad(month(from), 2, side = "left", pad = "0"), + stringr::str_pad(day(from), 2, side = "left", pad = "0"), + stringr::str_pad(year(from), 2, side = "left", pad = "0"), + sep = "/") + + to_q <- paste(stringr::str_pad(month(to), 2, side = "left", pad = "0"), + stringr::str_pad(day(to), 2, side = "left", pad = "0"), + stringr::str_pad(year(to), 2, side = "left", pad = "0"), + sep = "/") + + d <- fromJSON(paste0('https://logproc-stage-ucsb-1.test.dataone.org/metrics?q={%22metricsPage%22:{%22total%22:0,%22start%22:0,%22count%22:0},%22metrics%22:[%22citations%22],%22filterBy%22:[{%22filterType%22:%22repository%22,%22values%22:[%22urn:node:ARCTIC%22],%22interpretAs%22:%22list%22},{%22filterType%22:%22month%22,%22values%22:[%22', from_q,'%22,%22', to_q, '%22],%22interpretAs%22:%22range%22}],%22groupBy%22:[%22month%22]}')) + + output_json <- d$resultDetails$citations # pulls citation info + output_df <- as.data.frame(do.call(rbind, output_json), row.names = FALSE) # binds nested cit info into dataframe + + output_df <- output_df %>% + unnest_longer(target_id) %>% + unnest_longer(source_id) + + return(output_df) +} + +dois <- c() +for (node in nodes){ + node_dois <- get_node_dois(node) + dois <- c(dois, node_dois) +} + +# set up file to write to +today <- format(Sys.Date(), "%Y%m%d") +fp <- paste0("scythe-citations-", today, ".json") + +found_citations <- citation_search(dois, sources) + +if (is.null(found_citations) || nrow(found_citations) == 0){ + writeLines("No citations found.", fp) +} else { + existing_citations <- get_metrics_citations() + new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) + if (nrow(new_citations) > 0) { + write_citation_pairs(new_citations, fp) + } else { + writeLines("No citations found.", fp) + } +} \ No newline at end of file diff --git a/helm/templates/NOTES.txt b/helm/templates/NOTES.txt new file mode 100644 index 0000000..a5c5c5e --- /dev/null +++ b/helm/templates/NOTES.txt @@ -0,0 +1,22 @@ +1. Get the application URL by running these commands: +{{- if .Values.ingress.enabled }} +{{- range $host := .Values.ingress.hosts }} + {{- range .paths }} + http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} + {{- end }} +{{- end }} +{{- else if contains "NodePort" .Values.service.type }} + export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "scythe.fullname" . }}) + export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") + echo http://$NODE_IP:$NODE_PORT +{{- else if contains "LoadBalancer" .Values.service.type }} + NOTE: It may take a few minutes for the LoadBalancer IP to be available. + You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "scythe.fullname" . }}' + export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "scythe.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") + echo http://$SERVICE_IP:{{ .Values.service.port }} +{{- else if contains "ClusterIP" .Values.service.type }} + export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "scythe.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") + export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") + echo "Visit http://127.0.0.1:8080 to use your application" + kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT +{{- end }} diff --git a/helm/templates/_helpers.tpl b/helm/templates/_helpers.tpl new file mode 100644 index 0000000..161c0d7 --- /dev/null +++ b/helm/templates/_helpers.tpl @@ -0,0 +1,62 @@ +{{/* +Expand the name of the chart. +*/}} +{{- define "scythe.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. +*/}} +{{- define "scythe.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* +Create chart name and version as used by the chart label. +*/}} +{{- define "scythe.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* +Common labels +*/}} +{{- define "scythe.labels" -}} +helm.sh/chart: {{ include "scythe.chart" . }} +{{ include "scythe.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* +Selector labels +*/}} +{{- define "scythe.selectorLabels" -}} +app.kubernetes.io/name: {{ include "scythe.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end }} + +{{/* +Create the name of the service account to use +*/}} +{{- define "scythe.serviceAccountName" -}} +{{- if .Values.serviceAccount.create }} +{{- default (include "scythe.fullname" .) .Values.serviceAccount.name }} +{{- else }} +{{- default "default" .Values.serviceAccount.name }} +{{- end }} +{{- end }} diff --git a/helm/templates/cron-job.yaml b/helm/templates/cron-job.yaml new file mode 100644 index 0000000..2c7ab24 --- /dev/null +++ b/helm/templates/cron-job.yaml @@ -0,0 +1,34 @@ +apiVersion: batch/v1beta1 +kind: CronJob +metadata: + name: {{ .Values.cronjob.name }} +spec: + schedule: {{ .Values.cronjob.schedule | quote }} + successfulJobsHistoryLimit: 5 + failedJobsHistoryLimit: 5 + jobTemplate: + spec: + template: + spec: + containers: + - name: {{ .Chart.Name }} + image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + env: + - name: ENV + value: {{ .Values.env }} + command: + - /bin/sh + - -c + - | + {{ .Values.cronjob.command }} + volumeMounts: + - name: {{ .Values.persistence.claimName }} + mountPath: {{ .Values.persistence.mountPath }} + readOnly: true + volumes: + - name: {{ .Values.persistence.claimName }} + persistentVolumeClaim: + claimName: {{ .Values.persistence.claimName }} + readOnly: false + restartPolicy: OnFailure \ No newline at end of file diff --git a/helm/templates/serviceaccount.yaml b/helm/templates/serviceaccount.yaml new file mode 100644 index 0000000..6d1c954 --- /dev/null +++ b/helm/templates/serviceaccount.yaml @@ -0,0 +1,12 @@ +{{- if .Values.serviceAccount.create -}} +apiVersion: v1 +kind: ServiceAccount +metadata: + name: {{ include "scythe.serviceAccountName" . }} + labels: + {{- include "scythe.labels" . | nindent 4 }} + {{- with .Values.serviceAccount.annotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +{{- end }} diff --git a/helm/templates/tests/test-connection.yaml b/helm/templates/tests/test-connection.yaml new file mode 100644 index 0000000..39e17ac --- /dev/null +++ b/helm/templates/tests/test-connection.yaml @@ -0,0 +1,15 @@ +apiVersion: v1 +kind: Pod +metadata: + name: "{{ include "scythe.fullname" . }}-test-connection" + labels: + {{- include "scythe.labels" . | nindent 4 }} + annotations: + "helm.sh/hook": test +spec: + containers: + - name: wget + image: busybox + command: ['wget'] + args: ['{{ include "scythe.fullname" . }}:{{ .Values.service.port }}'] + restartPolicy: Never diff --git a/helm/values.yaml b/helm/values.yaml new file mode 100644 index 0000000..86de802 --- /dev/null +++ b/helm/values.yaml @@ -0,0 +1,65 @@ +# Default values for scythe. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. + +replicaCount: 1 + +image: + repository: nginx + pullPolicy: IfNotPresent + # Overrides the image tag whose default is the chart appVersion. + tag: "" + +imagePullSecrets: [] +nameOverride: "" +fullnameOverride: "" + +persistence: + enabled: true + claimName: scythe-results + mountPath: /var/data/scythe + +cronjob: + name: scythe + schedule: "*/30 * * * *" + command: Rscript -e get_citations.R + +serviceAccount: + # Specifies whether a service account should be created + create: false + # Annotations to add to the service account + annotations: {} + # The name of the service account to use. + # If not set and create is true, a name is generated using the fullname template + name: "" + +podAnnotations: {} + +podSecurityContext: {} + # fsGroup: 2000 + +securityContext: {} + # capabilities: + # drop: + # - ALL + # readOnlyRootFilesystem: true + # runAsNonRoot: true + # runAsUser: 1000 + +resources: {} + # We usually recommend not to specify default resources and to leave this as a conscious + # choice for the user. This also increases chances charts run on environments with little + # resources, such as Minikube. If you do want to specify resources, uncomment the following + # lines, adjust them as necessary, and remove the curly braces after 'resources:'. + # limits: + # cpu: 100m + # memory: 128Mi + # requests: + # cpu: 100m + # memory: 128Mi + +nodeSelector: {} + +tolerations: [] + +affinity: {} From 5f15347354b7b36483e8bebe09375e6640f3be5c Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 16:51:36 +0000 Subject: [PATCH 03/14] return empty data frame if no token is set this ensures consistent return behavior amongst various scenarious --- R/citation_search_scopus.R | 18 ++++++++++-------- R/citation_search_springer.R | 10 +++++++++- 2 files changed, 19 insertions(+), 9 deletions(-) diff --git a/R/citation_search_scopus.R b/R/citation_search_scopus.R index ae7036a..af6951a 100644 --- a/R/citation_search_scopus.R +++ b/R/citation_search_scopus.R @@ -19,11 +19,20 @@ citation_search_scopus <- function(identifiers) { report_est_wait(length(identifiers), wait_seconds) key <- scythe_get_key("scopus") + + # initialize df for storing results in orderly fashion + scopus_results <- data.frame( + article_id = character(), + article_title = character(), + dataset_id = character(), + source = character() + ) + if (is.na(key)) { warning( "Skipping Scopus search due to missing API key. Set an API key using scythe_set_key() to include Scopus results." ) - return() + return(scopus_results) } identifiers_enc <- utils::URLencode(identifiers, reserved = TRUE) @@ -40,13 +49,6 @@ citation_search_scopus <- function(identifiers) { )) } - # initialize df for storing results in orderly fashion - scopus_results <- data.frame( - article_id = character(), - article_title = character(), - dataset_id = character(), - source = character() - ) # extract relevant information from raw results for (i in 1:length(results)) { diff --git a/R/citation_search_springer.R b/R/citation_search_springer.R index 9fefad7..dfaf27c 100644 --- a/R/citation_search_springer.R +++ b/R/citation_search_springer.R @@ -22,13 +22,21 @@ citation_search_springer <- function(identifiers) { report_est_wait(length(identifiers), wait_seconds) identifiers <- check_identifiers(identifiers) + + # initialize df for storing results in orderly fashion + springer_results <- data.frame( + article_id = character(), + article_title = character(), + dataset_id = character(), + source = character() + ) key <- scythe_get_key("springer") if (is.na(key)) { warning( "Skipping Springer search due to missing API key. Set an API key using scythe_set_key() to include Springer results." ) - return() + return(springer_results) } identifiers_enc <- utils::URLencode(identifiers, reserved = TRUE) From 6ea399756c6a3ce5cfda74c604f47535616def24 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 13:37:38 -0700 Subject: [PATCH 04/14] update scythe dependency, change script location --- helm/Dockerfile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/helm/Dockerfile b/helm/Dockerfile index 50042aa..a792f3c 100644 --- a/helm/Dockerfile +++ b/helm/Dockerfile @@ -2,6 +2,8 @@ FROM rocker/r-ver:4 RUN groupadd -r scythe && useradd -r -g scythe scythe RUN mkdir -p /var/data/scythe && chown scythe.scythe /var/data/scythe +RUN mkdir -p /apps/scythe && chown scythe.scythe /apps/scythe + RUN apt-get update && apt-get install -y \ librdf-dev \ @@ -20,12 +22,10 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* RUN Rscript -e "install.packages(c('devtools', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate'))" -RUN Rscript -e "devtools::install_github('dataoneorg/scythe@v1.0.0', dependencies = TRUE)" +RUN Rscript -e "devtools::install_github('dataoneorg/scythe@89d52978', dependencies = TRUE)" USER scythe:scythe WORKDIR /var/data/scythe -COPY ./scripts/search.R . - -CMD ["Rscript", "--vanilla", "search.R", "urn:node:ARCTIC"] \ No newline at end of file +COPY ./scripts/search.R /apps/scythe/ \ No newline at end of file From 48fcdda24770d453de760a3c31e2e54476cb5b46 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 13:38:22 -0700 Subject: [PATCH 05/14] update helm config to pull correct image and run cronjob --- helm/templates/NOTES.txt | 14 -------------- helm/templates/cron-job.yaml | 19 +++++++++++++------ helm/templates/tests/test-connection.yaml | 2 +- helm/values.yaml | 16 +++++++++++----- 4 files changed, 25 insertions(+), 26 deletions(-) diff --git a/helm/templates/NOTES.txt b/helm/templates/NOTES.txt index a5c5c5e..0dc54d5 100644 --- a/helm/templates/NOTES.txt +++ b/helm/templates/NOTES.txt @@ -5,18 +5,4 @@ http{{ if $.Values.ingress.tls }}s{{ end }}://{{ $host.host }}{{ .path }} {{- end }} {{- end }} -{{- else if contains "NodePort" .Values.service.type }} - export NODE_PORT=$(kubectl get --namespace {{ .Release.Namespace }} -o jsonpath="{.spec.ports[0].nodePort}" services {{ include "scythe.fullname" . }}) - export NODE_IP=$(kubectl get nodes --namespace {{ .Release.Namespace }} -o jsonpath="{.items[0].status.addresses[0].address}") - echo http://$NODE_IP:$NODE_PORT -{{- else if contains "LoadBalancer" .Values.service.type }} - NOTE: It may take a few minutes for the LoadBalancer IP to be available. - You can watch the status of by running 'kubectl get --namespace {{ .Release.Namespace }} svc -w {{ include "scythe.fullname" . }}' - export SERVICE_IP=$(kubectl get svc --namespace {{ .Release.Namespace }} {{ include "scythe.fullname" . }} --template "{{"{{ range (index .status.loadBalancer.ingress 0) }}{{.}}{{ end }}"}}") - echo http://$SERVICE_IP:{{ .Values.service.port }} -{{- else if contains "ClusterIP" .Values.service.type }} - export POD_NAME=$(kubectl get pods --namespace {{ .Release.Namespace }} -l "app.kubernetes.io/name={{ include "scythe.name" . }},app.kubernetes.io/instance={{ .Release.Name }}" -o jsonpath="{.items[0].metadata.name}") - export CONTAINER_PORT=$(kubectl get pod --namespace {{ .Release.Namespace }} $POD_NAME -o jsonpath="{.spec.containers[0].ports[0].containerPort}") - echo "Visit http://127.0.0.1:8080 to use your application" - kubectl --namespace {{ .Release.Namespace }} port-forward $POD_NAME 8080:$CONTAINER_PORT {{- end }} diff --git a/helm/templates/cron-job.yaml b/helm/templates/cron-job.yaml index 2c7ab24..54174a0 100644 --- a/helm/templates/cron-job.yaml +++ b/helm/templates/cron-job.yaml @@ -1,4 +1,4 @@ -apiVersion: batch/v1beta1 +apiVersion: batch/v1 kind: CronJob metadata: name: {{ .Values.cronjob.name }} @@ -15,17 +15,24 @@ spec: image: {{ .Values.image.repository }}:{{ .Values.image.tag }} imagePullPolicy: {{ .Values.image.pullPolicy }} env: - - name: ENV - value: {{ .Values.env }} + - name: springer + valueFrom: + secretKeyRef: + name: api-keys + key: springer + - name: scopus + valueFrom: + secretKeyRef: + name: api-keys + key: scopus command: - /bin/sh - -c - - | - {{ .Values.cronjob.command }} + - {{ .Values.cronjob.command }} {{- range .Values.cronjob.nodes }} {{ . }} {{ end }} volumeMounts: - name: {{ .Values.persistence.claimName }} mountPath: {{ .Values.persistence.mountPath }} - readOnly: true + readOnly: false volumes: - name: {{ .Values.persistence.claimName }} persistentVolumeClaim: diff --git a/helm/templates/tests/test-connection.yaml b/helm/templates/tests/test-connection.yaml index 39e17ac..41e0f83 100644 --- a/helm/templates/tests/test-connection.yaml +++ b/helm/templates/tests/test-connection.yaml @@ -11,5 +11,5 @@ spec: - name: wget image: busybox command: ['wget'] - args: ['{{ include "scythe.fullname" . }}:{{ .Values.service.port }}'] + args: ['{{ include "scythe.fullname" . }}:8080'] restartPolicy: Never diff --git a/helm/values.yaml b/helm/values.yaml index 86de802..81308f5 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -5,10 +5,10 @@ replicaCount: 1 image: - repository: nginx - pullPolicy: IfNotPresent + repository: ghcr.io/dataoneorg/scythe + pullPolicy: Always # Overrides the image tag whose default is the chart appVersion. - tag: "" + tag: dev imagePullSecrets: [] nameOverride: "" @@ -21,8 +21,11 @@ persistence: cronjob: name: scythe - schedule: "*/30 * * * *" - command: Rscript -e get_citations.R + schedule: "* * 1 * *" + command: Rscript --vanilla /apps/scythe/search.R + nodes: + - urn:node:ARCTIC + - urn:node:ESS_DIVE serviceAccount: # Specifies whether a service account should be created @@ -33,6 +36,9 @@ serviceAccount: # If not set and create is true, a name is generated using the fullname template name: "" +ingress: + enabled: false + podAnnotations: {} podSecurityContext: {} From 578a76aad2bde980916b401bb66d841559f995a6 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 13:38:39 -0700 Subject: [PATCH 06/14] add logging output and search sids for DOIs too --- helm/scripts/search.R | 21 +++++++++++++-------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/helm/scripts/search.R b/helm/scripts/search.R index b42b2f1..2c08ae6 100644 --- a/helm/scripts/search.R +++ b/helm/scripts/search.R @@ -11,12 +11,14 @@ nodes <- commandArgs(trailingOnly = TRUE) get_node_dois <- function(node_id) { mn <- getMNode(CNode("PROD"), node_id) - queryParamList <- list(q="id:doi*", - fl="id", + queryParamList <- list(q="id:doi* OR seriesId:doi*", + fl="id, seriesId", start ="0", rows = "10") result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE) - return(result$id) + pids <- c(result$id, result$seriesId) + dois <- grep("doi:", pids, value = TRUE) + return(dois) } get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){ @@ -46,15 +48,18 @@ get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIX dois <- c() for (node in nodes){ + message(paste("Gathering DOIs for: ", node)) node_dois <- get_node_dois(node) dois <- c(dois, node_dois) -} +} +dois_unique <- unique(dois) # set up file to write to today <- format(Sys.Date(), "%Y%m%d") -fp <- paste0("scythe-citations-", today, ".json") +fp <- paste0("scythe-citations-", today, ".csv") -found_citations <- citation_search(dois, sources) +message("Beginning citations search.") +found_citations <- citation_search(dois_unique, sources) if (is.null(found_citations) || nrow(found_citations) == 0){ writeLines("No citations found.", fp) @@ -62,8 +67,8 @@ if (is.null(found_citations) || nrow(found_citations) == 0){ existing_citations <- get_metrics_citations() new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) if (nrow(new_citations) > 0) { - write_citation_pairs(new_citations, fp) + write.csv(new_citations, fp, row_names = FALSE) } else { - writeLines("No citations found.", fp) + writeLines("No new citations found.", fp) } } \ No newline at end of file From bbafe770190a56bc228e486fda893d63ae09e58e Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 13:38:45 -0700 Subject: [PATCH 07/14] add a readme --- helm/README.md | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 helm/README.md diff --git a/helm/README.md b/helm/README.md new file mode 100644 index 0000000..eba7392 --- /dev/null +++ b/helm/README.md @@ -0,0 +1,51 @@ +# Scythe Helm Chart + +This Helm chart deploys a CronJob that does a citation search on a set of DataONE member nodes using `scythe`. + +## Search Script and Container + +`scripts/search.R` is copied into the Dockerfile and run in the CronJob. It takes the node identifiers +listed in the `values.yaml` file as input. DOIs (either identifiers or series identifiers) are retrieved from each node, +then passed through `scythe::citation_search`, which searches for citations in PLOS, Springer, Scopus, and xDD. Citations +already in the metrics service are removed, and the citations are written to a csv. This table can be passed to `scythe::write_citation_pairs` to create the JSON file needed for ingest into the metrics system. + +## CronJob + +In `values.yaml`, key fields to configure are: + +- **`cronjob.schedule`**: Schedule for the CronJob (in cron format). +- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script. + +## API Keys + +For instructions on obtaining an API key, see README.md at the package level. + +Keys are made accessible to the deployment using Kubernetes secrets. To set API keys, run: + +``` +kubectl create secret generic -n scythe api-keys \ + --from-literal=springer={key} \ + --from-literal=scopus={key} +``` + +## Persistent Storage + +This Helm chart uses a dynamic PVC using CephFS to save results from the `scythe` run. An example configuration file is shown below. +For more information on CephFS on the cluster see [k8s-cluster docs](https://github.com/DataONEorg/k8s-cluster/blob/main/storage/Ceph/Ceph-CSI-CephFS.md#provisioning-dynamic-cephfs-volumes). + +``` +--- +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: scythe-results +spec: + accessModes: + - ReadWriteMany + resources: + requests: + storage: 10Gi + storageClassName: csi-cephfs-sc +``` + +To create the PVC, run `kubectl apply -f pvc.yaml -n scythe`. This should only be done once. \ No newline at end of file From 066ab39bfcb5efe8991617fbfe049ee4f4a56d89 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 14:48:51 -0700 Subject: [PATCH 08/14] update grant --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 5ab7f9a..0a985f4 100644 --- a/README.md +++ b/README.md @@ -66,7 +66,7 @@ keyring::key_get("springer", keyring = "scythe") ## Acknowledgments Work on this package was supported by: -- NSF-PLR grant #1546024 to M. B. Jones, S. Baker-Yeboah, J. Dozier, M. Schildhauer, and A. Budden +- NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier [![nceas_footer](https://live-ncea-ucsb-edu-v01.pantheonsite.io/sites/default/files/2020-03/NCEAS-full%20logo-4C.png)](https://www.nceas.ucsb.edu) From 2c68825d93249c324f7273a8edfca1af308d080f Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 16:16:42 -0700 Subject: [PATCH 09/14] update chart to accept named arguments in cmd line invocation of R script --- helm/Chart.yaml | 2 +- helm/Dockerfile | 4 +- helm/README.md | 3 +- helm/scripts/search.R | 87 ++++++++++++++++++++---------------- helm/templates/cron-job.yaml | 6 ++- helm/values.yaml | 8 ++-- 6 files changed, 63 insertions(+), 47 deletions(-) diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 9318f95..56e9554 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -21,4 +21,4 @@ version: 0.1.0 # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.0.0" +appVersion: "1.1.0" diff --git a/helm/Dockerfile b/helm/Dockerfile index a792f3c..7419b97 100644 --- a/helm/Dockerfile +++ b/helm/Dockerfile @@ -21,8 +21,8 @@ RUN apt-get update && apt-get install -y \ libjpeg-dev \ && rm -rf /var/lib/apt/lists/* -RUN Rscript -e "install.packages(c('devtools', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate'))" -RUN Rscript -e "devtools::install_github('dataoneorg/scythe@89d52978', dependencies = TRUE)" +RUN Rscript -e "install.packages(c('remotes', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate', 'optparse'))" +RUN Rscript -e "remotes::install_github('dataoneorg/scythe@89d52978')" USER scythe:scythe diff --git a/helm/README.md b/helm/README.md index eba7392..88a5433 100644 --- a/helm/README.md +++ b/helm/README.md @@ -14,7 +14,8 @@ already in the metrics service are removed, and the citations are written to a c In `values.yaml`, key fields to configure are: - **`cronjob.schedule`**: Schedule for the CronJob (in cron format). -- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script. +- **`cronjob.nodes`**: A set of node identifiers to be passed to the R script, as a comma separated list +- **`cronjob.rows`**: Optional number of rows to return per node when getting DOIs. Leave empty to return all identifiers ## API Keys diff --git a/helm/scripts/search.R b/helm/scripts/search.R index 2c08ae6..0daace3 100644 --- a/helm/scripts/search.R +++ b/helm/scripts/search.R @@ -1,74 +1,85 @@ -# search.R +#!/usr/bin/env Rscript suppressPackageStartupMessages(library(dplyr)) library(tidyr) library(scythe) library(dataone) library(jsonlite) suppressPackageStartupMessages(library(lubridate)) +library(optparse) -sources <- c("plos", "xdd", "scopus", "springer") -nodes <- commandArgs(trailingOnly = TRUE) +main <- function(){ + + option_list <- list( + make_option(c("-r", "--rows"), type="integer", default=100000, + help="Number of rows to return from query [default %default]"), + make_option(c("-n", "--nodes"), type="character", help="Comma separated list of nodes to query") + ) + + # parse command-line arguments + parser <- OptionParser(option_list=option_list) + opts <- parse_args(parser) + + num_rows <- opts$rows + nodes <- strsplit(opts$nodes, ",", fixed = TRUE)[[1]] + + sources <- c("plos", "xdd", "scopus", "springer") + + dois <- c() + for (node in nodes){ + message(paste("Gathering DOIs for: ", node)) + node_dois <- get_node_dois(node, num_rows) + dois <- c(dois, node_dois) + } + dois_unique <- unique(dois) + + # set up file to write to + today <- format(Sys.Date(), "%Y%m%d") + fp <- paste0("scythe-citations-", today, ".csv") + + message("Beginning citations search.") + found_citations <- citation_search(dois_unique, sources) + + if (is.null(found_citations) || nrow(found_citations) == 0){ + writeLines("No citations found.", fp) + } else { + existing_citations <- get_metrics_citations() + new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) + if (nrow(new_citations) > 0) { + write.csv(new_citations, fp, row_names = FALSE) + } else { + writeLines("No new citations found.", fp) + } + } +} -get_node_dois <- function(node_id) { +get_node_dois <- function(node_id, num_rows) { mn <- getMNode(CNode("PROD"), node_id) queryParamList <- list(q="id:doi* OR seriesId:doi*", fl="id, seriesId", start ="0", - rows = "10") + rows = num_rows) result <- query(mn, solrQuery=queryParamList, as="data.frame", parse=FALSE) pids <- c(result$id, result$seriesId) dois <- grep("doi:", pids, value = TRUE) return(dois) } - get_metrics_citations <- function(from = as.POSIXct("2000-01-01"), to = as.POSIXct(Sys.Date())){ - from <- as.Date(from); to <- as.Date(to) from_q <- paste(stringr::str_pad(month(from), 2, side = "left", pad = "0"), stringr::str_pad(day(from), 2, side = "left", pad = "0"), stringr::str_pad(year(from), 2, side = "left", pad = "0"), sep = "/") - to_q <- paste(stringr::str_pad(month(to), 2, side = "left", pad = "0"), stringr::str_pad(day(to), 2, side = "left", pad = "0"), stringr::str_pad(year(to), 2, side = "left", pad = "0"), sep = "/") - d <- fromJSON(paste0('https://logproc-stage-ucsb-1.test.dataone.org/metrics?q={%22metricsPage%22:{%22total%22:0,%22start%22:0,%22count%22:0},%22metrics%22:[%22citations%22],%22filterBy%22:[{%22filterType%22:%22repository%22,%22values%22:[%22urn:node:ARCTIC%22],%22interpretAs%22:%22list%22},{%22filterType%22:%22month%22,%22values%22:[%22', from_q,'%22,%22', to_q, '%22],%22interpretAs%22:%22range%22}],%22groupBy%22:[%22month%22]}')) - output_json <- d$resultDetails$citations # pulls citation info output_df <- as.data.frame(do.call(rbind, output_json), row.names = FALSE) # binds nested cit info into dataframe - output_df <- output_df %>% unnest_longer(target_id) %>% unnest_longer(source_id) - return(output_df) } -dois <- c() -for (node in nodes){ - message(paste("Gathering DOIs for: ", node)) - node_dois <- get_node_dois(node) - dois <- c(dois, node_dois) -} -dois_unique <- unique(dois) - -# set up file to write to -today <- format(Sys.Date(), "%Y%m%d") -fp <- paste0("scythe-citations-", today, ".csv") - -message("Beginning citations search.") -found_citations <- citation_search(dois_unique, sources) - -if (is.null(found_citations) || nrow(found_citations) == 0){ - writeLines("No citations found.", fp) -} else { - existing_citations <- get_metrics_citations() - new_citations <- anti_join(found_citations, existing_citations, by = c("dataset_id" = "target_id", "article_id" = "source_id")) - if (nrow(new_citations) > 0) { - write.csv(new_citations, fp, row_names = FALSE) - } else { - writeLines("No new citations found.", fp) - } -} \ No newline at end of file +main() diff --git a/helm/templates/cron-job.yaml b/helm/templates/cron-job.yaml index 54174a0..79de389 100644 --- a/helm/templates/cron-job.yaml +++ b/helm/templates/cron-job.yaml @@ -28,7 +28,11 @@ spec: command: - /bin/sh - -c - - {{ .Values.cronjob.command }} {{- range .Values.cronjob.nodes }} {{ . }} {{ end }} + - {{- if .Values.cronjob.rows }} + {{ .Values.cronjob.command }} -r {{ .Values.cronjob.rows }} -n {{ .Values.cronjob.nodes }} + {{- else }} + {{ .Values.cronjob.command }} -n {{ .Values.cronjob.nodes }} + {{- end }} volumeMounts: - name: {{ .Values.persistence.claimName }} mountPath: {{ .Values.persistence.mountPath }} diff --git a/helm/values.yaml b/helm/values.yaml index 81308f5..033b68a 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -21,11 +21,11 @@ persistence: cronjob: name: scythe - schedule: "* * 1 * *" + schedule: "0 12 1 * *" command: Rscript --vanilla /apps/scythe/search.R - nodes: - - urn:node:ARCTIC - - urn:node:ESS_DIVE + nodes: 'urn:node:ARCTIC,urn:node:ESS_DIVE' + # leave blank if returning all rows + rows: serviceAccount: # Specifies whether a service account should be created From 704d52ec708827a0a58cbaa0f1dca6b4e2b7d806 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 16:23:41 -0700 Subject: [PATCH 10/14] update scythe version in dockerfile --- helm/Dockerfile | 2 +- helm/values.yaml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/helm/Dockerfile b/helm/Dockerfile index 7419b97..3c9b11a 100644 --- a/helm/Dockerfile +++ b/helm/Dockerfile @@ -22,7 +22,7 @@ RUN apt-get update && apt-get install -y \ && rm -rf /var/lib/apt/lists/* RUN Rscript -e "install.packages(c('remotes', 'dataone', 'tidyr', 'redland', 'jsonlite', 'lubridate', 'optparse'))" -RUN Rscript -e "remotes::install_github('dataoneorg/scythe@89d52978')" +RUN Rscript -e "remotes::install_github('dataoneorg/scythe@v1.1.0')" USER scythe:scythe diff --git a/helm/values.yaml b/helm/values.yaml index 033b68a..b0de420 100644 --- a/helm/values.yaml +++ b/helm/values.yaml @@ -7,8 +7,6 @@ replicaCount: 1 image: repository: ghcr.io/dataoneorg/scythe pullPolicy: Always - # Overrides the image tag whose default is the chart appVersion. - tag: dev imagePullSecrets: [] nameOverride: "" From 4193dfba0b76d7992eea466860afb114a53a086e Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 17:01:01 -0700 Subject: [PATCH 11/14] keep both grants in readme --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 0a985f4..08ce1b9 100644 --- a/README.md +++ b/README.md @@ -66,6 +66,7 @@ keyring::key_get("springer", keyring = "scythe") ## Acknowledgments Work on this package was supported by: +- NSF-PLR grant #1546024 to M. B. Jones, S. Baker-Yeboah, J. Dozier, M. Schildhauer, and A. Budden - NSF-PLR grant #2042102 to M. B. Jones, A. Budden, M. Schildhauer, and J. Dozier [![nceas_footer](https://live-ncea-ucsb-edu-v01.pantheonsite.io/sites/default/files/2020-03/NCEAS-full%20logo-4C.png)](https://www.nceas.ucsb.edu) From c929974fb9a6dd46c8aec84f8cef82b6db673c17 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Tue, 17 Sep 2024 17:01:22 -0700 Subject: [PATCH 12/14] make chart version same as app version --- helm/Chart.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 56e9554..889c2df 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -15,7 +15,7 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 0.1.0 +version: 1.1.0 # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to From 37c24b276d4ee0c5976bef65a5aae3cafbaf691a Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Wed, 18 Sep 2024 09:46:40 -0700 Subject: [PATCH 13/14] increment version --- DESCRIPTION | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 4ee6e2b..61e8fc7 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: scythe Title: Harvest and register data package citations -Version: 1.0.0 +Version: 1.1.0 Authors@R: c( person("Jeanette", "Clark", role = c("aut", "cre"), email = "jclark@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-4703-1974")), person("Matthew B.", "Jones", role = "aut", email = "jones@nceas.ucsb.edu", comment=c(ORCID = "0000-0003-0077-4738")), diff --git a/README.md b/README.md index 08ce1b9..de0a761 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ including Scopus, PLOS, Springer, and XDD. ### Released version ``` -remotes::install_github("DataONEorg/scythe@v1.0.0") +remotes::install_github("DataONEorg/scythe@v1.1.0") ``` The *scythe* R package should be available for use at this point. From b8ba920d8c9f28f2aeb435720430a88f7bd65fd1 Mon Sep 17 00:00:00 2001 From: Jeanette Clark Date: Wed, 18 Sep 2024 11:45:34 -0700 Subject: [PATCH 14/14] update version and add default image tag --- helm/Chart.yaml | 4 ++-- helm/templates/cron-job.yaml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/helm/Chart.yaml b/helm/Chart.yaml index 889c2df..048695d 100644 --- a/helm/Chart.yaml +++ b/helm/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: 1.1.0 +version: "v1.1.0" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.1.0" +appVersion: "v1.1.0" diff --git a/helm/templates/cron-job.yaml b/helm/templates/cron-job.yaml index 79de389..2973aae 100644 --- a/helm/templates/cron-job.yaml +++ b/helm/templates/cron-job.yaml @@ -12,7 +12,7 @@ spec: spec: containers: - name: {{ .Chart.Name }} - image: {{ .Values.image.repository }}:{{ .Values.image.tag }} + image: {{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.image.pullPolicy }} env: - name: springer