Skip to content

Commit

Permalink
Merge pull request #76 from subugoe/ta_match
Browse files Browse the repository at this point in the history
Refactor TA matching to avoid confusion about the main institution
  • Loading branch information
njahn82 authored Aug 27, 2024
2 parents 3903f4e + c01f907 commit 39b49d0
Show file tree
Hide file tree
Showing 9 changed files with 108 additions and 122 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: hoaddata
Type: Package
Title: Data about hybrid open access journal publishing
Version: 0.2.98
Version: 0.2.99
Authors@R:
person("Najko", "Jahn", , "[email protected]",
role = c("aut", "cre"),
Expand Down
Binary file modified data/cc_articles.rda
Binary file not shown.
Binary file modified data/cr_md.rda
Binary file not shown.
Binary file modified data/cr_upw.rda
Binary file not shown.
Binary file modified data/jct_oalex_venues.rda
Binary file not shown.
Binary file modified data/jn_aff.rda
Binary file not shown.
Binary file modified data/jn_ind.rda
Binary file not shown.
93 changes: 45 additions & 48 deletions inst/sql/jct_inst_enriched.sql
Original file line number Diff line number Diff line change
@@ -1,49 +1,46 @@
-- This query is designed to identify institutions participating in transformative agreements.
-- JCT does not cover all associated institutions, eg. university hospitals and MPG institutes.
-- Here, we include associated institutions to ensure comprehensive coverage

-- Common Table Expression (CTE) - matching:
WITH matching AS (
-- Part 1: Retrieve data from 'oalex_inst' to include associated institutions
(
SELECT
esac_id, -- ESAC TA ID
oalex_inst.ror AS ror_main, -- ROR identifier for the main institution
inst.ror AS ror -- ROR identifier for associated institutions
FROM
`subugoe-collaborative.hoaddata.jct_inst` AS jct_inst
INNER JOIN
`subugoe-collaborative.openalex.institutions` as oalex_inst
ON
jct_inst.ror_id = oalex_inst.ror
INNER JOIN
UNNEST(oalex_inst.associated_institutions) as inst
ORDER BY
esac_id
)
WITH
obtain_associated_ror_ids AS (
-- Part 1: Retrieve data from OpenAlex institution table to include associated institutions
SELECT
esac_id, -- ESAC TA ID
jct_inst.ror_id AS ror_jct, -- ROR identifier from JCT
inst.ror AS ror_associated -- ROR identifier for associated institutions from OpenAlex
FROM
`subugoe-collaborative.hoaddata.jct_inst` AS jct_inst
LEFT JOIN
`subugoe-collaborative.openalex.institutions` AS oalex_inst
ON
jct_inst.ror_id = oalex_inst.ror
LEFT JOIN
UNNEST(oalex_inst.associated_institutions) AS inst
ORDER BY
esac_id
),
create_matching_table AS (
SELECT
esac_id,
'ror_jct' AS ror_type,
ror_jct AS ror
FROM
obtain_associated_ror_ids
UNION ALL
-- Part 2: Retrieve data from 'jct_inst' for the main institutions
SELECT
esac_id, -- ESAC TA ID
ror_id AS ror_main, -- ROR identifier for the main institution
ror_id AS ror -- ROR identifier for the main institution (no associated institutions)
FROM
`subugoe-collaborative.hoaddata.jct_inst` AS jct_inst
ORDER BY
esac_id, ror_main
)

-- Main Query:
-- Select data from the 'matching' CTE and join it with 'jct_inst' to retrieve additional details.
SELECT
DISTINCT matching.*, -- Data from the 'matching' CTE
start_date, -- Start date of participation in transformative agreement
end_date -- End date of participation in transformative agreement
FROM
matching
INNER JOIN
`subugoe-collaborative.hoaddata.jct_inst` AS jct_inst
ON
matching.esac_id = jct_inst.esac_id
ORDER BY
esac_id, ror_main
SELECT
esac_id,
'ror_associated' AS ror_type,
ror_associated AS ror
FROM
obtain_associated_ror_ids
)
SELECT
DISTINCT create_matching_table.*,
start_date,
end_date
FROM
create_matching_table
INNER JOIN
`subugoe-collaborative.hoaddata.jct_inst` AS jct_inst
ON
create_matching_table.esac_id = jct_inst.esac_id
ORDER BY
esac_id,
ror
135 changes: 62 additions & 73 deletions inst/sql/ta_oa_inst.sql
Original file line number Diff line number Diff line change
@@ -1,84 +1,73 @@
-- Publication statistics for institutions participating in transformative agreements (TA)
-- The resulting table allows us to determine the impact TAs have on the open access publication activity of participating institutions.
-- Combine ESAC TA data with institution information
WITH esac_journals AS (
SELECT DISTINCT
hybrid_jns.issn_l AS matching_issn,
hybrid_jns.esac_id,
esac_publisher,
start_date,
EXTRACT(YEAR FROM start_date) AS start_year,
end_date,
EXTRACT(YEAR FROM end_date) AS end_year,
issn_l,
jct_inst.ror,
jct_inst.ror_type,
oalex.id AS oalex_inst_id
FROM `subugoe-collaborative.hoaddata.jct_hybrid_jns` AS hybrid_jns
-- Join with participating institutions
INNER JOIN `subugoe-collaborative.hoaddata.jct_inst_enriched` AS jct_inst
ON jct_inst.esac_id = hybrid_jns.esac_id
-- Match with OpenAlex institutions
INNER JOIN `subugoe-collaborative.openalex.institutions` AS oalex
ON jct_inst.ror = oalex.ror
),

WITH
-- ESAC TA / institutions matching table
esac_journals AS (
SELECT
DISTINCT hybrid_jns.issn_l AS matching_issn, -- Extracting ISSN
hybrid_jns.esac_id, -- ESAC ID of the TA
esac_publisher, -- Publisher information
start_date, -- Agreement start date
EXTRACT(YEAR FROM start_date) AS start_year, -- Extracting the year from start_date
end_date, -- Agreement end date
EXTRACT(YEAR FROM end_date) AS end_year, -- Extracting the year from end_date
issn_l, -- ISSN of the journal
jct_inst.ror, -- ROR ID of the institution
jct_inst.ror_main, -- Main ROR ID of the institution
oalex.id AS oalex_inst_id -- OpenAlex institution ID
FROM
`subugoe-collaborative.hoaddata.jct_hybrid_jns` AS hybrid_jns
-- Join with participating institutions
INNER JOIN
`subugoe-collaborative.hoaddata.jct_inst_enriched` AS jct_inst
ON
jct_inst.esac_id = hybrid_jns.esac_id
-- OpenAlex / ROR Matching
INNER JOIN
`subugoe-collaborative.openalex.institutions` AS oalex
ON
jct_inst.ror = oalex.ror
),
-- Gather publication data for institutions per year and link to TA
inst_per_year AS (
SELECT DISTINCT
esac_journals.*,
oalex_inst.doi,
oalex_inst.cr_year,
oa.cc,
cr.issued,
esac_journals.ror as ror_matched,
esac_journals.ror_type as ror_type_,
-- Determine if publication is within TA date range
CASE
WHEN (DATE(cr.issued) BETWEEN DATE(start_date) AND DATE(end_date)) THEN TRUE
ELSE FALSE
END AS ta,
-- Check if publication has a CC license
CASE
WHEN oa.cc IS NOT NULL THEN TRUE
ELSE FALSE
END AS has_cc
FROM esac_journals
-- Join with OpenAlex institution data
INNER JOIN `subugoe-collaborative.hoaddata.cr_openalex_inst_full` AS oalex_inst
ON esac_journals.oalex_inst_id = oalex_inst.id
AND oalex_inst.issn_l = esac_journals.matching_issn
-- Left join to include publications without CC licenses
LEFT JOIN `subugoe-collaborative.hoaddata.cc_openalex_inst` AS oa
ON oalex_inst.doi = oa.doi
-- Join with Crossref data for publication dates
INNER JOIN `subugoe-collaborative.cr_instant.snapshot` AS cr
ON oalex_inst.doi = cr.doi
)

-- Publications per year, institution, and agreement
inst_per_year AS (
SELECT
DISTINCT esac_journals.*, -- Include columns from the previous CTE
oalex_inst.doi, -- DOI of the publication
oalex_inst.cr_year, -- Publication year
cc, -- CC license information
cr.issued, -- Date of publication
esac_journals.ror AS ror_rel, -- ROR ID related to the journal
esac_journals.ror_main AS ror_id, -- Main ROR ID related to the journal
CASE
WHEN (DATE(cr.issued) BETWEEN DATE(start_date) AND DATE(end_date)) THEN TRUE -- Check if publication is within the agreement's date range
ELSE FALSE
END AS ta, -- Flag indicating if the publication is within the agreement
CASE
WHEN cc IS NOT NULL THEN TRUE -- Check if the publication has a CC license
ELSE FALSE
END AS has_cc -- Flag indicating if the publication has a CC license
FROM
esac_journals
INNER JOIN
`subugoe-collaborative.hoaddata.cr_openalex_inst_full` AS oalex_inst
ON
esac_journals.oalex_inst_id = oalex_inst.id
AND oalex_inst.issn_l = esac_journals.matching_issn
LEFT JOIN
`subugoe-collaborative.hoaddata.cc_openalex_inst` AS oa
ON
oalex_inst.doi = oa.doi
INNER JOIN
`subugoe-collaborative.cr_instant.snapshot` AS cr
ON
oalex_inst.doi = cr.doi
)

-- Selecting relevant columns for the final result
SELECT
-- Final query to select relevant columns for analysis
SELECT DISTINCT
doi,
cr_year,
matching_issn AS issn_l,
esac_id AS ta_journal_portfolio,
esac_publisher,
ta AS ta_active,
cc,
ror_rel AS ror_matching,
ror_id AS ror_main
FROM
inst_per_year
-- Ordering the result set
ror_matched as ror,
ror_type_ as ror_type
FROM inst_per_year
-- Order results by TA journal portfolio, publication year (descending) and DOI
ORDER BY
ta_journal_portfolio,
cr_year DESC;
cr_year DESC,
doi

0 comments on commit 39b49d0

Please sign in to comment.