From f9021d947824fea6f7ef14ca3e9582ca0e6d8f56 Mon Sep 17 00:00:00 2001 From: seanfobbe <70438644+SeanFobbe@users.noreply.github.com> Date: Thu, 16 Dec 2021 19:46:00 +0100 Subject: [PATCH] main commit --- CD-ICJ_Source_CodebookCreation.R | 1008 ++++ CD-ICJ_Source_Config.csv | 18 + CD-ICJ_Source_CorpusCreation.R | 4762 +++++++++++++++++ CD-ICJ_Source_FullCompilation.R | 22 + buttons/MIT0-blue.pdf | Bin 0 -> 7210 bytes buttons/cc-zero.png | Bin 0 -> 6447 bytes data/CD-ICJ_Source_AdvisoryRequestCoding.csv | 9 + data/CD-ICJ_Source_CaseNames.csv | 180 + data/CD-ICJ_Source_CountryCodes.csv | 106 + data/CD-ICJ_Source_Stages_Filenames.csv | 144 + ...CD-ICJ_Source_UnlabelledFilesHandcoded.csv | 22 + functions/f.boxplot.body.R | 18 + functions/f.boxplot.outliers.R | 10 + functions/f.dopar.multihashes.R | 81 + functions/f.dopar.pagenums.R | 46 + functions/f.dopar.pdfextract.R | 68 + functions/f.dopar.pdfocr.R | 86 + functions/f.fast.freqtable.R | 121 + functions/f.hyphen.remove.R | 34 + functions/f.lingsummarize.iterator.R | 116 + functions/f.linkextract.R | 18 + functions/f.selectpdflinks.R | 19 + functions/f.special.replace.R | 19 + functions/f.token.processor.R | 17 + tex/CD-ICJ_Source_TEX_Author.tex | 5 + tex/CD-ICJ_Source_TEX_CodebookTitle.tex | 90 + tex/CD-ICJ_Source_TEX_CompilationTitle.tex | 90 + tex/CD-ICJ_Source_TEX_Preamble_EN.tex | 115 + 28 files changed, 7224 insertions(+) create mode 100644 CD-ICJ_Source_CodebookCreation.R create mode 100644 CD-ICJ_Source_Config.csv create mode 100644 CD-ICJ_Source_CorpusCreation.R create mode 100644 CD-ICJ_Source_FullCompilation.R create mode 100644 buttons/MIT0-blue.pdf create mode 100644 buttons/cc-zero.png create mode 100644 data/CD-ICJ_Source_AdvisoryRequestCoding.csv create mode 100644 data/CD-ICJ_Source_CaseNames.csv create mode 100644 data/CD-ICJ_Source_CountryCodes.csv create mode 100644 data/CD-ICJ_Source_Stages_Filenames.csv create mode 100644 data/CD-ICJ_Source_UnlabelledFilesHandcoded.csv create mode 100644 functions/f.boxplot.body.R create mode 100644 functions/f.boxplot.outliers.R create mode 100644 functions/f.dopar.multihashes.R create mode 100644 functions/f.dopar.pagenums.R create mode 100644 functions/f.dopar.pdfextract.R create mode 100644 functions/f.dopar.pdfocr.R create mode 100644 functions/f.fast.freqtable.R create mode 100644 functions/f.hyphen.remove.R create mode 100644 functions/f.lingsummarize.iterator.R create mode 100644 functions/f.linkextract.R create mode 100644 functions/f.selectpdflinks.R create mode 100644 functions/f.special.replace.R create mode 100644 functions/f.token.processor.R create mode 100644 tex/CD-ICJ_Source_TEX_Author.tex create mode 100644 tex/CD-ICJ_Source_TEX_CodebookTitle.tex create mode 100644 tex/CD-ICJ_Source_TEX_CompilationTitle.tex create mode 100644 tex/CD-ICJ_Source_TEX_Preamble_EN.tex diff --git a/CD-ICJ_Source_CodebookCreation.R b/CD-ICJ_Source_CodebookCreation.R new file mode 100644 index 0000000..105c1c4 --- /dev/null +++ b/CD-ICJ_Source_CodebookCreation.R @@ -0,0 +1,1008 @@ +#'--- +#'title: "Codebook | Corpus of Decisions: International Court of Justice (CD-ICJ)" +#'author: Anonymized for Peer Review +#'geometry: margin=3cm +#'papersize: a4 +#'fontsize: 11pt +#'output: +#' pdf_document: +#' keep_tex: true +#' toc: true +#' toc_depth: 3 +#' number_sections: true +#' pandoc_args: --listings +#' includes: +#' in_header: tex/CD-ICJ_Source_TEX_Preamble_EN.tex +#' before_body: [tex/CD-ICJ_Source_TEX_Author.tex,tex/CD-ICJ_Source_TEX_Definitions.tex,tex/CD-ICJ_Source_TEX_CodebookTitle.tex] +#'bibliography: packages.bib +#'nocite: '@*' +#' --- + +#'\newpage + +#+ echo = FALSE +knitr::opts_chunk$set(fig.pos = "center", + echo = FALSE, + warning = FALSE, + message = FALSE) + + + + +############################ +### Packages +############################ + +#+ +library(knitr) # Scientific Reporting +library(kableExtra) # Enhanced Knitr Tables +library(magick) # Required for cropping when compiling PDF +library(parallel) # Base R Parallelization +library(data.table) # Advanced Data Handling + +setDTthreads(threads = detectCores()) + + +############################ +### Preamble +############################ + +datashort <- "CD-ICJ" + + + +files.zip <- list.files(pattern = "\\.zip") + +datestamp <- unique(tstrsplit(files.zip, + split = "_")[[2]]) + + + +prefix.en <- paste0("ANALYSIS/", + datashort, + "_EN_01_FrequencyTable_var-") + +prefix.fr <- paste0("ANALYSIS/", + datashort, + "_FR_01_FrequencyTable_var-") + + +############################ +### Read Tables: Frequency +############################ + +table.doctype.en <- fread(paste0(prefix.en, "doctype.csv"))[,-3] +table.doctype.fr <- fread(paste0(prefix.fr, "doctype.csv"))[,-3] + +table.opinion.en <- fread(paste0(prefix.en, "opinion.csv"))[,-3] +table.opinion.fr <- fread(paste0(prefix.fr, "opinion.csv"))[,-3] + +table.year.en <- fread(paste0(prefix.en, "year.csv"))[,-3] +table.year.fr <- fread(paste0(prefix.fr, "year.csv"))[,-3] + +table.applicant.en <- fread(paste0(prefix.en, "applicant.csv"))[,-3] +table.applicant.fr <- fread(paste0(prefix.fr, "applicant.csv"))[,-3] + +table.respondent.en <- fread(paste0(prefix.en, "respondent.csv"))[,-3] +table.respondent.fr <- fread(paste0(prefix.fr, "respondent.csv"))[,-3] + + +############################## +### Read Tables: Entity Codes +############################## + +table.countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv") +table.advcodes <- fread("data/CD-ICJ_Source_AdvisoryRequestCoding.csv") + +############################ +### Read Tables: Linguistic +############################ + +stats.ling.en <- fread("ANALYSIS/CD-ICJ_EN_00_CorpusStatistics_Summaries_Linguistic.csv") +stats.ling.fr <- fread("ANALYSIS/CD-ICJ_FR_00_CorpusStatistics_Summaries_Linguistic.csv") + + +############################ +### Read Metadata +############################ + +meta.zip.en <- paste(datashort, + datestamp, + "EN_CSV_BEST_META.zip", + sep = "_") + +meta.best.en <- fread(cmd = paste("unzip -cq", + meta.zip.en)) + + +############################ +### Read Hash File +############################ + +hashfile <- paste(datashort, + datestamp, + "CryptographicHashes.csv", + sep = "_") + + +############################ +### Begin Text +############################ + + +#'# Introduction + +#'The \textbf{\icj\ (ICJ)} is the primary judicial organ of the United Nations and one of the most consequential courts in international law. +#' +#' Called the \enquote{World Court} by many, it is the only international court with general thematic jurisdiction. While critics occasionally note the lack of compulsory jurisdiction and sharply limited access to the Court,\footnote{Only States may be party to proceedings in contentious jurisdiction and only certain bodies of international organizations may request advisory opinions.} its opinions continue to have an outsize influence on the modern interpretation, codification and wider development of international law. Every international legal textbook covers the workings and decisions of the Court *in extenso* and participation in international moot courts, such as the Philip C. Jessup Moot Court, without regular reference to and citation of the \icj 's decisions, is unthinkable. +#' +#'The \textbf{\datatitle\ (\datashort)} collects and presents for the first time in human- and machine-readable form all published decisions of the \icj . Among these are judgments, advisory opinions and orders, as well as their respective appended minority opinions (declarations, separate opinions and dissenting opinions). +#' +#' +#' This data set is designed to be complementary to and fully compatible with the \emph{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ)}, which is also available open access.\footnote{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ). <\url{https://doi.org/10.5281/zenodo.3840480}>.} +#' +#' +#' The quantitative analysis of international legal data is still in its infancy, a situation which is exacerbated by the lack of high-quality empirical data. Most advanced data sets are held in commercial databases and are therefore not easily available to academic researchers, journalists and the general public. With this data set I hope to contribute to a more systematic and empirical view of the international legal system. In an international community founded on the rule of law the activities of the judiciary must be public, transparent and defensible. In the 21st century this requires quantitative scientific review of decisions and actions. +#' +#' Design, construction and compilation of this data set are based on the principles of general availability through freedom from copyright (public domain status), strict transparency and full scientific reproducibility. The *FAIR Guiding Principles for Scientific Data Management and Stewardship* (Findable, Accessible, Interoperable and Reusable) inspire both the design and the manner of publication.\footnote{Wilkinson, M., Dumontier, M., Aalbersberg, I. et al. The FAIR Guiding Principles for Scientific Data Management and Stewardship. Sci Data 3, 160018 (2016). <\url{https://doi.org/10.1038/sdata.2016.18}>.} + + + + + + + +#+ +#'# Reading Files + +#' The data are published in open, interoperable and widely used formats (CSV, TXT, PDF). They can be used with all modern programming languages (e.g. Python or R) and graphical interfaces. The PDF collections are intended to facilitate traditional legal research. +#' +#' **Important:** Missing values are always coded as \enquote{NA}. + +#+ +#'## CSV Files +#' +#' Working with the CSV files is recommended. CSV\footnote{The CSV format is defined in RFC 4180: <\url{https://tools.ietf.org/html/rfc4180}>.} is an open and simple machine-readable tabular data format. In this data set values are separated by commas. Each column is a variable and each row is a document. Variables are explained in detail in section \ref{variables}. + +#' +#' To read \textbf{CSV} files into R I strongly recommend using the fast file reader **fread()** from the **data.table** package (available on CRAN). The file can be read into \textbf{R} like so: + +#+ eval = FALSE, echo = TRUE +library(data.table) +icj.en <- fread("filename.csv") + + +#'## TXT Files +#'The \textbf{TXT} files, including metadata, can be read into \textbf{R} with the package \textbf{readtext} (available on CRAN) thus: + +#+ eval = FALSE, echo = TRUE +library(readtext) +icj.en <- readtext("EN_TXT_BEST_FULL/*.txt", + docvarsfrom = "filenames", + docvarnames = c("court", + "caseno", + "shortname", + "applicant", + "respondent", + "date", + "doctype", + "collision", + "opinion", + "language"), + dvsep = "_", + encoding = "UTF-8") + + + + +#+ +#'# Data Set Design + + +#'## Description of Data Set + +#'The \textbf{\datatitle\ (\datashort)} collects and structures in human- and machine-readable form all published decisions of the \icj . Among these are judgments, advisory opinions and orders, as well as their respective appended minority opinions (declarations, separate opinions and dissenting opinions). +#' + +#' It consists of a CSV file of the full data set, a CSV file with the metadata only, individual TXT files for each document and PDF files with an enhanced text layer generated by the LSTM neural network engine of the optical character recognition software (OCR) \emph{Tesseract}. +#' +#' Additionally, the raw PDF files and some intermediate stages of refinement are included to allow for easier replication of results and for production use in the event that even higher quality methods of optical character recognition (OCR) can be applied to the documents in the future. + + +#+ +#'## Complementarity + +#' This data set is intended to be complementary to and fully compatible with the \emph{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ)}, which is also available open access.\footnote{Corpus of Decisions: Permanent Court of International Justice (CD-PCIJ). <\url{https://doi.org/10.5281/zenodo.3840480}>.} +#' + +#+ +#'## Table of Sources + + +#'\begin{centering} +#'\begin{longtable}{P{5cm}p{9cm}} + +#'\toprule +#' Data Source & Citation \\ + +#'\midrule + +#' Primary Data Source & \url{https://www.icj-cij.org}\\ +#' Source Code & \url{\softwareversionurldoi}\\ +#' Country Codes & \url{\softwareversionurldoi}\\ +#' Entity Codes & \url{\softwareversionurldoi}\\ +#' Cases Names and Parties & \url{\softwareversionurldoi}\\ + +#'\bottomrule + +#'\end{longtable} +#'\end{centering} + + + + + + +#+ +#'## Data Collection +#' Data were collected with the explicit consent of the Registry of the \icj . All documents were downloaded via TLS-encrypted connections and cryptographically signed after data processing was complete. The data set collects all decisions and appended opinions issued by the \icj\ that were published on the official website of the \icj\ on \version . + + +#+ +#'## Source Code and Compilation Report +#' +#' The full Source Code for the creation of this data set, the resulting Compilation Report and this Codebook are published open access and permanently archived in the scientific repository of CERN. +#' +#' With every compilation of the full data set an extensive **Compilation Report** is created in a professionally layouted PDF format (comparable to this Codebook). The Compilation Report includes the Source Code, comments and explanations of design decisions, relevant computational results, exact timestamps and a table of contents with clickable internal hyperlinks to each section. The Compilation Report is published under the same DOI as the Source Code. +#' +#' For details of the construction and validation of the data set please refer to the Compilation Report. + + +#+ +#'## Limitations +#'Users should bear in mind certain limitations: + +#'\begin{enumerate} +#' \item The data set contains only those documents which were published by the ICJ and have been made available by the ICJ on its official website (\emph{publication bias}). +#' \item While Tesseract yields high-quality OCR results, current OCR technology is not perfect and minor errors must be expected (\emph{OCR bias}). +#' \item Automatic language detection is not foolproof and some bilingual documents marked as monolingual may have gone undetected (\emph{language mismatch}). +#' \item Lengthy quotations in languages other than the language indicated in the metadata may further confound analyses (\emph{language blurring}). +#'\end{enumerate} + + + +#+ +#'## Public Domain Status +#' +#'According to written communication between the author and the Registry of the \icj\ the original documents are not subject to copyright. +#' +#' To ensure the widest possible distribution and to promote the international rule of law I waive any copyright to the data set under a \textbf{Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication}. For details of the license please refer to the CC0 copyright notice at the beginning of this Codebook or visit the Creative Commons website for the full terms of the license.\footnote{\url{https://creativecommons.org/publicdomain/zero/1.0/legalcode}} + + +#'\newpage +#+ +#'## Quality Assurance + +#' Dozens of automated tests were conducted to ensure the quality of the data and metadata, for example: +#' +#' \begin{enumerate} +#'\item Auto-detection of language via analysis of n-gram patterns with the \emph{textcat} package for R. +#'\item Strict validation of variable types via \emph{regular expressions}. +#'\item Construction of frequency tables for (almost) every variable followed by human review to detect anomalies. +#'\item Creation of visualizations for many common descriptive analyses. +#'\end{enumerate} +#' +#'For results of each test and more information on the construction of the data set please refer to the Compilation Report or the \enquote{ANALYSIS} archive included with the data set. + + + + + + + +#' \begin{sidewaysfigure} +#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_1.pdf} +#' \caption{Workflow Schematic Part 1: Download, Labelling, Conversion and Sorting of Documents} +#' \end{sidewaysfigure} + + + + +#' \begin{sidewaysfigure} +#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_2.pdf} +#' \caption{Workflow Schematic Part 2: Ingestion, Pre-Processing, Analysis and Creation of CSV Files} +#' \end{sidewaysfigure} + + + + + +#+ +#'# Variants and Primary Target Audiences + +#'The data set is provided in two language versions (English and French), as well as several differently processed variants geared towards specific target audiences. +#' +#' A reduced PDF variant of the data set containing only majority opinions is intended to assist practitioners. +#' +#' \medskip + +#'\begin{centering} +#'\begin{longtable}{p{4cm}p{10cm}} + +#'\toprule + +#'Variant & Target Audience and Description \\ + +#'\midrule +#'\endhead + +#'PDF\_BEST & \textbf{Traditional Legal Research (recommended).} A synthesis of all born-digital documents issued by the ICJ combined with older scanned documents (prior to 2005) which were given a new and enhanced text layer created with an advanced LSTM neural network machine learning engine. Its main advantages are vastly improved local searches in individual documents via Ctrl+F and copy/pasting without the need for extensive manual revisions. Researchers with slow internet connections should consider using the \enquote{TXT\_BEST} variant, as this still provides a reasonable visual approximation of the original documents, but offers the advantage of drastically reduced file size. A reduced PDF variant of the data set containing only majority opinions is available to assist practitioners.\\ +#'CSV\_BEST & \textbf{Quantitative Research (recommended).} A structured representation of the full data set within a single comma-delimited file. Includes the full complement of metadata described in the Codebook. The \enquote{FULL} sub-variant includes the full text of the decisions, whereas the sub-variant \enquote{META} only contains the metadata.\\ +#'TXT\_BEST & \textbf{Quantitative Research.} A synthesis of TXT files created by combining the extracted text of all born-digital documents issued by the ICJ (2005 and later) and the OCR texts from older scanned documents (prior to 2005) generated with an advanced LSTM neural network machine learning engine. R users should strongly consider using the package \emph{readtext} to read them into R with the filename metadata intact.\\ +#'ANALYSIS & \textbf{Quantitative Research.} This archive contains almost all of the machine-readable analysis output generated during the data set creation process to facilitate further analysis (CSV for tables, PDF and PNG for plots). Minor analysis results are documented only in the Compilation Report.\\ +#'TXT\_EXTRACTED & \textbf{Replication Research and Creation of New Data Sets.} TXT files containing the extracted text layer from all original documents as published by the ICJ. The quality of the original OCR text for older documents is poor and this variant should not be used for statistical analysis. Documents dated 2005 or later were born-digital and can be used for all purposes.\\ +#'TXT\_TESSERACT & \textbf{Replication Research and Creation of New Data Sets.} TXT files containing the OCR text generated with an advanced LSTM neural network machine learning engine for documents predating 2005. Fully included in the BEST variant, but provided separately for reasons of transparency.\\ +#'PDF\_ORIGINAL & \textbf{Replication Research and Creation of New Data Sets.} The original documents with the original text layer. Only recommended for researchers who wish to replicate the machine-readable files or who wish to create a new and improved data set. Not recommended for traditional research, as the quality of the original OCR text layer is quite poor.\\ +#'PDF\_ENHANCED & \textbf{Replication Research and Creation of New Data Sets.} Scanned documents of opinions rendered before 2005 which were given a new and enhanced text layer generated with an advanced LSTM neural network machine learning engine. Fully included in the BEST variant, but provided separately for reasons of transparency.\\ + + +#'\bottomrule + +#'\end{longtable} +#'\end{centering} + + + +#+ +#'\newpage + + + +#+ +#'# Variables + + + +#+ +#'## General Remarks + +#' \begin{itemize} +#' +#' \item Missing values are always coded as \enquote{NA}. +#' +#' \item All Strings are encoded in UTF-8. +#' +#' \item A significant part of the metadata was included with the files downloaded from the Court's website. +#' +#' \item The variables \enquote{shortname}, \enquote{applicant}, \enquote{respondent}, \enquote{stage}, \enquote{applicant\_region}, \enquote{applicant\_subregion}, \enquote{respondent\_region} and \enquote{respondent\_subregion} were coded manually by the author of the data set and added automatically at compilation time. Country codes conform to the ISO 3166 Alpha-3 standard and geographical classifications to the M49 standard used by the UN Statistics Division. +#' +#' \item The variable \enquote{fullname} is coded according to case headings as published on the ICJ website. Includes the full names of the parties in parentheses. Introductory phrases such as \enquote{Case concerning...} are omitted. +#' +#' \item The variables \enquote{nchars}, \enquote{ntokens}, \enquote{ntypes}, \enquote{nsentences} and \enquote{year} were calculated automatically based on the content and metadata of each document. +#' +#' \item The variables \enquote{version}, \enquote{doi\_concept}, \enquote{doi\_version} and \enquote{license} were added automatically during the data set creation process to document provenance and to comply with FAIR Data Principles F1, F3 and R1.1. +#' +#' \end{itemize} + + +#'\vspace{1cm} + +#+ +#'## Structure of TXT File Names + +#'\begin{verbatim} +#'[court]_[caseno]_[shortname]_[applicant]_[respondent]_[date]_[doctype]_ +#'[collision]_[stage]_[opinion]_[language] +#'\end{verbatim} + +#'\vspace{1cm} + +#'\subsection{Example TXT File Name} + +#'\begin{verbatim} +#' ICJ_001_CorfuChannel_GBR_ALB_1949-04-09_JUD_01_ME_05_EN.txt +#'\end{verbatim} + + +#'\newpage +#+ +#'## Structure of CSV Metadata + +str(meta.best.en) + +#'\newpage +#+ +#'## Detailed Description of Variables + + +#'\begin{centering} +#'\begin{longtable}{p{3.5cm}p{2cm}p{9cm}} + + + +#'\toprule + +#'Variable & Type & Details\\ + +#'\midrule + +#'\endhead + +#' doc\_id & String & (CSV only) The name of the imported TXT file.\\ +#' text & String & (CSV only) The full content of the imported TXT file.\\ +#' court & String & The variable only takes the value \enquote{ICJ}, which stands for \enquote{\icj}. It is generally only useful if combined with the CD-PCIJ or other data sets.\\ +#' caseno & Integer & The case number assigned by the ICJ. The same case may span multiple case numbers, i.e. the Interpretation or Revision stages have different case numbers than the original judgment. To analyze all stages of a case I recommend a pattern search on the variable \enquote{shortname}. Note: case number 2 is unassigned and there are no documents for case number 2 available on the ICJ website.\\ +#' shortname & String & Short name of the case. This was custom-created by the author based on the original title. Short names include well-known components (e.g. \enquote{Nicaragua}) to facilitate quick local searches and try to be as faithful to the full title as possible. For requests concerning interpretation or revision of a judgment the shortname is followed by \enquote{Interpretation} or \enquote{Revision}.\\ +#' fullname & String & (CSV only) Full name of the case as published on the ICJ website. Includes the full names of the Parties. Introductory phrases such as \enquote{Case concerning...} are omitted.\\ +#' applicant & String & The unique identifier of the applicant. In contentious proceedings this is the three-letter (Alpha-3) country code as per the ISO 3166-1 standard. Table \ref{tab:countrycodes} contains an explanation of all country codes used in the data set. Please note that reserved country codes are in use for historical entities (e.g. the Soviet Union). For advisory proceedings this variable refers to the entity which requested an advisory opinion. Table \ref{tab:entities} explains the detailed advisory coding decisions.\\ +#' respondent & String & The unique identifier of the respondent. In contentious proceedings this is the three-letter (Alpha-3) country code as per the ISO 3166-1 standard. Table \ref{tab:countrycodes} contains an explanation of all country codes used in the data set. Please note that reserved country codes are in use for historical entities (e.g. the Soviet Union). Advisory proceedings do not have a respondent and therefore always take the value \enquote{NA}.\\ +#' applicant\_region & String & (CSV only) The geographical region of the applicant according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\ +#' respondent\_region & String & (CSV only) The geographical region of the respondent according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\ +#' applicant\_subregion & String & (CSV only) The geographical subregion of the applicant according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\ +#' respondent\_subregion & String & (CSV only) The geographical subregion of the respondent according to the UN M49 standard. Please refer to table \ref{tab:countrycodes} for details and exceptions. Geographical information is only available for countries, not for UN bodies or international organizations.\\ +#' date & ISO Date & The date of the document in the format YYYY-MM-DD (ISO-8601).\\ +#' doctype & String & A three-letter code indicating the type of document. Possible values are \enquote{JUD} (judgments in contentious jurisdiction), \enquote{ADV} (advisory opinions) and \enquote{ORD} (orders in all types of jurisidiction).\\ +#' collision & Integer & In rare instances the \icj\ issued several decisions of the same type in the same proceedings on the same day. Most documents take the value \enquote{01}. If documents with otherwise identical metadata would be issued, the value is incremented.\\ +#' stage & String & The stage of proceedings in contentious jurisdiction, coded based on the title page (primary), or a close reading of the findings (secondary). Possible values are \enquote{PO} (preliminary objections), \enquote{ME} (merits), \enquote{IN} (intervention) and \enquote{CO} (compensation). Please note that the ICJ is very inconsistent in how it classifies admissibility; it can occur in the same document either together with a decision on jurisdiction or a decision on the merits. I have chosen to code pure admissibility decisions as \enquote{ME} (e.g. Second Phase of Nottebohm). In general all of the above types of decisions can occur in the same document. I therefore do not recommend this variable for computational analysis unless great care is taken to understand its limitations. Currently only judgments are coded, orders will be added in the future.\\ +#' opinion & Integer & A sequential number assigned to each opinion. Majority opinions are always coded \enquote{00}. Minority opinions begin with \enquote{01} and ascend to the maximum number of minority opinions.\\ +#' language & String & The language of the document as a two-letter ISO 639-1 code. This data set contains documents in the languages English (\enquote{EN}) and French (\enquote{FR}).\\ +#' year & Integer & (CSV only) The year the document was issued. The format is YYYY.\\ +#' minority & Integer & (CSV only) This variable indicates whether the document is a majority (0) or minority (1) opinion.\\ +#' nchars & Integer & (CSV only) The number of characters in a given document.\\ +#' ntokens & Integer & (CSV only) The number of tokens (an arbitrary character sequence bounded by whitespace) in a given document. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\ +#' ntypes & Integer & (CSV only) The number of \emph{unique} tokens. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\ +#' nsentences & Integer & (CSV only) The number of sentences in a given document. The rules for detecting sentence boundaries are very complex and are described in \enquote{Unicode Standard Annex No 29}. This metric can vary significantly depending on tokenizer and parameters used. This count was generated based on plain tokenization with no further pre-processing (e.g. stopword removal, removal of numbers, lowercasing) applied. Analysts should use this number not as an exact figure, but as an estimate of the order of magnitude of a given document's length. If in doubt, perform an independent calculation with the software of your choice.\\ +#' version & ISO Date & (CSV only) The version of the data set as a date in long form as per ISO-8601. The version represents the date on which the data set creation process was begun and the data was acquired from the website of the Court.\\ +#' doi\_concept & String & (CSV only) The Digital Object Identifier (DOI) for the \emph{concept} of the data set. Resolving this DOI via www.doi.org allows researchers to always acquire the \emph{latest version} of the data set. The DOI is a persistent identifier suitable for stable long-term citation. Principle F1 of the FAIR Data Principles (\enquote{data are assigned globally unique and persistent identifiers}) recommends the documentation of each data set with a persistent identifier and Principle F3 its inclusion with the metadata. Even if the CSV data set is transmitted without the accompanying Codebook this allows researchers to establish provenance of the data.\\ +#' doi\_version & String & (CSV only) The Digital Object Identifier (DOI) for the \emph{specific version} of the data set. Resolving this DOI via www.doi.org allows researchers to always acquire this \emph{specific version} of the data set. The DOI is a persistent identifier suitable for stable long-term citation. Principle F1 of the FAIR Data Principles (\enquote{data are assigned globally unique and persistent identifiers}) recommends the documentation of each data set with a persistent identifier and Principle F3 its inclusion with the metadata. Even if the CSV data set is transmitted without the accompanying Codebook this allows researchers to establish provenance of the data.\\ +#' license & String & (CSV only) The license of the data set. In this data set the value is always \enquote{Creative Commons Zero 1.0 Universal}. Ensures compliance with FAIR data principle R1.1 (\enquote{clear and accessible data usage license}).\\ + +#'\bottomrule + + +#'\end{longtable} +#'\end{centering} + + + + +#'\newpage + +#'# Applicant and Respondent Codes + +#+ +#'## Contentious Jurisdiction: States +#' +#'\label{tab:countrycodes} +#' +#'Applicants and Respondents in contentious jurisdiction are coded according to the uppercase three-letter (Alpha-3) country codes described in the ISO 3166-1 standard. The codes are taken from the version of the standard which was valid on 4 November 2020. The table below only includes those codes which are used in the data set. The regions and subregions assigned to States generally follow the UN Standard Country or Area Codes for Statistics Use, 1999 (Revision 4), also known as the M49 standard. +#' +#'Please note that where States have ceased to exist (Soviet Union, Yugoslavia, Serbia and Montenegro, Czechoslovakia) their historical three-letter country codes from ISO 3166-1 are used. These are not part of the current ISO 3166-1 standard, but have been transitionally reserved by the ISO 3166 Maintenance Agency to ensure backwards compatibility. The four-letter ISO 3166-3 standard (\enquote{Code for formerly used names of countries}) is not used in this data set. The regions and subregions for Yugoslavia and Czechoslovakia are taken from M49 revision 2 (1982). The Soviet Union is coded as \enquote{Europe/Eastern Europe} (the M49 standard considers the SUN its own region). Serbia and Montenegro was never included in the M49 standard and has been assigned the same region and subregion as Yugoslavia. + +#'\bigskip + +#'\ra{1.2} + +kable(table.countrycodes, + format = "latex", + align = 'p{1.5cm}p{4cm}p{2cm}p{6cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("ISO-3", + "Name", + "Region", + "Sub-Region")) %>% kable_styling(latex_options = "repeat_header") + + + +#'\newpage +#+ +#'## Advisory Jurisdiction: Entities +#' +#'\label{tab:entities} +#' +#'Entities who requested an advisory opinion from the \icj\ are not Applicants in the strict sense, but have been coded under this variable to reduce clutter. I have tried to choose widely used codes for each entity. +#' +#'Note that the \emph{International Maritime Organization (IMO)} was known as the \enquote{Inter-Governmental Maritime Consultative Organization} at the time it requested the advisory opinon. I have coded it with the modern \enquote{IMO}, as the organization only underwent a change of name and its legal continuity is not in doubt. +#' +#'I was unable to discover a well-known acronym for the \emph{Committee on Applications for Review of Administrative Tribunal Judgements} and custom-coded it as \enquote{CARAT}. + +#'\bigskip + +kable(table.advcodes, + format = "latex", + align = c("p{3cm}", + "p{11cm}"), + booktabs = TRUE, + longtable = TRUE, + col.names = c("Code", + "Entity")) + + + + + +#'\newpage +#+ +#'# Linguistic Metrics + +#+ +#'## Explanation of Metrics + +#' To better communicate the scope of the corpus and its constituent documents I provide a number of classic linguistic metrics and visualize their distributions: +#' +#' +#' \medskip +#' +#'\begin{centering} +#'\begin{longtable}{P{3.5cm}p{10.5cm}} + +#'\toprule + +#'Metric & Definition\\ + +#'\midrule + +#' Characters & Characters roughly correspond to graphemes, the smallest functional unit in a writing system. The word \enquote{judge} is composed of 5 characters, for example.\\ +#' Tokens & An arbitrary character sequence delimited by whitespace on both sides, e.g. it roughly corresponds to the notion of a \enquote{word}. However, due to its strictly syntactical definition it might also include arbitrary sequences of numbers or special characters.\\ +#' Types & Unique tokens. If, for example, the token \enquote{human} appeared one hundred times in a given document, it would be counted as only one type. \\ +#' Sentences & Corresponds approximately to the colloquial definition of a sentence. The exact rules for determining sentence boundaries are very complex and may be reviewed in \enquote{Unicode Standard: Annex No 29}.\\ + +#'\bottomrule + +#'\end{longtable} +#'\end{centering} +#' + +#'\bigskip + +#+ +#'## Summary Statistics + +newnames <- c("Metric", + "Total", + "Min", + "Quart1", + "Median", + "Mean", + "Quart3", + "Max") + + +setnames(stats.ling.en, newnames) +setnames(stats.ling.fr, newnames) + +#'### English + +kable(stats.ling.en, + digits = 2, + format.args = list(big.mark = ","), + format = "latex", + booktabs = TRUE, + longtable = TRUE) + +#'### French + +kable(stats.ling.fr, + digits = 2, + format.args = list(big.mark = ","), + format = "latex", + booktabs = TRUE, + longtable = TRUE) + + + +#'\newpage +#'## Explanation of Diagrams + +#+ +#'### Distributions of Document Length + +#'The diagrams in Section \ref{doclength} are combined violin and box plots. They are especially useful in visualizing distributions of quantitative variables. Their interpretation is fairly straightforward: the greater the area under the curve for a given range, the more frequent the values are in this range. The thick center line of the box indicates the median, the outer lines of the box the first and third quartiles. Whiskers extend outwards to 1.5 times the inter-quartile range (IQR). Outliers beyond 1.5 times IQR are shown as individual points. +#' +#' Please note that the x-axis is logarithmically scaled, i.e. in powers of 10. It therefore increases in a non-linear fashion. Additional sub-markings are included to assist with interpretation. + + +#+ +#'### Most Frequent Tokens + +#' A token is defined as any character sequence delimited by whitespace on both sides, e.g. it roughly corresponds to the notion of a \enquote{word}. However, due to the strictly syntactical definition tokens might also include arbitrary sequences of numbers or special characters. +#' +#' The charts in Sections \ref{toptokens-en} and \ref{toptokens-fr} show the 50 most frequent tokens for each language, weighted by both term frequency (TF) and term frequency/inverse document frequency (TF-IDF). Sequences of numbers, special symbols and a general list of frequent words for English and French (\enquote{stopwords}) were removed prior to constructing the list. For details of the calculations, please refer to the Compilation Report and/or the Source Code. +#' +#' The term frequency $\text{tf}_{td}$ is calculated as the raw count of the number of times a term $t$ appears in a document $d$. +#' +#' The term frequency/inverse document frequency $\text{tf-idf}_{td}$ for a term $t$ in a document $d$ is calculated as follows, with $N$ the total number of documents in a corpus and $\text{df}_{t}$ being the number of documents in the corpus in which the term $t$ appears: +#' +#'$$\text{tf-idf}_{td} = \text{tf}_{td} \times \text{log}_{10}\left(\frac{N}{\text{df}_{t}}\right)$$ + +#+ +#'### Tokens over Time +#' The charts in Section \ref{tokenperyear} show the total output of the \icj\ for each year as the sum total of the tokens of all published decisions (judgments, advisory opinions, orders, appended opinions). These charts may give a rough estimate of the activity of the \icj , although they should be interpreted with caution, as duplicate and highly similar opinions were not removed for this simple analysis. Please refer to Section \ref{docsim} for the scope of identical and near-identical documents in the corpus. + + +#+ +#'\newpage +#'## Distributions of Document Length + + +#' \label{doclength} + + + +#+ +#'### English +#' ![](ANALYSIS/CD-ICJ_EN_10_Distributions_LinguisticMetrics-1.pdf) + + +#+ +#'### French +#' ![](ANALYSIS/CD-ICJ_FR_10_Distributions_LinguisticMetrics-1.pdf) + + + + + +#+ +#'## Most Frequent Tokens (English) +#'\label{toptokens-en} + +#+ +#'### Term Frequency Weighting (TF) +#' ![](ANALYSIS/CD-ICJ_EN_13_Top50Tokens_TF-Weighting_Scatter-1.pdf) + +#+ +#'### Term Frequency/Inverse Document Frequency Weighting (TF-IDF) +#' ![](ANALYSIS/CD-ICJ_EN_14_Top50Tokens_TFIDF-Weighting_Scatter-1.pdf) + +#+ +#'## Most Frequent Tokens (French) +#'\label{toptokens-fr} + +#+ +#'### Term Frequency Weighting (TF) +#' ![](ANALYSIS/CD-ICJ_FR_13_Top50Tokens_TF-Weighting_Scatter-1.pdf) + +#+ +#'### Term Frequency/Inverse Document Frequency Weighting (TF-IDF) +#' ![](ANALYSIS/CD-ICJ_FR_14_Top50Tokens_TFIDF-Weighting_Scatter-1.pdf) + + + +#+ +#'\newpage +#'## Tokens over Time + +#'\label{tokenperyear} + + +#+ +#'### English +#' ![](ANALYSIS/CD-ICJ_EN_05_TokensPerYear-1.pdf) + +#+ +#'### French +#' ![](ANALYSIS/CD-ICJ_FR_05_TokensPerYear-1.pdf) + + + +#+ +#'# Document Similarity +#' +#' \label{docsim} + +#+ +#'## English +#' ![](ANALYSIS/CD-ICJ_EN_19_DocumentSimilarity_Correlation-1.pdf) +#' +#'## French +#' ![](ANALYSIS/CD-ICJ_FR_19_DocumentSimilarity_Correlation-1.pdf) + +#+ +#'## Comment +#' Analysts are advised that the CD-ICJ contains a non-negligible number of highly similar to near-identical documents. This is due to the Court's long-standing practice of issuing formally different decisions for each Applicant-Respondent pair in the course of the same proceedings. A prime example of such proceedings are the *Use of Force* cases, for which the judgments are identical in content, but differ only in the names of the Parties across more than half a dozen different judgments. +#' +#' The above figures plot the number of files to be excluded as a function of correlation similarity based on a document-unigram matrix (with the removal of numbers, special symbols and stopwords, as well as lowercasing). Analysts who wish to qualitatively review this computational approach will find the IDs of presumed duplicates, together with the relevant value of correlation similarity, stored as CSV files in the \enquote{ANALYSIS} archive published with the data set (item 17). These document IDs can also easily be read into statistical software and excluded directly from analyses without having to perform one's own similarity analysis. I do, however, recommend double-checking the IDs for false positives. The document pairings and similarity scores are included in a different CSV file (also item 17). +#' +#' The choice of similarity algorithm, the threshold for marking a document as duplicate and the question of whether duplicate documents should be removed at all should be decided with respect to individual analyses. My goal is to document the Court's output as faithfully as possible and provide analysts with fair warning, as well as the opportunity to make their own choices. Please note that the manner of de-duplication will substantially affect analytical results and should be made after careful consideration of both methodology and the data. + + +#' + +#+ +#'\newpage +#+ +#'# Metadata Frequency Tables +#' +#' \ra{1.3} +#' +#+ +#'## By Year + +#+ +#'### English + +#'\vspace{0.3cm} +#' ![](ANALYSIS/CD-ICJ_EN_04_Barplot_Year-1.pdf) +#'\vspace{0.3cm} + +kable(table.year.en, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Year", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + +#+ +#'\newpage +#+ +#'### French + +#'\vspace{0.3cm} +#' ![](ANALYSIS/CD-ICJ_FR_04_Barplot_Year-1.pdf) +#'\vspace{0.3cm} + +kable(table.year.fr, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Year", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + + +#+ +#'\newpage +#+ +#'## By Document Type + +#+ +#'### English + +#'\vspace{0.3cm} +#' ![](ANALYSIS/CD-ICJ_EN_02_Barplot_Doctype-1.pdf) +#'\vspace{1cm} + +kable(table.doctype.en, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("DocType", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + +#+ +#'### French + +#'\vspace{0.3cm} +#' ![](ANALYSIS/CD-ICJ_FR_02_Barplot_Doctype-1.pdf) +#'\vspace{1cm} + +kable(table.doctype.fr, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("DocType", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + + +#'\ra{1.1} +#+ +#'\newpage +#+ +#'## By Opinion Number + +#+ +#'### English + +#' ![](ANALYSIS/CD-ICJ_EN_03_Barplot_Opinion-1.pdf) + + +kable(table.opinion.en, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Opinion Number", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + +#+ +#'\newpage +#+ +#'### French + +#'\vspace{0.1cm} +#' ![](ANALYSIS/CD-ICJ_FR_03_Barplot_Opinion-1.pdf) +#'\vspace{0.1cm} + +kable(table.opinion.fr, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Opinion Number", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + + +#'\ra{1.3} + +#+ +#'\newpage +#+ +#'## By Applicant + +#+ +#'### English + +kable(table.applicant.en, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Applicant", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + +#+ +#'\newpage +#+ +#'### French + +kable(table.applicant.fr, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Applicant", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + + +#+ +#'\newpage +#+ +#'## By Respondent + +#+ +#'### English + +kable(table.respondent.en, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Respondent", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + +#+ +#'\newpage +#+ +#'### French + +kable(table.respondent.fr, + format = "latex", + align = 'P{3cm}', + booktabs = TRUE, + longtable = TRUE, + col.names = c("Respondent", + "Documents", + "% Total", + "% Cumulative")) %>% kable_styling(latex_options = "repeat_header") + + + + + + + + + + + + +#'\newpage +#+ +#'# Verification of Cryptographic Signatures +#' This Codebook automatically verifies the SHA3-512 cryptographic signatures (\enquote{hashes}) of all ZIP archives during its compilation. SHA3-512 hashes are calculated via system call to the OpenSSL library on Linux systems. +#' +#' A successful check is indicated by \enquote{Signature verified!}. A failed check will print the line \enquote{ERROR!} + + + +#+ echo = TRUE + +# Function: Test SHA3-Hashes +sha3test <- function(filename, sig){ + sig.new <- system2("openssl", + paste("sha3-512", filename), + stdout = TRUE) + sig.new <- gsub("^.*\\= ", "", sig.new) + if (sig == sig.new){ + return("Signature verified!") + }else{ + return("ERROR!") + } +} + + +# Import Original Signatures +input <- fread(hashfile) +filename <- input$filename +sha3.512 <- input$sha3.512 + + +# Verify Signatures +sha3.512.result <- mcmapply(sha3test, filename, sha3.512, USE.NAMES = FALSE) + + +# Print Results +testresult <- data.table(filename, sha3.512.result) + +kable(testresult, + format = "latex", + align = c("l", "r"), + booktabs = TRUE, + col.names = c("File", + "Result")) + + + + + + +#+ +#'# Changelog +#' The Changelog documents changes made to the data set. Versions are named according to the day on which the data creation process began. +#' \vspace{1cm} + +#'\ra{1.5} +#'\begin{centering} +#'\begin{longtable}{p{3cm}p{11cm}} + +#'\toprule + +#'Version & Notes\\ + +#'\midrule + + +#' \version & Initial Release\\ + +#'\bottomrule + +#'\end{longtable} +#'\end{centering} + + + +#+ +#'# Strict Replication Parameters + +system2("openssl", "version", stdout = TRUE) + + +sessionInfo() + + + +#'\newpage +#+ +#'# References diff --git a/CD-ICJ_Source_Config.csv b/CD-ICJ_Source_Config.csv new file mode 100644 index 0000000..bf125aa --- /dev/null +++ b/CD-ICJ_Source_Config.csv @@ -0,0 +1,18 @@ +key,value +datatitle,Corpus of Decisions: International Court of Justice +datashort,CD-ICJ +doi.data.concept,10.5281/zenodo.3826444 +doi.data.version,10.5281/zenodo.3826445 +doi.software.concept,10.5281/zenodo.3977176 +doi.software.version,10.5281/zenodo.3977177 +license,Creative Commons Zero 1.0 Universal +caseno.begin,1 +caseno.end,181 +caseno.exclude,2 +mode.debug.toggle,FALSE +mode.debug.sample,3 +ocr.dpi,300 +plot.format,pdf png +plot.dpi,300 +fig.align,center +freq.var.ignore,date doc_id text diff --git a/CD-ICJ_Source_CorpusCreation.R b/CD-ICJ_Source_CorpusCreation.R new file mode 100644 index 0000000..0892542 --- /dev/null +++ b/CD-ICJ_Source_CorpusCreation.R @@ -0,0 +1,4762 @@ +#'--- +#'title: "Compilation Report | Corpus of Decisions: International Court of Justice (CD-ICJ)" +#'author: Anonymized for Peer Review +#'geometry: margin=3cm +#'papersize: a4 +#'fontsize: 11pt +#'output: +#' pdf_document: +#' keep_tex: true +#' toc: true +#' toc_depth: 3 +#' number_sections: true +#' pandoc_args: --listings +#' includes: +#' in_header: tex/CD-ICJ_Source_TEX_Preamble_EN.tex +#' before_body: [tex/CD-ICJ_Source_TEX_Author.tex,tex/CD-ICJ_Source_TEX_Definitions.tex,tex/CD-ICJ_Source_TEX_CompilationTitle.tex] +#'bibliography: packages.bib +#'nocite: '@*' +#'--- + + + +#'\newpage +#+ +#'# Introduction +#' +#+ +#'## Overview +#' This R script downloads and processes the full set of decisions and appended opinions rendered by the International Court of Justice (ICJ) as published on its website (https://www.icj-cij.org) into a rich and structured human- and machine-readable data set. It is the basis for the \textbf{\datatitle\ (\datashort )}. +#' +#' All data sets created with this script will always be hosted permanently open access and freely available at Zenodo, the scientific repository of CERN. Each version is uniquely identified with a persistent Digitial Object Identifier (DOI), the \emph{Version DOI}. The newest version of the data set will always available via the link of the \emph{Concept DOI}: \dataconcepturldoi + + + +#+ +#'## Functionality +#' +#' This script will produce 21 ZIP archives: +#' +#'* 2 archives of CSV files containing the full machine-readable data set (English/French) +#'* 2 archives of CSV files containing the full machine-readable metadata (English/French) +#'* 2 archives of TXT files containing all machine-readable texts with a reduced set of metadata encoded in the filenames (English/French) +#'* 2 archives of PDF files containing all human-readable texts with enhanced OCR (English/French) +#'* 2 archives of PDF files containing all human-readable majority opinions with enhanced OCR (English/French) +#'* 2 archives of PDF files of documents dated 2004 and earlier containing monolingual documents with enhanced OCR (English/French) +#'* 2 archives of PDF files as originally published by the ICJ (English/French) +#'* 2 archives of TXT files containing text as generated by Tesseract for documents dated 2004 or earlier (English/French) +#'* 2 archives of TXT files containing extracted text from the original documents (English/French) +#'* 1 archive PDF files that were unlabelled on the website (intended for replication and review only) +#'* 1 archive of analysis data and diagrams +#'* 1 archive containing all source files +#' +#' The integrity and veracity of each ZIP archive is documented with cryptographically secure hash signatures (SHA2-256 and SHA3-512). Hashes are stored in a separate CSV file created during the data set compilation process. +#' +#' Please refer to the Codebook regarding the relative merits of each variant. Unless you have very specific needs you should only use the variants denoted \enquote{BEST} for serious work. +#' + + +#'\newpage +#+ +#'## System Requirements +#' +#' You must have **R** and all **R packages** listed under the heading \enquote{Load Packages} installed. +#' +#' You must have the system dependencies **tesseract** and **imagemagick** (on Fedora Linux, names may differ with other Linux distibutions) installed for the OCR pipeline to work. +#' +#' Due to the use of Fork Clusters and system commands the script as published will (probably) only run on Fedora Linux. The specific version of Fedora used is documented as part of the session information at the end of this script. With adjustments it may also work on other distributions. +#' +#' Parallelization will automatically be customized to your machine by detecting the maximum number of cores. A full run of this script takes approximately 11 hours on a machine with a Ryzen 3700X CPU using 16 threads, 64 GB DDR4 RAM and a fast SSD. +#' +#' You must have the **openssl** system library installed for signature generation. If you prefer not to generate signatures this part of the script can be removed without affecting other parts, but a missing signature CSV file will result in non-fatal errors during Codebook compilation. +#' +#' +#' Optional code to compile a high-quality PDF report adhering to standards of strict reproducibility is included. This requires the R packages **rmarkdown**, **magick**, an installation of \LaTeX\ and all the packages specified in the TEX Preamble file. +#' + + + +#+ +#'## Compilation +#' +#' All comments are in **roxygen2-style** markup for use with **spin()** or **render()** from the **rmarkdown ** package. Compiling the scripts will produce the full data set, high-quality PDF reports and save all diagrams to disk. +#' +#' Both scripts can be executed as ordinary R scripts without any of the markdown and report generation elements. The Corpus creation script will also produce the full data set. No diagrams or reports will be saved to disk in this scenario. +#' +#' To compile the full data set, a Compilation Report and the Codebook, copy all files provided in the Source ZIP Archive into an empty (!) folder and run the following command in an R session: + +#+ eval = FALSE + +source("CD-ICJ_Source_FullCompilation.R") + + + + + +#'\newpage +#'# Preamble + +#+ +#'## Datestamp +#' This datestamp will be applied to all output files. It is set at the beginning of the script so it will be held constant for all output even if long runtime breaks the date barrier. + +datestamp <- Sys.Date() +print(datestamp) + + +#'## Date and Time (Begin) +begin.script <- Sys.time() +print(begin.script) + + +#+ +#'## Load Packages + +library(httr) # HTTP Tools +library(rvest) # Web Scraping +library(mgsub) # Vectorized Gsub +library(stringr) # String Manipulation +library(pdftools) # PDF utilities +library(fs) # File Operations +library(knitr) # Scientific Reporting +library(kableExtra) # Enhanced Knitr Tables +library(magick) # Required for cropping when compiling PDF +library(DiagrammeR) # Graph/Network Visualization +library(DiagrammeRsvg) # Export DiagrammeR Graphs as SVG +library(rsvg) # Render SVG to PDF +library(ggplot2) # Advanced Plotting +library(scales) # Rescaling of Plots +library(viridis) # Viridis Color Palette +library(RColorBrewer) # ColorBrewer Palette +library(readtext) # Read TXT Files +library(quanteda) # Advanced Text Analytics +library(quanteda.textstats) # Text Statistics Tools +library(quanteda.textplots) # Specialized Plots for Text Statistics +library(textcat) # Classify Text Language +library(data.table) # Advanced Data Handling +library(doParallel) # Parallelization + + + + +#'## Load Additional Functions +#' **Note:** Each custom function will be printed in full prior to its first use in order to enhance readability. All custom functions are prefixed with \enquote{f.} for clarity. + +source("functions/f.boxplot.body.R") +source("functions/f.boxplot.outliers.R") +source("functions/f.dopar.multihashes.R") +source("functions/f.dopar.pagenums.R") +source("functions/f.dopar.pdfextract.R") +source("functions/f.dopar.pdfocr.R") +source("functions/f.fast.freqtable.R") +source("functions/f.hyphen.remove.R") +source("functions/f.lingsummarize.iterator.R") +source("functions/f.linkextract.R") +source("functions/f.selectpdflinks.R") +source("functions/f.special.replace.R") +source("functions/f.token.processor.R") + + + +#'# Parameters + +#+ +#'## Read Configuration File +#' All configuration options are set in a separate configuration file that is read here. They should only be changed in that file! +#' +#' The configuration is read, printed, re-written to a temporary file and re-read to achieve transposition with correct column classes, something fread() cannot do directly. This procedure allows for a source CSV file that is easier to edit and easier to access within R. + +config <- fread("CD-ICJ_Source_Config.csv") + +kable(config, + format = "latex", + align = c("p{5cm}", + "p{9cm}"), + booktabs = TRUE, + col.names = c("Key", + "Value")) + +temp <- transpose(config, + make.names = "key") + +fwrite(temp, + "temp.csv") + +config <- fread("temp.csv") + +unlink("temp.csv") + + + + +#+ +#'## Name of Data Set + +datashort <- config$datashort +print(datashort) + + +#'## DOI of Data Set Concept + +doi.concept <- config$doi.data.concept +print(doi.concept) + + +#'## DOI of Specific Version + +doi.version <- config$doi.data.version +print(doi.version) + + +#'## License +license <- config$license +print(license) + + + +#'## Output Directory +#' The directory name must include a terminating slash! +outputdir <- paste0(getwd(), + "/ANALYSIS/") + + +#'## Scope: Case Numbers +#' These variables define the scope of cases (by ordinal number) to be compiled into the data set. +#' +#' Case number 2 appears to be unassigned. There is no information available on the ICJ website. It is therefore always excluded. +#' +#' The variable for the final case number --- caseno.end --- must be set manually. + +caseno.begin <- config$caseno.begin +caseno.end <- config$caseno.end +caseno.exclude <- config$caseno.exclude + +print(caseno.begin) +print(caseno.end) +print(caseno.exclude) + + +#'## Debugging Mode +#' The debugging mode will reduce the number of documents compiled significantly. The full complement of cases takes approximately 11 hours to process with 16 threads on a Ryzen 3700X. The reduced complement captures a variety of cases with key characteristics that are useful in testing all features. Testing should always include cases 116 and 146 or an error will occur. +#' +#' In addition to the mandatory test cases debugging mode will draw two random samples of size *debug.sample*, one from older and one from more recent cases of the ICJ. + + +mode.debug.toggle <- config$mode.debug.toggle +mode.debug.sample <- config$mode.debug.sample + +print(mode.debug.toggle) +print(mode.debug.sample) + + + +#'## DPI for OCR +#' This is the resolution at which PDF files will be converted to TIFF during the OCR step. DPI values will significantly affect the quality of text ouput and file size. Higher DPI requires more RAM, means higher quality text and greater PDF file size. A value of 300 is recommended. + +ocr.dpi <- config$ocr.dpi +print(ocr.dpi) + + + + +#'## Frequency Tables: Ignored Variables + +#' This is a character vector of variable names that will be ignored in the construction of frequency tables. +#' +#' It is a good idea to add variables to this list that are unlikely to produce useful frequency tables. This is often the case for variables with a very large proportion of unique values. Use this option judiciously, as frequency tables are useful for detecting anomalies in the metadata. + + +freq.var.ignore <- unlist(tstrsplit(config$freq.var.ignore, + split = " ")) + +print(freq.var.ignore) + + + + + + +#'## Knitr Options + +#+ +#'### Image Output File Formats + +plot.format <- unlist(tstrsplit(config$plot.format, + split = " ")) + +print(plot.format) + + +#'### DPI for Raster Graphics + +plot.dpi <- config$plot.dpi +print(plot.dpi) + + + +#'### Alignment of Diagrams in Report + +fig.align <- config$fig.align +print(fig.align) + + + +#'### Set Knitr Options +knitr::opts_chunk$set(fig.path = outputdir, + dev = plot.format, + dpi = plot.dpi, + fig.align = fig.align) + + + + + +#'## LaTeX Configuration + +#+ +#'### Construct LaTeX Definitions + +latexdefs <- c("%===========================\n% Definitions\n%===========================", + "\n% NOTE: This file was created automatically during the compilation process.\n", + "\n%-----Version-----", + paste0("\\newcommand{\\version}{", + datestamp, + "}"), + "\n%-----Titles-----", + paste0("\\newcommand{\\datatitle}{", + config$datatitle, + "}"), + paste0("\\newcommand{\\datashort}{", + config$datashort, + "}"), + paste0("\\newcommand{\\softwaretitle}{Source Code for the \\enquote{", + config$datatitle, + "}}"), + paste0("\\newcommand{\\softwareshort}{", + config$datashort, + "-Source}"), + "\n%-----Data DOIs-----", + paste0("\\newcommand{\\dataconceptdoi}{", + config$doi.data.concept, + "}"), + paste0("\\newcommand{\\dataversiondoi}{", + config$doi.data.version, + "}"), + paste0("\\newcommand{\\dataconcepturldoi}{https://doi.org/", + config$doi.data.concept, + "}"), + paste0("\\newcommand{\\dataversionurldoi}{https://doi.org/", + config$doi.data.version, + "}"), + "\n%-----Software DOIs-----", + paste0("\\newcommand{\\softwareconceptdoi}{", + config$doi.software.concept, + "}"), + paste0("\\newcommand{\\softwareversiondoi}{", + config$doi.software.version, + "}"), + + paste0("\\newcommand{\\softwareconcepturldoi}{https://doi.org/", + config$doi.software.concept, + "}"), + paste0("\\newcommand{\\softwareversionurldoi}{https://doi.org/", + config$doi.software.version, + "}")) + + + +#'\newpage +#'### Write LaTeX Definitions + +writeLines(latexdefs, + "tex/CD-ICJ_Source_TEX_Definitions.tex") + + + + +#'## Write Package Citations +write_bib(c(.packages()), + "packages.bib") + + + + + +#'# Parallelization +#' Parallelization is used for many tasks in this script, e.g. for accelerating the conversion from PDF to TXT, OCR, analysis with **quanteda** and with **data.table**. The maximum number of cores will automatically be detected and used. +#' +#' The download of decisions from the ICJ website is not parallelized to ensure respectful use of the Court's bandwidth. +#' +#' The use of **fork clusters** is significantly more efficient than PSOCK clusters, although it restricts use of this script to Linux systems. + +#+ +#'## Detect Number of Logical Cores +#' This will detect the maximum number of threads (= logical cores) available on the system. + +fullCores <- detectCores() +print(fullCores) + +#'## Set Number of OCR Control Cores +#' **Note:** Reduced number of control cores for OCR, as Tesseract calls up to four threads by itself. +ocrCores <- round((fullCores / 4)) + 1 +print(ocrCores) + +#'## Data.table +setDTthreads(threads = fullCores) + +#'## Quanteda +quanteda_options(threads = fullCores) + + + + + + +#'# Create Directories + +#+ +#'## Define Set of Data Directories + +dirset <- c("EN_PDF_ORIGINAL_FULL", + "FR_PDF_ORIGINAL_FULL", + "EN_PDF_ENHANCED_max2004", + "FR_PDF_ENHANCED_max2004", + "EN_PDF_BEST_FULL", + "FR_PDF_BEST_FULL", + "EN_PDF_BEST_MajorityOpinions", + "FR_PDF_BEST_MajorityOpinions", + "EN_TXT_BEST_FULL", + "FR_TXT_BEST_FULL", + "EN_TXT_TESSERACT_max2004", + "FR_TXT_TESSERACT_max2004", + "EN_TXT_EXTRACTED_FULL", + "FR_TXT_EXTRACTED_FULL") + + +#'## Create Data Directories + +for (dir in dirset){ + dir.create(dir) + } + + + +#'## Create Output Directory +dir.create(outputdir) + + + + +#'## Create Directory for Unlabelled Files + +dir.unlabelled <- paste(datashort, + datestamp, + "UnlabelledFiles", + sep = "_") + +dir.create(dir.unlabelled) + + + + + + + +#'# Visualize Corpus Creation Process + +#+ +#'## Workflow Part 1 + + +workflow1 <- " +digraph workflow { + + # a 'graph' statement + graph [layout = dot, overlap = false] + + # Legend + + subgraph cluster1{ + peripheries=1 + 9991 [label = 'Data Nodes', shape = 'ellipse', fontsize = 22] + 9992 [label = 'Action Nodes', shape = 'box', fontsize = 22] +} + + + # Data Nodes + + node[shape = 'ellipse', fontsize = 22] + + 100 [label = 'www.icj-cij.org'] + 101 [label = 'Links to Raw PDF Files'] + 102 [label = 'Unlabelled Files'] + 103 [label = 'Labelling Information'] + 104 [label = 'Labelled PDF Files'] + 105 [label = 'Handcoded Case Names'] + + 106 [label = 'EN_PDF_ORIGINAL_FULL'] + 107 [label = 'EN_TXT_EXTRACTED'] + 108 [label = 'EN_TXT_TESSERACT_max2004'] + 109 [label = 'EN_PDF_ENHANCED_Max2004'] + 110 [label = 'EN_TXT_BEST'] + 111 [label = 'EN_PDF_BEST_FULL'] + 112 [label = 'EN_PDF_BEST_MajorityOpinions'] + + 113 [label = 'FR_PDF_ORIGINAL_FULL'] + 114 [label = 'FR_TXT_EXTRACTED'] + 115 [label = 'FR_TXT_TESSERACT_max2004'] + 116 [label = 'FR_PDF_ENHANCED_Max2004'] + 117 [label = 'FR_TXT_BEST'] + 118 [label = 'FR_PDF_BEST_FULL'] + 119 [label = 'FR_PDF_BEST_MajorityOpinions'] + + + # Action Nodes + + node[shape = 'box', fontsize = 22] + + 200 [label = 'Extract Links from HTML'] + 201 [label = 'Detect Unlabelled Files'] + 202 [label = 'Download Unlabelled Files'] + 203 [label = 'Handcoding of Labels'] + 204 [label = 'Apply Labelling'] + 205 [label = 'Strict REGEX Validation: ICJ File Name Schema'] + 206 [label = 'Download Module'] + 207 [label = 'File Split Module'] + 208 [label = 'Filename Enhancement Module'] + 209 [label = 'Strict REGEX Validation: Codebook File Name Schema'] + 210 [label = 'Detect Missing Language Counterparts'] + 211 [label = 'Text Extraction Module'] + 212 [label = 'Tesseract OCR Module'] + 213 [label = 'Create Majority Variant'] + + + # Edge Statements + 100 -> 200 -> 101 -> 201 -> 202 -> 102 + 102 -> 203 -> 103 + {101, 103} -> 204 -> 205 -> 206 -> 104 -> 207 -> 208 -> 209 -> {106,113} -> 210 -> {211, 212} + 105 -> 208 + 211 -> {107, 114} + 212 -> {108, 109, 115, 116} + {107, 108} -> 110 + {106, 109} -> 111 + {114, 115} -> 117 + {113, 115} -> 118 + 111 -> 213 -> 112 + 118 -> 213 -> 119 + +} +" + + + +grViz(workflow1) %>% export_svg %>% charToRaw %>% rsvg_pdf("ANALYSIS/CD-ICJ_Workflow_1.pdf") +grViz(workflow1) %>% export_svg %>% charToRaw %>% rsvg_png("ANALYSIS/CD-ICJ_Workflow_1.png") + + + +#' \begin{sidewaysfigure} +#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_1.pdf} +#' \caption{Workflow Part 1: Download, Labelling, Conversion and Sorting of Documents} +#' \end{sidewaysfigure} + + + + + + + + + + + + + + + +#+ +#'## Workflow Part 2 + + +workflow2 <- " +digraph workflow { + + # Graph statement + graph [layout = dot, overlap = false] + + # Data Nodes + + node[shape = 'ellipse', fontsize = 22] + + 100 [label = 'EN_TXT_BEST'] + 101 [label = 'FR_TXT_BEST'] + 102 [label = 'EN_TXT_EXTRACTED'] + 103 [label = 'FR_TXT_EXTRACTED'] + + 104 [label = 'EN_CSV_BEST_FULL'] + 105 [label = 'FR_CSV_BEST_FULL'] + 106 [label = 'EN_CSV_BEST_META'] + 107 [label = 'FR_CSV_BEST_META'] + + 108 [label = 'ANALYSIS'] + 109 [label = 'Frequency Tables'] + + + # Action Nodes + + node[shape = 'box', fontsize = 22] + + 200 [label = 'OCR Quality Control Module'] + 201 [label = 'Clean Texts'] + 202 [label = 'Language Purity Module'] + 203 [label = 'Add Metadata'] + 204 [label = 'Calculate Frequency Tables'] + 205 [label = 'Visualize Frequency Tables'] + 206 [label = 'Calculate and Add Summary Statistics'] + 207 [label = 'Calculate Token Frequencies'] + 208 [label = 'Calculate Document Similarity'] + 209 [label = 'Write CSV Files'] + + + # Edge Statements + + {100, 101, 102, 103} -> 200 + {100, 101} -> 201 -> 202 -> 203 + 203 -> 204 -> 109 -> 205 + 203 -> 206 -> 209 + 203 -> {207, 208} + {109, 204, 205, 206, 207, 208} -> 108 + 209 -> {104, 105, 106, 107} + + +} +" + +grViz(workflow2) %>% export_svg %>% charToRaw %>% rsvg_pdf("ANALYSIS/CD-ICJ_Workflow_2.pdf") +grViz(workflow2) %>% export_svg %>% charToRaw %>% rsvg_png("ANALYSIS/CD-ICJ_Workflow_2.png") + + +#' \begin{sidewaysfigure} +#'\includegraphics{ANALYSIS/CD-ICJ_Workflow_2.pdf} +#' \caption{Workflow Part 2: Ingestion, Pre-Processing, Analysis and Creation of CSV Files} +#' \end{sidewaysfigure} + + + + + +#+ +#'# Prepare Download + +#+ +#'## Define Download Scope + +caseno.full <- setdiff(caseno.begin:caseno.end, + caseno.exclude) + + +#'## Debugging Mode --- Reduced Scope + +if(mode.debug.toggle == TRUE){ + caseno.full <- c(sample(3:41, + mode.debug.sample), + 116, + 146, + 152, + sample(153:caseno.end, + mode.debug.sample)) + caseno.full <- sort(caseno.full) + } + + + +#'## Show Function: f.linkextract +print(f.linkextract) + +#'## Show Function: f.selectpdflinks +print(f.selectpdflinks) + + +#'## Prepare Empty Link List +links.list <- vector("list", + caseno.end) + + +#'## Acquire Download Links + +for (caseno in caseno.full) { + + URL.JUD <- sprintf("https://www.icj-cij.org/en/case/%d/judgments", + caseno) + + volatile <- f.linkextract(URL.JUD) + links.jud <- f.selectpdflinks(volatile) + + + URL.ORD <- sprintf("https://www.icj-cij.org/en/case/%d/orders", + caseno) + + volatile <- f.linkextract(URL.ORD) + links.ord <- f.selectpdflinks(volatile) + + + URL.ADV <- sprintf("https://www.icj-cij.org/en/case/%d/advisory-opinions", + caseno) + + volatile <- f.linkextract(URL.ADV) + links.adv <- f.selectpdflinks(volatile) + + + links.list[[caseno]] <- c(links.jud, + links.ord, + links.adv) + print(caseno) + + Sys.sleep(runif(1, 0.5, 1.5)) + +} + + +#'## Clean Links + +links <- unlist(links.list) + +links.unique <- unique(links) + +links.download <- paste0("https://www.icj-cij.org", + links.unique) + + + +#'## Remove Specific Links +#' **Note 1:** All files related to the advisory opinion in Case 146 are bilingual, even the supposedly monolingual variants. This removes the monolingual variants without replacement. True monolingual variants will be generated via splitting the bilingual variants at a later stage. +#' +#' **Note 2:** The French files for cases 89, 125 and 156 are in fact mislabelled English variants. No French variants of the document are available on the website and even the bilingual variants are in fact entirely in English. + +f1 <- "(089-19990629-ORD-01-00-FR)" +f2 <- "(125-20040709-ORD-01-00-FR)" +f3 <- "(146-20120201-ADV-01-00)" +f4 <- "(156-20150422-ORD-01-01-FR)" + +links.download <- grep(paste(f1, f2, f3, f4, sep = "|"), + links.download, + invert = TRUE, + value = TRUE) + + + +#'## Add Specific Links +#' All files related to the advisory opinion in Case 146 are bilingual, even the supposedly monolingual variants. This adds the official bilingual advisory opinion and adds the bilingual appended opinions which were not included in the original link list. These files will be split into monolingual variants at a later stage of the script. + + +links.download <- c(links.download, + "https://www.icj-cij.org/public/files/case-related/146/146-20120201-ADV-01-00-BI.pdf", + "https://www.icj-cij.org/public/files/case-related/146/146-20120201-ADV-01-01-BI.pdf", + "https://www.icj-cij.org/public/files/case-related/146/146-20120201-ADV-01-02-BI.pdf") + + + + + + +#'# Labelling Module +#' Almost two dozen ICJ documents are unlabelled, i.e. they are provided with a computer-generated number only. Their filenames encode no semantic information. This module corrects the filenames and applies the standard naming scheme employed by the ICJ. + +#+ +#'## List Unlabelled Files + +unlabelled.temp <- grep("EN|FR|BI", + links.unique, + invert = TRUE, + value = TRUE) + +unlabelled.out <- data.table(sort(unlabelled.temp), + sort(unlabelled.temp)) + +print(unlabelled.temp) + + +#'## Write to Disk + +fwrite(unlabelled.out, + paste0(dir.unlabelled, + "/", + datashort, + "_", + datestamp, + "_", + "UnlabelledFiles.csv")) + + + +#'## Download Unlabelled Files +#' This is to prepare manual inspection and coding of unlabelled files. + +#+ +#'### Prepare + +unlabelled.download.url <- paste0("https://www.icj-cij.org", + unlabelled.temp) + +unlabelled.download.name <- gsub("\\/", "\\_", + unlabelled.temp) + +unlabelled.download.name <- sub("\\_", "", + unlabelled.download.name) + +dt <- data.table(unlabelled.download.url, + unlabelled.download.name) + + +#'### Number of Unlabelled Files to Download +dt[,.N] + + +#'### Timestamp (Unlabelled Download Begin) + +begin.download <- Sys.time() +print(begin.download) + + +#'### Execute Download +#' **Note:** There is no download retry for this section, as these files are always inspected manually. + +for (i in sample(dt[,.N])){ + download.file(dt$unlabelled.download.url[i], + dt$unlabelled.download.name[i]) + Sys.sleep(runif(1, 0.5, 1.5)) + } + + +#'### Timestamp (Unlabelled Download End) + +end.download <- Sys.time() +print(end.download) + +#'### Duration (Download) + +end.download - begin.download + + + + + +#'## Download Result + +#+ +#'### Number of Files to Download +download.expected.N <- dt[,.N] +print(download.expected.N) + +#'### Number of Files Successfully Downloaded +files.pdf <- list.files(pattern = "\\.pdf", + ignore.case = TRUE) + +download.success.N <- length(files.pdf) +print(download.success.N) + +#'### Number of Missing Files +missing.N <- download.expected.N - download.success.N +print(missing.N) + +#'### Names of Missing Files +missing.names <- setdiff(dt$unlabelled.download.name, + files.pdf) +print(missing.names) + + + +#'## Store Unlabelled Files +file_move(files.pdf, + dir.unlabelled) + + + + +#'## Manual Coding + +#+ +######################################### +### HANDCODING OF UNLABELLED FILES +########################################## + + + +#'## Read in Corrected Labels + +unlabelled.in <- fread("data/CD-ICJ_Source_UnlabelledFilesHandcoded.csv", + header = TRUE) + + +#'## Apply Correct Labels to Link List + +links.corrected <- mgsub(links.download, + unlabelled.in$old, + unlabelled.in$new) + + +#'## REGEX VALIDATION 1: Strictly Validate Links against ICJ Naming Scheme +#' Test strict compliance of proposed download names with naming scheme used by ICJ. The result of a successful test should be an empty character vector! + +#+ +#'### Execute Validation + +regex.test1 <- grep(paste0("^[0-9]{3}", # var: caseno + "-", + "[0-9]{8}", # var: date + "-", + "(JUD|ADV|ORD)", # var: doctype + "-", + "[0-9]{2}", # var: collision + "-", + "[0-9]{2}", # var: opinion + "-", + "(EN|FR|BI)", # var: language + ".pdf$"), # file extension, + basename(links.corrected), + invert = TRUE, + value = TRUE) + + + + + +#'### Results of Validation +print(regex.test1) + +#'### Stop Script on Failure +if (length(regex.test1) != 0){ + stop("REGEX VALIDATION 1 FAILED: LINKS NOT IN COMPLIANCE WITH ICJ SCHEMA!") + } + + + +#'## Detect Duplicate Filenames +links.corrected[duplicated(links.corrected)] + + +#'## Detect Missing Counterparts for each Language Version + +linknames.en <- grep("EN.pdf", + links.corrected, + value=TRUE) + +linknames.fr <- grep("FR.pdf", + links.corrected, + value=TRUE) + + +#'## Difference in Number of Files +length(linknames.en) - length(linknames.fr) + + +#'## Show Missing French Documents +linknames.fr.temp <- gsub("FR", + "EN", + linknames.fr) + +frenchmissing <- setdiff(linknames.en, + linknames.fr.temp) + +frenchmissing <- gsub("EN", + "FR", + frenchmissing) + +print(frenchmissing) + + +#'## Show Missing English Documents +linknames.en.temp <- gsub("EN", + "FR", + linknames.en) + +englishmissing <- setdiff(linknames.fr, + linknames.en.temp) + +englishmissing <- gsub("FR", + "EN", + englishmissing) + +print(englishmissing) + + + + + + + + +#'# Download Module + +#+ +#'## Prepare Download Table + +dt <- data.table(links.download, + basename(links.corrected)) + +setnames(dt, + new = c("links.download", + "names.download")) + + +#'## Timestamp (Download Begin) +begin.download <- Sys.time() +print(begin.download) + + +#'## Execute Download (All Files) + +for (i in sample(dt[,.N])){ + + download.file(dt$links.download[i], + dt$names.download[i]) + + Sys.sleep(runif(1, 0.5, 1.5)) + +} + + +#'## Timestamp (Download End) +end.download <- Sys.time() +print(end.download) + + +#'## Duration (Download) +end.download - begin.download + + + + +#'## Debugging Mode --- Delete Random Files +#' This section deletes random files to test the result calculations and retry mode. + +if (mode.debug.toggle == TRUE){ + + files.pdf <- list.files(pattern = "\\.pdf") + + unlink(sample(files.pdf, 5)) + +} + + + + + + +#'## Download Result + +#+ +#'### Number of Files to Download +download.expected.N <- dt[,.N] +print(download.expected.N) + +#'### Number of Files Successfully Downloaded +files.pdf <- list.files(pattern = "\\.pdf", + ignore.case = TRUE) + +download.success.N <- length(files.pdf) +print(download.success.N) + +#'### Number of Missing Files +missing.N <- download.expected.N - download.success.N +print(missing.N) + +#'### Names of Missing Files +missing.names <- setdiff(dt$names.download, + files.pdf) +print(missing.names) + + + + +#'## Timestamp (Retry Download Begin) +begin.download <- Sys.time() +print(begin.download) + + +#'## Retry Download + +if(missing.N > 0){ + + dt.retry <- dt[names.download %in% missing.names] + + for (i in 1:dt.retry[,.N]){ + + response <- GET(dt.retry$links.download[i]) + + Sys.sleep(runif(1, 0.25, 0.75)) + + if (response$headers$"content-type" == "application/pdf" & response$status_code == 200){ + tryCatch({download.file(url = dt.retry$links.download[i], destfile = dt.retry$names.download[i]) + }, + error=function(cond) { + return(NA)} + ) + }else{ + print(paste0(dt.retry$names.download[i], " : no PDF available")) + } + + Sys.sleep(runif(1, 0.5, 1.5)) + } +} + + +#'## Timestamp (Retry Download End) +end.download <- Sys.time() +print(end.download) + +#'## Duration (Retry Download) +end.download - begin.download + + + +#'## Retry Result + +files.pdf <- list.files(pattern = "\\.pdf", + ignore.case = TRUE) + + +#'### Successful during Retry + +retry.success.names <- files.pdf[files.pdf %in% missing.names] +print(retry.success.names) + +#'### Missing after Retry + +retry.missing.names <- setdiff(retry.success.names, + missing.names) +print(retry.missing.names) + + + +#'## Final Download Result + +#+ +#'### Number of Files to Download +download.expected.N <- dt[,.N] +print(download.expected.N) + +#'### Number of Files Successfully Downloaded +files.pdf <- list.files(pattern = "\\.pdf", + ignore.case = TRUE) + +download.success.N <- length(files.pdf) +print(download.success.N) + +#'### Number of Missing Files +missing.N <- download.expected.N - download.success.N +print(missing.N) + +#'### Names of Missing Files +missing.names <- setdiff(dt$names.download, + files.pdf) +print(missing.names) + + + + +#'# File Split Module + +#'## Armed Activities Order +#' Note: this file contains the correct French document, but also an appended opinion in English, which is already correctly located in another file. Therefore the appended opinion is simply removed from the file. + +filename <- "116-20161206-ORD-01-00-FR.pdf" + +file.temp <- paste0(filename, + "-temp") + +file.rename(filename, file.temp) + +pdf_subset(file.temp, 1:5, filename) + +unlink(file.temp) + + + + +#'## Case 146 +#' **Note:** The files for the Advisory Opinion and appended opinions of Case 146 are all bilingual, including the supposedly monolingual versions. These need to be split into their component language versions. English is assumed to be on even pages for the majority opinion and on odd pages for the appended opinions. Both processes are looped in case further documents in need of splitting are discovered. + + +#+ +#'### English on Even Pages + +even.english <- c("146-20120201-ADV-01-00-BI.pdf") + +for (file in even.english){ + temp1 <- seq(1, pdf_length(file), 1) + + even <- temp1[lapply(seq(1, max(temp1), 1), "%%", 2) == 0] + even.name <- gsub("BI\\.pdf", + "EN\\.pdf", + file) + pdf_subset(file, + pages = even, + output = even.name) + + odd <- temp1[lapply(seq(1, max(temp1), 1), "%%", 2) != 0] + odd.name <- gsub("BI\\.pdf", + "FR\\.pdf", + file) + pdf_subset(file, + pages = odd, + output = odd.name) +} + + +#'### English on Odd Pages + +odd.english <- c("146-20120201-ADV-01-01-BI.pdf", + "146-20120201-ADV-01-02-BI.pdf") + +for (file in odd.english){ + temp1 <- seq(1, pdf_length(file), 1) + + even <- temp1[lapply(seq(1, max(temp1), 1), "%%", 2) == 0] + even.name <- gsub("BI\\.pdf", + "FR\\.pdf", + file) + pdf_subset(file, + pages = even, + output = even.name) + + odd <- temp1[lapply(seq(1, max(temp1), 1), "%%", 2) != 0] + odd.name <- gsub("BI\\.pdf", + "EN\\.pdf", + file) + pdf_subset(file, + pages = odd, + output = odd.name) +} + + +#'### Delete Bilingual Files +unlink(even.english) +unlink(odd.english) + + + + + + + +#'# Filename Enhancement Module +#' This module applies a number of enhancements to the filenames: +#' \begin{itemize} +#' \item Better separators +#' \item Case names +#' \item Applicant ISO codes +#' \item Respondent ISO codes +#' \item Stage of proceedings +#' \end{itemize} + + +filenames.original <- list.files(pattern = "\\.pdf") + +#'## Enhance Syntax + +filenames.enhanced1 <- gsub(paste0("([0-9]{3})", # var: caseno + "-", + "([0-9]{4})([0-9]{2})([0-9]{2})", # var: date + "-", + "([A-Z]{3})", # var: doctype + "-", + "([0-9]{2})", # var: collision + "-", + "([0-9]{2})", # var: opinion + "-", + "([A-Z]{2})"), # var: language + "\\1_\\2-\\3-\\4_\\5_\\6_\\7_\\8", + filenames.original) + + + + + +#+ +#'## Manual Coding + + +########## HAND CODING #################### +### - CASENAMES +### - Applicant Codes +### - Respondent Codes +### - Stage of Proceedings +############################################ + + + +#+ +#'## Read Hand Coded Data +casenames <- fread("data/CD-ICJ_Source_CaseNames.csv", + header = TRUE) + + +#'## Add Hand Coded Data to Filenames +#' Case names, Applicant codes and Respondent codes have been hand coded and are added in this step. + +caseno.pad <- formatC(casenames$caseno, + width = 3, + flag = "0") + +case.header <- paste0("ICJ_", + caseno.pad, + "_", + casenames$casename_short, + "_") + +filenames.enhanced2 <- mgsub(filenames.enhanced1, + paste0("^", + caseno.pad, + "\\_"), + case.header) + + + +#'## Add Stage of Proceedings + +stage <- fread("data/CD-ICJ_Source_Stages_Filenames.csv") + + + +files <- list.files("CD-ICJ_2021-07-12_EN_TXT_BEST_FULL/EN_TXT_BEST_FULL/") + +filenames.enhanced3 <- mgsub(filenames.enhanced2, + stage$old, + stage$new) + +filenames.enhanced3 <- gsub("([0-9]{4}-[0-9]{2}-[0-9]{2}_[A-Z]{3}_[0-9]{2})(_[0-9]{2})", + "\\1_NA\\2", + filenames.enhanced3) + + + + + +#'\newpage +#'## REGEX VALIDATION 2: Strictly Validate Naming Scheme against Codebook Schema +#' Test strict compliance with variable types described in Codebook. The result should be an empty character vector! + + +#+ +#'### Execute Validation + +regex.test2 <- grep(paste0("^ICJ", # var: court + "_", + "[0-9]{3}", # var: caseno + "_", + "[A-Za-z0-9\\-]*", # var: shortname + "_", + "[A-Z\\-]*", # var: applicant + "_", + "[A-Z\\-]*", # var: respondent + "_", + "[0-9]{4}-[0-9]{2}-[0-9]{2}", # var: date + "_", + "(JUD|ADV|ORD)", # var: doctype + "_", + "[0-9]{2}", # var: collision + "_", + "(NA|PO|ME|IN|CO)", # var: stage + "_", + "[0-9]{2}", # var: opinion + "_", + "(EN|FR)", # var: language + ".pdf$"), # file extension + filenames.enhanced3, + value = TRUE, + invert = TRUE) + + + +#'### Results of Validation +print(regex.test2) + + +#'### Stop Script on Failure + +if (length(regex.test2) != 0){ + stop("REGEX VALIDATION 2 FAILED: FILE NAMES NOT IN COMPLIANCE WITH CODEBOOK SCHEMA!") + } + + + + +#'## Execute Rename +#+ results = "hide" + +file.rename(filenames.original, + filenames.enhanced3) + + + + +#'# Detect Missing Counterparts for each Language Variant + +files.en <- list.files(pattern = "EN\\.pdf") +files.fr <- list.files(pattern = "FR\\.pdf") + + +#'## Difference between French and English File Lists +abs(length(files.en) - length(files.fr)) + + +#'## Show Missing French Documents +files.fr.temp <- gsub("FR\\.pdf", + "EN\\.pdf", + files.fr) + +frenchmissing <- setdiff(files.en, + files.fr.temp) + +frenchmissing <- gsub("EN\\.pdf", + "FR\\.pdf", + frenchmissing) + +print(frenchmissing) + + +#'## Show Missing English Documents +files.en.temp <- gsub("EN\\.pdf", + "FR\\.pdf", + files.en) + +englishmissing <- setdiff(files.fr, + files.en.temp) + +englishmissing <- gsub("FR\\.pdf", + "EN\\.pdf", + englishmissing) + +print(englishmissing) + + + + + + + + +#'# Text Extraction Module + + +#'## Define Set of Files to Process +files.pdf <- list.files(pattern = "\\.pdf$", + ignore.case = TRUE) + + +#'## Number of Files to Process +length(files.pdf) + + +#'## Show Function: f.dopar.pagenums +#+ results = "asis" +print(f.dopar.pagenums) + + +#'## Count Pages +f.dopar.pagenums(files.pdf, + sum = TRUE, + threads = fullCores) + + +#'## Show Function: f.dopar.pdfextract +#+ results = "asis" +print(f.dopar.pdfextract) + + +#'## Extract Text +result <- f.dopar.pdfextract(files.pdf, + threads = fullCores) + + + + + +#'## Copy and Move EXTRACTED TXT Files +#' This step copies all extracted TXT files from 2005 and later, which are assumed to be born-digital, to the BEST variant TXT folder. It further moves all TXT files to the "EXTRACTED" folder. + +#+ results = "hide" +txt.best.en <- list.files(pattern = "_(200[5-9]|201[0-9]|202[0-5])-.*EN\\.txt") +txt.best.fr <- list.files(pattern = "_(200[5-9]|201[0-9]|202[0-5])-.*FR\\.txt") + +file_copy(txt.best.en, + "EN_TXT_BEST_FULL") +file_copy(txt.best.fr, + "FR_TXT_BEST_FULL") + +txt.extracted.en <- list.files(pattern = "EN\\.txt") +txt.extracted.fr <- list.files(pattern = "FR\\.txt") + +file_move(txt.extracted.en, + "EN_TXT_EXTRACTED_FULL") +file_move(txt.extracted.fr, + "FR_TXT_EXTRACTED_FULL") + + + + + + + +#'# Tesseract OCR Module + +#+ +#'## Mark Files for OCR +#' Only files which were published in 2004 or earlier are marked for optical character recognition (OCR) processing. Files from 2005 onwards are assumed to be born-digital and of perfect quality. + +#+ results = "hide" +files.pdf.en <- list.files(pattern = "EN\\.pdf") +files.pdf.fr <- list.files(pattern = "FR\\.pdf") + +files.ocr.en <- list.files(pattern = "_(19[4-8][0-9]|199[0-9]|200[0-4])-.*EN\\.pdf") +files.ocr.fr <- list.files(pattern = "_(19[4-8][0-9]|199[0-9]|200[0-4])-.*FR\\.pdf") + + +#'## Copy and Move Born-Digital Files + +files.pdf.best.en <- setdiff(files.pdf.en, + files.ocr.en) + +files.pdf.best.fr <- setdiff(files.pdf.fr, + files.ocr.fr) + +file_copy(files.pdf.best.en, + "EN_PDF_BEST_FULL") +file_copy(files.pdf.best.fr, + "FR_PDF_BEST_FULL") + + +file_move(files.pdf.best.en, + "EN_PDF_ORIGINAL_FULL") +file_move(files.pdf.best.fr, + "FR_PDF_ORIGINAL_FULL") + + + + +#'## Show Function: f.dopar.pdfocr +#+ results = "asis" +print(f.dopar.pdfocr) + + +#'## English + +#+ +#'### Number of English Documents to Process +length(files.ocr.en) + +#'### Number of English Pages to Process +f.dopar.pagenums(files.ocr.en, + sum = TRUE, + threads = fullCores) + + + +#'### Run OCR on English Documents +#' **Note:** Training data is set to include both English and French. Lengthy quotations in a non-dominant language are common in international law. Order in language setting matters and for English documents "eng" is set as the primary training data. + +result <- f.dopar.pdfocr(files.ocr.en, + dpi = ocr.dpi, + lang = "eng+fra", + output = "pdf txt", + jobs = ocrCores) + + + + +#'## French + + +#+ +#'### Number of French Documents to Process +length(files.ocr.fr) + + + +#'### Number of French Pages to Process +f.dopar.pagenums(files.ocr.fr, + sum = TRUE, + threads = fullCores) + + + +#'### Run OCR on French Documents +#' **Note:** Training data is set to include both French and English. Lengthy quotations in a non-dominant language are common in international law. Order in language setting matters and for French documents "fra" is set as the primary training data. + +result <- f.dopar.pdfocr(files.ocr.fr, + dpi = ocr.dpi, + lang = "fra+eng", + output = "pdf txt", + jobs = ocrCores) + + + + + + +#'## Rename Files + +#+ results = "hide" +files.pdf <- list.files(pattern = "\\.pdf$") + +files.pdf.enhanced <- gsub("_TESSERACT.pdf", + "_ENHANCED.pdf", + files.pdf) + +file.rename(files.pdf, + files.pdf.enhanced) + + +#+ results = "hide" +files.txt <- list.files(pattern = "\\.txt$") + +files.txt.new <- gsub("_TESSERACT.txt", + ".txt", + files.txt) + +file.rename(files.txt, + files.txt.new) + + + + +#'## Copy and Move TXT Files + +files.ocr.txt.en <- list.files(pattern = "EN\\.txt") +files.ocr.txt.fr <- list.files(pattern = "FR\\.txt") + +file_copy(files.ocr.txt.en, + "EN_TXT_BEST_FULL") +file_copy(files.ocr.txt.fr, + "FR_TXT_BEST_FULL") + +file_move(files.ocr.txt.en, + "EN_TXT_TESSERACT_max2004") +file_move(files.ocr.txt.fr, + "FR_TXT_TESSERACT_max2004") + + + + +#'## Copy and Move PDF Files + +files.ocr.pdf.enhanced.en <- list.files(pattern = "EN_ENHANCED\\.pdf") +files.ocr.pdf.enhanced.fr <- list.files(pattern = "FR_ENHANCED\\.pdf") + +files.ocr.pdf.original.en <- list.files(pattern = "EN\\.pdf") +files.ocr.pdf.original.fr <- list.files(pattern = "FR\\.pdf") + + +file_copy(files.ocr.pdf.enhanced.en, + "EN_PDF_BEST_FULL") +file_copy(files.ocr.pdf.enhanced.fr, + "FR_PDF_BEST_FULL") + +file_move(files.ocr.pdf.enhanced.en, + "EN_PDF_ENHANCED_max2004") +file_move(files.ocr.pdf.enhanced.fr, + "FR_PDF_ENHANCED_max2004") + +file_move(files.ocr.pdf.original.en, + "EN_PDF_ORIGINAL_FULL") +file_move(files.ocr.pdf.original.fr, + "FR_PDF_ORIGINAL_FULL") + + + + + + + +#'# Create Majority-Only Variant + +majonly.en <- list.files("EN_PDF_BEST_FULL", + full.names = TRUE, + pattern = "00_EN") + +majonly.fr <- list.files("FR_PDF_BEST_FULL", + full.names = TRUE, + pattern = "00_FR") + +file_copy(majonly.en, + "EN_PDF_BEST_MajorityOpinions") +file_copy(majonly.fr, + "FR_PDF_BEST_MajorityOpinions") + + + + + + + +#'# Read in TXT Files + +#'## Define Variable Names + +names.variables <- c("court", + "caseno", + "shortname", + "applicant", + "respondent", + "date", + "doctype", + "collision", + "stage", + "opinion", + "language") + + + +#'## BEST Variants + +#'### English + +data.best.en <- readtext("EN_TXT_BEST_FULL/*.txt", + docvarsfrom = "filenames", + docvarnames = names.variables, + dvsep = "_", + encoding = "UTF-8") + +#'### French + +data.best.fr <- readtext("FR_TXT_BEST_FULL/*.txt", + docvarsfrom = "filenames", + docvarnames = names.variables, + dvsep = "_", + encoding = "UTF-8") + + +#'## EXTRACTED Variants + +#'### English + +data.extracted.en <- readtext("EN_TXT_EXTRACTED_FULL/*.txt", + docvarsfrom = "filenames", + docvarnames = names.variables, + dvsep = "_", + encoding = "UTF-8") + + + +#'### French + +data.extracted.fr <- readtext("FR_TXT_EXTRACTED_FULL/*.txt", + docvarsfrom = "filenames", + docvarnames = names.variables, + dvsep = "_", + encoding = "UTF-8") + + +#'## Convert to Data Tables + +setDT(data.best.en) +setDT(data.best.fr) +setDT(data.extracted.en) +setDT(data.extracted.fr) + + + +#'# Clean Texts + + +#+ +#'## Remove Hyphenation across Linebreaks +#' Hyphenation across linebreaks is a serious issue in longer texts. Hyphenated words are often not recognized as a single token by standard tokenization. The result is two unique and non-expressive tokens instead of a single, expressive token. This section removes these hyphenations. + + +#+ +#'### Show Function: f.hyphen.remove + +print(f.hyphen.remove) + +#'### Execute Function + +data.best.en[, text := lapply(.(text), f.hyphen.remove)] +data.best.fr[, text := lapply(.(text), f.hyphen.remove)] + +data.extracted.en[, text := lapply(.(text), f.hyphen.remove)] +data.extracted.fr[, text := lapply(.(text), f.hyphen.remove)] + + +#'## Replace Special Characters +#' This section replaces special characters with their closest equivalents in the Latin alphabet, as some R functions have difficulties processing the originals. These characters usually occur due to OCR mistakes. + +#+ +#'### Show Function: f.special.replace + +print(f.special.replace) + +#'### Execute Function + +data.best.en[, text := lapply(.(text), f.special.replace)] +data.best.fr[, text := lapply(.(text), f.special.replace)] + +data.extracted.en[, text := lapply(.(text), f.special.replace)] +data.extracted.fr[, text := lapply(.(text), f.special.replace)] + + + + + + + +#'# OCR Quality Control Module +#' This module measures the quality of the new Tesseract-generated OCR text against the OCR text provided by the ICJ, which was extracted from the original documents. +#' +#' Only documents from 2004 or earlier will be compared. This provides a more accurate measurement of the relative quality of the different OCR processes than if born-digital documents were to be included. + + +#+ +#'## Create Corpora + +corpus.en.b <- corpus(data.best.en) +corpus.en.e <- corpus(data.extracted.en) + +corpus.fr.b <- corpus(data.best.fr) +corpus.fr.e <- corpus(data.extracted.fr) + + +#'## Subset to 2004 and earlier + +corpus.en.b.2004 <- corpus_subset(corpus.en.b, date < 2005) +corpus.en.e.2004 <- corpus_subset(corpus.en.e, date < 2005) + +corpus.fr.b.2004 <- corpus_subset(corpus.fr.b, date < 2005) +corpus.fr.e.2004 <- corpus_subset(corpus.fr.e, date < 2005) + + +#'## Show Function: f.token.processor + +print(f.token.processor) + +#'## Tokenize + +quanteda_options(tokens_locale = "en") # Set Locale for Tokenization + +tokens.en.b.2004 <- f.token.processor(corpus.en.b.2004) +tokens.en.e.2004 <- f.token.processor(corpus.en.e.2004) + +quanteda_options(tokens_locale = "fr") # Set Locale for Tokenization + +tokens.fr.b.2004 <- f.token.processor(corpus.fr.b.2004) +tokens.fr.e.2004 <- f.token.processor(corpus.fr.e.2004) + + +#'## Create Document-Feature-Matrices + +dfm.en.b.2004 <- dfm(tokens.en.b.2004) +dfm.en.e.2004 <- dfm(tokens.en.e.2004) + +dfm.fr.b.2004 <- dfm(tokens.fr.b.2004) +dfm.fr.e.2004 <- dfm(tokens.fr.e.2004) + + + +#'## Features Reduction +#' **Note:** This is the number of features which have been saved by using advanced OCR in comparison to the OCR used by the ICJ. + + +feat.languages <- c("English", + "French") + +feat.extracted <- c(nfeat(dfm.en.e.2004), + nfeat(dfm.fr.e.2004)) + + +feat.tesseract <- c(nfeat(dfm.en.b.2004), + nfeat(dfm.fr.b.2004)) + + + +feat.reduction.abs <- feat.extracted - feat.tesseract + +feat.reduction.rel.pct <- (1 - (feat.tesseract / feat.extracted)) * 100 + + +dt.ocrquality <- data.table(feat.languages, + feat.extracted, + feat.tesseract, + feat.reduction.abs, + paste(round(feat.reduction.rel.pct, 2), "%")) + + + +kable(dt.ocrquality, + format = "latex", + align = "r", + booktabs = TRUE, + col.names = c("Language", + "Extracted Features", + "Tesseract Features", + "Difference (abs)", + "Difference (pct)")) + + + +#'# Language Purity Module +#' This module analyzes the n-gram patterns of each document with **textcat** to detect the most likely language. Only English and French are considered. This is to ensure maximum monolinguality of documents, which is an advantage in Natural Language Processing. + + + +#+ +#'## Limit Detection to English and French + +lang.profiles <- TC_byte_profiles[names(TC_byte_profiles) %in% c("english", + "french")] + + +#'## Automatic Language Detection + +data.best.en$textcat <- textcat(data.best.en$text, + p = lang.profiles) + +data.best.fr$textcat <- textcat(data.best.fr$text, + p = lang.profiles) + + +#'## Detected Languages + +#' **Note:** Should only read 'english' +unique(data.best.en$textcat) + +#' **Note:** Should only read 'french' +unique(data.best.fr$textcat) + + + +#'## Show Mismatches +#' Print files which failed to match the language specified in metadata. + +langtest.fail.en <- data.best.en[textcat != "english", .(doc_id, textcat)] +print(langtest.fail.en) + +langtest.fail.fr <- data.best.fr[textcat != "french", .(doc_id, textcat)] +print(langtest.fail.fr) + + +#'## Final Note: Human Review of Mismatches +#' All documents flagged by textcat were reviewed and appropriate remedies devised. Some files were deleted from the corpus if no authentic language variant could be found. Monolingual files for case 146 are now generated from the bilingual originals. See the download section for details. + + + + + +#+ +#'# Add and Delete Variables + +#+ +#'## Delete Textcat Classifications + +data.best.en$textcat <- NULL +data.best.fr$textcat <- NULL + + +#'## Add Variable "year" + +data.best.en$year <- year(data.best.en$date) +data.best.fr$year <- year(data.best.fr$date) + + +#'## Add Variable "minority" +#' "0" indicates a majority opinion, "1" a minority opinion. + +data.best.en$minority <- (data.best.en$opinion != 0) * 1 +data.best.fr$minority <- (data.best.fr$opinion != 0) * 1 + + +#'## Add Variable "fullname" + +#+ +#'### Read Hand Coded Data +casenames <- fread("data/CD-ICJ_Source_CaseNames.csv", + header = TRUE) + + +#'### Create Variable + +data.best.en$fullname <- casenames$casename_full[match(data.best.en$caseno, + casenames$caseno)] + +data.best.fr$fullname <- casenames$casename_full[match(data.best.fr$caseno, + casenames$caseno)] + + + + + +#'## Add Variable "applicant_region" + + + +#+ +#'### Read Hand Coded Data + +countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv") + + + + + +#'### Merge Regions for English Version + +applicant_region <- data.best.en$applicant + +applicant_region <- gsub("CARAT|ECOSOC|IFAD|IMO|UNESCO|UNGA|UNSC|WHO", + "NA", + applicant_region) + +applicant_region <- gsub("-", + "|", + applicant_region) + + +applicant_region <- mgsub(applicant_region, + countrycodes$ISO3, + countrycodes$region) + +data.best.en$applicant_region <- applicant_region + + + +#'### Merge Regions for French Version + +applicant_region <- data.best.fr$applicant + +applicant_region <- gsub("CARAT|ECOSOC|IFAD|IMO|UNESCO|UNGA|UNSC|WHO", + "NA", + applicant_region) + +applicant_region <- gsub("-", + "|", + applicant_region) + + +applicant_region <- mgsub(applicant_region, + countrycodes$ISO3, + countrycodes$region) + +data.best.fr$applicant_region <- applicant_region + + + +#'## Add Variable "respondent_region" + + + +#+ +#'### Read Hand Coded Data + +countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv") + + +#'### Merge Regions for English Version + +respondent_region <- data.best.en$respondent + +respondent_region <- gsub("-", + "|", + respondent_region) + +respondent_region <- mgsub(respondent_region, + countrycodes$ISO3, + countrycodes$region) + +data.best.en$respondent_region <- respondent_region + + + +#'### Merge Regions for French Version + + +respondent_region <- data.best.fr$respondent + +respondent_region <- gsub("-", + "|", + respondent_region) + +respondent_region <- mgsub(respondent_region, + countrycodes$ISO3, + countrycodes$region) + +data.best.fr$respondent_region <- respondent_region + + + + + +#'## Add Variable "applicant_subregion" + + + +#+ +#'### Read Hand Coded Data + +countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv") + + +#'### Merge Subregions for English Version + +applicant_subregion <- data.best.en$applicant + +applicant_subregion <- gsub("CARAT|ECOSOC|IFAD|IMO|UNESCO|UNGA|UNSC|WHO", + "NA", + applicant_subregion) + +applicant_subregion <- gsub("-", + "|", + applicant_subregion) + + +applicant_subregion <- mgsub(applicant_subregion, + countrycodes$ISO3, + countrycodes$subregion) + +data.best.en$applicant_subregion <- applicant_subregion + + + +#'### Merge Subregions for French Version + +applicant_subregion <- data.best.fr$applicant + +applicant_subregion <- gsub("CARAT|ECOSOC|IFAD|IMO|UNECO|UNGA|UNSC|WHO", + "NA", + applicant_subregion) + +applicant_subregion <- gsub("-", + "|", + applicant_subregion) + + +applicant_subregion <- mgsub(applicant_subregion, + countrycodes$ISO3, + countrycodes$subregion) + +data.best.fr$applicant_subregion <- applicant_subregion + + + +#'## Add Variable "respondent_subregion" + + + +#+ +#'### Read Hand Coded Data + +countrycodes <- fread("data/CD-ICJ_Source_CountryCodes.csv") + + +#'### Merge Subregions for English Version + +respondent_subregion <- data.best.en$respondent + +respondent_subregion <- gsub("-", + "|", + respondent_subregion) + +respondent_subregion <- mgsub(respondent_subregion, + countrycodes$ISO3, + countrycodes$subregion) + +data.best.en$respondent_subregion <- respondent_subregion + + + +#'### Merge Subregions for French Version + + +respondent_subregion <- data.best.fr$respondent + +respondent_subregion <- gsub("-", + "|", + respondent_subregion) + +respondent_subregion <- mgsub(respondent_subregion, + countrycodes$ISO3, + countrycodes$subregion) + +data.best.fr$respondent_subregion <- respondent_subregion + + + + + + +#'## Add Variable "doi_concept" + +data.best.en$doi_concept <- rep(doi.concept, + data.best.en[,.N]) + +data.best.fr$doi_concept <- rep(doi.concept, + data.best.fr[,.N]) + +#'## Add Variable "doi_version" + +data.best.en$doi_version <- rep(doi.version, + data.best.en[,.N]) + +data.best.fr$doi_version <- rep(doi.version, + data.best.fr[,.N]) + + +#'## Add Variable "version" + +data.best.en$version <- as.character(rep(datestamp, + data.best.en[,.N])) + +data.best.fr$version <- as.character(rep(datestamp, + data.best.fr[,.N])) + + +#'## Add Variable "license" + +data.best.en$license <- as.character(rep(license, + data.best.en[,.N])) + +data.best.fr$license <- as.character(rep(license, + data.best.fr[,.N])) + + + + + + + +#'# Frequency Tables +#' Frequency tables are a very useful tool for checking the plausibility of categorical variables and detecting anomalies in the data. This section will calculate frequency tables for all variables of interest. + +#+ +#'## Show Function: f.fast.freqtable + +#+ results = "asis" +print(f.fast.freqtable) + + +#+ +#'## English Corpus + +#+ +#'### Variables to Ignore +print(freq.var.ignore) + +#'### Variables to Analyze +varlist <- names(data.best.en) + +varlist <- setdiff(varlist, + freq.var.ignore) + +print(varlist) + + +#'### Construct Frequency Tables + +prefix <- paste0(datashort, + "_EN_01_FrequencyTable_var-") + + +#+ results = "asis" +f.fast.freqtable(data.best.en, + varlist = varlist, + sumrow = TRUE, + output.list = FALSE, + output.kable = TRUE, + output.csv = TRUE, + outputdir = outputdir, + prefix = prefix, + align = c("p{5cm}", + rep("r", 4))) + + + + +#'\newpage +#'## French Corpus + +#+ +#'### Variables to Ignore +print(freq.var.ignore) + +#'### Variables to Analyze + +varlist <- names(data.best.fr) + +varlist <- setdiff(varlist, + freq.var.ignore) + +print(varlist) + + + + +#'### Construct Frequency Tables + +prefix <- paste0(datashort, + "_FR_01_FrequencyTable_var-") + + +#+ results = "asis" +f.fast.freqtable(data.best.fr, + varlist = varlist, + sumrow = TRUE, + output.list = FALSE, + output.kable = TRUE, + output.csv = TRUE, + outputdir = outputdir, + prefix = prefix, + align = c("p{5cm}", + rep("r", 4))) + + + + + + + + + + + +#'# Visualize Frequency Tables + +#+ +#'## Load Tables + +prefix.en <- paste0("ANALYSIS/", + datashort, + "_EN_01_FrequencyTable_var-") + +prefix.fr <- paste0("ANALYSIS/", + datashort, + "_FR_01_FrequencyTable_var-") + + +table.en.doctype <- fread(paste0(prefix.en, + "doctype.csv")) + +table.en.opinion <- fread(paste0(prefix.en, + "opinion.csv")) + +table.en.year <- fread(paste0(prefix.en, + "year.csv")) + + +table.fr.doctype <- fread(paste0(prefix.fr, + "doctype.csv")) + +table.fr.opinion <- fread(paste0(prefix.fr, + "opinion.csv")) + +table.fr.year <- fread(paste0(prefix.fr, + "year.csv")) + + + + + + + +#'\newpage +#'## Doctype + +#+ +#'### English + +freqtable <- table.en.doctype[-.N] + + +#+ CD-ICJ_EN_02_Barplot_Doctype, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = reorder(doctype, + -N), + y = N), + stat = "identity", + fill = "black", + color = "black", + width = 0.4) + + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Documents per Document Type"), + caption = paste("DOI:", + doi.version), + x = "Document Type", + y = "Documents" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + +#'\newpage +#'### French + +freqtable <- table.fr.doctype[-.N] + + +#+ CD-ICJ_FR_02_Barplot_Doctype, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = reorder(doctype, + -N), + y = N), + stat = "identity", + fill = "black", + color = "black", + width = 0.4) + + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Documents per Document Type"), + caption = paste("DOI:", + doi.version), + x = "Document Type", + y = "Documents" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + +#'\newpage +#'## Opinion + +#+ +#'### English + +freqtable <- table.en.opinion[-.N] + +#+ CD-ICJ_EN_03_Barplot_Opinion, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = reorder(opinion, + -N), + y = N), + stat = "identity", + fill = "black", + color = "black") + + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Documents per Opinion Number"), + caption = paste("DOI:", + doi.version), + x = "Opinion Number", + y = "Documents" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + +#'\newpage +#'### French + +freqtable <- table.fr.opinion[-.N] + +#+ CD-ICJ_FR_03_Barplot_Opinion, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = reorder(opinion, -N), + y = N), + stat = "identity", + fill = "black", + color = "black") + + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Documents per Opinion Number"), + caption = paste("DOI:", + doi.version), + x = "Opinon Number", + y = "Documents" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + +#'\newpage +#'## Year + +#+ +#'### English + +freqtable <- table.en.year[-.N][,lapply(.SD, as.numeric)] + +#+ CD-ICJ_EN_04_Barplot_Year, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = year, + y = N), + stat = "identity", + fill = "black") + + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Documents per Year"), + caption = paste("DOI:", + doi.version), + x = "Year", + y = "Documents" + )+ + theme( + text = element_text(size = 16), + plot.title = element_text(size = 16, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + +#'\newpage +#'### French + +freqtable <- table.fr.year[-.N][,lapply(.SD, as.numeric)] + +#+ CD-ICJ_FR_04_Barplot_Year, fig.height = 6, fig.width = 9 +ggplot(data = freqtable) + + geom_bar(aes(x = year, + y = N), + stat = "identity", + fill = "black") + + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Documents per Year"), + caption = paste("DOI:", + doi.version), + x = "Year", + y = "Documents" + )+ + theme( + text = element_text(size = 16), + plot.title = element_text(size = 16, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + + + +#'# Summary Statistics + + +#+ +#'## Linguistic Metrics +#' For the text of each document the number of characters, tokens, types and sentences will be calculated. + + +#+ +#'### Show Function: f.lingsummarize.iterator +#+ results = "asis" +print(f.lingsummarize.iterator) + + +#'### Calculate Linguistic Metrics + +quanteda_options(tokens_locale = "en") # Set Locale for Tokenization + +summary.corpus.en <- f.lingsummarize.iterator(data.best.en, + threads = fullCores, + chunksize = 1) + + +quanteda_options(tokens_locale = "fr") # Set Locale for Tokenization + +summary.corpus.fr <- f.lingsummarize.iterator(data.best.fr, + threads = fullCores, + chunksize = 1) + + +#'### Add Linguistic Metrics to Full Corpora + +data.best.en <- cbind(data.best.en, + summary.corpus.en) + +data.best.fr <- cbind(data.best.fr, + summary.corpus.fr) + + +#'### Create Metadata-only Variants + +meta.best.en <- data.best.en[, !"text"] +meta.best.fr <- data.best.fr[, !"text"] + + + + + +#+ +#'### Calculate Summaries: English + +dt.summary.ling <- meta.best.en[, lapply(.SD, + function(x)unclass(summary(x))), + .SDcols = c("nchars", + "ntokens", + "ntypes", + "nsentences")] + + +dt.sums.ling <- meta.best.en[, + lapply(.SD, sum), + .SDcols = c("nchars", + "ntokens", + "ntypes", + "nsentences")] + +quanteda_options(tokens_locale = "en") # Set Locale for Tokenization + +tokens.temp <- tokens(corpus(data.best.en), + what = "word", + remove_punct = FALSE, + remove_symbols = FALSE, + remove_numbers = FALSE, + remove_url = FALSE, + remove_separators = TRUE, + split_hyphens = FALSE, + include_docvars = FALSE, + padding = FALSE + ) + +dt.sums.ling$ntypes <- nfeat(dfm(tokens.temp)) + + + + +dt.stats.ling <- rbind(dt.sums.ling, + dt.summary.ling) + +dt.stats.ling <- transpose(dt.stats.ling, + keep.names = "names") + +setnames(dt.stats.ling, c("Variable", + "Total", + "Min", + "Quart1", + "Median", + "Mean", + "Quart3", + "Max")) + +#'\newpage +#'### Show Summaries: English + +kable(dt.stats.ling, + format.args = list(big.mark = ","), + format = "latex", + booktabs = TRUE) + + +#'### Write Summaries to Disk: English + +fwrite(dt.stats.ling, + paste0(outputdir, + datashort, + "_EN_00_CorpusStatistics_Summaries_Linguistic.csv"), + na = "NA") + + + + +#'\newpage +#'### Calculate Summaries: French + +dt.summary.ling <- meta.best.fr[, lapply(.SD, + function(x)unclass(summary(x))), + .SDcols = c("nchars", + "ntokens", + "ntypes", + "nsentences")] + + +dt.sums.ling <- meta.best.fr[, + lapply(.SD, sum), + .SDcols = c("nchars", + "ntokens", + "ntypes", + "nsentences")] + + +quanteda_options(tokens_locale = "fr") # Set Locale for Tokenization + +tokens.temp <- tokens(corpus(data.best.fr), + what = "word", + remove_punct = FALSE, + remove_symbols = FALSE, + remove_numbers = FALSE, + remove_url = FALSE, + remove_separators = TRUE, + split_hyphens = FALSE, + include_docvars = FALSE, + padding = FALSE + ) + +dt.sums.ling$ntypes <- nfeat(dfm(tokens.temp)) + + + + +dt.stats.ling <- rbind(dt.sums.ling, + dt.summary.ling) + +dt.stats.ling <- transpose(dt.stats.ling, + keep.names = "names") + +setnames(dt.stats.ling, c("Variable", + "Total", + "Min", + "Quart1", + "Median", + "Mean", + "Quart3", + "Max")) + + +#'\newpage +#'### Show Summaries: French + +kable(dt.stats.ling, + format.args = list(big.mark = ","), + format = "latex", + booktabs = TRUE) + + +#'### Write Summaries to Disk: French + +fwrite(dt.stats.ling, + paste0(outputdir, + datashort, + "_FR_00_CorpusStatistics_Summaries_Linguistic.csv"), + na = "NA") + + + + + + + + + + +#'\newpage +#'## Distributions + +#+ +#'### Tokens per Year: English + +tokens.year.en <- meta.best.en[, + sum(ntokens), + by = "year"] + + + +#+ CD-ICJ_EN_05_TokensPerYear, fig.height = 6, fig.width = 9 +print( + ggplot(data = tokens.year.en, + aes(x = year, + y = V1))+ + geom_bar(stat = "identity", + fill = "black")+ + scale_y_continuous(labels = comma)+ + theme_bw()+ + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Number of Tokens per Year"), + caption = paste("DOI:", + doi.version), + x = "Year", + y = "Tokens" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold") + ) +) + + + + + + +#'\newpage +#'### Tokens per Year: French + +tokens.year.fr <- meta.best.fr[, + sum(ntokens), + by = "year"] + + +#+ CD-ICJ_FR_05_TokensPerYear, fig.height = 6, fig.width = 9 +print( + ggplot(data = tokens.year.fr, + aes(x = year, + y = V1))+ + geom_bar(stat = "identity", + fill = "black")+ + scale_y_continuous(labels = comma)+ + theme_bw()+ + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Number of Tokens per Year"), + caption = paste("DOI:", + doi.version), + x = "Year", + y = "Tokens" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold") + ) +) + + + + + + +#'\newpage +#+ +#'### Density: Characters + +#+ CD-ICJ_EN_06_Density_Characters, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.en) + + geom_density(aes(x = nchars), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distribution of Document Length (Characters)"), + caption = paste("DOI:", + doi.version), + x = "Characters", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage +#+ CD-ICJ_FR_06_Density_Characters, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.fr) + + geom_density(aes(x = nchars), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distribution of Document Length (Characters)"), + caption = paste("DOI:", + doi.version), + x = "Characters", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage +#'### Density: Tokens + +#+ CD-ICJ_EN_07_Density_Tokens, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.en) + + geom_density(aes(x = ntokens), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distribution of Document Length (Tokens)"), + caption = paste("DOI:", + doi.version), + x = "Tokens", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage +#+ CD-ICJ_FR_07_Density_Tokens, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.fr) + + geom_density(aes(x = ntokens), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distribution of Document Length (Tokens)"), + caption = paste("DOI:", + doi.version), + x = "Tokens", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + + +#'\newpage +#'### Density: Types + +#+ CD-ICJ_EN_08_Density_Types, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.en) + + geom_density(aes(x = ntypes), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distribution of Document Length (Types)"), + caption = paste("DOI:", + doi.version), + x = "Types", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + +#'\newpage +#+ CD-ICJ_FR_08_Density_Types, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.fr) + + geom_density(aes(x = ntypes), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distribution of Document Length (Types)"), + caption = paste("DOI:", + doi.version), + x = "Types", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage +#'### Density: Sentences + +#+ CD-ICJ_EN_09_Density_Sentences, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.en) + + geom_density(aes(x = nsentences), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distribution of Document Length (Sentences)"), + caption = paste("DOI:", + doi.version), + x = "Sentences", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage +#+ CD-ICJ_FR_09_Density_Sentences, fig.height = 6, fig.width = 9 +ggplot(data = meta.best.fr) + + geom_density(aes(x = nsentences), + fill = "black") + + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + theme_bw() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distribution of Document Length (Sentences)"), + caption = paste("DOI:", + doi.version), + x = "Sentences", + y = "Density" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + +#'\newpage +#'### All Distributions of Linguistic Metrics +#' When plotting a boxplot on a logarithmic scale the standard geom_boxplot() function from ggplot2 incorrectly performs the statistical transformation first before calculating the boxplot statistics. While median and quartiles are based on ordinal position the inter-quartile range differs depending on when statistical transformation is performed. +#' +#' Solutions are based on this SO question: https://stackoverflow.com/questions/38753628/ggplot-boxplot-length-of-whiskers-with-logarithmic-axis + +print(f.boxplot.body) +print(f.boxplot.outliers) + + + +dt.allmetrics.en <- melt(summary.corpus.en, + measure.vars = rev(c("nchars", + "ntokens", + "ntypes", + "nsentences"))) + +#'\newpage +#+ CD-ICJ_EN_10_Distributions_LinguisticMetrics, fig.height = 10, fig.width = 8.3 +ggplot(dt.allmetrics.en, aes(x = value, + y = variable, + fill = variable))+ + geom_violin()+ + stat_summary(fun.data = f.boxplot.body, + geom = "errorbar", + width = 0.1) + + stat_summary(fun.data = f.boxplot.body, + geom = "boxplot", + width = 0.1) + + stat_summary(fun.data = f.boxplot.outliers, + geom = "point", + size = 0.5, + alpha = 0.1)+ + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + scale_y_discrete(labels = rev(c("Characters", + "Tokens", + "Types", + "Sentences")))+ + theme_bw() + + scale_fill_viridis_d(begin = 0.35)+ + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distributions of Document Length"), + caption = paste("DOI:", + doi.version), + x = "Value", + y = "Linguistic Metric" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + +#'\newpage + +dt.allmetrics.fr <- melt(summary.corpus.fr, + measure.vars = rev(c("nchars", + "ntokens", + "ntypes", + "nsentences"))) + +#+ CD-ICJ_FR_10_Distributions_LinguisticMetrics, fig.height = 10, fig.width = 8.3 +ggplot(dt.allmetrics.fr, aes(x = value, + y = variable, + fill = variable)) + + geom_violin()+ + stat_summary(fun.data = f.boxplot.body, + geom = "errorbar", + width = 0.1) + + stat_summary(fun.data = f.boxplot.body, + geom = "boxplot", + width = 0.1) + + stat_summary(fun.data = f.boxplot.outliers, + geom = "point", + size = 0.5, + alpha = 0.1)+ + scale_x_log10(breaks = trans_breaks("log10", function(x) 10^x), + labels = trans_format("log10", math_format(10^.x)))+ + annotation_logticks(sides = "b")+ + coord_cartesian(xlim = c(1, 10^6))+ + scale_y_discrete(labels = rev(c("Characters", + "Tokens", + "Types", + "Sentences")))+ + theme_bw() + + scale_fill_viridis_d(begin = 0.35)+ + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distributions of Document Length"), + caption = paste("DOI:", + doi.version), + x = "Value", + y = "Linguistic Metric" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + plot.margin = margin(10, 20, 10, 10) + ) + + + + + + + + + +#'\newpage +#'## Number of Majority Opinions + +#+ +#'### English + +dt.maj.disaggregated <- meta.best.en[opinion == 0, + .N, + keyby = "doctype"] + +sumrow <- data.table("Total", + sum(dt.maj.disaggregated$N)) + +dt.maj.disaggregated <- rbind(dt.maj.disaggregated, + sumrow, + use.names = FALSE) + + + +kable(dt.maj.disaggregated, + format = "latex", + booktabs = TRUE, + longtable = TRUE) + + +fwrite(dt.maj.disaggregated, + paste0(outputdir, + datashort, + "_EN_00_CorpusStatistics_Summaries_Majority.csv"), + na = "NA") + + + +#'\newpage +#'### French + +dt.maj.disaggregated <- meta.best.fr[opinion == 0, + .N, + keyby = "doctype"] + +sumrow <- data.table("Total", + sum(dt.maj.disaggregated$N)) + +dt.maj.disaggregated <- rbind(dt.maj.disaggregated, + sumrow, + use.names = FALSE) + + +kable(dt.maj.disaggregated, + format = "latex", + booktabs = TRUE, + longtable = TRUE) + + +fwrite(dt.maj.disaggregated, + paste0(outputdir, + datashort, + "_FR_00_CorpusStatistics_Summaries_Majority.csv"), + na = "NA") + + + +#'\newpage +#'## Number of Minority Opinions + +#+ +#'### English + +dt.min.disaggregated <- meta.best.en[opinion > 0, + .N, + keyby = "doctype"] + +sumrow <- data.table("Total", + sum(dt.min.disaggregated$N)) + +dt.min.disaggregated <- rbind(dt.min.disaggregated, + sumrow, + use.names = FALSE) + + + +kable(dt.min.disaggregated, + format = "latex", + booktabs = TRUE, + longtable = TRUE) + + +fwrite(dt.min.disaggregated, + paste0(outputdir, + datashort, + "_EN_00_CorpusStatistics_Summaries_Minority.csv"), + na = "NA") + + + + + +#'\newpage +#'### French + +dt.min.disaggregated <- meta.best.fr[opinion > 0, + .N, + keyby = "doctype"] + +sumrow <- data.table("Total", + sum(dt.min.disaggregated$N)) + +dt.min.disaggregated <- rbind(dt.min.disaggregated, + sumrow, + use.names = FALSE) + + +kable(dt.min.disaggregated, + format = "latex", + booktabs = TRUE, + longtable = TRUE) + + +fwrite(dt.min.disaggregated, + paste0(outputdir, + datashort, + "_FR_00_CorpusStatistics_Summaries_Minority.csv"), + na = "NA") + + + + + + + +#'## Year Range + +summary(meta.best.en$year) # English +summary(meta.best.fr$year) # French + + + +#'## Date Range + +meta.best.en$date <- as.Date(meta.best.en$date) +meta.best.fr$date <- as.Date(meta.best.fr$date) + +summary(meta.best.en$date) # English +summary(meta.best.fr$date) # French + + + + +#'# Test and Sort Variable Names + +#+ +#'## Semantic Sorting of Variable Names +#' This step ensures that all variable names documented in the Codebook are present in the data set and sorted according to the order in the Codebook. Where variables are missing in the data or undocumented variables are present this step will throw an error. + +#+ +#'### Sort Variables: Full Data Set + + +setcolorder(data.best.en, # English + c("doc_id", + "text", + "court", + "caseno", + "shortname", + "fullname", + "applicant", + "respondent", + "applicant_region", + "respondent_region", + "applicant_subregion", + "respondent_subregion", + "date", + "doctype", + "collision", + "stage", + "opinion", + "language", + "year", + "minority", + "nchars", + "ntokens", + "ntypes", + "nsentences", + "version", + "doi_concept", + "doi_version", + "license")) + + +#'\newpage + + +setcolorder(data.best.fr, # French + c("doc_id", + "text", + "court", + "caseno", + "shortname", + "fullname", + "applicant", + "respondent", + "applicant_region", + "respondent_region", + "applicant_subregion", + "respondent_subregion", + "date", + "doctype", + "collision", + "stage", + "opinion", + "language", + "year", + "minority", + "nchars", + "ntokens", + "ntypes", + "nsentences", + "version", + "doi_concept", + "doi_version", + "license")) + + +#'\newpage +#+ +#'### Sort Variables: Metadata + +setcolorder(meta.best.en, # English + c("doc_id", + "court", + "caseno", + "shortname", + "fullname", + "applicant", + "respondent", + "applicant_region", + "respondent_region", + "applicant_subregion", + "respondent_subregion", + "date", + "doctype", + "collision", + "stage", + "opinion", + "language", + "year", + "minority", + "nchars", + "ntokens", + "ntypes", + "nsentences", + "version", + "doi_concept", + "doi_version", + "license")) + + +#'\newpage + + +setcolorder(meta.best.fr, # French + c("doc_id", + "court", + "caseno", + "shortname", + "fullname", + "applicant", + "respondent", + "applicant_region", + "respondent_region", + "applicant_subregion", + "respondent_subregion", + "date", + "doctype", + "collision", + "stage", + "opinion", + "language", + "year", + "minority", + "nchars", + "ntokens", + "ntypes", + "nsentences", + "version", + "doi_concept", + "doi_version", + "license")) + + + + + + +#'\newpage +#'## Number of Variables: Full Data Set + +length(data.best.en) # English +length(data.best.fr) # French + +#'## Number of Variables: Metadata + +length(meta.best.en) # English +length(meta.best.fr) # French + + +#'## List All Variables: Full Data Set +#' "doc_id" is the filename, "text" is the extracted plaintext, third variable onwards are the metadata variables ("docvars"). + +names(data.best.en) # English +names(data.best.fr) # French + + +#'## List All Variables: Metadata + +names(meta.best.en) # English +names(meta.best.fr) # French + + + + + + + + +#'# Calculate Detailed Token Frequencies + + +#+ +#'## Create Corpora +corpus.en.b <- corpus(data.best.en) +corpus.fr.b <- corpus(data.best.fr) + + +#'## Process Tokens + +quanteda_options(tokens_locale = "en") # Set Locale for Tokenization +tokens.en <- f.token.processor(corpus.en.b) + +quanteda_options(tokens_locale = "fr") # Set Locale for Tokenization +tokens.fr <- f.token.processor(corpus.fr.b) + + +#'## Construct Document-Feature-Matrices + +dfm.en <- dfm(tokens.en) +dfm.fr <- dfm(tokens.fr) + +dfm.tfidf.en <- dfm_tfidf(dfm.en) +dfm.tfidf.fr <- dfm_tfidf(dfm.fr) + + + + +#'## Most Frequent Tokens | TF Weighting | Tables + +#+ +#'### English + +tstat.en <- textstat_frequency(dfm.en, + n = 100) + +fwrite(tstat.en, paste0(outputdir, + datashort, + "_EN_11_Top100Tokens_TF-Weighting.csv")) + +kable(tstat.en, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Feature", + "Frequency", + "Rank", + "Docfreq", + "Group")) %>% kable_styling(latex_options = "repeat_header") + + + + + +#'### French + +tstat.fr <- textstat_frequency(dfm.fr, + n = 100) + +fwrite(tstat.fr, paste0(outputdir, + datashort, + "_FR_11_Top100Tokens_TF-Weighting.csv")) + +kable(tstat.fr, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Feature", + "Frequency", + "Rank", + "Docfreq", + "Group")) %>% kable_styling(latex_options = "repeat_header") + + + + +#'## Most Frequent Tokens | TFIDF Weighting | Tables + +#+ +#'### English + +tstat.tfidf.en <- textstat_frequency(dfm.tfidf.en, + n = 100, + force = TRUE) + +fwrite(tstat.en, paste0(outputdir, + datashort, + "_EN_12_Top100Tokens_TFIDF-Weighting.csv")) + +kable(tstat.tfidf.en, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Feature", + "Weight", + "Rank", + "Docfreq", + "Group")) %>% kable_styling(latex_options = "repeat_header") + + + +#'### French + +tstat.tfidf.fr <- textstat_frequency(dfm.tfidf.fr, + n = 100, + force = TRUE) + +fwrite(tstat.fr, paste0(outputdir, + datashort, + "_FR_12_Top100Tokens_TFIDF-Weighting.csv")) + +kable(tstat.tfidf.fr, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Feature", + "Weight", + "Rank", + "Docfreq", + "Group")) %>% kable_styling(latex_options = "repeat_header") + + + + + +#'\newpage +#'## Most Frequent Tokens | TF Weighting | Scatterplots + +#+ +#'### English + + +#+ CD-ICJ_EN_13_Top50Tokens_TF-Weighting_Scatter, fig.height = 9, fig.width = 7 +print( + ggplot(data = tstat.en[1:50, ], + aes(x = reorder(feature, + frequency), + y = frequency))+ + geom_point()+ + coord_flip()+ + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Top 50 Tokens | Term Frequency"), + caption = paste("DOI:", + doi.version), + x = "Feature", + y = "Frequency" + )+ + theme_bw()+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 12, + face = "bold") + ) +) + + + + +#'\newpage +#+ +#'### French + +#+ CD-ICJ_FR_13_Top50Tokens_TF-Weighting_Scatter, fig.height = 9, fig.width = 7 +print( + ggplot(data = tstat.fr[1:50, ], + aes(x = reorder(feature, + frequency), + y = frequency))+ + geom_point()+ + coord_flip()+ + theme_bw()+ + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Top 50 Tokens | Term Frequency"), + caption = paste("DOI:", + doi.version), + x = "Feature", + y = "Frequency" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 12, + face = "bold") + ) +) + + + + +#'\newpage +#'## Most Frequent Tokens | TFIDF Weighting | Scatterplots + +#+ +#'### English + +#+ CD-ICJ_EN_14_Top50Tokens_TFIDF-Weighting_Scatter, fig.height = 9, fig.width = 7 +print( + ggplot(data = tstat.tfidf.en[1:50, ], + aes(x = reorder(feature, + frequency), + y = frequency))+ + geom_point()+ + coord_flip()+ + theme_bw()+ + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Top 50 Tokens | TF-IDF"), + caption = paste("DOI:", + doi.version), + x = "Feature", + y = "Weight" + )+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 12, + face = "bold") + ) +) + + + + +#'\newpage +#+ +#'### French + +#+ CD-ICJ_FR_14_Top50Tokens_TFIDF-Weighting_Scatter, fig.height = 9, fig.width = 7 +print( + ggplot(data = tstat.tfidf.fr[1:50, ], + aes(x = reorder(feature, + frequency), + y = frequency)) + + geom_point() + + coord_flip() + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Top 50 Tokens | TF-IDF"), + caption = paste("DOI:", + doi.version), + x = "Feature", + y = "Weight" + )+ + theme_bw()+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 12, + face = "bold") + ) +) + + + + + +#'\newpage +#'## Most Frequent Tokens | TF Weighting | Wordclouds + +#+ +#'### English + +#+ CD-ICJ_EN_15_Top100Tokens_TF-Weighting_Cloud, fig.height = 7, fig.width = 7 +textplot_wordcloud(dfm.en, + max_words = 100, + min_size = 1, + max_size = 5, + random_order = FALSE, + rotation = 0, + color = brewer.pal(8, "Dark2")) + +#'\newpage +#+ +#'### French + + +#+ CD-ICJ_FR_15_Top100Tokens_TF-Weighting_Cloud, fig.height = 7, fig.width = 7 +textplot_wordcloud(dfm.fr, + max_words = 100, + min_size = 1, + max_size = 5, + random_order = FALSE, + rotation = 0, + color = brewer.pal(8, "Dark2")) + + +#'\newpage +#'## Most Frequent Tokens | TFIDF Weighting | Wordclouds + + +#+ +#'### English + +#+ CD-ICJ_EN_16_Top100Tokens_TFIDF-Weighting_Cloud, fig.height = 7, fig.width = 7 +textplot_wordcloud(dfm.tfidf.en, + max_words = 100, + min_size = 1, + max_size = 2, + random_order = FALSE, + rotation = 0, + color = brewer.pal(8, "Dark2")) + +#'\newpage +#+ +#'### French + + +#+ CD-ICJ_FR_16_Top100Tokens_TFIDF-Weighting_Cloud, fig.height = 7, fig.width = 7 +textplot_wordcloud(dfm.tfidf.fr, + max_words = 100, + min_size = 1, + max_size = 2, + random_order = FALSE, + rotation = 0, + color = brewer.pal(8, "Dark2")) + + + + + + + + +#'# Document Similarity +#' This analysis computes the correlation similarity for all documents in each corpus, plots the number of documents to drop as a function of the correlation similarity threshold and outputs the document IDs for specific threshold values. +#' +#' The similarity test uses the standard pre-processed unigram document-feature matrix created by the f.token.processor function for the analyses of detailed token frequencies, i.e. it includes removal of numbers, special characters, stopwords (English/French) and lowercasing. I investigated other pre-processing workflows without the removal of features or lowercasing, as well as bigrams and trigrams, but, based on a qualitative assessment of the results, these performed no better or even worse than the standard workflow. Further research will be required to provide a definitive recommendation on how to deduplicate the corpus. +#' +#' I intentionally do not correct for length, as the analysis focuses on detecting duplicates and near-duplicates, not topical similarity. + + +#+ +#'## Set Ranges +#' +#' **Note:** These ranges should cover most use cases. + +threshold.range <- seq(0.8, 1, 0.005) + +threshold.N <- length(threshold.range) + +print(threshold.range) + + +print.range <- seq(0.8, 0.99, 0.01) + +print(print.range) + +#'\newpage +#+ +#'## English + + +#+ +#'### Calculate Similarity + +sim <- textstat_simil(dfm.en, + margin = "documents", + method = "correlation") + +sim.dt <- as.data.table(sim) + + + +#'### Create Empty Lists + +list.ndrop <- vector("list", + threshold.N) + +list.drop.ids <- vector("list", + threshold.N) + +list.pair.ids <- vector("list", + threshold.N) + + +#'### Build Tables + +for (i in 1:threshold.N){ + + threshold <- threshold.range[i] + + pair.ids <- sim.dt[correlation > threshold] + + list.pair.ids[[i]] <- pair.ids + + drop.ids <- sim.dt[correlation > threshold, + .(unique(document1))][order(V1)] + + list.drop.ids[[i]] <- drop.ids + + ndrop <- drop.ids[,.N] + + list.ndrop[[i]] <- data.table(threshold, + ndrop) +} + + +dt.ndrop <- rbindlist(list.ndrop) + + +#'### IDs of Paired Documents Above Threshold +#' IDs of document pairs, with one of them to drop, as function of correlation similarity. + +for (i in print.range){ + + index <- match(i, threshold.range) + + fwrite(list.pair.ids[[index]], + paste0(outputdir, + datashort, + "_EN_17_DocumentSimilarity_Correlation_PairedDocIDs_", + str_pad(threshold.range[index], + width = 5, + side = "right", + pad = "0"), + ".csv")) +} + + + +#'### IDs of Duplicate Documents per Threshold +#' IDs of Documents to drop as function of correlation similarity. + +for (i in print.range){ + + index <- match(i, threshold.range) + + fwrite(list.drop.ids[[index]], + paste0(outputdir, + datashort, + "_EN_17_DocumentSimilarity_Correlation_DuplicateDocIDs_", + str_pad(threshold.range[index], + width = 5, + side = "right", + pad = "0"), + ".csv")) +} + + + +#'### Count of Duplicate Documents per Threshold +#' Number of Documents to drop as function of correlation similarity. + +kable(dt.ndrop, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Threshold", + "Number to Drop")) %>% kable_styling(latex_options = "repeat_header") + +fwrite(dt.ndrop, + paste0(outputdir, + datashort, + "_EN_18_DocumentSimilarity_Correlation_Table.csv")) + + + + +#'\newpage +#+ CD-ICJ_EN_19_DocumentSimilarity_Correlation, fig.height = 6, fig.width = 9 +print( + ggplot(data = dt.ndrop, + aes(x = threshold, + y = ndrop))+ + geom_line()+ + geom_point()+ + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Document Similarity (Correlation)"), + caption = paste("DOI:", + doi.version), + x = "Correlation Similarity Threshold", + y = "Number of Documents Above Threshold" + )+ + scale_x_continuous(breaks = seq(0.8, 1, 0.02))+ + theme_bw()+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "bottom", + legend.direction = "vertical" + ) +) + + + + + + + +#'\newpage +#'## French + + +#'### Calculate Similarity + +sim <- textstat_simil(dfm.fr, + margin = "documents", + method = "correlation") + +sim.dt <- as.data.table(sim) + + + +#'### Create Empty Lists + +list.ndrop <- vector("list", + threshold.N) + +list.drop.ids <- vector("list", + threshold.N) + +list.pair.ids <- vector("list", + threshold.N) + + +#'### Build Tables + +for (i in 1:threshold.N){ + + threshold <- threshold.range[i] + + pair.ids <- sim.dt[correlation > threshold] + + list.pair.ids[[i]] <- pair.ids + + drop.ids <- sim.dt[correlation > threshold, + .(unique(document1))][order(V1)] + + list.drop.ids[[i]] <- drop.ids + + ndrop <- drop.ids[,.N] + + list.ndrop[[i]] <- data.table(threshold, + ndrop) +} + +dt.ndrop <- rbindlist(list.ndrop) + + + +#'### IDs of Paired Documents Above Threshold +#' IDs of document pairs, with one of them to drop, as function of correlation similarity. + +for (i in print.range){ + + index <- match(i, threshold.range) + + fwrite(list.pair.ids[[index]], + paste0(outputdir, + datashort, + "_FR_17_DocumentSimilarity_Correlation_PairedDocIDs_", + str_pad(threshold.range[index], + width = 5, + side = "right", + pad = "0"), + ".csv")) +} + + +#'### IDs of Duplicate Documents per Threshold +#' IDs of Documents to drop as function of correlation similarity. + +for (i in print.range){ + + index <- match(i, threshold.range) + + fwrite(list.drop.ids[[index]], + paste0(outputdir, + datashort, + "_FR_17_DocumentSimilarity_Correlation_DuplicateDocIDs_", + str_pad(threshold.range[index], + width = 5, + side = "right", + pad = "0"), + ".csv")) + +} + + + +#'### Count of Duplicate Documents per Threshold +#' Number of Documents to drop as function of correlation similarity. + +kable(dt.ndrop, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + col.names = c("Threshold", + "Number to Drop")) %>% kable_styling(latex_options = "repeat_header") + +fwrite(dt.ndrop, + paste0(outputdir, + datashort, + "_FR_18_DocumentSimilarity_Correlation_Table.csv")) + + + +#'\newpage +#+ CD-ICJ_FR_19_DocumentSimilarity_Correlation, fig.height = 6, fig.width = 9 +print( + ggplot(data = dt.ndrop, + aes(x = threshold, + y = ndrop))+ + geom_line()+ + geom_point()+ + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Document Similarity (Correlation)"), + caption = paste("DOI:", + doi.version), + x = "Correlation Similarity Threshold", + y = "Number of Documents Above Threshold" + )+ + scale_x_continuous(breaks = seq(0.8, 1, 0.02))+ + theme_bw()+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position="bottom", + legend.direction = "vertical" + ) +) + + + + + + + + + +#+ +#'# Create CSV Files + +#+ +#'## Full Data Set + +csvname.full.en <- paste(datashort, + datestamp, + "EN_CSV_BEST_FULL.csv", + sep = "_") + +csvname.full.fr <- paste(datashort, + datestamp, + "FR_CSV_BEST_FULL.csv", + sep = "_") + + +fwrite(data.best.en, + csvname.full.en, + na = "NA") + +fwrite(data.best.fr, + csvname.full.fr, + na = "NA") + + + +#'## Metadata Only +#' These files are the same as the full data set, minus the "text" variable. + +csvname.meta.en <- paste(datashort, + datestamp, + "EN_CSV_BEST_META.csv", + sep = "_") + +csvname.meta.fr <- paste(datashort, + datestamp, + "FR_CSV_BEST_META.csv", + sep = "_") + + +fwrite(meta.best.en, + csvname.meta.en, + na = "NA") + +fwrite(meta.best.fr, + csvname.meta.fr, + na = "NA") + + + + + +#'# Final File Count per Folder + +dir.table <- as.data.table(dirset)[, { + filecount <- lapply(dirset, + function(x){length(list.files(x))}) + list(dirset, filecount) +}] + + +kable(dir.table, + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE, + linesep = "", + col.names = c("Directory", + "Filecount")) + + + +#'# File Size Distribution + +#'## English + +#'### Corpus Object in RAM + +print(object.size(data.best.en), + humanReadable = TRUE, + units = "MB") + + +#'### Create Data Table of Filenames + +best <- list.files("EN_PDF_BEST_FULL", + full.names = TRUE) + +original <- list.files("EN_PDF_ORIGINAL_FULL", + full.names = TRUE) + +MB <- file.size(best) / 10^6 + +dt1 <- data.table(MB, + rep("BEST", + length(MB))) + + +MB <- file.size(original) / 10^6 + +dt2 <- data.table(MB, rep("ORIGINAL", + length(MB))) + + +dt <- rbind(dt1, + dt2) + +setnames(dt, + "V2", + "variant") + + +#'### Total Size Comparison + +kable(dt[, + .(MB_total = sum(MB)), + keyby = variant], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + + + +#'### Analyze Files Larger than 10 MB + +# Summarize +summary(dt[MB > 10]$MB) + + + +# Space required by large files + +kable(dt[MB > 10, + .(total = sum(MB)), + keyby = variant], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + +# Show Individual Large File Sizes + +kable(dt[MB > 10][order(MB)], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + +#'\newpage +#'### Plot Density Distribution for Files 10MB or Less +dt.plot <- dt[MB <= 10] + + +#+ CD-ICJ_EN_20_FileSizesDensity_Less10MB, fig.height = 6, fig.width = 9 +print( + ggplot(data = dt.plot, + aes(x = MB, + group = variant, + fill = variant))+ + geom_density()+ + theme_bw()+ + facet_wrap(~variant, + ncol = 2) + + labs( + title = paste(datashort, + "| EN | Version", + datestamp, + "| Distribution of File Sizes up to 10 MB"), + caption = paste("DOI:", + doi.version), + x = "File Size in MB", + y = "Density" + )+ + scale_fill_viridis(end = 0.35, discrete = TRUE) + + scale_color_viridis(end = 0.35, discrete = TRUE) + + scale_x_continuous(breaks = seq(0, 10, 2))+ + theme( + text = element_text(size= 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + panel.spacing = unit(0.1, + "lines"), + axis.ticks.x = element_blank() + ) +) + +#'\newpage +#'## French + +#'### Corpus Object in RAM + +print(object.size(data.best.en), + humanReadable = TRUE, + units = "MB") + + +#'### Create Data Table of filenames + +best <- list.files("FR_PDF_BEST_FULL", + full.names = TRUE) + +original <- list.files("FR_PDF_ORIGINAL_FULL", + full.names = TRUE) + + +MB <- file.size(best) / 10^6 + +dt1 <- data.table(MB, + rep("BEST", + length(MB))) + + + +MB <- file.size(original) / 10^6 + +dt2 <- data.table(MB, + rep("ORIGINAL", + length(MB))) + +dt <- rbind(dt1, + dt2) + +setnames(dt, + "V2", + "variant") + + + +#'### Total Size Comparison + +kable(dt[, + .(MB_total = sum(MB)), + keyby = variant], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + +#'### Analyze Files Larger than 10 MB + +summary(dt[MB > 10]$MB) + + + +# Space required by large files + +kable(dt[MB > 10, + .(total = sum(MB)), + keyby = variant], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + +# Show Individual Large File Sizes + +kable(dt[MB > 10][order(MB)], + format = "latex", + align = "r", + booktabs = TRUE, + longtable = TRUE) + + + + +#'\newpage +#'### Plot Density Distribution for Files 10MB or Less + +dt.plot <- dt[MB <= 10] + +#+ CD-ICJ_FR_20_FileSizesDensity_Less10MB, fig.height = 6, fig.width = 9 +print( + ggplot(data = dt.plot, + aes(x = MB, + group = variant, + fill = variant)) + + geom_density() + + theme_bw() + + facet_wrap(~variant, + ncol=2) + + labs( + title = paste(datashort, + "| FR | Version", + datestamp, + "| Distribution of File Sizes up to 10 MB"), + caption = paste("DOI:", + doi.version), + x = "File Size in MB", + y = "Density" + )+ + scale_fill_viridis(end = 0.35, discrete = TRUE) + + scale_color_viridis(end = 0.35, discrete = TRUE) + + scale_x_continuous(breaks = seq(0, 10, 2))+ + theme( + text = element_text(size = 14), + plot.title = element_text(size = 14, + face = "bold"), + legend.position = "none", + panel.spacing = unit(0.1, + "lines"), + axis.ticks.x = element_blank() + ) +) + + + + +#'# Create ZIP Archives + +#+ +#'## ZIP CSV Files + +csv.zip.name.full.en <- gsub(".csv", + "", + csvname.full.en) + +csv.zip.name.full.fr <- gsub(".csv", + "", + csvname.full.fr) + +csv.zip.name.meta.en <- gsub(".csv", + "", + csvname.meta.en) + +csv.zip.name.meta.fr <- gsub(".csv", + "", + csvname.meta.fr) + + +#+ results = 'hide' +zip(csv.zip.name.full.fr, + csvname.full.fr) + +zip(csv.zip.name.full.en, + csvname.full.en) + +zip(csv.zip.name.meta.fr, + csvname.meta.fr) + +zip(csv.zip.name.meta.en, + csvname.meta.en) + + +#'## ZIP Data Directories + +#' **Note:** Vector of Directories was created at the beginning of the script. + +for (dir in dirset){ + zip(paste(datashort, + datestamp, + dir, + sep = "_"), + dir) +} + + +#'\newpage +#'## ZIP ANALYSIS Directory + +zip(paste(datashort, + datestamp, + "EN-FR", + basename(outputdir), + sep = "_"), + basename(outputdir)) + + + +#'## ZIP Unlabelled Files Directory + +zip(dir.unlabelled, + dir.unlabelled) + + +#'## ZIP Source Files + +files.source <- c(list.files(pattern = "Source"), + "data", + "functions", + "buttons") + + +files.source <- grep("spin", + files.source, + value = TRUE, + ignore.case = TRUE, + invert = TRUE) + +zip(paste(datashort, + datestamp, + "Source_Files.zip", + sep = "_"), + files.source) + + + + +#'# Delete CSV and Directories +#' The metadata CSV files are retained for Codebook generation. + +#+ +#'## Delete CSVs + +unlink(csvname.full.fr) +unlink(csvname.full.en) +unlink(csvname.meta.fr) +unlink(csvname.meta.en) + + + +#'## Delete Data Directories +for (dir in dirset){ + unlink(dir, + recursive = TRUE) +} + +unlink(dir.unlabelled, + recursive = TRUE) + + + +#'# Cryptography Module +#' This module computes two types of hashes for every ZIP archive: SHA2-256 and SHA3-512. These are proof of the authenticity and integrity of data and document that the files are the result of this source code. The SHA-2 and SHA-3 family of algorithms are highly resistant to collision and pre-imaging attacks in reasonable scenarios and can therefore be considered secure according to current public cryptographic research. SHA3 hashes with an output length of 512 bit may even provide sufficient security when attacked with quantum cryptanalysis based on Grover's algorithm. + +#+ +#'## Create Set of ZIP Archives +files.zip <- list.files(pattern = "\\.zip$", + ignore.case = TRUE) + + + +#'## Show Function: f.dopar.multihashes +#+ results = "asis" +print(f.dopar.multihashes) + + +#'## Compute Hashes +multihashes <- f.dopar.multihashes(files.zip) + + +#'## Convert to Data Table +setDT(multihashes) + + + +#'## Add Index +multihashes$index <- seq_len(multihashes[,.N]) + +#'\newpage +#'## Save to Disk +fwrite(multihashes, + paste(datashort, + datestamp, + "CryptographicHashes.csv", + sep = "_"), + na = "NA") + + +#'## Add Whitespace to Enable Automatic Linebreak +#' This is only used for display and will be discarded after printing to the Compilation Report. + +multihashes$sha3.512 <- paste(substr(multihashes$sha3.512, 1, 64), + substr(multihashes$sha3.512, 65, 128)) + + +#'\newpage +#'## Print to Report + +kable(multihashes[,.(index,filename)], + format = "latex", + align = c("p{1cm}", + "p{13cm}"), + booktabs = TRUE, + longtable = TRUE) + + +#'\newpage +kable(multihashes[,.(index,sha2.256)], + format = "latex", + align = c("c", + "p{13cm}"), + booktabs = TRUE, + longtable = TRUE) + + +#'\newpage +kable(multihashes[,.(index,sha3.512)], + format = "latex", + align = c("c", + "p{13cm}"), + booktabs = TRUE, + longtable = TRUE) + + + + + + +#'# Finalize + + +#+ +#'## Datestamp +print(datestamp) + + +#'## Date and Time (Begin) +print(begin.script) + + +#'## Date and Time (End) +end.script <- Sys.time() +print(end.script) + + +#'## Script Runtime +print(end.script - begin.script) + + +#'## Warnings +warnings() + + + +#'# Strict Replication Parameters +sessionInfo() + +system2("openssl", + "version", + stdout = TRUE) + +system2("tesseract", + "-v", + stdout = TRUE) + +system2("convert", + "--version", + stdout = TRUE) + + +print(quanteda_options()) + + + +#+ +#'# References diff --git a/CD-ICJ_Source_FullCompilation.R b/CD-ICJ_Source_FullCompilation.R new file mode 100644 index 0000000..696f8ee --- /dev/null +++ b/CD-ICJ_Source_FullCompilation.R @@ -0,0 +1,22 @@ +#'# Load Package +library(rmarkdown) + +#+ +#'# Data Set +#' To compile the full data set and generate a PDF report, copy all files provided in the Source ZIP Archive into an empty (!) folder and use the command below from within an R session: + +rmarkdown::render(input = "CD-ICJ_Source_CorpusCreation.R", + output_file = paste0("CD-ICJ_", + Sys.Date(), + "_CompilationReport.pdf"), + envir = new.env()) + +#+ +#'# Codebook +#' To compile the Codebook, after you have run the Corpus Creation script, use the command below from within an R session: + +rmarkdown::render(input = "CD-ICJ_Source_CodebookCreation.R", + output_file = paste0("CD-ICJ_", + Sys.Date(), + "_Codebook.pdf"), + envir = new.env()) diff --git a/buttons/MIT0-blue.pdf b/buttons/MIT0-blue.pdf new file mode 100644 index 0000000000000000000000000000000000000000..cf9fde94174a5a74456f15bc096e46ca2abae4e1 GIT binary patch literal 7210 zcmcIpc|26@+izbc@svH`lw>y>Gj@-i?0b=Aj4?C9Ff+;${R;KSlBHzHmLd@2wMki3gUpE|`ph>_}oCz=$6&Q_9CE(m)KG&LJ zp14*aSk6xb!uZF6_$3|=%`0@_@QPbDDTecFlM>Zzb!UDc8jy6t7*Fp_WY@*%?^nx# zE&BRG?hit6aF-WPB;|Oc%9^11qh9pq~f#-4)@}7v{x%y2Cy}4p$ zs3UG7y2JDC*3uovvq@480`EOPcXK^?OpELZ%9g2=IyN-Dc_3(r>-MQ#y(9abpShhq zP+H<8cW$ad=pjdS*Ze;Yx0?4@-_sGucMaXLp&cb#U~t_wtLf-Rvxozg zBZyd`*nuyb?mm-LM<6%&#hSs6Y8d~>c$laV6v&Z3ttbCAz3^(#G&Q3zcKa)?Bb1l6 zOQPcqtUk#<#%{vK9XNkPcE)$~biFYSZ(*~@v2M_7x-teYHtOgc%sY1K4g1&n9vqC| z;k+C^@Cgb$6m94ij5it~2l1g1dNnPKL}>ZYbWc5?pk6A@gXRfo!uzg}^#D56o3Jv{ z0B30uPLlBiGaYq^xEuvD0*&HL#S>@%lmRoS+L!&32gfTLMcdm0PbX767}`)@wr%O#~jaidVpJ#lz2qrXIMl~18M(^h?1R=3PtADv|eSlD2E!BtRlnEx1I zS!S;U8i1nDr~r_$COA+T{a?-3q2gAG0~89^p!|Q@&q(7R99X+smfJ6qS$U96f9k&w zomurqjh$Qxcsir}%)OoHjLoJ^p}I3D&;w&0bKj}|++s*@_c3d4wc4Q-;05J>wbd-s zesY!@4x`mD+m4C~lgr#Z%;%K?TDP1=LV}tZ8~5A=>tJt#CXol#?O9RLZXdTm328ZadFr@z019&Qea0~l(Uy$=dJ zLp;>-84qfD=70;%jmCH@FZU_r$_oXOyHZk&X3kjh-%5%(6A8#*(WqadG-B=VW1$tN z>V2l@+pJ=dV*lU^U;2*SJN0y*1r=p#nVgN@c}O8xc3)a({lAC*$!Dq6e~4ck26W{^D%X?Ywet zBgvh9dP1a0uozN~<@rrV}#X~Kp_8uF3*Obh;b73-ifL(r9Jl077ecU#DUnVy~+2>Vd zU8;0_p;GNdbd{4cXWDBgYON~ zo35=ABUTSP($3{en#A(JQ|N{K7DZ7DW7atjI+%%b$gj%->Kz$x?H3^ z)o(##OweB-j{imEm|GogRwtsSlc?j;=k!w$8?KGG{sIBEA~A#%zxD3Hf85zT6k+tNqSVDw1G2Z^Gm>LbbMRq5&t9|%E!*p zW9a9TkKfx5RG)P*Cfc(Qh^MY*KZYkc*t_iZ+K4t0JjzXCE!Z|FDBZg4*g|>mLg^l% zMUTVvcr#b~P0~jF77NWUGA1}EiGBloBc1KM(NdCg6?xp_&z>m_yb4r7)4soc@`iPg z%N}_+qCcY33DYHqyoS^aJzhp?rHtVqCRy09LXX|!nT#RaX6354OJG`C~3;VTXF_VC>d4RR@ zAkj{+H7U_^(;PbsB#}qDv`Qc8FOJtY`k!=<{OYG#r&<~RhOFtc!EH-XL9VC2P4f0J z_4}^6a@>+AWPu3sGw=LcNakuM&-0X*jlR8KKP5ZDhr?=qSHbVoff23#3se3xb z0kJF&wJ`@=MlQ$jp8xQpuZ>dpV$Kdj!tUn0u2hP6k@8-b&PvoF%HO+&cgK(6`SHn- zz}6Z^ss5C49=KH*Sv%?&d7W1po}-nrzQ@*}B_cY^vTwX$nQ0dSW)rEW1OZJ5l%4LNzN@DHG zpV}??<}z>ci%Uu-VQ$KCV0{)_n-pWkTQvYG4 zlO^t@{m&h@mbrWDCUGW4oz%Y`ctJUu&V8XYD@>Vd*SF|Ar50@SZ2J`0M6B+;K+*_LLsknEa`S}D>O3G83TsD{+1t8GKFDv|OCS8ZbO&JQ zjJqYn|s{5Y=RpkKymtn}UeL5FFUC8qnUbZ-D$WHitLQM9_kp4xuCIMv^?619=f2XvuF`6NCcNLwOsRl5Um~}JAofW5>_V+! z()@jLn=H=UI^WLRI>eAG=cIRxPD%Ycdtf@L-Kjmz>Q;@@om+cU|2jc&<`tc-rN`tM zju24^&%Dg}zg*ml)K`*9;Z*I2bR`C9oGX5Ue3G0UHB#e0BWNyDu+_ zI+jke=MLp|0ho(TeMywrj6T)B1W$j|^B*W0vYlHnZt^qFpK^HgkfOX<;9FDi$uJ2@ zljPC~i651TqBpD*o-88Vvm%`M%&$DNufFEuj+M$1iJwW4J77ibe%7mP{hnjuxSB|T zod_~vIzj(4U$$wH=uKrweBos1i}P9{0k_L_QuB&Nxxbv;mksae{Q?WPTVMBadSh`! z9jP?mxA^=Glbrd>Ty25J)m2zW+3DDTiznT(cI66l4Y0q|wI%QDeZy{PktOva6a6mU zEo*mQj!#+g{m&e>MGle~tbSV`pM<+}@rQrh5}?&+a=YE@@$iW&ZcE6x_U3ok=T6Da z7YbrqQ3-V&_w%{~2K4IV_yZPU1NVe2j@U}w;Yy9JwQ&;YJ^YH@E{;1p!DvuTvQosU zI3=?pdq6(|tMnh3t8+J>6H)TY_49Y@zQx4T$HdCi-LmxkLXb&>3PFu(g*j3iucz>Y@g`2~IK-u`+P9Z(vEO z6?H+Mc@tdzAikB0_f5-wv!@X; zQ=wlc@4RbAD-U>G?QS;9O(6w{d~4?2Ww|%n)=rr{`K@$G!&1xJVU{~DQ`3feND8^u ztq04(xb)y-o#^bR)ab8ltzL_#DHe*xd%7>pENLhGD>R!rjJGmtszu)6AC9?PuXRiH z5==mF>V4>;(O*A?YXy@$g@fT%17Z~>kNSdJlAd$cRA^7ZVOMKkr`8Q!uT&iwR40a(J`Xl} z)phLLOzKGejOppPAR6q9$-E3(#oXX2L%E#EQ0|I5H5As@_z-mIqJflx(SVSmtJgPy zYu~*8PBEMJ-msK58C{vr5t#k_RdB3G%!pnHd&4|}t&THNdagbr)JJur4#z9=dH;|q z*o63k`e0OL3n#1iev_^VHCqkchhl}}ihwQW?3mQm@2rjvW#x0k1I0y3ePLy|MdE=^ zraoCaOE?2>KfgZturhDQkLHMEMRh81i{eB7k;uOu`cHB3En0C<#24;VVnVQ`UAHG= zJNu_ae0};ODnio|ie#|3u*$cFM#aZ)vFv;fd;nH^vS*ulIc5W&L8)t<1PqQX9&Usg zO6R`Q_?R@=8s_oUOu?Q$RA*XRF#ey@YB)Q5@$pj|K8}8=6%294%kQiDc1itcPQlx+ zgVc!1nX0>qxV&=q94m!kaX~AEf^)nQ2T6{`vcUy?L4x9vwxiLF9C3b=*VN0@lhq&1 zb@$kO%~f}oN(pt;%XX07Ik+7r3o|a6y3B6eGZkDpWGWf?=B_B0mgyano0FZeU1n1O zAzxFbsDV2LEaR=ibmDJW#OM5F8lQo%#dYvlXii13kHF5{M3|?$owtkALRLji2(v^D zG*)ef&E0?dy)NVBrMit-@|r#guZ{3- z@EUw27w(M@m^21Sb!TWh~qcRbqtl8n{ z_AtjSVy1`C!NWUZj_W$J9Twkmokvb!b39MlbJcT!pRxmCJ5P(^HY~tYLVUKuDU;$p zJUK+AXExPZudO`g*>C7SX(&nF=_%S}*u{}s+H~7--ZAv=cUsBlf7#xuds$*`#_h(P z+aLD7W?~wsK@zMYsNV3SWDU zw2N-^s0LWH=1zxPZL1NTh!juPZKua-3A*;M>Hg~+weQ@HW5W|CfW#wq|6!ZAZ;b!c zQ%2YJuUF|i==V9TFq*ZKK-+|?*5k9f3_O^u6B~CDkZi2)o5-mvM!Vv!Y0T<1w3qN3 zwNQDd&TjmBcKncn`(e2Nhwt=uO2?1a;k6pgJzR&x#pe{dig4QX*T44ZJv~v8+Ay0n z-y@~Y+oI*FC%M;JaR)spA!Dh5>@MA%JsK&~Qfa4ioI@P{B_xHZ{B z&4Wf>Auu#slbz|LpLibxQ~rLVF-!^u4Q;&t-Hif)U}%M`EEbRhbF_9fhJmK7rTqFV zD+4)!VMG_=ps>&eXWkeHie0{CSCD29aH3dxkfHAzU@?H)wK&ZR#`rTQ5TMPhJbf|) z@?=0eE2uBU8^X8OK)9fmpGc1hmE!D;2S2Ka;&Egu1wcw8G13S?ltia{(iGqzkPlLc zRGcS?jHgLcs6;Vl8Eb%_Tp-9<6Z!}ss;Phis~3qxpb;`ivknf4FTokH<}D6={Fka*E)GfdDJ?+7=+Avd2%SQa(>exF9T-%R~iz9TCYO}1%~an zd2%xA{6NZK*69G#0ap31Yh}^cb@F7<^6PAt#mcUWfh-og&Ss1(1PA`A1A{@Y(Ltx; zz|KNtfQsg1KLV6X2oE)*Q0M^U91}%{>v_0Ppbss}rB0*csB}j5F&H@+n6R*xu{P{~ E00ecuw*UYD literal 0 HcmV?d00001 diff --git a/buttons/cc-zero.png b/buttons/cc-zero.png new file mode 100644 index 0000000000000000000000000000000000000000..4ff09a0bb26016cef37b018a23d191be5a1cd0c9 GIT binary patch literal 6447 zcmaKxc{o(z+s6@Q$(pR$w<%kQ$RK0L7`tMuSw;%kCfiI}M2szF82iXFl`J6>V~uQM zPs}J=_T6BVq~)D{zdzphulJvG-`8{Q>ptJ-zOM6J-}589`$9ba69Xq_L=zJ2fycrJqsUiv!CX#lwbrQSteQ7xyM$l$#RS&(Nl&+nx&mS zXk75Qn1Hh&1eLGOA~r;}a`8dL>rOXbKReZY9MzKj=KFqxx=m!PG<$4_R6A|CxAlH| zwo&IzL*1Y6mpu%3meD;He5}Sf9XJ1qNEw~Ol9!Dv<=7eJ3Q=6eR~4WA)o|T$mRG;o zHXXCqEi9&J%*Q(a=;p)U;fL#=UY_S#)`Ra4d9-mqJpQq_I5T4Ow*~d*7sy+f|8RMuE8iCvfkOQ4~-J1S(<%2B7?ChRpP? zPcLnFDP(>;5ctRFze`X+Z`3OpepX{Xe;Lhx0j&6MM}7a=@a6uK`+sxUGYpl+aM^$B zUq9*r<+Iev6+RR_{QZqOZ}a|Pk)ZOiuwVj%^drDLxbhc zul)h_7n`>4w|+Fwx3?efG^5}rNUY8qM{R%qpoZLx`5f71Twa-TkQx#(Kg%*@Q{ z=H}+e2;SiM{x2YOV0n3Yl~>B}{vY~?QNAI~#_0D!w9Tyw!wRr$MnPW=Jhsy;^o^A? zqH7Fu6%`YK2Fq%Sc5;%Mn#6ATJqQgA6`=}IIH_^ zw6Eg9G=k=`iIdGRH^c2{etr0Buh#_hT-ykKX}4ghVs~0H>iGNi&c=p?@oucNN>VjO zsQu?g#mw;`E#K{UVPWBUea7PH@7L?R%==|fhVW52Mu&vW)(q*RpWilT@-GMTaG-7Q z!15+Zsx)aYLF&QiX<=3`jmn*s&Z^}?t&UvQgs}o>lvD)5pnBljk;KUHt#d8Y9-QJh zb9q0+tuPd7^F{iJ$00+6q7JRY_{}Jw2^WTO53|$98oyrrNUtj>_pS$$Y3D5P>z*Y{ z??NkMD>_g`NpS4g3+o%tzP16)yPw#Gknfa0#Y!y8%P7lP4M;v-LN-(~(d zpJ5JGoB(V3`!2fp?id5E7-gi$uCc2eONUK-S5JO=@{A)CpjKQ%m{jhZ>3t?e)2(#YFvwcqJPD<(DnYqnNNMrcpFT#r0nisLe^Ua zlAop}wCND$28`KF)rW|(zYKJY*b#^QGj(ZD&Vy46v(9pZAIo=tj8-H@r>Gb_i?;?n zL95j5r~%-UP1pJ`UZdLaq5(6ggBWLI?}G2gl{DADlpiQ{<*4(Uq_QCv{T~X`1(eXw zdX(2wE<jM$cUv z_5}AW(xaIxH1U9d08h#NK-e~6Gk=8M`de|)GlOH(hhNk9aaL1=t;premlJ3~Hy|~9 z7H8$+1|wx|E8F&$##(Vyry*oBcZWmN@3N)Kfi;)YRA%5Ih5VPzWw9XJFQxUBr;=FW zJJjwEc*?*8<+|Ju5uLQOmQUP{{iMgl`m@Ph^xtd&f2VasT8r12f%;G1wEy`6mM@j8 zcjX`)c>ZC${HK<^iPJxJz9i(#KiAVyJ5N0&&Yg)-$nmm@Lk2tOR*MRYpK-e~OPt4n zVRPr>o_zO-4ARZ9N#zdc5Mz@5@toXIOvp@d63rl`gz#wV4O%IjeL4#6FWWt$Ff-vS z!wqjb^ahllu-v3+{V6};ORLIl;iR#8o$!`-W{O^TXUYv4JhsF}M#&{O=tFs0 zsd(~ACs0*Y!RDTWRp5iu#YUXbV#Ltzjkq?(3GzPEEAt`Z^xGRNCpOiE9@TwAQ1E*p>kuurJiUo?fZku#ZdgN^S;Jd`x@BV^ z{!(e^ho>aofP2Do+rPiov4uWM$^*0!`5g{`XS~`Z#Z!dD?PYj(PVKfV32;3MxkdwI zrBla#h6rtPDWBf#)#6oL5?XSJR8i}xfg9splnkWYf)siI!nNl5E8kv8m~-F~*@R&% z#-J0rULWB0C8Lx4C}M>{#0eE=&A4ufX%D=`v+ho91V_e*2?*C1(?-?GE%+3qb#JQD z-BqsDB$?;^)bqY7ZY&43wTrM+_<7ix%>lU{;v{I8OiSh=cWB zbad{)#2Zp+D(Px#FN!0*L(1gBxxB+KuLLE?5!o`zV2eom&FLyykM?wQ_phQy?{5f6XTN0Zpi~XDPnuKLB zoiFS-ol&)Ib8B_2>T8XTg+H(JNRjjcA>iCzAQ)@SQKsuh3Q~{*`W3u` z=(76xkM_;CXcA83MrXMt(Dut&Re`VsGqxj$hRmdFrlXDXH#wn8DGIK5*U5%Hc#q{! zDvL#UtSg9&k2NJ*({^@t@a=7sLR%k+uJ_Ehs>3^vYi?N_u5ECxDm9|ah~^e6JTh-; zJ$W)|03kR}E)(=l3`H#?)S)EoVDtHvOO0^ht-D#z;g!uHiXkeW)4P+VihCBy%h&S5 z5KAXTcLQ;A3H3Nm<9O*6^X$m2v&EO~+c;^`D{2xSwKF*_y-qlY0Y8Xz&AzNGOvOUh z_O>_zIKm^;5Oz*PE0nH&$Eh9Cu4wzTo1}8C%AH);&w|w;O`xs*DS!Taqky?=w(`Tl z0%fAE1O49k1?JL4ukDhd@KEB#Ayr3+wGo?o{CLS@hw3^%W9K5TgR}?bPXhPz{Z(@b zNvc726C_;Ur29ZTjs1OI+b$ZD%fPt}(r2JO5!SG<-H&gKBk+s2hoVJW??Wt#m(rx- z#NIODY+>atj-|3^mlL}SQIw8w`z)3?X0p7KbLj5Q_A1duL9bd$vEte*jU52ZJrL5B zm1SwhOD&*}jIhC;kK5BLx)tZ|wU%Wryjs3igL60>ZnrZAvdy{&ntUa42nI%t`+vl% zeN~7oDnIV4wE)t=AgG7U?m4}LrB(mjxZ6#Wjr2pHT#=rnoC@(Lvo)VK$WbmZS;n!!WunTLEF?d@mdA!p&d z-tY(NAK7hgn=6tmUdP+l0s2UNB0`&P7+FW0Li0OQz@42&mxm~2uks~z&kUDdnfgq^ zG+8XE;Mi>4-rqMsKbRs;pY%8Inc0ouhe$j9>qwq|P5z|!LK*y$M;bvWh=dSwMa=6fpuAIV!|T~~QJHvgpW&=W9FnpfNWGk41?`HWm`#SI z1)X9j8AM&_N-dmOYOF%u<<RiPQ`&dHBi+^^xU}jkFWg0L1ZTx+%42I)2!9qqH$pziUQ6-UjJPhlcL4F$1mEKiw zqmE!t@%OpFFxO=r215#N{G33f=y#vB=N^1}R%rNcXK zu&3mctS=}d=n>P+<|Wx3Bf>a`>Nci~P~R&dHm9KKXONGU4w!Q{jqJyuaB5?FcXg-@ z)_w7TqF#*a+FXZzu@QNg^p_<@`$y{Z9|MMqzBqSnm@;tz}Az|QRviH(xKOlHiM>7TcMU~nWVD-@t*3QZ;2I|$Gd96T?SczRZ41|^dRYJ zSgZD?_hNBB-;)eTHOBwrozYlQ*aoTi{p{Ifl0tEztFe_5C+ATVaZ%MSq1lM+Hw{)=mPiRGrbo=;(DxSO<#G z@O0H$*Gccq_QI9#0#S{Ut+eI_lt8X7pj-%tZU{5`xFnFd9PI*AtU4yEcpb$lLLD#_ zm!cvOS3i&NbF_09Ycxm#qeA#*U2iAikPY~4Ji^_*N|mMWLgeiC$?ONvXm7a=!EkWq zklJG0!}fHiRQMM?)2>W#?Y2e_(c%D=*E5CMwJs+( zQ2U~#6lgL^uQ$Jc*EvSw*2);4Nn4v*Pn&s4NEUBOZ&&||5mWHIh8pXZ!O7oUahnWN2DCObCYMpqdZU!=;+{B;9ABy3(tI;@=`nwVyp}D%@AG4db8R9=%gGRR|P!8eY2?I0QOu3Jy z5l@BG#~1k|eK8e*cCrZlNyo!G3THNh52JqYcy$Tt^ zUb@4NbH-kB1j$?4bLG~<+yEVJ!s9pm z8y-v0$cA>b)5V3B^Bj^oOS-~F?zWtDH60%J!u zyg54mVx4T?PSJ-s%|0(-Laii0BRI4YQa^o2F9DWp`9epdmelvfSs$4cE-@7~fAW*p z{g-h2Ca%NT+R|SEXGHXPAFsuZufRN2qp4Qjp-Z#b;VCa8p1Yb_Wx6Nl!4OB{8Vz?K zSIszUr!gYV1UGksOsB_}yg6`Tk>Lzy$>C}VqGf+}$g=I}IDTMJO0G;M9hy2KALZ4> z2C&O~pUE3_)6sC4d8jAqYimHdhqCD2bn%=a+h~ul#sv{Svc?0 znQqN@q-L8+*-hHv&cX%mZ&pJZ6}|%ctZx*s<3CCTT``agF!{_cMVRpygj1VwZ(oVb z*M|Dr4Y|S>%67vd`Zk5Aa^5N^Un4;42Y4jaZP zW~b+dKR(!5SorKM#0(a2cwHN*=_EZ?A|EjKaO+$ws_=_xLqzC=ONPNxc#X0B+scl< zvhgb%`f9(}6#M;qR z`?N}`v@DqXioy{OLIs+rYKGQZPg)^+3F=k=nEWsNVJ!G#79c9(GX>m@uM$uZPTJ2-1ke7d2mVYL8s_{Jg z(4!V(y0nUs`Qwy_^sbcr7TGjK6i0594+}758h$`K5-4uGVlIK2Fm1&hrQ1=+h3|ttvx!Z+f=hcUXIQkI(E-?TT2_Q0q1A-q%7m zHP5ei-j4oA_?p|2r9f3)2Rm8F6AoSJvpAS;l`W`AkIvPd1meL=4PEs6WDinAQb6`~|gyk4Yh zw^P4dxMQkF@aab2>YIj42TRIju%t*ahfxB&^TkMoLHL%7v`#Nh5$mR&*ax3~*Q)p6 z#~6ZZ*v92x?ERc1r~Z!8?|EtM%VVrX21Rq~<4YEkiswf#W;3BMmQ1lZHBwE_*ujM3HjB7%+M^zCT%Nt1~Ickk1 zAKnH>z8X}`q*&$a{bF!MwIj?7gLt*Ij9-6jDI&LWtDGyux;PX6skpBS?r5Q?v0QNF z2|xx5d;he%J8N+x4;dMdL7sjJf_r06J+^!QfG_`^{09XubjO@}fmOo6yjabp+0K~6 zfuW6{5Apj8-N~jOP*t~QUU5a*yb_YAEBVl{v9|)KvT~SfW%(DdX zn()4Q22aed(2ukZfxzehU5(FK!0Q^5pEa{MuIX(mblJ?$!hhC9F)i~;i=*{A1-P4z&Kvq2)5 z{{mwj0bei$0M;cT-O1pV#Il}N0E0uiV>zVNW=-Z#+B;AEKFfI$X@I>}ojMGW7a*^Y z%XIn!EOhfdFSlnBJ88Qp{3^eERP=S`np+eGuOx@S$L!aQRpL64*?t`*-j)8K`;lCk z#5V>9*$nLG7Cml4k#}x}zZruig91IIeU5Yk^y-PrhakE0nh`JSBFM4aR#~7E(mS=j zX7BviSTr|LJ9q$l=l7Z>--X;{jgqgFeUvR~kQ9)t=YP=aRJ!TU z+^AS~WfzubV_`VSGvIRK)u}@DT=GrKaMkTaS z`OMjkgU|{OO!!!3i0GJ08&A@eW9!Ufi7cc_Qx!gT3kd<^N_RdsV?ML4QdJ^Su-S1oGv2j4S1cPR4e&&D%u8BJ3)ME~FM zLt*9Gf9$*9eD2a!$EVWlAPGr-c82q3{pa=>UH%uK|KI%ogXv!4jnq4P3kx7N9)`d; z=K{x@?brT;;BD{z7r(Rpof`Pm3Z&XH9>8w)>*qt}lT68%&dF)n`9(4QfG|N!t-;kG H&)EL~wS8Yu literal 0 HcmV?d00001 diff --git a/data/CD-ICJ_Source_AdvisoryRequestCoding.csv b/data/CD-ICJ_Source_AdvisoryRequestCoding.csv new file mode 100644 index 0000000..cd17710 --- /dev/null +++ b/data/CD-ICJ_Source_AdvisoryRequestCoding.csv @@ -0,0 +1,9 @@ +short,full +CARAT,Committee on Applications for Review of Administrative Tribunal Judgements +ECOSOC,UN Economic and Social Council +IFAD,International Fund for Agricultural Development +IMO,Inter-Governmental Maritime Consultative Organization +UNECO,"United Nations Educational, Scientific and Cultural Organization" +UNGA,UN General Assembly +UNSC,United Nations Security Council +WHO,World Health Organization diff --git a/data/CD-ICJ_Source_CaseNames.csv b/data/CD-ICJ_Source_CaseNames.csv new file mode 100644 index 0000000..6a91a43 --- /dev/null +++ b/data/CD-ICJ_Source_CaseNames.csv @@ -0,0 +1,180 @@ +,caseno,casename_short,casename_full +1,1,CorfuChannel_GBR_ALB,Corfu Channel (United Kingdom of Great Britain and Northern Ireland v. Albania) +2,2,NA,NA +3,3,AdmissionUN_UNGA_NA,Conditions of Admission of a State to Membership in the United Nations (Article 4 of the Charter) +4,4,ReparationUN_UNGA_NA,Reparation for Injuries Suffered in the Service of the United Nations +5,5,Fisheries_GBR_NOR,Fisheries (United Kingdom v. Norway) +6,6,FrenchNationalsEgypt_FRA_EGY,Protection of French Nationals and Protected Persons in Egypt (France v. Egypt) +7,7,Asylum_COL_PER,Asylum (Colombia v. Peru) +8,8,PeaceTreaties_UNGA_NA,"Interpretation of Peace Treaties with Bulgaria, Hungary and Romania" +9,9,CompetenceAdmissionGA_UNGA_NA,Competence of the General Assembly for the Admission of a State to the United Nations +10,10,StatusSouthWestAfrica_UNGA_NA,International Status of South West Africa +11,11,USNationalsMorocco_FRA_USA,Rights of Nationals of the United States of America in Morocco (France v. United States of America) +12,12,ReservationsGenocideConvention_UNGA_NA,Reservations to the Convention on the Prevention and Punishment of the Crime of Genocide +13,13,Asylum-Interpretation_COL_PER,Request for Interpretation of the Judgment of 20 November 1950 in the Asylum Case (Colombia v. Peru) +14,14,HayaDeLaTorre_COL_PER,Haya de la Torre (Colombia v. Peru) +15,15,Ambatielos_GRC_GBR,Ambatielos (Greece v. United Kingdom) +16,16,AngloIranianOil_GBR_IRN,Anglo-Iranian Oil Co. (United Kingdom v. Iran) +17,17,MinquiersEcrehos_FRA_GBR,Minquiers and Ecrehos (France/United Kingdom) +18,18,Nottebohm_LIE_GTM,Nottebohm (Liechtenstein v. Guatemala) +19,19,MonetaryGold_ITA_FRA-GBR-USA,"Monetary Gold Removed from Rome in 1943 (Italy v. France, United Kingdom of Great Britain and Northern Ireland and United States of America)" +20,20,ElectriciteBeyrouth_FRA_LBN,Electricité de Beyrouth Company (France v. Lebanon) +21,21,CompensationUNAT_UNGA_NA,Effect of Awards of Compensation Made by the United Nations Administrative Tribunal +22,22,TreatmentAirCrew_USA_HUN,Treatment in Hungary of Aircraft and Crew of United States of America (United States of America v. Hungarian People's Republic) +23,23,TreatmentAirCrew_USA_SUN,Treatment in Hungary of Aircraft and Crew of United States of America (United States of America v. Union of Soviet Socialist Republics) +24,24,VotingProcedureSouthWestAfrica_UNGA_NA,Voting Procedure on Questions relating to Reports and Petitions concerning the Territory of South West Africa +25,25,AerialIncident1953_USA_CSK,Aerial Incident of 10 March 1953 (United States of America v. Czechoslovakia) +26,26,Antarctica_GBR_ARG,Antarctica (United Kingdom v. Argentina) +27,27,Antarctica_GBR_CHL,Antarctica (United Kingdom v. Chile) +28,28,AerialIncident1952_USA_SUN,Aerial Incident of 7 October 1952 (United States of America v. Union of Soviet Socialist Republics) +29,29,NorwegianLoans_FRA_NOR,Certain Norwegian Loans (France v. Norway) +30,30,ATILO-UNESCO_UNESCO_NA,Judgments of the Administrative Tribunal of the ILO upon Complaints Made against UNESCO +31,31,PetitionersComitteeSouthWestAfrica_UNGA_NA,Admissibility of Hearings of Petitioners by the Committee on South West Africa +32,32,PassageIndianTerritory_PRT_IND,Right of Passage over Indian Territory (Portugal v. India) +33,33,GuardianshipInfantsConvention_NLD_SWE,Application of the Convention of 1902 Governing the Guardianship of Infants (Netherlands v. Sweden) +34,34,Interhandel_CHE_USA,Interhandel (Switzerland v. United States of America) +35,35,AerialIndicent1955_ISR_BGR,Aerial Incident of 27 July 1955 (Israel v. Bulgaria) +36,36,AerialIndicent1955_USA_BGR,Aerial Incident of 27 July 1955 (United States of America v. Bulgaria) +37,37,AerialIndicent1955_GBR_BGR,Aerial Incident of 27 July 1955 (United Kingdom v. Bulgaria) +38,38,SovereigntyFrontierLand_BEL_NLD,Sovereignty over Certain Frontier Land (Belgium/Netherlands) +39,39,ArbitralAwardKingOfSpain_HND_NIC,Arbitral Award Made by the King of Spain on 23 December 1906 (Honduras v. Nicaragua) +40,40,AerialIncidentSept1954_USA_SUN,Aerial Incident of 4 September 1954 (United States of America v. Union of Soviet Socialist Republics) +41,41,BarcelonaTraction1958_BEL_ESP,"Barcelona Traction, Light and Power Company, Limited (Belgium v. Spain)" +42,42,PortBeyrouthSRO_FRA_LBN,"Compagnie du Port, des Quais et des Entrepôts de Beyrouth and Société Radio-Orient (France v. Lebanon)" +43,43,ConstitutionMaritimeSafetyCommittee_IMO_NA,Constitution of the Maritime Safety Committee of the Inter-Governmental Maritime Consultative Organization +44,44,AerialIncidentNov1954_USA_SUN,Aerial Incident of 7 November 1954 (United States of America v. Union of Soviet Socialist Republics) +45,45,TemplePreahVihear_KHM_THA,Temple of Preah Vihear (Cambodia v. Thailand) +46,46,SouthWestAfrica_ETH_ZAF,South West Africa (Ethiopia v. South Africa) +47,47,SouthWestAfrica_LBR_ZAF,South West Africa (Liberia v. South Africa) +48,48,NorthernCameroons_CMR_GBR,Northern Cameroons (Cameroon v. United Kingdom) +49,49,CertainExpensesUN_UNGA_NA,"Certain Expenses of the United Nations (Article 17, paragraph 2, of the Charter)" +50,50,BarcelonaTraction1962_BEL_ESP,"Barcelona Traction, Light and Power Company, Limited (Belgium v. Spain) (New Application: 1962)" +51,51,NorthSeaContinentalShelf_DEU_DNK,North Sea Continental Shelf (Federal Republic of Germany/Denmark) +52,52,NorthSeaContinentalShelf_DEU_NLD,North Sea Continental Shelf (Federal Republic of Germany/Netherlands) +53,53,Namibia_UNSC_NA,Legal Consequences for States of the Continued Presence of South Africa in Namibia (South West Africa) notwithstanding Security Council Resolution 276 (1970) +54,54,ICAOCouncil_IND_PAK,Appeal Relating to the Jurisdiction of the ICAO Council (India v. Pakistan) +55,55,FisheriesJurisdiction_GBR_ISL,Fisheries Jurisdiction (United Kingdom v. Iceland) +56,56,FisheriesJurisdiction_DEU_ISL,Fisheries Jurisdiction (Federal Republic of Germany v. Iceland) +57,57,ReviewJudgment158UNAT_CARAT_NA,Application for Review of Judgment No. 158 of the United Nations Administrative Tribunal +58,58,NuclearTests_AUS_FRA,Nuclear Tests (Australia v. France) +59,59,NuclearTests_NZL_FRA,Nuclear Tests (New Zealand v. France) +60,60,TrialPakistaniPOW_PAK_IND,Trial of Pakistani Prisoners of War (Pakistan v. India) +61,61,WesternSahara_UNGA_NA,Western Sahara +62,62,AegeanSeaContinentalShelf_GRC_TUR,Aegean Sea Continental Shelf (Greece v. Turkey) +63,63,ContinentalShelf_TUN_LBY,Continental Shelf (Tunisia/Libyan Arab Jamahiriya) +64,64,USDiplomaticStaffTehran_USA_IRN,United States Diplomatic and Consular Staff in Tehran (United States of America v. Iran) +65,65,WHO-EgyptAgreement_WHO_NA,Interpretation of the Agreement of 25 March 1951 between the WHO and Egypt +66,66,ReviewJudgment273UNAT_CARAT_NA,Application for Review of Judgment No. 273 of the United Nations Administrative Tribunal +67,67,GulfOfMaine_CAN_USA,Delimitation of the Maritime Boundary in the Gulf of Maine Area (Canada/United States of America) +68,68,ContinentalShelf_LBY_MLT,Continental Shelf (Libyan Arab Jamahiriya/Malta) +69,69,FrontierDispute_BFA_MLI,Frontier Dispute (Burkina Faso/Republic of Mali) +70,70,MilitaryParamilitaryActivitiesNicaragua_NIC_USA,Military and Paramilitary Activities in and against Nicaragua (Nicaragua v. United States of America) +71,71,ContinentalShelf-InterpretationRevision_TUN_LBY,Application for Revision and Interpretation of the Judgment of 24 February 1982 in the Case concerning the Continental Shelf (Tunisia/Libyan Arab Jamahiriya) (Tunisia v. Libyan Arab Jamahiriya) +72,72,ReviewJudgment333UNAT_CARAT_NA,Application for Review of Judgment No. 333 of the United Nations Administrative Tribunal +73,73,TransborderArmedActions_NIC_CRI,Border and Transborder Armed Actions (Nicaragua v. Costa Rica) +74,74,TransborderArmedActions_NIC_HND,Border and Transborder Armed Actions (Nicaragua v. Honduras) +75,75,LandIslandMaritimeFrontier_SLV_HND,"Land, Island and Maritime Frontier Dispute (El Salvador/Honduras: Nicaragua intervening)" +76,76,ELSI_GBR_ITA,Elettronica Sicula S.p.A. (ELSI) (United States of America v. Italy) +77,77,ArbitrationUNHQAgreement_UNGA_NA,Applicability of the Obligation to Arbitrate under Section 21 of the United Nations Headquarters Agreement of 26 June 1947 +78,78,MaritimeDelimitation-GreenlandJanMayen_DNK_NOR,Maritime Delimitation in the Area between Greenland and Jan Mayen (Denmark v. Norway) +79,79,AerialIncident1988_IRN_USA,Aerial Incident of 3 July 1988 (Islamic Republic of Iran v. United States of America) +80,80,CertainPhosphateLands_NRU_AUS,Certain Phosphate Lands in Nauru (Nauru v. Australia) +81,81,ConventionPrivilegesImmunitiesUN_ECOSOC_NA,"Applicability of Article VI, Section 22, of the Convention on the Privileges and Immunities of the United Nations" +82,82,ArbitralAward1989_GNB_SEN,Arbitral Award of 31 July 1989 (Guinea-Bissau v. Senegal) +83,83,TerritorialDispute_LBY_TCD,Territorial Dispute (Libyan Arab Jamahiriya/Chad) +84,84,EastTimor_PRT_AUS,East Timor (Portugal v. Australia) +85,85,MaritimeDelimitation_GNB_SEN,Maritime Delimitation between Guinea-Bissau and Senegal (Guinea-Bissau v. Senegal) +86,86,PassageGreatBelt_FIN_DNK,Passage through the Great Belt (Finland v. Denmark) +87,87,MaritimeDelimitation_QAT_BHR,Maritime Delimitation and Territorial Questions between Qatar and Bahrain (Qatar v. Bahrain) +88,88,Lockerbie_LBY_GBR,Questions of Interpretation and Application of the 1971 Montreal Convention arising from the Aerial Incident at Lockerbie (Libyan Arab Jamahiriya v. United Kingdom) +89,89,Lockerbie_LBY_USA,Questions of Interpretation and Application of the 1971 Montreal Convention arising from the Aerial Incident at Lockerbie (Libyan Arab Jamahiriya v. United States of America) +90,90,OilPlatforms_IRN_USA,Oil Platforms (Islamic Republic of Iran v. United States of America) +91,91,ApplicationGenocideConvention_BIH_SCG,Application of the Convention on the Prevention and Punishment of the Crime of Genocide (Bosnia and Herzegovina v. Serbia and Montenegro) +92,92,GabcikovoNagymaros_HUN_SVK,Gabčíkovo-Nagymaros Project (Hungary/Slovakia) +93,93,LegalityNuclearWeaponsArmedConflict_WHO_NA,Legality of the Use by a State of Nuclear Weapons in Armed Conflict +94,94,LandMaritimeBoundary_CMR_NGA,Land and Maritime Boundary between Cameroon and Nigeria (Cameroon v. Nigeria: Equatorial Guinea intervening) +95,95,LegalityThreatUseNuclearWeapons_UNGA_NA,Legality of the Threat or Use of Nuclear Weapons +96,96,FisheriesJurisdiction_ESP_CAN,Fisheries Jurisdiction (Spain v. Canada) +97,97,NuclearTests-ExaminationSituation_NZL_FRA,Request for an Examination of the Situation in Accordance with Paragraph 63 of the Court's Judgment of 20 December 1974 in the Nuclear Tests (New Zealand v. France) Case +98,98,KasikiliSedudu_BWA_NAM,Kasikili/Sedudu Island (Botswana/Namibia) +99,99,ViennaConventionConsularRelations_PRY_USA,Vienna Convention on Consular Relations (Paraguay v. United States of America) +100,100,ImmunitySRCommHR_ECOSOC_NA,Difference Relating to Immunity from Legal Process of a Special Rapporteur of the Commission on Human Rights +101,101,LandMaritimeBoundary-Interpretation_CMR_NGA,"Request for Interpretation of the Judgment of 11 June 1998 in the Case concerning the Land and Maritime Boundary between Cameroon and Nigeria (Cameroon v. Nigeria), Preliminary Objections (Nigeria v. Cameroon)" +102,102,SovereigntyPulau_IDN_MYS,Sovereignty over Pulau Ligitan and Pulau Sipadan (Indonesia/Malaysia) +103,103,Diallo_GIN_COD,Ahmadou Sadio Diallo (Republic of Guinea v. Democratic Republic of the Congo) +104,104,LaGrand_DEU_USA,LaGrand (Germany v. United States of America) +105,105,UseOfForce_SCG_BEL,Legality of Use of Force (Serbia and Montenegro v. Belgium) +106,106,UseOfForce_SCG_CAN,Legality of Use of Force (Serbia and Montenegro v. Canada) +107,107,UseOfForce_SCG_FRA,Legality of Use of Force (Serbia and Montenegro v. France) +108,108,UseOfForce_SCG_DEU,Legality of Use of Force (Serbia and Montenegro v. Germany) +109,109,UseOfForce_SCG_ITA,Legality of Use of Force (Serbia and Montenegro v. Italy) +110,110,UseOfForce_SCG_NLD,Legality of Use of Force (Serbia and Montenegro v. Netherlands) +111,111,UseOfForce_SCG_PRT,Legality of Use of Force (Serbia and Montenegro v. Portugal) +112,112,UseOfForce_YUG_ESP,Legality of Use of Force (Yugoslavia v. Spain) +113,113,UseOfForce_SCG_GBR,Legality of Use of Force (Serbia and Montenegro v. United Kingdom) +114,114,UseOfForce_YUG_ESP,Legality of Use of Force (Yugoslavia v. United States of America) +115,115,ArmedActivities_COD_BDI,Armed Activities on the Territory of the Congo (Democratic Republic of the Congo v. Burundi) +116,116,ArmedActivities_COD_UGA,Armed Activities on the Territory of the Congo (Democratic Republic of the Congo v. Uganda) +117,117,ArmedActivities_COD_RWA,Armed Activities on the Territory of the Congo (Democratic Republic of the Congo v. Rwanda) +118,118,ApplicationGenocideConvention_HRV_SRB,Application of the Convention on the Prevention and Punishment of the Crime of Genocide (Croatia v. Serbia) +119,119,AerialIncident1999_PAK_IND,Aerial Incident of 10 August 1999 (Pakistan v. India) +120,120,TerritorialDispute-CaribbeanSea_NIC_HND,Territorial and Maritime Dispute between Nicaragua and Honduras in the Caribbean Sea (Nicaragua v. Honduras) +121,121,ArrestWarrant_COD_BEL,Arrest Warrant of 11 April 2000 (Democratic Republic of the Congo v. Belgium) +122,122,ApplicationGenocideConvention-Revision_BIH_YUG,"Application for Revision of the Judgment of 11 July 1996 in the Case concerning Application of the Convention on the Prevention and Punishment of the Crime of Genocide (Bosnia and Herzegovina v. Yugoslavia), Preliminary Objections (Yugoslavia v. Bosnia and Herzegovina)" +123,123,CertainProperty_LIE_DEU,Certain Property (Liechtenstein v. Germany) +124,124,TerritorialDispute_NIC_COL,Territorial and Maritime Dispute (Nicaragua v. Colombia) +125,125,FrontierDispute_BEN_NER,Frontier Dispute (Benin/Niger) +126,126,ArmedActivitiesApp2002_COD_RWA,Armed Activities on the Territory of the Congo (New Application: 2002) (Democratic Republic of the Congo v. Rwanda) +127,127,LandIslandMaritimeFrontier-Revision_SLV_HND,"Application for Revision of the Judgment of 11 September 1992 in the Case concerning the Land, Island and Maritime Frontier Dispute (El Salvador/Honduras: Nicaragua intervening) (El Salvador v. Honduras)" +128,128,Avena_MEX_USA,Avena and Other Mexican Nationals (Mexico v. United States of America) +129,129,CertainCriminalProceedings_COD_FRA,Certain Criminal Proceedings in France (Republic of the Congo v. France) +130,130,PedraBranca_MYS_SGP,"Sovereignty over Pedra Branca/Pulau Batu Puteh, Middle Rocks and South Ledge (Malaysia/Singapore)" +131,131,ConstructionWallOPT_UNGA_NA,Legal Consequences of the Construction of a Wall in the Occupied Palestinian Territory +132,132,MaritimeDelimitation-BlackSea_ROU_UKR,Maritime Delimitation in the Black Sea (Romania v. Ukraine) +133,133,NavigationalRights_CRI_NIC,Dispute regarding Navigational and Related Rights (Costa Rica v. Nicaragua) +134,134,DiplomaticEnvoyUN_DMA_CHE,Status vis-à-vis the Host State of a Diplomatic Envoy to the United Nations (Commonwealth of Dominica v. Switzerland) +135,135,PulpMills_ARG_URY,Pulp Mills on the River Uruguay (Argentina v. Uruguay) +136,136,MutualAssistanceCriminalMatters_DJI_FRA,Certain Questions of Mutual Assistance in Criminal Matters (Djibouti v. France) +137,137,MaritimeDispute_PER_CHL,Maritime Dispute (Peru v. Chile) +138,138,AerialHerbicideSpraying_ECU_COL,Aerial Herbicide Spraying (Ecuador v. Colombia) +139,139,Avena-Interpretation_MEX_USA,Request for Interpretation of the Judgment of 31 March 2004 in the Case concerning Avena and Other Mexican Nationals (Mexico v. United States of America) (Mexico v. United States of America) +140,140,ICERD_GEO_RUS,Application of the International Convention on the Elimination of All Forms of Racial Discrimination (Georgia v. Russian Federation) +141,141,IndependenceDeclarationKosovo_UNGA_NA,Accordance with international law of the unilateral declaration of independence in respect of Kosovo +142,142,InterimAccord1995_MKD_GRC,Application of the Interim Accord of 13 September 1995 (the former Yugoslav Republic of Macedonia v. Greece) +143,143,JurisdictionalImmunities_DEU_ITA,Jurisdictional Immunities of the State (Germany v. Italy: Greece intervening) +144,144,ObligationProsecuteExtradite_BEL_SEN,Questions relating to the Obligation to Prosecute or Extradite (Belgium v. Senegal) +145,145,JudgmentsCivilCommercialMatters_BEL_CHE,Jurisdiction and Enforcement of Judgments in Civil and Commercial Matters (Belgium v. Switzerland) +146,146,Judgment2867ATILO-IFAD_IFAD_NA,Judgment No.2867 of the Administrative Tribunal of the International Labour Organization upon a Complaint Filed against the International Fund for Agricultural Development +147,147,DiplomaticRelations_HND_BRA,Certain Questions concerning Diplomatic Relations (Honduras v. Brazil) +148,148,WhalingAntarctic_AUS_JPN,Whaling in the Antarctic (Australia v. Japan: New Zealand intervening) +149,149,FrontierDispute_BFA_NER,Frontier Dispute (Burkina Faso/Niger) +150,150,CertainActivitiesBorderArea_CRI_NIC,Certain Activities Carried Out by Nicaragua in the Border Area (Costa Rica v. Nicaragua) +151,151,TemplePreahVihear-Interpretation_KHM_THA,Request for Interpretation of the Judgment of 15 June 1962 in the Case concerning the Temple of Preah Vihear (Cambodia v. Thailand) (Cambodia v. Thailand) +152,152,SanJuanRiver_NIC_CRI,Construction of a Road in Costa Rica along the San Juan River (Nicaragua v. Costa Rica) +153,153,AccessPacificOcean_BOL_CHL,Obligation to Negotiate Access to the Pacific Ocean (Bolivia v. Chile) +154,154,DelimitationContinentalShelf_NIC_COL,Question of the Delimitation of the Continental Shelf between Nicaragua and Colombia beyond 200 nautical miles from the Nicaraguan Coast (Nicaragua v. Colombia) +155,155,SovereignRightsCaribbeanSea_NIC_COL,Alleged Violations of Sovereign Rights and Maritime Spaces in the Caribbean Sea (Nicaragua v. Colombia) +156,156,CertainDocumentsSeizure_TLS_AUS,Questions relating to the Seizure and Detention of Certain Documents and Data (Timor-Leste v. Australia) +157,157,MaritimeDelimitation-CaribbeanPacific_CRI_NIC,Maritime Delimitation in the Caribbean Sea and the Pacific Ocean (Costa Rica v. Nicaragua) +158,158,NuclearDisarmament_MHL_IND,Obligations concerning Negotiations relating to Cessation of the Nuclear Arms Race and to Nuclear Disarmament (Marshall Islands v. India) +159,159,NuclearDisarmament_MHL_PAK,Obligations concerning Negotiations relating to Cessation of the Nuclear Arms Race and to Nuclear Disarmament (Marshall Islands v. Pakistan) +160,160,NuclearDisarmament_MHL_GBR,Obligations concerning Negotiations relating to Cessation of the Nuclear Arms Race and to Nuclear Disarmament (Marshall Islands v. United Kingdom) +161,161,MaritimeDelimitation-IndianOcean_SOM_KEN,Maritime Delimitation in the Indian Ocean (Somalia v. Kenya) +162,162,SilalaWaters_CHL_BOL,Dispute over the Status and Use of the Waters of the Silala (Chile v. Bolivia) +163,163,ImmunitiesCriminalProceedings_GNQ_FRA,Immunities and Criminal Proceedings (Equatorial Guinea v. France) +164,164,IranianAssets_IRN_USA,Certain Iranian Assets (Islamic Republic of Iran v. United States of America) +165,165,IslaPortillos_CRI_NIC,Land Boundary in the Northern Part of Isla Portillos (Costa Rica v. Nicaragua) +166,166,ConventionTerrorismFinancingCERD_UKR_RUS,Application of the International Convention for the Suppression of the Financing of Terrorism and of the International Convention on the Elimination of All Forms of Racial Discrimination (Ukraine v. Russian Federation) +167,167,PedraBranca-Revision_MYS_SGP,"Application for revision of the Judgment of 23 May 2008 in the case concerning Sovereignty over Pedra Branca/Pulau Batu Puteh, Middle Rocks and South Ledge (Malaysia/Singapore) (Malaysia v. Singapore)" +168,168,Jadhav_IND_PAK,Jadhav (India v. Pakistan) +169,169,ChagosArchipelago_UNGA_NA,Legal Consequences of the Separation of the Chagos Archipelago from Mauritius in 1965 +170,170,PedraBranca-Interpretation_MYS_SGP,"Request for Interpretation of the Judgment of 23 May 2008 in the case concerning Sovereignty over Pedra Branca/Pulau Batu Puteh, Middle Rocks and South Ledge (Malaysia/Singapore) (Malaysia v. Singapore)" +171,171,ArbitralAward1899_GUY_VEN,Arbitral Award of 3 October 1899 (Guyana v. Venezuela) +172,172,CERD_QAT_ARE,Application of the International Convention on the Elimination of All Forms of Racial Discrimination (Qatar v. United Arab Emirates) +173,173,ICAOCouncil-CICA_BHR-EGY-SAU-ARE_QAT,"Appeal Relating to the Jurisdiction of the ICAO Council under Article 84 of the Convention on International Civil Aviation (Bahrain, Egypt, Saudi Arabia and United Arab Emirates v. Qatar)" +174,174,ICAOCouncil-IASTA_BHR-EGY-ARE_QAT,"Appeal Relating to the Jurisdiction of the ICAO Council under Article II, Section 2, of the 1944 International Air Services Transit Agreement (Bahrain, Egypt and United Arab Emirates v. Qatar)" +175,175,1955AmityTreaty_IRN_USA,"Alleged violations of the 1955 Treaty of Amity, Economic Relations, and Consular Rights (Islamic Republic of Iran v. United States of America)" +176,176,RelocationEmbassyUSJerusalem_PSE_USA,Relocation of the United States Embassy to Jerusalem (Palestine v. United States of America) +177,177,GuatemalaTerritorialInsularMaritimeClaim_GTM_BLZ,"Guatemala’s Territorial, Insular and Maritime Claim (Guatemala/Belize)" +178,178,ApplicationGenocideConvention_GMB_MMR,Application of the Convention on the Prevention and Punishment of the Crime of Genocide (The Gambia v. Myanmar) +179,179,LandMaritimeDelimitationSovereigntyIslands_GAB_GNQ,Land and Maritime Delimitation and Sovereignty over Islands (Gabon/Equatorial Guinea) diff --git a/data/CD-ICJ_Source_CountryCodes.csv b/data/CD-ICJ_Source_CountryCodes.csv new file mode 100644 index 0000000..72bd6a7 --- /dev/null +++ b/data/CD-ICJ_Source_CountryCodes.csv @@ -0,0 +1,106 @@ +ISO3,name,region,subregion +ALB,Albania,Europe,Southern Europe +ARE,United Arab Emirates,Asia,Western Asia +ARG,Argentina,Americas,Latin America and the Caribbean +AUS,Australia,Oceania,Australia and New Zealand +BDI,Burundi,Africa,Sub-Saharan Africa +BEL,Belgium,Europe,Western Europe +BEN,Benin,Africa,Sub-Saharan Africa +BFA,Burkina Faso,Africa,Sub-Saharan Africa +BGR,Bulgaria,Europe,Eastern Europe +BHR,Bahrain,Asia,Western Asia +BIH,Bosnia and Herzegovina,Europe,Southern Europe +BLZ,Belize,Americas,Latin America and the Caribbean +BOL,Bolivia,Americas,Latin America and the Caribbean +BRA,Brazil,Americas,Latin America and the Caribbean +BWA,Botswana,Africa,Sub-Saharan Africa +CAN,Canada,Americas,Northern America +CHE,Switzerland,Europe,Western Europe +CHL,Chile,Americas,Latin America and the Caribbean +CMR,Cameroon,Africa,Sub-Saharan Africa +COD,Democratic Republic of the Congo,Africa,Sub-Saharan Africa +COL,Colombia,Americas,Latin America and the Caribbean +CRI,Costa Rica,Americas,Latin America and the Caribbean +CSK,Czechoslovakia,Europe,Eastern Europe +DEU,Germany,Europe,Western Europe +DJI,Djibouti,Africa,Sub-Saharan Africa +DMA,Dominica,Americas,Latin America and the Caribbean +DNK,Denmark,Europe,Northern Europe +ECU,Ecuador,Americas,Latin America and the Caribbean +EGY,Egypt,Africa,Northern Africa +ESP,Spain,Europe,Southern Europe +ETH,Ethiopia,Africa,Sub-Saharan Africa +FIN,Finland,Europe,Northern Europe +FRA,France,Europe,Western Europe +GAB,Gabon,Africa,Sub-Saharan Africa +GBR,United Kingdom,Europe,Northern Europe +GEO,Georgia,Asia,Western Asia +GIN,Guinea,Africa,Sub-Saharan Africa +GMB,Gambia,Africa,Sub-Saharan Africa +GNB,Guinea-Bissau,Africa,Sub-Saharan Africa +GNQ,Equatorial Guinea,Africa,Sub-Saharan Africa +GRC,Greece,Europe,Southern Europe +GTM,Guatemala,Americas,Latin America and the Caribbean +GUY,Guyana,Americas,Latin America and the Caribbean +HND,Honduras,Americas,Latin America and the Caribbean +HRV,Croatia,Europe,Southern Europe +HUN,Hungary,Europe,Eastern Europe +IDN,Indonesia,Asia,South-eastern Asia +IND,India,Asia,Southern Asia +IRN,Iran,Asia,Southern Asia +ISL,Iceland,Europe,Northern Europe +ISR,Israel,Asia,Western Asia +ITA,Italy,Europe,Southern Europe +JPN,Japan,Asia,Eastern Asia +KEN,Kenia,Africa,Sub-Saharan Africa +KHM,Cambodia,Asia,South-eastern Asia +LBN,Lebanon,Asia,Western Asia +LBR,Liberia,Africa,Sub-Saharan Africa +LBY,Libya,Africa,Northern Africa +LIE,Liechtenstein,Europe,Western Europe +MEX,Mexico,Americas,Latin America and the Caribbean +MHL,Marshall Islands,Oceania,Micronesia +MKD,North Macedonia,Europe,Southern Europe +MLI,Mali,Africa,Sub-Saharan Africa +MLT,Malta,Europe,Southern Europe +MMR,Myanmar,Asia,South-eastern Asia +MYS,Malaysia,Asia,South-eastern Asia +NAM,Namibia,Africa,Sub-Saharan Africa +NER,Niger,Africa,Sub-Saharan Africa +NGA,Nigeria,Africa,Sub-Saharan Africa +NIC,Nicaragua,Americas,Latin America and the Caribbean +NLD,Netherlands,Europe,Western Europe +NOR,Norway,Europe,Northern Europe +NRU,Nauru,Oceania,Micronesia +NZL,New Zealand,Oceania,Australia and New Zealand +PAK,Pakistan,Asia,Southern Asia +PER,Peru,Americas,Latin America and the Caribbean +PRT,Portugal,Europe,Southern Europe +PRY,Paraguay,Americas,Latin America and the Caribbean +PSE,Palestine,Asia,Western Asia +QAT,Qatar,Asia,Western Asia +ROU,Romania,Europe,Eastern Europe +RUS,Russia,Europe,Eastern Europe +RWA,Rwanda,Africa,Sub-Saharan Africa +SAU,Saudi-Arabia,Asia,Western Asia +SCG,Serbia and Montenegro,Europe,Southern Europe +SEN,Senegal,Africa,Sub-Saharan Africa +SGP,Singapore,Asia,South-eastern Asia +SLV,El Salvador,Americas,Latin America and the Caribbean +SOM,Somalia,Africa,Sub-Saharan Africa +SRB,Serbia,Europe,Southern Europe +SUN,Soviet Union,Europe,Eastern Europe +SVK,Slovakia,Europe,Eastern Europe +SWE,Sweden,Europe,Northern Europe +TCD,Chad,Africa,Sub-Saharan Africa +THA,Thailand,Asia,South-eastern Asia +TLS,Timor Leste,Asia,South-eastern Asia +TUN,Tunisia,Africa,Northern Africa +TUR,Turkey,Asia,Western Asia +UGA,Uganda,Africa,Sub-Saharan Africa +UKR,Ukraine,Europe,Eastern Europe +URY,Uruguay,Americas,Latin America and the Caribbean +USA,United States of America,Americas,Northern America +VEN,Venezuela,Americas,Latin America and the Caribbean +YUG,Yugoslavia,Europe,Southern Europe +ZAF,South Africa,Africa,Sub-Saharan Africa diff --git a/data/CD-ICJ_Source_Stages_Filenames.csv b/data/CD-ICJ_Source_Stages_Filenames.csv new file mode 100644 index 0000000..9799208 --- /dev/null +++ b/data/CD-ICJ_Source_Stages_Filenames.csv @@ -0,0 +1,144 @@ +old,new +1948-03-25_JUD_01,1948-03-25_JUD_01_PO +1949-04-09_JUD_01,1949-04-09_JUD_01_ME +1949-12-15_JUD_01,1949-12-15_JUD_01_CO +1951-12-18_JUD_01,1951-12-18_JUD_01_ME +1950-11-20_JUD_01,1950-11-20_JUD_01_ME +1952-08-27_JUD_01,1952-08-27_JUD_01_ME +1950-11-27_JUD_01,1950-11-27_JUD_01_ME +1951-06-13_JUD_01,1951-06-13_JUD_01_ME +1952-07-01_JUD_01,1952-07-01_JUD_01_PO +1953-05-19_JUD_01,1953-05-19_JUD_01_ME +1952-07-22_JUD_01,1952-07-22_JUD_01_PO +1953-11-17_JUD_01,1953-11-17_JUD_01_ME +1953-11-18_JUD_01,1953-11-18_JUD_01_PO +1955-04-06_JUD_01,1955-04-06_JUD_01_ME +1954-06-15_JUD_01,1954-06-15_JUD_01_PO +1957-07-06_JUD_01,1957-07-06_JUD_01_PO +1957-11-26_JUD_01,1957-11-26_JUD_01_PO +1960-04-12_JUD_01,1960-04-12_JUD_01_ME +1958-11-28_JUD_01,1958-11-28_JUD_01_ME +1959-03-21_JUD_01,1959-03-21_JUD_01_PO +1959-05-26_JUD_01,1959-05-26_JUD_01_PO +1959-06-20_JUD_01,1959-06-20_JUD_01_ME +1960-11-18_JUD_01,1960-11-18_JUD_01_ME +1961-05-26_JUD_01,1961-05-26_JUD_01_PO +1962-06-15_JUD_01,1962-06-15_JUD_01_ME +1962-12-21_JUD_01,1962-12-21_JUD_01_PO +1966-07-18_JUD_01,1966-07-18_JUD_01_ME +1962-12-21_JUD_01,1962-12-21_JUD_01_PO +1966-07-18_JUD_01,1966-07-18_JUD_01_ME +1963-12-02_JUD_01,1963-12-02_JUD_01_PO +1964-07-24_JUD_01,1964-07-24_JUD_01_PO +1970-02-05_JUD_01,1970-02-05_JUD_01_ME +1969-02-20_JUD_01,1969-02-20_JUD_01_ME +1969-02-20_JUD_01,1969-02-20_JUD_01_ME +1972-08-18_JUD_01,1972-08-18_JUD_01_ME +1973-02-02_JUD_01,1973-02-02_JUD_01_PO +1974-07-25_JUD_01,1974-07-25_JUD_01_ME +1973-02-02_JUD_01,1973-02-02_JUD_01_PO +1974-07-25_JUD_01,1974-07-25_JUD_01_ME +1974-12-20_JUD_01,1974-12-20_JUD_01_ME +1974-12-20_JUD_01,1974-12-20_JUD_01_ME +1978-12-19_JUD_01,1978-12-19_JUD_01_PO +1981-04-14_JUD_01,1981-04-14_JUD_01_IN +1982-02-24_JUD_01,1982-02-24_JUD_01_ME +1980-05-24_JUD_01,1980-05-24_JUD_01_ME +1984-10-12_JUD_01,1984-10-12_JUD_01_ME +1984-03-21_JUD_01,1984-03-21_JUD_01_IN +1985-06-03_JUD_01,1985-06-03_JUD_01_ME +1986-12-22_JUD_01,1986-12-22_JUD_01_ME +1984-11-26_JUD_01,1984-11-26_JUD_01_PO +1986-06-27_JUD_01,1986-06-27_JUD_01_ME +1985-12-10_JUD_01,1985-12-10_JUD_01_ME +1988-12-20_JUD_01,1988-12-20_JUD_01_PO +1990-09-13_JUD_01,1990-09-13_JUD_01_IN +1992-09-11_JUD_01,1992-09-11_JUD_01_ME +1989-07-20_JUD_01,1989-07-20_JUD_01_ME +1993-06-14_JUD_01,1993-06-14_JUD_01_ME +1992-06-26_JUD_01,1992-06-26_JUD_01_PO +1991-11-12_JUD_01,1991-11-12_JUD_01_ME +1994-02-03_JUD_01,1994-02-03_JUD_01_ME +1995-06-30_JUD_01,1995-06-30_JUD_01_PO +1994-07-01_JUD_01,1994-07-01_JUD_01_PO +1995-02-15_JUD_01,1995-02-15_JUD_01_PO +2001-03-16_JUD_01,2001-03-16_JUD_01_ME +1998-02-27_JUD_01,1998-02-27_JUD_01_PO +1998-02-27_JUD_01,1998-02-27_JUD_01_PO +1996-12-12_JUD_01,1996-12-12_JUD_01_PO +2003-11-06_JUD_01,2003-11-06_JUD_01_ME +1996-07-11_JUD_01,1996-07-11_JUD_01_PO +2007-02-26_JUD_01,2007-02-26_JUD_01_ME +1997-09-25_JUD_01,1997-09-25_JUD_01_ME +1998-06-11_JUD_01,1998-06-11_JUD_01_PO +2002-10-10_JUD_01,2002-10-10_JUD_01_ME +1998-12-04_JUD_01,1998-12-04_JUD_01_PO +1999-12-13_JUD_01,1999-12-13_JUD_01_ME +1999-03-25_JUD_01,1999-03-25_JUD_01_PO +2001-10-23_JUD_01,2001-10-23_JUD_01_IN +2002-12-17_JUD_01,2002-12-17_JUD_01_ME +2007-05-24_JUD_01,2007-05-24_JUD_01_PO +2010-11-30_JUD_01,2010-11-30_JUD_01_ME +2012-06-19_JUD_01,2012-06-19_JUD_01_CO +2001-06-27_JUD_01,2001-06-27_JUD_01_ME +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2004-12-15_JUD_01,2004-12-15_JUD_01_PO +2005-12-19_JUD_01,2005-12-19_JUD_01_ME +2008-11-18_JUD_01,2008-11-18_JUD_01_PO +2015-02-03_JUD_01,2015-02-03_JUD_01_ME +2000-06-21_JUD_01,2000-06-21_JUD_01_PO +2007-10-08_JUD_01,2007-10-08_JUD_01_ME +2002-02-14_JUD_01,2002-02-14_JUD_01_ME +2003-02-03_JUD_01,2003-02-03_JUD_01_ME +2005-02-10_JUD_01,2005-02-10_JUD_01_PO +2007-12-13_JUD_01,2007-12-13_JUD_01_PO +2011-05-04_JUD_01,2011-05-04_JUD_01_IN +2011-05-04_JUD_02,2011-05-04_JUD_02_IN +2012-11-19_JUD_01,2012-11-19_JUD_01_ME +2005-07-12_JUD_01,2005-07-12_JUD_01_ME +2006-02-03_JUD_01,2006-02-03_JUD_01_PO +2003-12-18_JUD_01,2003-12-18_JUD_01_ME +2004-03-31_JUD_01,2004-03-31_JUD_01_ME +2008-05-23_JUD_01,2008-05-23_JUD_01_ME +2009-02-03_JUD_01,2009-02-03_JUD_01_ME +2009-07-13_JUD_01,2009-07-13_JUD_01_ME +2010-04-20_JUD_01,2010-04-20_JUD_01_ME +2008-06-04_JUD_01,2008-06-04_JUD_01_ME +2014-01-27_JUD_01,2014-01-27_JUD_01_ME +2009-01-19_JUD_01,2009-01-19_JUD_01_ME +2011-04-01_JUD_01,2011-04-01_JUD_01_PO +2011-12-05_JUD_01,2011-12-05_JUD_01_ME +2012-02-03_JUD_01,2012-02-03_JUD_01_ME +2012-07-20_JUD_01,2012-07-20_JUD_01_ME +2014-03-31_JUD_01,2014-03-31_JUD_01_ME +2013-04-16_JUD_01,2013-04-16_JUD_01_ME +2015-12-16_JUD_01,2015-12-16_JUD_01_ME +2018-02-02_JUD_01,2018-02-02_JUD_01_CO +2013-11-11_JUD_01,2013-11-11_JUD_01_ME +2015-12-16_JUD_01,2015-12-16_JUD_01_ME +2015-09-24_JUD_01,2015-09-24_JUD_01_PO +2018-10-01_JUD_01,2018-10-01_JUD_01_ME +2016-03-17_JUD_01,2016-03-17_JUD_01_PO +2016-03-17_JUD_01,2016-03-17_JUD_01_PO +2018-02-02_JUD_01,2018-02-02_JUD_01_ME +2016-10-05_JUD_01,2016-10-05_JUD_01_PO +2016-10-05_JUD_01,2016-10-05_JUD_01_PO +2016-10-05_JUD_01,2016-10-05_JUD_01_PO +2017-02-02_JUD_01,2017-02-02_JUD_01_PO +2018-06-06_JUD_01,2018-06-06_JUD_01_PO +2020-12-11_JUD_01,2020-12-11_JUD_01_ME +2019-02-13_JUD_01,2019-02-13_JUD_01_PO +2018-02-02_JUD_01,2018-02-02_JUD_01_ME +2019-11-08_JUD_01,2019-11-08_JUD_01_PO +2019-07-17_JUD_01,2019-07-17_JUD_01_ME +2020-12-18_JUD_01,2020-12-18_JUD_01_PO +2021-02-04_JUD_01,2021-02-04_JUD_01_PO +2020-07-14_JUD_01,2020-07-14_JUD_01_ME +2020-07-14_JUD_01,2020-07-14_JUD_01_ME +2021-02-03_JUD_01,2021-02-03_JUD_01_PO diff --git a/data/CD-ICJ_Source_UnlabelledFilesHandcoded.csv b/data/CD-ICJ_Source_UnlabelledFilesHandcoded.csv new file mode 100644 index 0000000..e2782b0 --- /dev/null +++ b/data/CD-ICJ_Source_UnlabelledFilesHandcoded.csv @@ -0,0 +1,22 @@ +,old,new +1,/files/case-related/150/18852.pdf,/files/case-related/150/150-20151216-JUD-01-02-EN.pdf +2,/files/case-related/152/18850.pdf,/files/case-related/152/152-20151216-JUD-01-01-EN.pdf +3,/files/case-related/152/18852.pdf,/files/case-related/152/152-20151216-JUD-01-02-EN.pdf +4,/files/case-related/152/18854.pdf,/files/case-related/152/152-20151216-JUD-01-03-EN.pdf +5,/files/case-related/152/18856.pdf,/files/case-related/152/152-20151216-JUD-01-04-EN.pdf +6,/files/case-related/152/18858.pdf,/files/case-related/152/152-20151216-JUD-01-05-EN.pdf +7,/files/case-related/152/18860.pdf,/files/case-related/152/152-20151216-JUD-01-06-EN.pdf +8,/files/case-related/152/18862.pdf,/files/case-related/152/152-20151216-JUD-01-07-EN.pdf +9,/files/case-related/152/18864.pdf,/files/case-related/152/152-20151216-JUD-01-08-EN.pdf +10,/files/case-related/152/18867.pdf,/files/case-related/152/152-20151216-JUD-01-09-FR.pdf +11,/files/case-related/152/18868.pdf,/files/case-related/152/152-20151216-JUD-01-10-EN.pdf +12,/files/case-related/153/18748.pdf,/files/case-related/153/153-20150924-JUD-01-01-EN.pdf +13,/files/case-related/153/18749.pdf,/files/case-related/153/153-20150924-JUD-01-01-FR.pdf +14,/files/case-related/153/18750.pdf,/files/case-related/153/153-20150924-JUD-01-02-EN.pdf +15,/files/case-related/153/18751.pdf,/files/case-related/153/153-20150924-JUD-01-02-FR.pdf +16,/files/case-related/153/18752.pdf,/files/case-related/153/153-20150924-JUD-01-03-EN.pdf +17,/files/case-related/153/18753.pdf,/files/case-related/153/153-20150924-JUD-01-03-FR.pdf +18,/files/case-related/153/18754.pdf,/files/case-related/153/153-20150924-JUD-01-04-EN.pdf +19,/files/case-related/153/18755.pdf,/files/case-related/153/153-20150924-JUD-01-04-FR.pdf +20,/files/case-related/156/18638.pdf,/files/case-related/156/156-20150422-ORD-01-01-EN.pdf +21,/files/case-related/156/18640.pdf,/files/case-related/156/156-20150422-ORD-01-02-EN.pdf diff --git a/functions/f.boxplot.body.R b/functions/f.boxplot.body.R new file mode 100644 index 0000000..88a953b --- /dev/null +++ b/functions/f.boxplot.body.R @@ -0,0 +1,18 @@ +#'## f.boxplot.body: Calculate boxplot body for use with logarithmic axes in ggplot2 +#' When plotting a boxplot on a logarithmic scale ggplot2 incorrectly performs the statistical transformation first before calculating the boxplot statistics. While median and quartiles are based on ordinal position the inter-quartile range differs depending on when statistical transformation is performed. +#' +#' This function calculates the boxplot body for use with ggplot2's stat_summary. Solution is based on this SO question: https://stackoverflow.com/questions/38753628/ggplot-boxplot-length-of-whiskers-with-logarithmic-axis + +f.boxplot.body = function(x) { + + body = log10(boxplot.stats(10^x)[["stats"]]) + + names(body) = c("ymin", + "lower", + "middle", + "upper", + "ymax") + + return(body) + +} diff --git a/functions/f.boxplot.outliers.R b/functions/f.boxplot.outliers.R new file mode 100644 index 0000000..0fa6cc8 --- /dev/null +++ b/functions/f.boxplot.outliers.R @@ -0,0 +1,10 @@ +#'## f.boxplot.outliers: Calculate boxplot outliers for use with logarithmic axes in ggplot2 +#' When plotting a boxplot on a logarithmic scale ggplot2 incorrectly performs the statistical transformation first before calculating the boxplot statistics. While median and quartiles are based on ordinal position the inter-quartile range differs depending on when statistical transformation is performed. +#' +#' This function calculates outliers for use with ggplot2's stat_summary. Solution is based on this SO question: https://stackoverflow.com/questions/38753628/ggplot-boxplot-length-of-whiskers-with-logarithmic-axis + +f.boxplot.outliers = function(x) { + + data.frame(y = log10(boxplot.stats(10^x)[["out"]])) + +} diff --git a/functions/f.dopar.multihashes.R b/functions/f.dopar.multihashes.R new file mode 100644 index 0000000..614b306 --- /dev/null +++ b/functions/f.dopar.multihashes.R @@ -0,0 +1,81 @@ +#'# f.dopar.multihashes +#' This function parallelizes computation of both SHA2-256 and SHA3-512 hashes for an arbitrary number of files. It returns a data frame of file names, SHA2-256 hashes and SHA3-512 hashes. + + +#+ +#'## Required Arguments + + +#' @param x A vector of filenames. Should be located in the working directory. + + +#+ +#'## Required: OpenSSL System Library +#' The function requires the existence of the OpenSSL library on the system. This is because the openssl package for R does not provide SHA 3 capabilities yet. + +#'# Required Packages + +library(doParallel) + +#'# Requires system libraries + +## openssl + + + + + +f.dopar.multihashes <- function(x, + threads = detectCores()){ + + print(paste("Parallel processing using", threads, "threads.")) + + begin <- Sys.time() + + cl <- makeForkCluster(threads) + registerDoParallel(cl) + + multihashes <- foreach(filename = x, + .errorhandling = 'pass', + .combine = 'rbind') %dopar% { + + sha2.256 <- system2("openssl", + paste("sha256", + filename), + stdout = TRUE) + + sha2.256 <- gsub("^.*\\= ", + "", + sha2.256) + + sha3.512 <- system2("openssl", + paste("sha3-512", + filename), + stdout = TRUE) + + sha3.512 <- gsub("^.*\\= ", + "", + sha3.512) + + out <- data.frame(filename, + sha2.256, + sha3.512) + return(out) + } + stopCluster(cl) + + end <- Sys.time() + duration <- end - begin + + print(paste0("Processed ", + length(x), + " files. Runtime was ", + round(duration, + digits = 2), + " ", + attributes(duration)$units, + ".")) + + return(multihashes) + +} diff --git a/functions/f.dopar.pagenums.R b/functions/f.dopar.pagenums.R new file mode 100644 index 0000000..5243d4b --- /dev/null +++ b/functions/f.dopar.pagenums.R @@ -0,0 +1,46 @@ +#'# f.dopar.pagenums: Parallelized Computation of the length (in pages) of PDF files +#' This function computes the maximum number of pages for each PDF file. Ideally used with sum() to get the total number of pages of all PDF files in a folder. + + +#+ +#'## Required Arguments + + +#' @param x A vector of PDF filenames. Should be located in the working directory. + + +#'## Required Packages + +library(doParallel) +library(pdftools) + +#'## Function + + +f.dopar.pagenums <- function(x, + sum = FALSE, + threads = detectCores()){ + + print(paste("Parallel processing using", threads, "threads.")) + + cl <- makeForkCluster(threads) + registerDoParallel(cl) + + pagenums <- foreach(filename = x, + .combine = 'c', + .errorhandling = 'remove', + .inorder = TRUE) %dopar% { + pdf_length(filename) + } + stopCluster(cl) + + if (sum == TRUE){ + sum.out <- sum(pagenums) + print(paste("Total number of pages:", sum.out)) + return(sum.out) + }else{ + return(pagenums) + } + +} + diff --git a/functions/f.dopar.pdfextract.R b/functions/f.dopar.pdfextract.R new file mode 100644 index 0000000..895a076 --- /dev/null +++ b/functions/f.dopar.pdfextract.R @@ -0,0 +1,68 @@ +#'# f.dopar.pdfextract: Parallelized Extraction of text from PDF files +#' This function parallelizes the extraction of text from each PDF file and saves the results as TXT files. Only the file extension is modified. + + +#+ +#'## Required Arguments + + +#' @param x A vector of PDF filenames. Should be located in the working directory. + + + + +#'## Required Packages + +library(doParallel) +library(pdftools) + +#'## Function + +f.dopar.pdfextract <- function(x, + threads = detectCores()){ + + begin.extract <- Sys.time() + + print(paste("Parallel processing using", threads, "threads. Begin at", begin.extract)) + + + cl <- makeForkCluster(threads) + registerDoParallel(cl) + + newnames <- gsub("\\.pdf", + "\\.txt", + x) + + result <- foreach(i = seq_along(x), + .errorhandling = 'pass') %dopar% { + + ## Extract text layer from PDF + pdf.extracted <- pdf_text(x[i]) + + ## Write TXT to Disk + write.table(pdf.extracted, + newnames[i], + quote = FALSE, + row.names = FALSE, + col.names = FALSE) + } + stopCluster(cl) + + end.extract <- Sys.time() + duration.extract <- end.extract - begin.extract + + print(paste0("Processed ", + length(result), + " files. Runtime was ", + round(duration.extract, + digits = 2), + " ", + attributes(duration.extract)$units, + ". Ended at ", + end.extract, ".")) + + return(result) + +} + + diff --git a/functions/f.dopar.pdfocr.R b/functions/f.dopar.pdfocr.R new file mode 100644 index 0000000..f54cdc4 --- /dev/null +++ b/functions/f.dopar.pdfocr.R @@ -0,0 +1,86 @@ +#'# f.dopar.pdfocr: Parallelized Extraction of text from PDF files +#' This function extracts the text from scanned PDF files to separate TXT files and further creates an enhanced PDF version with new OCR text grafted to the scan. It runs in nested parallelization, with tesseract calling up to 3 or 4 threads to process a single PDF file and the number of jobs determines how many PDF files are processed in parallel. Very, very CPU intensive. Will only work on Linux. + + +#+ +#'## Required Arguments + + +#' @param x A vector of PDF filenames. Should be located in the working directory. +#' @param dpi The resolution at which PDF files should be converted. Defaults to 300. +#' @param lang The languages which should be expected during the OCR step, as string. Passed directly to tesseract. Default is "eng" for English. Multiple languages possible, e.g. "eng+fra" for English and French. Order of language matters. +#' @param output The output which should be generated, as string. Passed directly to tesseract. Default is "pdf txt" for PDF and TXT output. +#' @param jobs The number of jobs which should be run in parallel. Tesseract calls up to 4 threads by itself, so it should be somewhere around the full number of cores divided by 4. This is also the default. + + + +#'## Required Packages + +library(doParallel) + +#'## Required System Libraries + +## tesseract +## imagemagick + + +f.dopar.pdfocr <- function(x, + dpi = 300, + lang = "eng", + output = "pdf txt", + jobs = round(detectCores() / 4)){ + + begin.ocr <- Sys.time() + + print(paste("Parallel processing running", jobs, "jobs. Begin at", begin.ocr)) + + cl <- makeForkCluster(jobs) + registerDoParallel(cl) + + + result <- foreach(file = x, + .combine = 'c') %dopar% { + + name.tiff <- gsub("\\.pdf", + "\\.tiff", + file) + + name.out <- gsub("\\.pdf", + "_TESSERACT", + file) + + system2("convert", + paste("-density", + dpi, + "-depth 8 -compress LZW -strip -background white -alpha off", + file, + name.tiff)) + + system2("tesseract", + paste(name.tiff, + name.out, + "-l", + lang, + output)) + + unlink(name.tiff) + } + + stopCluster(cl) + + end.ocr <- Sys.time() + duration.ocr <- end.ocr - begin.ocr + + print(paste0("Processed ", + length(result), + " files. Runtime was ", + round(duration.ocr, + digits = 2), + " ", + attributes(duration.ocr)$units, + ". Ended at ", + end.ocr, ".")) + + return(result) + +} diff --git a/functions/f.fast.freqtable.R b/functions/f.fast.freqtable.R new file mode 100644 index 0000000..3442ab6 --- /dev/null +++ b/functions/f.fast.freqtable.R @@ -0,0 +1,121 @@ + + +#'# f.fast.freqtable: Fast Frequency Tables +#' This function create frequency tables for an arbitrary number of variables. It can return them as a list, write them to an arbitrary folder on disk as CSV files (with an optional prefix and return kable tables that are designed to work well with render() and LaTeX. It is based on data.table and is therefore capable of processing massive data ssets. To show the kable output in render() you must add the Chunk Option "results = 'asis'" when calling the function. +#' + + + +#+ +#'## Required Arguments + + +#' @param x A data.table. + + +#+ +#'## Optional arguments + + +#' @param varlist Character. An optional character vector of variable names to construct tables for. Defaults to all variables. +#' @param sumrow Logical. Whether to add a summary row. +#' @param output.list Logical. Whether to output the frequency tables as a list. Defaults to TRUE. Returns NULL otherwise. +#' @param output.kable Logical. Whether to return kable tables. Defaults to FALSE. +#' @param output.csv Logical. Whether to write CSV files to disk. Defaults to FALSE. +#' @param outputdir Character. The target directory for writing CSV files. Defaults to the current R working directory. +#' @param prefix A string to be added to each CSV file. Default is not to add a string and just to output the variable name as the name of the CSV file. +#' @param align Alignment of table columns. Passed to kable. Default is "r". Modifications must take into account five columns. + + + +#'## Required Packages + +library(data.table) +library(knitr) +library(kableExtra) + + +#'## Function + +f.fast.freqtable <- function(x, + varlist = names(x), + sumrow = TRUE, + output.list = TRUE, + output.kable = FALSE, + output.csv = FALSE, + outputdir = "./", + prefix = "", + align = "r"){ + + ## Begin List + freqtable.list <- vector("list", length(varlist)) + + ## Calculate Frequency Table + for (i in seq_along(varlist)){ + + varname <- varlist[i] + + freqtable <- x[, .N, keyby=c(paste0(varname))] + + freqtable[, c("exactpercent", + "roundedpercent", + "cumulpercent") := { + exactpercent <- N/sum(N)*100 + roundedpercent <- round(exactpercent, 2) + cumulpercent <- round(cumsum(exactpercent), 2) + list(exactpercent, + roundedpercent, + cumulpercent)}] + + ## Calculate Summary Row + if (sumrow == TRUE){ + colsums <- cbind("Total", + freqtable[, lapply(.SD, function(x){round(sum(x))}), + .SDcols = c("N", + "exactpercent", + "roundedpercent") + ], round(max(freqtable$cumulpercent))) + + colnames(colsums)[c(1,5)] <- c(varname, "cumulpercent") + freqtable <- rbind(freqtable, colsums) + } + + ## Add Frequency Table to List + freqtable.list[[i]] <- freqtable + + ## Write CSV + if (output.csv == TRUE){ + + fwrite(freqtable, + paste0(outputdir, + prefix, + varname, + ".csv"), + na = "NA") + + } + + ## Output Kable + if (output.kable == TRUE){ + + cat("\n------------------------------------------------\n") + cat(paste0("Frequency Table for Variable: ", varname, "\n")) + cat("------------------------------------------------\n") + cat(paste0("\n ", + x[, .N, keyby=c(paste0(varname))][,.N], + " unique value(s) detected.\n\n")) + + + print(kable(freqtable, + format = "latex", + align = align, + booktabs = TRUE, + longtable = TRUE) %>% kable_styling(latex_options = "repeat_header")) + } + } + + ## Return List of Frequency Tables + if (output.list == TRUE){ + return(freqtable.list) + } +} diff --git a/functions/f.hyphen.remove.R b/functions/f.hyphen.remove.R new file mode 100644 index 0000000..320b351 --- /dev/null +++ b/functions/f.hyphen.remove.R @@ -0,0 +1,34 @@ +#'## f.hyphen.remove: Remove Hyphenation across Linebreaks +#' Hyphenation spanning linebreaks is a serious issue for longer texts. Hyphenated words are often not recognized as a single token by standard tokenization. The result is two mostly non-expressive and unique tokens instead of a single and expressive token. The function removes linebreaking hyphenations. It does not attempt to cover hyphenation spanning pagebreaks, as there is often confounding header/footer/footnote text in extracted text from PDFs which needs to be uniquely processed for specific corpora. +#' +#' The first REGEX matches regular hyphenation of words. The second REGEX matches compounds (e.g. SARS-CoV-2) broken across lines. + + +#'@param text A character vector of text. + + +# test <- "Ham-\nburg Mei-\n nungsäußerung SARS-CoV-\n2 hat- 2\nte Unsterb- 6\nliche hat- \n 2 te, Unsterb- \n 6 liche" + + + +f.hyphen.remove <- function(text){ + ## Examples: Ham-\nburg, Mei-\n nungsäußerung + text.out <- gsub("([a-zöäüß])-[[:blank:]]*\n[[:blank:]]*([a-zöäüß])", + "\\1\\2", + text) + ## Examples: SARS-CoV-\n2 + text.out <- gsub("([a-zA-ZöäüÖÄÜß])-[[:blank:]]*\n[[:blank:]]*([A-Z0-9ÖÄÜß])", + "\\1-\\2", + text.out) + ## Example: hat- 2\nte, Unsterb- 6\nliche + text.out <- gsub("([a-zöäüß])-[[:blank:]]*[0-9]+[[:blank:]]*\n[[:blank:]]*([a-zöäüß])", + "\\1\\2", + text.out) + + ## Example: hat- \n 2 te, Unsterb- \n 6 liche + text.out <- gsub("([a-zöäüß])-[[:space:]]*[0-9]+[[:blank:]]*([a-zöäüß])", + "\\1\\2", + text.out) + + return(text.out) +} diff --git a/functions/f.lingsummarize.iterator.R b/functions/f.lingsummarize.iterator.R new file mode 100644 index 0000000..20e1ca5 --- /dev/null +++ b/functions/f.lingsummarize.iterator.R @@ -0,0 +1,116 @@ +#'# f.lingsummarize.iterator +#' Iterated parallel computation of characters, tokens, types and sentences for each document of a given data table. Documents must contain text in a "text" variable and document names in a "doc_id" variable. +#' +#' During computation documents are ordered by number of characters (descending) to ensure that long documents are computed first. For corpora with a skewed document length distribution this is significantly faster. The variables "nchars" is also added to the original object. + + +library(quanteda) +library(doParallel) + + + + +f.lingsummarize.iterator <- function(dt, + threads = detectCores(), + chunksize = 1){ + + + begin.dopar <- Sys.time() + + dt <- dt[,.(doc_id, text)] + + nchars <- dt[, lapply(.(text), nchar)] + + print(paste0("Parallel processing using ", + threads, + " threads. Begin at ", + begin.dopar, + ". Processing ", + dt[,.N], + " documents with a total length of ", + sum(nchars), + " characters.")) + + + ord <- order(-nchars) + dt <- dt[ord] + + cl <- makeForkCluster(threads) + registerDoParallel(cl) + + + itx <- iter(dt["nchars" > 0], + by = "row", + chunksize = chunksize) + + result.list <- foreach(i = itx, + .errorhandling = 'pass') %dopar% { + + corpus <- corpus(i) + + tokens <- tokens(corpus, + what = "word", + remove_punct = FALSE, + remove_symbols = FALSE, + remove_numbers = FALSE, + remove_url = FALSE, + remove_separators = TRUE, + split_hyphens = FALSE, + include_docvars = FALSE, + padding = FALSE + ) + + ntokens <- unname(ntoken(tokens)) + ntypes <- unname(ntype(tokens)) + nsentences <- unname(nsentence(corpus)) + + temp <- data.table(ntokens, + ntypes, + nsentences) + + return(temp) + } + + stopCluster(cl) + + + end.dopar <- Sys.time() + duration.dopar <- end.dopar - begin.dopar + + result.dt <- rbindlist(result.list) + + summary.corpus <- cbind(nchars[ord], + result.dt) + + setnames(summary.corpus, + "V1", + "nchars") + + + if(dt["nchars" == 0, .N] > 0){ + + dt.charnull <- dt["nchars" == 0] + dt.charnull$text <- NULL + dt.charnull$ntokens <- rep(0, dt.charnull[,.N]) + dt.charnull$ntypes <- rep(0, dt.charnull[,.N]) + dt.charnull$nsentences <- rep(0, dt.charnull[,.N]) + + summary.corpus <- rbind(summary.corpus, + dt.charnull) + } + + + summary.corpus <- summary.corpus[order(ord)] + + + print(paste0("Runtime was ", + round(duration.dopar, + digits = 2), + " ", + attributes(duration.dopar)$units, + ". Ended at ", + end.dopar, ".")) + + return(summary.corpus) + +} diff --git a/functions/f.linkextract.R b/functions/f.linkextract.R new file mode 100644 index 0000000..63a830f --- /dev/null +++ b/functions/f.linkextract.R @@ -0,0 +1,18 @@ +#+ +#'## f.linkextract: Extract Links from HTML +#' This function extracts all links (i.e. href attributes of tags) from an arbitrary HTML document. Returns "NA" if there is an error. +#' + +#' @param URL A valid URL. + +#library(rvest) + +f.linkextract <- function(URL){ + tryCatch({ + read_html(URL) %>% + html_nodes("a")%>% + html_attr('href')}, + error = function(cond) { + return(NA)} + ) +} diff --git a/functions/f.selectpdflinks.R b/functions/f.selectpdflinks.R new file mode 100644 index 0000000..3009cb9 --- /dev/null +++ b/functions/f.selectpdflinks.R @@ -0,0 +1,19 @@ +#'# Select PDF Links + +#' This function extracts from a general set of links from the ICJ website only those links which indicate monolingual case-related documents (by excluding bilingual documents). +#' +#' It is specific to the ICJ website and will not generalize without modification. + + +f.selectpdflinks <- function(links){ + temp <- grep ("case-related", + links, + ignore.case = TRUE, + value = TRUE) + out <- grep ("BI.pdf", + temp, + ignore.case = TRUE, + invert = TRUE, + value = TRUE) + return(out) +} diff --git a/functions/f.special.replace.R b/functions/f.special.replace.R new file mode 100644 index 0000000..6ebb82f --- /dev/null +++ b/functions/f.special.replace.R @@ -0,0 +1,19 @@ +#'# Replace Special Characters +#' This function replaces special characters with their closest equivalents in the Latin alphabet. These characters usually occur due to OCR mistakes. + +f.special.replace <- function(text){ + text.out <- gsub("ff", + "ff", + text) + + text.out <- gsub("fi", + "fi", + text.out) + + + text.out <- gsub("fl", + "fl", + text.out) + + return(text.out) +} diff --git a/functions/f.token.processor.R b/functions/f.token.processor.R new file mode 100644 index 0000000..a50854c --- /dev/null +++ b/functions/f.token.processor.R @@ -0,0 +1,17 @@ +#'# Process Corpus to Tokens +#' This function tokenizes a corpus, removes irrelevant characters, converts to lowercase and removes common stopwords for both English and French. It is intended to simulate a generic and widespread pre-processing workflow in natural language processing. + + +f.token.processor <- function(corpus){ + tokens <- tokens(corpus, + remove_numbers = TRUE, + remove_punct = TRUE, + remove_symbols = TRUE, + remove_separators = TRUE) + tokens <- tokens_tolower(tokens) + tokens <- tokens_remove(tokens, + pattern = c(stopwords("english"), + stopwords("french"))) + return(tokens) + } + diff --git a/tex/CD-ICJ_Source_TEX_Author.tex b/tex/CD-ICJ_Source_TEX_Author.tex new file mode 100644 index 0000000..9eec466 --- /dev/null +++ b/tex/CD-ICJ_Source_TEX_Author.tex @@ -0,0 +1,5 @@ +%=========================== +% Author +%=========================== + +\newcommand{\dataauthor}{Seán Fobbe} \ No newline at end of file diff --git a/tex/CD-ICJ_Source_TEX_CodebookTitle.tex b/tex/CD-ICJ_Source_TEX_CodebookTitle.tex new file mode 100644 index 0000000..7a6c6b9 --- /dev/null +++ b/tex/CD-ICJ_Source_TEX_CodebookTitle.tex @@ -0,0 +1,90 @@ +%=========================== +% Title Page +%=========================== + + +\pagestyle{empty} + +\begin{tikzpicture}[overlay,remember picture] + +\node[align=center] at ($(current page.center)+(0,8.5)$) +{ \scshape \Huge \bfseries Corpus of Decisions \\[2cm] \Huge \bfseries \scshape International Court of Justice \\[1cm] \LARGE \bfseries (\datashort)}; + +\draw[thick] (5.5,-2) -- (9.5,-2); + +\node[align=center] at ($(current page.center)+(0,2)$) +{ \scshape \LARGE Codebook}; + +\node[align=center] at ($(current page.center)+(0,-8)$) +{ \large Version \version}; + +\node[align=center] at ($(current page.center)+(0,-10)$) + {\includegraphics[width=.20\textwidth]{./buttons/cc-zero.png}}; + +\node[align=center] at ($(current page.center)+(0,-12)$) +{ \large DOI: \dataversiondoi}; + +\end{tikzpicture} + +\newpage + +\pagestyle{plain} + + + + + +%=========================== +% Inside Cover +%=========================== + +\newpage +\ra{1.5} + + +\begin{centering} +\begin{longtable}{p{2.5cm}p{12.5cm}} + +\textbf{Title} & \datatitle \\ +\textbf{Abbreviation} & \datashort \\ +\textbf{Author} & \dataauthor\\ +\textbf{Version} & \version \\ +\textbf{Download} & \url{\dataversionurldoi}\\ +\textbf{License} & CC0 1.0 Universal\\ + +\end{longtable} +\end{centering} + + + +\textbf{Citation} + +\emph{\dataauthor} (\the\year ). \datatitle\ (\datashort ). Version \version . Zenodo. DOI: \dataversiondoi . + +\vspace{0.5cm} + +\textbf{Digital Object Identifiers: Concept DOI and Version DOI} + +This data set is uniquely identified via the Digital Object Identifier (DOI) system. DOIs are persistent identifiers that are globally unique and can be resolved as a link by entering a DOI into the web service at \url{www.doi.org}. The DOI given in this document is a \enquote{Version DOI}, which uniquely identifies version \version. Academics and others who wish to enable replication analyses are strongly advised to cite the \emph{version DOI} and the precise version of the data used. A \enquote{Concept DOI} is available from the page of the Zenodo record under the heading \enquote{Cite all versions?} and will always resolve to the latest version. + +\vspace{0.5cm} + + + +\textbf{Public Domain Status} + +The full data set and this document are distributed under a \textbf{Creative Commons CC0 1.0 Universal (CC0 1.0) Public Domain Dedication} license. The person who associated a work with this deed has dedicated the work to the public domain by waiving all of his or her rights to the work worldwide under copyright law, including all related and neighboring rights, to the extent allowed by law. + +You can copy, modify, distribute and perform the work, even for commercial purposes, all without asking permission. In no way are the patent or trademark rights of any person affected by CC0, nor are the rights that other persons may have in the work or in how the work is used, such as publicity or privacy rights. Unless expressly stated otherwise, the person who associated a work with this deed makes no warranties about the work, and disclaims liability for all uses of the work, to the fullest extent permitted by applicable law. + +Please see <\url{https://creativecommons.org/publicdomain/zero/1.0/legalcode}> for the full terms of the license. + + +\vspace{0.5cm} + +\textbf{Disclaimer} + +This data set is a personal academic initiative and is not associated with or endorsed by the International Court of Justice or the United Nations. + + +\newpage \ No newline at end of file diff --git a/tex/CD-ICJ_Source_TEX_CompilationTitle.tex b/tex/CD-ICJ_Source_TEX_CompilationTitle.tex new file mode 100644 index 0000000..443263f --- /dev/null +++ b/tex/CD-ICJ_Source_TEX_CompilationTitle.tex @@ -0,0 +1,90 @@ + + +%=========================== +% Title Page +%=========================== + + +\pagestyle{empty} + +\begin{tikzpicture}[overlay,remember picture] + +\node[align=center] at ($(current page.center)+(0,8.5)$) +{ \scshape \Huge \bfseries Corpus of Decisions \\[2cm] \Huge \bfseries \scshape International Court of Justice \\[1cm] \LARGE \bfseries (\softwareshort)}; + +\draw[thick] (5.5,-2) -- (9.5,-2); + +\node[align=center] at ($(current page.center)+(0,2)$) +{ \scshape \LARGE Compilation Report}; + +\node[align=center] at ($(current page.center)+(0,-8)$) +{ \large Version \version}; + +\node[align=center] at ($(current page.center)+(0,-10)$) + {\includegraphics[width=0.18\linewidth]{./buttons/MIT0-blue.pdf}}; + +\node[align=center] at ($(current page.center)+(0,-12)$) +{ \large DOI: \softwareversiondoi}; + +\end{tikzpicture} + +\newpage + +\pagestyle{plain} + + + +%=========================== +% Inside Cover +%=========================== + +\newpage +\ra{1.5} + + +\begin{centering} +\begin{longtable}{p{2.5cm}p{12.5cm}} + +\textbf{Title} & \softwaretitle \\ +\textbf{Abbreviation} & \softwareshort \\ +\textbf{Author} & \dataauthor \\ +\textbf{Version} & \version \\ +\textbf{Download} & \url{\softwareversionurldoi}\\ +\textbf{License} & MIT No Attribution (MIT-0)\\ + +\end{longtable} +\end{centering} + + + +\textbf{Citation} + +\emph{\dataauthor} (\the\year ). \softwaretitle\ (\softwareshort ). Version \version . Zenodo. DOI: \softwareversiondoi . + +\vspace{0.5cm} + +\textbf{Digital Object Identifiers: Concept DOI and Version DOI} + +This data set is uniquely identified via the Digital Object Identifier (DOI) system. DOIs are persistent identifiers that are globally unique and can be resolved as a link by entering a DOI into the web service at \url{www.doi.org}. The DOI given in this document is a \emph{Version DOI}, which uniquely identifies version \version. Analysts who wish to enable replication analyses are strongly advised to cite the \emph{Version DOI} and the exact version of the data used. A \emph{Concept DOI} is available from the page of the Zenodo record under the heading \enquote{Cite all versions?} and will always resolve to the latest version. + +\vspace{0.5cm} + + + +\textbf{License: MIT No Attribution (MIT-0)} + +Copyright --- \the\year --- \dataauthor + +Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the \enquote{Software}), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so. + + +THE SOFTWARE IS PROVIDED \enquote{AS IS}, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +\vspace{0.5cm} + +\textbf{Disclaimer} + +This data set is a personal academic initiative and is not associated with or endorsed by the International Court of Justice or the United Nations. + + +\newpage \ No newline at end of file diff --git a/tex/CD-ICJ_Source_TEX_Preamble_EN.tex b/tex/CD-ICJ_Source_TEX_Preamble_EN.tex new file mode 100644 index 0000000..b7192dd --- /dev/null +++ b/tex/CD-ICJ_Source_TEX_Preamble_EN.tex @@ -0,0 +1,115 @@ +%================================= +% Packages +%================================= + +\usepackage[utf8]{inputenc} +\usepackage[T1]{fontenc} +\usepackage{xcolor} +\usepackage{graphicx} +%\graphicspath{ {./analysis/} } +\usepackage{rotating} +\usepackage{tikz} +\usetikzlibrary{calc} +\usepackage{booktabs} +\newcommand{\ra}[1]{\renewcommand{\arraystretch}{#1}} +\usepackage{longtable} +\setlength{\LTcapwidth}{6in} +\usepackage{array} +\newcolumntype{P}[1]{>{\centering\arraybackslash}p{#1}} + +\usepackage{titlesec} +\newcommand{\sectionbreak}{\clearpage} +\titlespacing\section{0pt}{22pt plus 4pt minus 2pt}{12pt plus 4pt minus 2pt} +\titlespacing\subsection{0pt}{18pt plus 4pt minus 2pt}{10pt plus 4pt minus 2pt} + +\usepackage[style=british]{csquotes} +\usepackage[indentfirst=false]{quoting} +\usepackage[hang]{footmisc} +\footnotemargin 1.5em + + +\usepackage{datetime2} + + + + + + +%================================= +% Define Colors +%================================= + +% Palette from http://jfly.iam.u-tokyo.ac.jp/color/ +\definecolor{bluishgreen}{RGB}{0, 158, 115} +\definecolor{blue}{RGB}{0, 115, 178} +\definecolor{vermillion}{RGB}{213, 94, 0} +\definecolor{orange}{RGB}{230, 159, 0} +\definecolor{reddishpurple}{RGB}{204, 121, 167} +\definecolor{darkred}{rgb}{0.55, 0.0, 0.0} + +% Rmarkdown Tango Palette +\definecolor{tangogreen}{HTML}{4e9a06} +\definecolor{tangobrown}{HTML}{8f5902} +\definecolor{tangoblue}{HTML}{204a87} +\definecolor{gray1}{RGB}{250,250,250} +\definecolor{gray2}{RGB}{200,200,200} + + +%================================= +% Listings-Options +%================================= + + +\lstset{ % + language=R, + backgroundcolor=\color{gray1}, + basicstyle=\ttfamily\small, + breakatwhitespace=false, + breaklines=true, + commentstyle=\color{tangobrown}, + deletekeywords={.df, csv}, + extendedchars=true, % lets you use non-ASCII characters; for 8-bits encodings only, does not work with UTF-8 + frame=single, % adds a frame around the code + keepspaces=true, % keeps spaces in text, useful for keeping indentation of code (possibly needs columns=flexible) + keywordstyle=\color{tangoblue}\bfseries, + otherkeywords={*,tokens_remove}, + numbers=none, + rulecolor=\color{black}, + showspaces=false, + showstringspaces=false, + showtabs=false, + stringstyle=\color{tangogreen}, + tabsize=2, + title=\lstname, % show the filename of files included with \lstinputlisting; also try caption instead of title + columns=fullflexible, + literate= + {á}{{\'a}}1 {é}{{\'e}}1 {í}{{\'i}}1 {ó}{{\'o}}1 {ú}{{\'u}}1 + {Á}{{\'A}}1 {É}{{\'E}}1 {Í}{{\'I}}1 {Ó}{{\'O}}1 {Ú}{{\'U}}1 + {à}{{\`a}}1 {è}{{\`e}}1 {ì}{{\`i}}1 {ò}{{\`o}}1 {ù}{{\`u}}1 + {À}{{\`A}}1 {È}{{\'E}}1 {Ì}{{\`I}}1 {Ò}{{\`O}}1 {Ù}{{\`U}}1 + {ä}{{\"a}}1 {ë}{{\"e}}1 {ï}{{\"i}}1 {ö}{{\"o}}1 {ü}{{\"u}}1 + {Ä}{{\"A}}1 {Ë}{{\"E}}1 {Ï}{{\"I}}1 {Ö}{{\"O}}1 {Ü}{{\"U}}1 {β}{{$\beta$}}1 + {â}{{\^a}}1 {ê}{{\^e}}1 {î}{{\^i}}1 {ô}{{\^o}}1 {û}{{\^u}}1 {α}{{$\alpha$}}1 + {Â}{{\^A}}1 {Ê}{{\^E}}1 {Î}{{\^I}}1 {Ô}{{\^O}}1 {Û}{{\^U}}1 {´}{{'}}1 + {Ã}{{\~A}}1 {ã}{{\~a}}1 {Õ}{{\~O}}1 {õ}{{\~o}}1 {ff}{{ff}}1 {fi}{{fi}}1 {fl}{{fl}}1 + {œ}{{\oe}}1 {Œ}{{\OE}}1 {æ}{{\ae}}1 {Æ}{{\AE}}1 {ß}{{\ss}}1 + {ű}{{\H{u}}}1 {Ű}{{\H{U}}}1 {ő}{{\H{o}}}1 {Ő}{{\H{O}}}1 + {ç}{{\c c}}1 {Ç}{{\c C}}1 {ø}{{\o}}1 {å}{{\r a}}1 {Å}{{\r A}}1 + {€}{{\euro}}1 {£}{{\pounds}}1 {«}{{\guillemotleft}}1 + {»}{{\guillemotright}}1 {ñ}{{\~n}}1 {Ñ}{{\~N}}1 {¿}{{?`}}1 {nº}{{n}}1 {©}{{(c)}}1 +} + + +%====================================== +% Shortcuts +%====================================== + +\newcommand{\pcij}{Permanent Court of International Justice} +\newcommand{\icj}{International Court of Justice} + + +%====================================== +% Deactivate Automatic Title Generation +%====================================== + +\AtBeginDocument{\let\maketitle\relax}