From b001e91e5af599e9ef8d4949f81691af844d9471 Mon Sep 17 00:00:00 2001 From: sbilge Date: Tue, 29 Aug 2023 13:06:26 +0000 Subject: [PATCH 01/15] docs from schema --- .sheet_documentation_template.md | 15 ++ .workbook_config.yaml | 79 +++++++++ docs/metadata/worksheets/Analysis.md | 43 +++++ docs/metadata/worksheets/AnalysisProcess.md | 37 ++++ .../worksheets/AnalysisProcessOutputFile.md | 61 +++++++ docs/metadata/worksheets/Biospecimen.md | 67 ++++++++ docs/metadata/worksheets/Condition.md | 61 +++++++ .../worksheets/DataAccessCommittee.md | 25 +++ docs/metadata/worksheets/DataAccessPolicy.md | 55 ++++++ docs/metadata/worksheets/Dataset.md | 37 ++++ docs/metadata/worksheets/Individual.md | 43 +++++ .../worksheets/LibraryPreparationProtocol.md | 91 ++++++++++ docs/metadata/worksheets/Publication.md | 61 +++++++ docs/metadata/worksheets/Sample.md | 67 ++++++++ docs/metadata/worksheets/SampleFile.md | 61 +++++++ .../worksheets/SequencingExperiment.md | 49 ++++++ docs/metadata/worksheets/SequencingProcess.md | 79 +++++++++ .../worksheets/SequencingProcessFile.md | 61 +++++++ .../metadata/worksheets/SequencingProtocol.md | 109 ++++++++++++ docs/metadata/worksheets/Study.md | 43 +++++ docs/metadata/worksheets/StudyFile.md | 61 +++++++ docs/metadata/worksheets/Trio.md | 31 ++++ requirements.txt | 3 + scripts/script_utils/fastapi_app_location.py | 23 --- scripts/update_metadata_docs.py | 158 ++++++++++++++++++ 25 files changed, 1397 insertions(+), 23 deletions(-) create mode 100644 .sheet_documentation_template.md create mode 100644 .workbook_config.yaml create mode 100644 docs/metadata/worksheets/Analysis.md create mode 100644 docs/metadata/worksheets/AnalysisProcess.md create mode 100644 docs/metadata/worksheets/AnalysisProcessOutputFile.md create mode 100644 docs/metadata/worksheets/Biospecimen.md create mode 100644 docs/metadata/worksheets/Condition.md create mode 100644 docs/metadata/worksheets/DataAccessCommittee.md create mode 100644 docs/metadata/worksheets/DataAccessPolicy.md create mode 100644 docs/metadata/worksheets/Dataset.md create mode 100644 docs/metadata/worksheets/Individual.md create mode 100644 docs/metadata/worksheets/LibraryPreparationProtocol.md create mode 100644 docs/metadata/worksheets/Publication.md create mode 100644 docs/metadata/worksheets/Sample.md create mode 100644 docs/metadata/worksheets/SampleFile.md create mode 100644 docs/metadata/worksheets/SequencingExperiment.md create mode 100644 docs/metadata/worksheets/SequencingProcess.md create mode 100644 docs/metadata/worksheets/SequencingProcessFile.md create mode 100644 docs/metadata/worksheets/SequencingProtocol.md create mode 100644 docs/metadata/worksheets/Study.md create mode 100644 docs/metadata/worksheets/StudyFile.md create mode 100644 docs/metadata/worksheets/Trio.md delete mode 100644 scripts/script_utils/fastapi_app_location.py create mode 100644 scripts/update_metadata_docs.py diff --git a/.sheet_documentation_template.md b/.sheet_documentation_template.md new file mode 100644 index 0000000..bfeed90 --- /dev/null +++ b/.sheet_documentation_template.md @@ -0,0 +1,15 @@ +# {{name}} + +## Description + +{{ description }} + +## Fields + +{% for slot in slots %} +**{{ slot.name }}** : {{ slot.description }} +**alias** : {{ slot.alias }} +**data type** : {{ slot.data_type }} +**required** : {{ slot.required }} + +{% endfor %} diff --git a/.workbook_config.yaml b/.workbook_config.yaml new file mode 100644 index 0000000..5d4ea3e --- /dev/null +++ b/.workbook_config.yaml @@ -0,0 +1,79 @@ +workbooks: + - file_name: ghga_submission_full.xlsx + worksheets: + - Study + - StudyFile + - Sample + - SampleFile + - Condition + - Biospecimen + - Individual + - Trio + - LibraryPreparationProtocol + - SequencingProtocol + - SequencingExperiment + - SequencingProcess + - SequencingProcessFile + - Analysis + - AnalysisProcess + - AnalysisProcessOutputFile + - Dataset + - DataAccessPolicy + - DataAccessCommittee + - Publication + + - file_name: ghga_submission_minimal.xlsx + worksheets: + - Study + - StudyFile + - Dataset + - DataAccessPolicy + - DataAccessCommittee + - Publication + + - file_name: ghga_submission_sample.xlsx + worksheets: + - Study + - StudyFile + - Sample + - SampleFile + - Condition + - Dataset + - DataAccessPolicy + - DataAccessCommittee + - Publication + + - file_name: ghga_submission_individual.xlsx + worksheets: + - Study + - StudyFile + - Sample + - SampleFile + - Condition + - Biospecimen + - Individual + - Trio + - Dataset + - DataAccessPolicy + - DataAccessCommittee + - Publication + + - file_name: ghga_submission_seq.xlsx + worksheets: + - Study + - StudyFile + - Sample + - SampleFile + - Condition + - Biospecimen + - Individual + - Trio + - LibraryPreparationProtocol + - SequencingProtocol + - SequencingExperiment + - SequencingProcess + - SequencingProcessFile + - Dataset + - DataAccessPolicy + - DataAccessCommittee + - Publication diff --git a/docs/metadata/worksheets/Analysis.md b/docs/metadata/worksheets/Analysis.md new file mode 100644 index 0000000..f7ac8a7 --- /dev/null +++ b/docs/metadata/worksheets/Analysis.md @@ -0,0 +1,43 @@ +# Analysis + +## Description + +An Analysis is a data transformation that transforms input data to output data. The workflow used to achieve this transformation and the individual steps are also captured. + +## Fields + + +**title** : The title that describes an entity. +**alias** : title +**data type** : string +**required** : None + + +**description** : Describing how an Analysis was carried out. (e.g.: computational tools, settings, etc.). +**alias** : description +**data type** : string +**required** : False + + +**type** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF) +**alias** : type +**data type** : string +**required** : False + + +**reference_genome** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13). +**alias** : reference_genome +**data type** : string +**required** : True + + +**reference_chromosome** : The reference chromosome used for this Analysis. +**alias** : reference_chromosome +**data type** : string +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/AnalysisProcess.md b/docs/metadata/worksheets/AnalysisProcess.md new file mode 100644 index 0000000..7c87a1f --- /dev/null +++ b/docs/metadata/worksheets/AnalysisProcess.md @@ -0,0 +1,37 @@ +# AnalysisProcess + +## Description + +None + +## Fields + + +**analysis** : The Analysis the AnalysisProcess was part of +**alias** : analysis +**data type** : Analysis +**required** : True + + +**study_input_files** : The StudyFile associated used as an input for an entity. +**alias** : study_input_files +**data type** : StudyFile +**required** : False + + +**sample_input_files** : The SampleFile associated used as an input for an entity. +**alias** : sample_input_files +**data type** : SampleFile +**required** : False + + +**sequencing_process_input_files** : The SequencingProcessFile associated used as an input for an entity. +**alias** : sequencing_process_input_files +**data type** : SequencingProcessFile +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/AnalysisProcessOutputFile.md b/docs/metadata/worksheets/AnalysisProcessOutputFile.md new file mode 100644 index 0000000..308a3ab --- /dev/null +++ b/docs/metadata/worksheets/AnalysisProcessOutputFile.md @@ -0,0 +1,61 @@ +# AnalysisProcessOutputFile + +## Description + +A AnalysisProcessOutputFile is a File that is associated as an output file with an AnalysisProcess. + +## Fields + + +**analysis_process** : The AnalysisProcess associated with an entity. +**alias** : analysis_process +**data type** : AnalysisProcess +**required** : True + + +**name** : The given filename. +**alias** : name +**data type** : string +**required** : True + + +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. +**alias** : format +**data type** : FileFormatEnum +**required** : True + + +**size** : The size of a file in bytes. +**alias** : size +**data type** : integer +**required** : True + + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. +**alias** : checksum +**data type** : string +**required** : True + + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. +**alias** : forward_or_reverse +**data type** : ForwardOrReverseEnum +**required** : False + + +**checksum_type** : The type of algorithm used to generate the checksum of a file. +**alias** : checksum_type +**data type** : string +**required** : True + + +**dataset** : The Dataset associated with an entity. +**alias** : dataset +**data type** : Dataset +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Biospecimen.md b/docs/metadata/worksheets/Biospecimen.md new file mode 100644 index 0000000..b14be4f --- /dev/null +++ b/docs/metadata/worksheets/Biospecimen.md @@ -0,0 +1,67 @@ +# Biospecimen + +## Description + +A Biospecimen is any natural material taken from a biological entity (usually a human) for testing, diagnostics, treatment, or research purposes. The Biospecimen is linked to the Individual from which the Biospecimen is derived. + +## Fields + + +**name** : The name for an entity. +**alias** : name +**data type** : string +**required** : False + + +**type** : The type of Biospecimen. +**alias** : type +**data type** : string +**required** : False + + +**description** : Description of an entity. +**alias** : description +**data type** : string +**required** : False + + +**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. +**alias** : isolation +**data type** : string +**required** : False + + +**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). +**alias** : storage +**data type** : string +**required** : False + + +**individual** : The Individual entity from which this Biospecimen was derived. +**alias** : individual +**data type** : Individual +**required** : True + + +**age_at_sampling** : Age of an individual. +**alias** : age_at_sampling +**data type** : AgeRangeEnum +**required** : True + + +**vital_status_at_sampling** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased'). +**alias** : vital_status_at_sampling +**data type** : VitalStatusEnum +**required** : False + + +**tissue** : None +**alias** : tissue +**data type** : string +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Condition.md b/docs/metadata/worksheets/Condition.md new file mode 100644 index 0000000..c51b111 --- /dev/null +++ b/docs/metadata/worksheets/Condition.md @@ -0,0 +1,61 @@ +# Condition + +## Description + +An condition that is linked to comparable samples. + +## Fields + + +**title** : The title that describes an entity. +**alias** : title +**data type** : string +**required** : None + + +**description** : Description of an entity. +**alias** : description +**data type** : string +**required** : True + + +**name** : The name for an entity. +**alias** : name +**data type** : string +**required** : True + + +**disease_or_healthy** : Whether a condition corresponds to a disease or a healthy state. +**alias** : disease_or_healthy +**data type** : DiseaseOrHealthyEnum +**required** : True + + +**case_control_status** : Whether a condition corresponds to a treatment or a control. +**alias** : case_control_status +**data type** : CaseControlStatusEnum +**required** : True + + +**mutant_or_wildtype** : Whether a condition corresponds to a mutant or a wildtype. +**alias** : mutant_or_wildtype +**data type** : MutantOrWildtypeEnum +**required** : True + + +**study** : The study associated with an entity. +**alias** : study +**data type** : Study +**required** : True + + +**attributes** : Key/value pairs corresponding to an entity. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/DataAccessCommittee.md b/docs/metadata/worksheets/DataAccessCommittee.md new file mode 100644 index 0000000..ef28ac3 --- /dev/null +++ b/docs/metadata/worksheets/DataAccessCommittee.md @@ -0,0 +1,25 @@ +# DataAccessCommittee + +## Description + +A group of members that are delegated to grant access to one or more datasets after ensuring the minimum criteria for data sharing has been met, and request for data use does not raise ethical and/or legal concerns. + +## Fields + + +**email** : Email of a person. +**alias** : email +**data type** : string +**required** : True + + +**institute** : The institute a person is affiliated with. +**alias** : institute +**data type** : string +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/DataAccessPolicy.md b/docs/metadata/worksheets/DataAccessPolicy.md new file mode 100644 index 0000000..670fa1c --- /dev/null +++ b/docs/metadata/worksheets/DataAccessPolicy.md @@ -0,0 +1,55 @@ +# DataAccessPolicy + +## Description + +A Data Access Policy specifies under which circumstances, legal or otherwise, a user can have access to one or more Datasets belonging to one or more Studies. + +## Fields + + +**name** : A name for the Data Access Policy. +**alias** : name +**data type** : string +**required** : True + + +**description** : A short description for the Data Access Policy. +**alias** : description +**data type** : string +**required** : True + + +**policy_text** : The terms of data use and policy verbiage should be captured here. +**alias** : policy_text +**data type** : string +**required** : True + + +**policy_url** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL. +**alias** : policy_url +**data type** : string +**required** : False + + +**data_access_committee** : The Data Access Committee linked to this policy. +**alias** : data_access_committee +**data type** : DataAccessCommittee +**required** : True + + +**data_use_permission** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'. +**alias** : data_use_permission +**data type** : DataUsePermissionEnum +**required** : True + + +**data_use_modifiers** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier' +**alias** : data_use_modifiers +**data type** : DataUseModifierEnum +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Dataset.md b/docs/metadata/worksheets/Dataset.md new file mode 100644 index 0000000..2f99c50 --- /dev/null +++ b/docs/metadata/worksheets/Dataset.md @@ -0,0 +1,37 @@ +# Dataset + +## Description + +A Dataset is a collection of Files that is prepared for distribution and is tied to a Data Access Policy. + +## Fields + + +**title** : A title for the submitted Dataset. +**alias** : title +**data type** : string +**required** : True + + +**description** : Description of an entity. +**alias** : description +**data type** : string +**required** : True + + +**types** : The type of a dataset. +**alias** : types +**data type** : string +**required** : True + + +**data_access_policy** : The Data Access Policy that applies to this Dataset. +**alias** : data_access_policy +**data type** : DataAccessPolicy +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Individual.md b/docs/metadata/worksheets/Individual.md new file mode 100644 index 0000000..2f6a9d6 --- /dev/null +++ b/docs/metadata/worksheets/Individual.md @@ -0,0 +1,43 @@ +# Individual + +## Description + +An Individual is a Person who is participating in a Study. + +## Fields + + +**sex** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female. +**alias** : sex +**data type** : IndividualSexEnum +**required** : True + + +**karyotype** : The karyotype of an individual if defined. +**alias** : karyotype +**data type** : KaryotypeEnum +**required** : False + + +**geographical_region** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries. +**alias** : geographical_region +**data type** : string +**required** : False + + +**ancestries** : A person's descent or lineage, from a person or from a population. +**alias** : ancestries +**data type** : string +**required** : False + + +**phenotypic_features** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype. +**alias** : phenotypic_features +**data type** : string +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/LibraryPreparationProtocol.md b/docs/metadata/worksheets/LibraryPreparationProtocol.md new file mode 100644 index 0000000..ecb337a --- /dev/null +++ b/docs/metadata/worksheets/LibraryPreparationProtocol.md @@ -0,0 +1,91 @@ +# LibraryPreparationProtocol + +## Description + +Information about the library_preparation of an sequencing experiment. + +## Fields + + +**description** : Description about how a sequencing library was prepared (eg: Library construction method). +**alias** : description +**data type** : string +**required** : True + + +**library_name** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library. +**alias** : library_name +**data type** : string +**required** : True + + +**library_layout** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode +**alias** : library_layout +**data type** : LibraryPreparationLibraryLayoutEnum +**required** : True + + +**library_type** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc) +**alias** : library_type +**data type** : LibraryPreparationLibraryTypeEnum +**required** : True + + +**library_selection** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc ) +**alias** : library_selection +**data type** : LibraryPreparationLibrarySelectionEnum +**required** : True + + +**library_preparation** : The general method for sequencing library_preparation (e.g. KAPA PCR-free). +**alias** : library_preparation +**data type** : string +**required** : True + + +**library_preparation_kit_retail_name** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.) +**alias** : library_preparation_kit_retail_name +**data type** : LibraryPreparationKitRetailNameEnum +**required** : False + + +**library_preparation_kit_manufacturer** : Manufacturer of library_preparation kit +**alias** : library_preparation_kit_manufacturer +**data type** : string +**required** : False + + +**primer** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA. +**alias** : primer +**data type** : PrimerEnum +**required** : False + + +**end_bias** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript. +**alias** : end_bias +**data type** : EndBiasEnum +**required** : False + + +**target_regions** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study. +**alias** : target_regions +**data type** : string +**required** : False + + +**rnaseq_strandedness** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand. +**alias** : rnaseq_strandedness +**data type** : LibraryPreparationRNASeqStrandednessEnum +**required** : False + + +**attributes** : One or more attributes that further characterizes this library_preparation Protocol. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Publication.md b/docs/metadata/worksheets/Publication.md new file mode 100644 index 0000000..6deb294 --- /dev/null +++ b/docs/metadata/worksheets/Publication.md @@ -0,0 +1,61 @@ +# Publication + +## Description + +The Publication entity represents a publication. While a publication can be any article that is published, the minimum expectation is that the publication has a valid DOI. + +## Fields + + +**title** : The title for the Publication. +**alias** : title +**data type** : string +**required** : False + + +**abstract** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study. +**alias** : abstract +**data type** : string +**required** : False + + +**author** : The individual who is responsible for the content of a document version. +**alias** : author +**data type** : string +**required** : False + + +**year** : Year in which the paper was published. +**alias** : year +**data type** : integer +**required** : False + + +**journal** : Name of the journal. +**alias** : journal +**data type** : string +**required** : False + + +**doi** : DOI identifier of the Publication. +**alias** : doi +**data type** : string +**required** : True + + +**study** : The Study entity associated with this Publication. +**alias** : study +**data type** : Study +**required** : True + + +**xref** : One or more cross-references for this Publication. +**alias** : xref +**data type** : string +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Sample.md b/docs/metadata/worksheets/Sample.md new file mode 100644 index 0000000..d6e0a19 --- /dev/null +++ b/docs/metadata/worksheets/Sample.md @@ -0,0 +1,67 @@ +# Sample + +## Description + +A sample is a limited quantity of something to be used for testing, analysis, inspection, investigation, demonstration, or trial use. A sample is prepared from a Biospecimen (isolate or tissue). + +## Fields + + +**name** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1). +**alias** : name +**data type** : string +**required** : True + + +**type** : The type of sample. +**alias** : type +**data type** : SampleTypeEnum +**required** : False + + +**description** : Short textual description of the sample (How the sample was collected, sample source, Protocol followed for processing the sample etc). +**alias** : description +**data type** : string +**required** : True + + +**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. +**alias** : isolation +**data type** : string +**required** : False + + +**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). +**alias** : storage +**data type** : string +**required** : False + + +**biospecimen** : The Biospecimen from which this Sample was prepared from. +**alias** : biospecimen +**data type** : Biospecimen +**required** : False + + +**condition** : The condition associated with an entity. +**alias** : condition +**data type** : Condition +**required** : True + + +**xref** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession. +**alias** : xref +**data type** : string +**required** : False + + +**attributes** : Key/value pairs corresponding to an entity. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/SampleFile.md b/docs/metadata/worksheets/SampleFile.md new file mode 100644 index 0000000..5e614ea --- /dev/null +++ b/docs/metadata/worksheets/SampleFile.md @@ -0,0 +1,61 @@ +# SampleFile + +## Description + +A SampleFile is a File that is associated with a Sample. + +## Fields + + +**sample** : The sample associated with an entity. +**alias** : sample +**data type** : Sample +**required** : True + + +**name** : The given filename. +**alias** : name +**data type** : string +**required** : True + + +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. +**alias** : format +**data type** : FileFormatEnum +**required** : True + + +**size** : The size of a file in bytes. +**alias** : size +**data type** : integer +**required** : True + + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. +**alias** : checksum +**data type** : string +**required** : True + + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. +**alias** : forward_or_reverse +**data type** : ForwardOrReverseEnum +**required** : False + + +**checksum_type** : The type of algorithm used to generate the checksum of a file. +**alias** : checksum_type +**data type** : string +**required** : True + + +**dataset** : The Dataset associated with an entity. +**alias** : dataset +**data type** : Dataset +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/SequencingExperiment.md b/docs/metadata/worksheets/SequencingExperiment.md new file mode 100644 index 0000000..387d90b --- /dev/null +++ b/docs/metadata/worksheets/SequencingExperiment.md @@ -0,0 +1,49 @@ +# SequencingExperiment + +## Description + +An sequencing experiment is an investigation that consists of a coordinated set of actions and observations designed to generate data with the goal of verifying, falsifying, or establishing the validity of a hypothesis. + +## Fields + + +**title** : Name for the experiment (eg: GHGAE_PBMC_RNAseq). +**alias** : title +**data type** : string +**required** : None + + +**description** : A detailed description of the Experiment. +**alias** : description +**data type** : string +**required** : True + + +**type** : The type of sequencing experiment. +**alias** : type +**data type** : string +**required** : False + + +**sequencing_protocol** : The sequencing protocol associated with an entity. +**alias** : sequencing_protocol +**data type** : SequencingProtocol +**required** : True + + +**library_preparation_protocol** : The library_preparation Protocol associated with an entity. +**alias** : library_preparation_protocol +**data type** : LibraryPreparationProtocol +**required** : True + + +**attributes** : Key/value pairs corresponding to an entity. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/SequencingProcess.md b/docs/metadata/worksheets/SequencingProcess.md new file mode 100644 index 0000000..02d6ff1 --- /dev/null +++ b/docs/metadata/worksheets/SequencingProcess.md @@ -0,0 +1,79 @@ +# SequencingProcess + +## Description + +A sequencing process linking a sample to sequencing output. + +## Fields + + +**title** : The title that describes an entity. +**alias** : title +**data type** : string +**required** : None + + +**description** : Description of an entity. +**alias** : description +**data type** : string +**required** : True + + +**name** : The name for an entity. +**alias** : name +**data type** : string +**required** : True + + +**sequencing_run_id** : Identifier of the sequencing run. Used for batch correction. +**alias** : sequencing_run_id +**data type** : string +**required** : False + + +**sequencing_lane_id** : Identifier of the sequencing lane. Used for batch correction. +**alias** : sequencing_lane_id +**data type** : string +**required** : False + + +**sequencing_machine_id** : Identifier of the sequencing machine. Used for batch correction. +**alias** : sequencing_machine_id +**data type** : string +**required** : False + + +**sequencing_experiment** : The sequencing experiment associated with an entity. +**alias** : sequencing_experiment +**data type** : SequencingExperiment +**required** : True + + +**index_sequence** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample. +**alias** : index_sequence +**data type** : string +**required** : False + + +**lane_number** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing. +**alias** : lane_number +**data type** : string +**required** : False + + +**sample** : The sample associated with an entity. +**alias** : sample +**data type** : Sample +**required** : True + + +**attributes** : Key/value pairs corresponding to an entity. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/SequencingProcessFile.md b/docs/metadata/worksheets/SequencingProcessFile.md new file mode 100644 index 0000000..ff77cd7 --- /dev/null +++ b/docs/metadata/worksheets/SequencingProcessFile.md @@ -0,0 +1,61 @@ +# SequencingProcessFile + +## Description + +A SequencingProcessFile is a File that is associated with a SequencingProcess. + +## Fields + + +**sequencing_process** : The SequencingProcess associated with an entity. +**alias** : sequencing_process +**data type** : SequencingProcess +**required** : True + + +**name** : The given filename. +**alias** : name +**data type** : string +**required** : True + + +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. +**alias** : format +**data type** : FileFormatEnum +**required** : True + + +**size** : The size of a file in bytes. +**alias** : size +**data type** : integer +**required** : True + + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. +**alias** : checksum +**data type** : string +**required** : True + + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. +**alias** : forward_or_reverse +**data type** : ForwardOrReverseEnum +**required** : False + + +**checksum_type** : The type of algorithm used to generate the checksum of a file. +**alias** : checksum_type +**data type** : string +**required** : True + + +**dataset** : The Dataset associated with an entity. +**alias** : dataset +**data type** : Dataset +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/SequencingProtocol.md b/docs/metadata/worksheets/SequencingProtocol.md new file mode 100644 index 0000000..7f9f870 --- /dev/null +++ b/docs/metadata/worksheets/SequencingProtocol.md @@ -0,0 +1,109 @@ +# SequencingProtocol + +## Description + +Information about the sequencing of a sample. + +## Fields + + +**description** : Description about the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). +**alias** : description +**data type** : string +**required** : True + + +**type** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). +**alias** : type +**data type** : string +**required** : None + + +**instrument_model** : The name and model of the technology platform used to perform sequencing. +**alias** : instrument_model +**data type** : InstrumentModelEnum +**required** : True + + +**sequencing_center** : Center where sample was sequenced. +**alias** : sequencing_center +**data type** : string +**required** : False + + +**sequencing_read_length** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process +**alias** : sequencing_read_length +**data type** : string +**required** : False + + +**target_coverage** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced. +**alias** : target_coverage +**data type** : string +**required** : False + + +**flow_cell_id** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing. +**alias** : flow_cell_id +**data type** : string +**required** : False + + +**flow_cell_type** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell. +**alias** : flow_cell_type +**data type** : FlowCellTypeEnum +**required** : False + + +**umi_barcode_read** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2). +**alias** : umi_barcode_read +**data type** : IndexReadEnum +**required** : False + + +**umi_barcode_offset** : The offset in sequence of the UMI identifying barcode. (E.g. '16'). +**alias** : umi_barcode_offset +**data type** : string +**required** : False + + +**umi_barcode_size** : The size of the UMI identifying barcode (Eg. '10'). +**alias** : umi_barcode_size +**data type** : string +**required** : False + + +**cell_barcode_read** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2). +**alias** : cell_barcode_read +**data type** : IndexReadEnum +**required** : False + + +**cell_barcode_offset** : The offset in sequence of the cell identifying barcode. (Eg. '0'). +**alias** : cell_barcode_offset +**data type** : string +**required** : False + + +**cell_barcode_size** : The size of the cell identifying barcode (E.g. '16'). +**alias** : cell_barcode_size +**data type** : string +**required** : False + + +**sample_barcode_read** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2). +**alias** : sample_barcode_read +**data type** : SampleBarcodeReadEnum +**required** : False + + +**attributes** : One or more attributes that further characterizes this Sequencing Protocol. +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Study.md b/docs/metadata/worksheets/Study.md new file mode 100644 index 0000000..823bfab --- /dev/null +++ b/docs/metadata/worksheets/Study.md @@ -0,0 +1,43 @@ +# Study + +## Description + +Studies are experimental investigations of a particular phenomenon. It involves a detailed examination and analysis of a subject to learn more about the phenomenon being studied. + +## Fields + + +**title** : A comprehensive title for the study. +**alias** : title +**data type** : string +**required** : True + + +**description** : A detailed description (abstract) that describes the goals of this Study. +**alias** : description +**data type** : string +**required** : True + + +**type** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'. +**alias** : type +**data type** : StudyTypeEnum +**required** : True + + +**affiliations** : The Institution(s) associated with an entity. +**alias** : affiliations +**data type** : string +**required** : True + + +**attributes** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc) +**alias** : attributes +**data type** : Attribute +**required** : False + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/StudyFile.md b/docs/metadata/worksheets/StudyFile.md new file mode 100644 index 0000000..543c458 --- /dev/null +++ b/docs/metadata/worksheets/StudyFile.md @@ -0,0 +1,61 @@ +# StudyFile + +## Description + +A StudyFile is a File that is associated with a Study. + +## Fields + + +**study** : The study associated with an entity. +**alias** : study +**data type** : Study +**required** : True + + +**name** : The given filename. +**alias** : name +**data type** : string +**required** : True + + +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. +**alias** : format +**data type** : FileFormatEnum +**required** : True + + +**size** : The size of a file in bytes. +**alias** : size +**data type** : integer +**required** : True + + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. +**alias** : checksum +**data type** : string +**required** : True + + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. +**alias** : forward_or_reverse +**data type** : ForwardOrReverseEnum +**required** : False + + +**checksum_type** : The type of algorithm used to generate the checksum of a file. +**alias** : checksum_type +**data type** : string +**required** : True + + +**dataset** : The Dataset associated with an entity. +**alias** : dataset +**data type** : Dataset +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/docs/metadata/worksheets/Trio.md b/docs/metadata/worksheets/Trio.md new file mode 100644 index 0000000..dc401d6 --- /dev/null +++ b/docs/metadata/worksheets/Trio.md @@ -0,0 +1,31 @@ +# Trio + +## Description + +A trio is defined by three individuals representing an individual and their parents. + +## Fields + + +**mother** : The mother of an individual. +**alias** : mother +**data type** : Individual +**required** : True + + +**father** : The father of an individual. +**alias** : father +**data type** : Individual +**required** : True + + +**child** : The child of two individuals. +**alias** : child +**data type** : Individual +**required** : True + + +**alias** : The alias for an entity at the time of submission. +**alias** : alias +**data type** : string +**required** : True diff --git a/requirements.txt b/requirements.txt index 1a0f626..1d84cb8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,6 @@ pygments markdown pymdown-extensions mkdocs-git-revision-date-localized-plugin +pydantic +jinja2 +linkml-runtime diff --git a/scripts/script_utils/fastapi_app_location.py b/scripts/script_utils/fastapi_app_location.py deleted file mode 100644 index b2ddb1d..0000000 --- a/scripts/script_utils/fastapi_app_location.py +++ /dev/null @@ -1,23 +0,0 @@ -# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln -# for the German Human Genome-Phenome Archive (GHGA) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# - -"""Used to define the location of the main FastAPI app object.""" - -# flake8: noqa -# pylint: skip-file - -# Please adapt to package structure: -from my_microservice.api.main import app diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py new file mode 100644 index 0000000..d00a25f --- /dev/null +++ b/scripts/update_metadata_docs.py @@ -0,0 +1,158 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Script to generate user-facing metadata schema documentation""" + +from pathlib import Path +from typing import Callable + +import requests +import yaml +from jinja2 import Environment, FileSystemLoader +from linkml_runtime.utils.schemaview import SchemaView # type: ignore +from pydantic import BaseModel, Field + +HERE = Path(__file__).parent.resolve() +ROOT = HERE.parent +DOCS_DIR = ROOT / "docs" / "metadata" / "worksheets" +CONFIG_PATH = ROOT / ".workbook_config.yaml" + +TEMPLATE = ".sheet_documentation_template.md" + +SCHEMA_URL = "https://raw.githubusercontent.com/ghga-de/ghga-metadata-schema/main/src/schema/submission.yaml" # pylint: disable=line-too-long + +MAIN_WORKBOOK = "ghga_submission_full.xlsx" + + +class WorkbookConfigurationNotFound(Exception): + """custom error to raise if the workbook config file does not exits in the given path""" + + +class MainSheetNotIdentified(Exception): + """custom error to raise if the config does not have 'ghga_submission_full.xlsx'""" + + +class SchemaNotLoaded(Exception): + """custom error to raise if the schema request return any response code other than 200""" + + +def load_config(config_path=CONFIG_PATH) -> dict: + """Loads config file""" + + try: + with open(config_path, "r", encoding="utf8") as config_file: + return yaml.safe_load(config_file) + except FileNotFoundError as exc: + raise WorkbookConfigurationNotFound( + f"Workbook configuration not found at: {config_path}" + ) from exc + + +def load_schema(schema_url=SCHEMA_URL): + """Loads schema""" + + schema_config = requests.get(schema_url, timeout=3) + if schema_config.status_code == 200: + return SchemaView(schema_config.text) + raise SchemaNotLoaded(f"Schema could not be loaded from {SCHEMA_URL}") + + +def extract_slots_from(schema: SchemaView, sheet_name: str) -> list[dict]: + """Extracts slot information of a given class""" + + return [ + { + "name": definition.name, + "alias": definition.alias, + "description": definition.description, + "data_type": definition.range, + "required": definition.required, + } + for definition in schema.class_induced_slots(sheet_name) + ] + + +def generate_workbook( + schema: SchemaView, + sheets: list[str], + get_slots: Callable[[SchemaView, str], list[dict]], +) -> list[dict]: + """Assembles the content of classes and their slots""" + return [ + { + "name": sheet, + "description": schema.get_class(sheet).description, + "slots": get_slots(schema, sheet), + } + for sheet in sheets + ] + + +class WorkbookConfig(BaseModel): + """A workbook configuration""" + + worksheets: list[str] = Field(default_factory=list) + file_name: str + + +class Config(BaseModel): + """Configures multiple workbooks each having a different set of worksheets""" + + workbooks: list[WorkbookConfig] + + @property + def main_workbook(self): + """Extract a workbook configuration based on file_name""" + for sheet in self.workbooks: + if sheet.file_name == MAIN_WORKBOOK: + return sheet + raise MainSheetNotIdentified( + f"No workbook configuration is found for {MAIN_WORKBOOK}" + ) + + +def generate_markdown(content: dict) -> str: + """Generates the markdown text by rendering the content into the template""" + + env = Environment(loader=FileSystemLoader(ROOT)) + template = env.get_template(TEMPLATE) + return template.render(content) + + +def create_doc_file(out_dir: Path, name: str, content: str) -> None: + """Creates a markdown file for a given sheet and content""" + + with open(out_dir / (name + ".md"), mode="w", encoding="utf8") as file: + file.write(content) + + +def main(): + """Patches things together""" + + config = Config.model_validate(load_config()) + if config.main_workbook is None: + raise MainSheetNotIdentified + worksheet_names = config.main_workbook.worksheets + + schema = load_schema() + workbook = generate_workbook(schema, worksheet_names, extract_slots_from) + + for sheet in workbook: + create_doc_file(DOCS_DIR, sheet["name"], generate_markdown(sheet)) + + +if __name__ == "__main__": + main() From 9a0d7adbf7d3f95e565c7c867b0383b193aa620b Mon Sep 17 00:00:00 2001 From: Karoline Mauer Date: Wed, 30 Aug 2023 11:09:42 +0000 Subject: [PATCH 02/15] update for entity, modules, overview descriptions --- docs/metadata/entities.md | 105 ++++++++++++++++++++++++++++++++------ docs/metadata/modules.md | 16 +++++- docs/metadata/overview.md | 20 ++++++++ 3 files changed, 125 insertions(+), 16 deletions(-) diff --git a/docs/metadata/entities.md b/docs/metadata/entities.md index fce40ab..7e17372 100644 --- a/docs/metadata/entities.md +++ b/docs/metadata/entities.md @@ -1,29 +1,104 @@ -# Entities & Attributes +# **Captured Metadata** -## Dataset +This section provides an overview on what metadata elements are captured with the GHGA Metadata Schema. -## Study +A breakdown of each metadata element described in the different entities will provide more insight on what elements are required for the functionality of GHGA, mandatory properties and recommended or optional information that can be provided by the data submitters. -## File +## **Study** -## Publication +All data deposited at GHGA is subject to a specific study, under which relevant data has been aggregated. A study is an experimental investigation of a particular phenomenon and involves a detailed examination and analysis of a subject to learn more about the phenomenon being studied. A detailed description of a study can guide data requesters to identify the most relevant datasets for their own research. -## Condition +### **Study metadata properties** -## Sample +In order to describe a *Study*, data submitters are required to provide information about the study affiliation(s), title, description and type - i.e.: Cancer Genomic, Epigenomics, etc. - of a study. -## Biospecimen +A study can also be linked to a *Publication*. The *Publication* entity holds the title, id (e.g. DOI of a publication), external reference, abstract, author, year, and journal for a unique publication. -## Individual +*Publication* is an optional metadata entity. If it is submitted, its properties become mandatory or optional. -## SequencingProcess +## **Sample** -## LibraryPreparationProtocol +GHGAs *Sample* metadata can be separated into three distinct entities: *Sample*, *Biospecimen* and *Condition*. Both the *Sample* and *Biospecimen* entities provide the data submitter with options to deposit metadata that allows for deeper insight into the characteristics of samples and biospecimen. The *Condition* allows to further define the state of the samples and to group samples within a study accordingly. The following paragraph gives a definition of what a sample, biospecimen or condition is in the context of GHGAs metadata schema. -## SequencingProtocol +A *Sample* is defined as a limited quantity of something to be used for testing, analysis, inspection, investigation, demonstration, or trial use. A sample is prepared from a biospecimen (isolate or tissue). -## SequencingExperiment +A *Biospecimen* is defined in GHGAs metadata as any natural material taken from a biological entity for testing, diagnostics, treatment or research purposes. The *Biospecimen* is linked to the *Individual* entity from which the biospecimen itself has been derived. -## Analysis +A *Condition* describes the state and origin of a sample. It captures actions applied to a sample that were necessary for the specific study in which the sample is used. The *Condition* links the *Sample* to a *Study*. -## AnalysisProcess +### **Sample metadata properties** + +The *Sample* entity requires data submitters to provide the name and description of a sample, as well as the link to the condition. On top of those mandatory properties, a submitter can provide more information on a sample through the type of isolation of the sample, or how it is stored. Further properties held by the sample entity are type (e.g. genomic DNA, single cell RNA or total RNA) and an external reference. + +The *Biospecimen* entity captures only optional information, which reflects the information of the sample entity. These include an alias, a description, isolation, name, type, external ID and storage. Addtionally, this property captures the age at sampling and vital status of the *Biospecimen* donor. + +The *Condition* entity captures information about whether and how a sample was treated and its status (case or control, mutant or wildtype). This information is important to uniquely link samples with the same *Condition* within one *Study*. All properties within the *Condition* are required. + +## **Individual** + +The *Individual* entity within GHGAs Metadata Schema is aimed at capturing relevant information about a sample’s donor. The content of the individual entity is crucial to identify cohorts of interest and gives valuable insight on the target group of an experiment. Data submitters are asked to provide information such as sex and other phenotypic features that help data requesters to identify a cohort of interest. + +Individuals can be part of a *Trio*. The *Trio* class is a study design often used in studying genetic conditions in a family. It involves the genetic analysis of three individuals within a family unit, usually a child and their biological parents. + +### **Individual metadata properties** +The data submitter is required to provide information about an individual’s sex. Additional information such as phenotypic features, karyotype, geographical region and ethnicity can be submitted. + +## **Sequencing Experiment** + +Omics data is gathered while carrying out an experiment under certain conditions and procedures. Thus, GHGA aims at collecting as much as possible information about several protocols which gives data requesters an insight on how an experiment was conducted. Data submitters are asked to provide a core set of properties to help make the data deposited at GHGA more rich, but are welcome to provide all the information that has been generated while carrying out the experiment for the dataset. The insights provided by this collection of information helps to make data reusable, one of the main incentives of the FAIR data principles. + +### **Sequencing Experiment metadata properties** +*Sequencing Experiment* metadata elements within GHGA’s Metadata Schema spans not only information about the experiment itself but also protocols under which an experiment has been conducted. These include entities for *Sequencing Process*, *Sequencing Protocol* and *Library Preparation Protocol*. + +The *Sequencing Experiment* entity is used to group *Sequencing Processes*, *Library Preparation Protocols* and *Sequencing Protocols* within one *Sequencing Experiment*. Via the *Sequencing Process* and *Condition* entities, it is then linked to *Study*, *Sample* and *File*. + +The *Sequencing Process* captures the technical parameters that were used to generate output from a *Sample* during a *Sequencing Experiment*. + +The *Sequencing Protocol* entity gives a variety of properties that a data submitter can submit. Mandatory properties are the sequencing alias, the description of the sequencing protocol and the provision of the instrument model with which the sequencing was done. Optional properties include the offset, read and size for cell and umi barcodes, flow cell id and type, read length, target coverage, and the type of the used sequencing protocol. + +The *Library Preparation Protocol* entity requires a data submitter to provide the following mandatory properties in order to allow reproducible research: library– layout, type, selection, preparation, as well as a name for the protocol, an alias and a thorough description. Optional properties include information about the kit retail name and manufacturer, the library primer, the RNAseq strandedness, target regions, primer end bias and the type of the protocol. If there is a publicly referable url for the protocol, this can also be submitted. + + +## **File** + +At the core of GHGA is the deposition of raw files that have been generated while carrying out an experiment. These files also have to be annotated with metadata, in order to give data requesters more information on what files have been deposited at GHGA by the data submitter. Therefore this metadata will also be used by the user interface of GHGAs data portal to provide not only information on how many files are contained within a dataset, but also information on file size, file formats, checksums and file types. + +The files deposited at GHGA, and their metadata, will always link to an experiment entity. + +### **File metadata properties** + +The *File* entity requires submitters to provide the name, the alias and the format for a file. During the GHGA Catalog phase, the submitter also has to provide information about the file size, checksum and checksum type. + +## **Analysis** + +GHGA will provide data analysis of the raw files deposited at GHGA by a data submitter. The *Analysis* entity will aim at storing metadata related to the computational analysis of the files that potentially will be run using containers and nf-core pipelines. The information that will be stored in the *Analysis* entity will help to make the analysis data reproducible and reusable with respect to the FAIR data principles. + +### **Analysis metadata properties** + +The data submitter is required to provide an analysis alias, the aliases for the input and output files, as well as the link to a study and the reference genome or chromosome(s) used for the analysis. Optional properties include a description of the analysis and the analysis type. Additionally, users can submit an *Analysis Process*, which captures the workflow steps that were performed to analyze data obtained from sequencing experiments + +## **Dataset** + +GHGA presents its content to potential data requesters and submitters with the *Dataset* entity, which focuses on sharing functionality by describing the contents at a high level. Each dataset is linked to a *Data Access Policy*, which builds the legal basis for the sharing of data. One dataset has links to *Experiment* and / or *Analysis* entities to bundle all relevant data that makes a dataset by the definition of the GHGA Metadata Schema. + +### **Dataset metadata properties** + +The *Dataset* entity is aimed at capturing relevant information about a dataset itself. The data submitter can provide a description and a title for the dataset. The main purpose of this entity is to link a dataset to the related study, experiments, samples, analysis, files and data access policies. These links must be provided on the submission of data, either through automatic linking with respect to the *Data Access Committee*, or the data submitter. + +All properties captured in the *Dataset* entity are required for the functionality of GHGA and are therefore mandatory. The only exception is the analysis alias, which only needs to be provided if an analysis is to be submitted. A title and description can be indexed by the database in order to make the GHGA Data Portal searchable for a specific dataset. In addition, the links to study, experiment, samples, analysis (if avalaible) and files are necessary to provide a data requester with all relevant data and metadata associated with a dataset. This also ensures reusability in the light of the FAIR Data Principles. + +## **Data Access Policy and Committee** + +Depositing data at GHGA requires a data submitter to provide a *Data Access Committee (DAC)* and *Data Access Policy (DAP)*. This ensures controlled access to their deposited data and a clear guideline for data requesters to access the data. This includes a defined contact person and a consent-based legal basis for getting access to a dataset. + +### **Data Access Policy and Committee metadata properties** + +The *DAC* entity bundles necessary information that is required to identify the Data Controller of the deposited data. Therefore a name and description for the *DAC*, and the main contact have to be provided upon submission. The information about a contact includes the email address and the associated affiliation. + +A *DAP* is directly linked to the *DAC* and *Dataset* entity, thus providing the condition under which the data deposited at GHGA can be re-used by a data requester. The submitter must provide an alias, name, description and either the policy text for the *DAP* or the URL where the *DAP* is stored. The *DAP* needs to be linked to the *DAC* and *Dataset*. + +To systematically and semantically identify the conditions under which deposited data can be reused, data submitters can optionally provide DUO terms that are used to identify the research purpose under which the data can be requested, e.g. General Research Use (DUO:0000042), research specific restrictions (DUO:0000012). + +## **Submission Spreadsheet** + +The Submission Spreadsheet for GHGA Archive captures the above-mentioned metadata in an ordered fashion. Data submitters are given a predefined set of properties to describe the data, which they are aiming to deposit. Furthermore in the initial phase, data submitters are asked to provide additional information using key-value pairs. That means, metadata properties which are not yet covered within the metadata catalog for GHGA Archive can be provided with a descriptive property title and the corresponding value. Leaving the freedom to the submitters to provide as much information as is being captured in their study. Since information submitted through the attributes-property is not controllable by GHGA, attributes are considered restricted metadata and will not be visible in the GHGA Catalog. diff --git a/docs/metadata/modules.md b/docs/metadata/modules.md index a55ecc0..e8fdfc0 100644 --- a/docs/metadata/modules.md +++ b/docs/metadata/modules.md @@ -1 +1,15 @@ -# Modules +# **Modules in the GHGA Metadata Model** + +- **Basic Module**: The Basic Module is the fundamental module in the GHGA Metadata Schema. It covers the minimal amount of information that must be included in a successful submission. + +- **Sample Module**: Every Basic Module can be linked to one or more Sample Modules. This module contains information relating to the sample that was later sequenced in a sequencing experiment. + +- **Phenotype Module**: One Sample Module can have one or more Phenotype Modules. This module can be used when a sample originated from a ‘Biospecimen’ or an ‘Individual’ and thus allows to group several Sample Modules based on the sample origin. In addition, the Phenotype Module captures detailed information about phenotypes or individual demographics. + +- **Sequencing Module**: One Sample Module can also be linked to one or more Sequencing Modules. The Sequencing Module captures information about the ‘Sequencing Process’, such as the sequencing and library preparation protocols. + +- **Data Use Conditions Module**: The Data Use Conditions Module captures in granular detail what restrictions and use conditions are associated with a Data Access Policy. This section also captures the Data Access Committee that enforces the Data Access Policy requirements. + +- **Dataset Module**: The Dataset Module contains the ‘Dataset’ entity, which is a collection of one or more Files from one or more Modules. All Files within the Dataset Module are subject to the Data Access Policy that is captured in the Data Use Conditions Module. One Dataset Module can only be linked to one Data Use Conditions Module. + +- **Analysis Module**: A dataset can have one or more Analysis Modules where each Analysis Module links to one or more files as input to the Analysis, one or more files as output to the Analysis, and the ‘Analysis Process’ that captures how the analysis was performed. diff --git a/docs/metadata/overview.md b/docs/metadata/overview.md index b50d7c6..1079335 100644 --- a/docs/metadata/overview.md +++ b/docs/metadata/overview.md @@ -1 +1,21 @@ # The GHGA Metadata Model +## **Glossary** +- **Entity**: An Entity holds characteristics of a real-world object. Example: The Individual entity is described by the information (properties) for sex, year of birth and height. + + - Synonyms: class, table, object + +- **Property**: A Property is a single characteristic that can be used in combination with other characteristics to describe a real-world object. Example: The combination of the properties sex, year of birth and height describe the (real-world object) entity Individual. + + - Synonyms: attribute, element, field, slot + +- **FAIR**: Findable, Accessible, Interoperable, Reusable + +## **Introduction** +The German Human Genome-Phenome Archive (GHGA) provides a nation-wide resource for archiving, accessing and sharing of multi-omics data produced and processed in research and health care initiatives in Germany. GHGA aims to bring these data together and make it easier to find data for secondary use, by adopting and adhering to [FAIR data principles](https://doi.org/10.1038/sdata.2016.18). In order to meet the domain-specific requirements we developed the GHGA Metadata Schema - a schema for representing information pertaining to various aspects of our data. + +This documentation serves as the description and reasoning behind the Metadata Model of GHGA, which encapsulates the metadata schema, its technical implementation, and resources to support submission of metadata. The Archive function of GHGA is envisioned to handle a wide variety of omics and research data. The Metadata Model is architecturally flexible and can be expanded with specific fields using domain and technology specific modules. + + +The core of the schema is built such that it can be expanded to accommodate genomic, epigenetic, transcriptomic, clinical, and other forms of medical data. Our initial focus is on research data from the Cancer and Rare Diseases communities. These communities can benefit greatly by improving the exchange of data and associated metadata. + +Furthermore we provide data submitters with a Submission Spreadsheet in order to easily deposit their data within GHGA. From 69a75b923088105e783bfbc84647a978014f62d9 Mon Sep 17 00:00:00 2001 From: sbilge Date: Wed, 30 Aug 2023 13:02:58 +0000 Subject: [PATCH 03/15] enum extraction added --- ... => .sheet_documentation_template.md.jinja | 4 +- docs/metadata/worksheets/Analysis.md | 11 --- docs/metadata/worksheets/AnalysisProcess.md | 9 --- .../worksheets/AnalysisProcessOutputFile.md | 21 +---- docs/metadata/worksheets/Biospecimen.md | 23 +----- docs/metadata/worksheets/Condition.md | 23 +----- .../worksheets/DataAccessCommittee.md | 5 -- docs/metadata/worksheets/DataAccessPolicy.md | 19 +---- docs/metadata/worksheets/Dataset.md | 9 --- docs/metadata/worksheets/Individual.md | 15 +--- .../worksheets/LibraryPreparationProtocol.md | 41 ++-------- docs/metadata/worksheets/Publication.md | 17 ---- docs/metadata/worksheets/Sample.md | 21 +---- docs/metadata/worksheets/SampleFile.md | 21 +---- .../worksheets/SequencingExperiment.md | 13 --- docs/metadata/worksheets/SequencingProcess.md | 23 ------ .../worksheets/SequencingProcessFile.md | 21 +---- .../metadata/worksheets/SequencingProtocol.md | 43 ++-------- docs/metadata/worksheets/Study.md | 13 +-- docs/metadata/worksheets/StudyFile.md | 21 +---- docs/metadata/worksheets/Trio.md | 7 -- scripts/update_metadata_docs.py | 79 +++++++++++-------- 22 files changed, 78 insertions(+), 381 deletions(-) rename .sheet_documentation_template.md => .sheet_documentation_template.md.jinja (55%) diff --git a/.sheet_documentation_template.md b/.sheet_documentation_template.md.jinja similarity index 55% rename from .sheet_documentation_template.md rename to .sheet_documentation_template.md.jinja index bfeed90..952e47f 100644 --- a/.sheet_documentation_template.md +++ b/.sheet_documentation_template.md.jinja @@ -8,8 +8,6 @@ {% for slot in slots %} **{{ slot.name }}** : {{ slot.description }} -**alias** : {{ slot.alias }} -**data type** : {{ slot.data_type }} +{% if 'Enum' in slot.data_type.range %}**data type** : Controlled Vocabulary{% else %}**data type** : {{ slot.data_type.range }}{% endif %} **required** : {{ slot.required }} - {% endfor %} diff --git a/docs/metadata/worksheets/Analysis.md b/docs/metadata/worksheets/Analysis.md index f7ac8a7..6d90561 100644 --- a/docs/metadata/worksheets/Analysis.md +++ b/docs/metadata/worksheets/Analysis.md @@ -8,36 +8,25 @@ An Analysis is a data transformation that transforms input data to output data. **title** : The title that describes an entity. -**alias** : title **data type** : string **required** : None - **description** : Describing how an Analysis was carried out. (e.g.: computational tools, settings, etc.). -**alias** : description **data type** : string **required** : False - **type** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF) -**alias** : type **data type** : string **required** : False - **reference_genome** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13). -**alias** : reference_genome **data type** : string **required** : True - **reference_chromosome** : The reference chromosome used for this Analysis. -**alias** : reference_chromosome **data type** : string **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/AnalysisProcess.md b/docs/metadata/worksheets/AnalysisProcess.md index 7c87a1f..8ca4be7 100644 --- a/docs/metadata/worksheets/AnalysisProcess.md +++ b/docs/metadata/worksheets/AnalysisProcess.md @@ -8,30 +8,21 @@ None **analysis** : The Analysis the AnalysisProcess was part of -**alias** : analysis **data type** : Analysis **required** : True - **study_input_files** : The StudyFile associated used as an input for an entity. -**alias** : study_input_files **data type** : StudyFile **required** : False - **sample_input_files** : The SampleFile associated used as an input for an entity. -**alias** : sample_input_files **data type** : SampleFile **required** : False - **sequencing_process_input_files** : The SequencingProcessFile associated used as an input for an entity. -**alias** : sequencing_process_input_files **data type** : SequencingProcessFile **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/AnalysisProcessOutputFile.md b/docs/metadata/worksheets/AnalysisProcessOutputFile.md index 308a3ab..73f0103 100644 --- a/docs/metadata/worksheets/AnalysisProcessOutputFile.md +++ b/docs/metadata/worksheets/AnalysisProcessOutputFile.md @@ -8,54 +8,37 @@ A AnalysisProcessOutputFile is a File that is associated as an output file with **analysis_process** : The AnalysisProcess associated with an entity. -**alias** : analysis_process **data type** : AnalysisProcess **required** : True - **name** : The given filename. -**alias** : name **data type** : string **required** : True - **format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**alias** : format -**data type** : FileFormatEnum +**data type** : Controlled Vocabulary **required** : True - **size** : The size of a file in bytes. -**alias** : size **data type** : integer **required** : True - **checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**alias** : checksum **data type** : string **required** : True - **forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**alias** : forward_or_reverse -**data type** : ForwardOrReverseEnum +**data type** : Controlled Vocabulary **required** : False - **checksum_type** : The type of algorithm used to generate the checksum of a file. -**alias** : checksum_type **data type** : string **required** : True - **dataset** : The Dataset associated with an entity. -**alias** : dataset **data type** : Dataset **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Biospecimen.md b/docs/metadata/worksheets/Biospecimen.md index b14be4f..7bbc1ee 100644 --- a/docs/metadata/worksheets/Biospecimen.md +++ b/docs/metadata/worksheets/Biospecimen.md @@ -8,60 +8,41 @@ A Biospecimen is any natural material taken from a biological entity (usually a **name** : The name for an entity. -**alias** : name **data type** : string **required** : False - **type** : The type of Biospecimen. -**alias** : type **data type** : string **required** : False - **description** : Description of an entity. -**alias** : description **data type** : string **required** : False - **isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. -**alias** : isolation **data type** : string **required** : False - **storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). -**alias** : storage **data type** : string **required** : False - **individual** : The Individual entity from which this Biospecimen was derived. -**alias** : individual **data type** : Individual **required** : True - **age_at_sampling** : Age of an individual. -**alias** : age_at_sampling -**data type** : AgeRangeEnum +**data type** : Controlled Vocabulary **required** : True - **vital_status_at_sampling** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased'). -**alias** : vital_status_at_sampling -**data type** : VitalStatusEnum +**data type** : Controlled Vocabulary **required** : False - **tissue** : None -**alias** : tissue **data type** : string **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Condition.md b/docs/metadata/worksheets/Condition.md index c51b111..38ac610 100644 --- a/docs/metadata/worksheets/Condition.md +++ b/docs/metadata/worksheets/Condition.md @@ -8,54 +8,37 @@ An condition that is linked to comparable samples. **title** : The title that describes an entity. -**alias** : title **data type** : string **required** : None - **description** : Description of an entity. -**alias** : description **data type** : string **required** : True - **name** : The name for an entity. -**alias** : name **data type** : string **required** : True - **disease_or_healthy** : Whether a condition corresponds to a disease or a healthy state. -**alias** : disease_or_healthy -**data type** : DiseaseOrHealthyEnum +**data type** : Controlled Vocabulary **required** : True - **case_control_status** : Whether a condition corresponds to a treatment or a control. -**alias** : case_control_status -**data type** : CaseControlStatusEnum +**data type** : Controlled Vocabulary **required** : True - **mutant_or_wildtype** : Whether a condition corresponds to a mutant or a wildtype. -**alias** : mutant_or_wildtype -**data type** : MutantOrWildtypeEnum +**data type** : Controlled Vocabulary **required** : True - **study** : The study associated with an entity. -**alias** : study **data type** : Study **required** : True - **attributes** : Key/value pairs corresponding to an entity. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/DataAccessCommittee.md b/docs/metadata/worksheets/DataAccessCommittee.md index ef28ac3..ce3e2b8 100644 --- a/docs/metadata/worksheets/DataAccessCommittee.md +++ b/docs/metadata/worksheets/DataAccessCommittee.md @@ -8,18 +8,13 @@ A group of members that are delegated to grant access to one or more datasets af **email** : Email of a person. -**alias** : email **data type** : string **required** : True - **institute** : The institute a person is affiliated with. -**alias** : institute **data type** : string **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/DataAccessPolicy.md b/docs/metadata/worksheets/DataAccessPolicy.md index 670fa1c..c475d88 100644 --- a/docs/metadata/worksheets/DataAccessPolicy.md +++ b/docs/metadata/worksheets/DataAccessPolicy.md @@ -8,48 +8,33 @@ A Data Access Policy specifies under which circumstances, legal or otherwise, a **name** : A name for the Data Access Policy. -**alias** : name **data type** : string **required** : True - **description** : A short description for the Data Access Policy. -**alias** : description **data type** : string **required** : True - **policy_text** : The terms of data use and policy verbiage should be captured here. -**alias** : policy_text **data type** : string **required** : True - **policy_url** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL. -**alias** : policy_url **data type** : string **required** : False - **data_access_committee** : The Data Access Committee linked to this policy. -**alias** : data_access_committee **data type** : DataAccessCommittee **required** : True - **data_use_permission** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'. -**alias** : data_use_permission -**data type** : DataUsePermissionEnum +**data type** : Controlled Vocabulary **required** : True - **data_use_modifiers** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier' -**alias** : data_use_modifiers -**data type** : DataUseModifierEnum +**data type** : Controlled Vocabulary **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Dataset.md b/docs/metadata/worksheets/Dataset.md index 2f99c50..522507a 100644 --- a/docs/metadata/worksheets/Dataset.md +++ b/docs/metadata/worksheets/Dataset.md @@ -8,30 +8,21 @@ A Dataset is a collection of Files that is prepared for distribution and is tied **title** : A title for the submitted Dataset. -**alias** : title **data type** : string **required** : True - **description** : Description of an entity. -**alias** : description **data type** : string **required** : True - **types** : The type of a dataset. -**alias** : types **data type** : string **required** : True - **data_access_policy** : The Data Access Policy that applies to this Dataset. -**alias** : data_access_policy **data type** : DataAccessPolicy **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Individual.md b/docs/metadata/worksheets/Individual.md index 2f6a9d6..2f6c1f0 100644 --- a/docs/metadata/worksheets/Individual.md +++ b/docs/metadata/worksheets/Individual.md @@ -8,36 +8,25 @@ An Individual is a Person who is participating in a Study. **sex** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female. -**alias** : sex -**data type** : IndividualSexEnum +**data type** : Controlled Vocabulary **required** : True - **karyotype** : The karyotype of an individual if defined. -**alias** : karyotype -**data type** : KaryotypeEnum +**data type** : Controlled Vocabulary **required** : False - **geographical_region** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries. -**alias** : geographical_region **data type** : string **required** : False - **ancestries** : A person's descent or lineage, from a person or from a population. -**alias** : ancestries **data type** : string **required** : False - **phenotypic_features** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype. -**alias** : phenotypic_features **data type** : string **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/LibraryPreparationProtocol.md b/docs/metadata/worksheets/LibraryPreparationProtocol.md index ecb337a..7751d00 100644 --- a/docs/metadata/worksheets/LibraryPreparationProtocol.md +++ b/docs/metadata/worksheets/LibraryPreparationProtocol.md @@ -8,84 +8,57 @@ Information about the library_preparation of an sequencing experiment. **description** : Description about how a sequencing library was prepared (eg: Library construction method). -**alias** : description **data type** : string **required** : True - **library_name** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library. -**alias** : library_name **data type** : string **required** : True - **library_layout** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode -**alias** : library_layout -**data type** : LibraryPreparationLibraryLayoutEnum +**data type** : Controlled Vocabulary **required** : True - **library_type** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc) -**alias** : library_type -**data type** : LibraryPreparationLibraryTypeEnum +**data type** : Controlled Vocabulary **required** : True - **library_selection** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc ) -**alias** : library_selection -**data type** : LibraryPreparationLibrarySelectionEnum +**data type** : Controlled Vocabulary **required** : True - **library_preparation** : The general method for sequencing library_preparation (e.g. KAPA PCR-free). -**alias** : library_preparation **data type** : string **required** : True - **library_preparation_kit_retail_name** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.) -**alias** : library_preparation_kit_retail_name -**data type** : LibraryPreparationKitRetailNameEnum +**data type** : Controlled Vocabulary **required** : False - **library_preparation_kit_manufacturer** : Manufacturer of library_preparation kit -**alias** : library_preparation_kit_manufacturer **data type** : string **required** : False - **primer** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA. -**alias** : primer -**data type** : PrimerEnum +**data type** : Controlled Vocabulary **required** : False - **end_bias** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript. -**alias** : end_bias -**data type** : EndBiasEnum +**data type** : Controlled Vocabulary **required** : False - **target_regions** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study. -**alias** : target_regions **data type** : string **required** : False - **rnaseq_strandedness** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand. -**alias** : rnaseq_strandedness -**data type** : LibraryPreparationRNASeqStrandednessEnum +**data type** : Controlled Vocabulary **required** : False - **attributes** : One or more attributes that further characterizes this library_preparation Protocol. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Publication.md b/docs/metadata/worksheets/Publication.md index 6deb294..f72ef6d 100644 --- a/docs/metadata/worksheets/Publication.md +++ b/docs/metadata/worksheets/Publication.md @@ -8,54 +8,37 @@ The Publication entity represents a publication. While a publication can be any **title** : The title for the Publication. -**alias** : title **data type** : string **required** : False - **abstract** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study. -**alias** : abstract **data type** : string **required** : False - **author** : The individual who is responsible for the content of a document version. -**alias** : author **data type** : string **required** : False - **year** : Year in which the paper was published. -**alias** : year **data type** : integer **required** : False - **journal** : Name of the journal. -**alias** : journal **data type** : string **required** : False - **doi** : DOI identifier of the Publication. -**alias** : doi **data type** : string **required** : True - **study** : The Study entity associated with this Publication. -**alias** : study **data type** : Study **required** : True - **xref** : One or more cross-references for this Publication. -**alias** : xref **data type** : string **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Sample.md b/docs/metadata/worksheets/Sample.md index d6e0a19..cf3fe9e 100644 --- a/docs/metadata/worksheets/Sample.md +++ b/docs/metadata/worksheets/Sample.md @@ -8,60 +8,41 @@ A sample is a limited quantity of something to be used for testing, analysis, in **name** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1). -**alias** : name **data type** : string **required** : True - **type** : The type of sample. -**alias** : type -**data type** : SampleTypeEnum +**data type** : Controlled Vocabulary **required** : False - **description** : Short textual description of the sample (How the sample was collected, sample source, Protocol followed for processing the sample etc). -**alias** : description **data type** : string **required** : True - **isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. -**alias** : isolation **data type** : string **required** : False - **storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). -**alias** : storage **data type** : string **required** : False - **biospecimen** : The Biospecimen from which this Sample was prepared from. -**alias** : biospecimen **data type** : Biospecimen **required** : False - **condition** : The condition associated with an entity. -**alias** : condition **data type** : Condition **required** : True - **xref** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession. -**alias** : xref **data type** : string **required** : False - **attributes** : Key/value pairs corresponding to an entity. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/SampleFile.md b/docs/metadata/worksheets/SampleFile.md index 5e614ea..6a73e93 100644 --- a/docs/metadata/worksheets/SampleFile.md +++ b/docs/metadata/worksheets/SampleFile.md @@ -8,54 +8,37 @@ A SampleFile is a File that is associated with a Sample. **sample** : The sample associated with an entity. -**alias** : sample **data type** : Sample **required** : True - **name** : The given filename. -**alias** : name **data type** : string **required** : True - **format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**alias** : format -**data type** : FileFormatEnum +**data type** : Controlled Vocabulary **required** : True - **size** : The size of a file in bytes. -**alias** : size **data type** : integer **required** : True - **checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**alias** : checksum **data type** : string **required** : True - **forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**alias** : forward_or_reverse -**data type** : ForwardOrReverseEnum +**data type** : Controlled Vocabulary **required** : False - **checksum_type** : The type of algorithm used to generate the checksum of a file. -**alias** : checksum_type **data type** : string **required** : True - **dataset** : The Dataset associated with an entity. -**alias** : dataset **data type** : Dataset **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/SequencingExperiment.md b/docs/metadata/worksheets/SequencingExperiment.md index 387d90b..aa76f8f 100644 --- a/docs/metadata/worksheets/SequencingExperiment.md +++ b/docs/metadata/worksheets/SequencingExperiment.md @@ -8,42 +8,29 @@ An sequencing experiment is an investigation that consists of a coordinated set **title** : Name for the experiment (eg: GHGAE_PBMC_RNAseq). -**alias** : title **data type** : string **required** : None - **description** : A detailed description of the Experiment. -**alias** : description **data type** : string **required** : True - **type** : The type of sequencing experiment. -**alias** : type **data type** : string **required** : False - **sequencing_protocol** : The sequencing protocol associated with an entity. -**alias** : sequencing_protocol **data type** : SequencingProtocol **required** : True - **library_preparation_protocol** : The library_preparation Protocol associated with an entity. -**alias** : library_preparation_protocol **data type** : LibraryPreparationProtocol **required** : True - **attributes** : Key/value pairs corresponding to an entity. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/SequencingProcess.md b/docs/metadata/worksheets/SequencingProcess.md index 02d6ff1..18d5b95 100644 --- a/docs/metadata/worksheets/SequencingProcess.md +++ b/docs/metadata/worksheets/SequencingProcess.md @@ -8,72 +8,49 @@ A sequencing process linking a sample to sequencing output. **title** : The title that describes an entity. -**alias** : title **data type** : string **required** : None - **description** : Description of an entity. -**alias** : description **data type** : string **required** : True - **name** : The name for an entity. -**alias** : name **data type** : string **required** : True - **sequencing_run_id** : Identifier of the sequencing run. Used for batch correction. -**alias** : sequencing_run_id **data type** : string **required** : False - **sequencing_lane_id** : Identifier of the sequencing lane. Used for batch correction. -**alias** : sequencing_lane_id **data type** : string **required** : False - **sequencing_machine_id** : Identifier of the sequencing machine. Used for batch correction. -**alias** : sequencing_machine_id **data type** : string **required** : False - **sequencing_experiment** : The sequencing experiment associated with an entity. -**alias** : sequencing_experiment **data type** : SequencingExperiment **required** : True - **index_sequence** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample. -**alias** : index_sequence **data type** : string **required** : False - **lane_number** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing. -**alias** : lane_number **data type** : string **required** : False - **sample** : The sample associated with an entity. -**alias** : sample **data type** : Sample **required** : True - **attributes** : Key/value pairs corresponding to an entity. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/SequencingProcessFile.md b/docs/metadata/worksheets/SequencingProcessFile.md index ff77cd7..f011c9d 100644 --- a/docs/metadata/worksheets/SequencingProcessFile.md +++ b/docs/metadata/worksheets/SequencingProcessFile.md @@ -8,54 +8,37 @@ A SequencingProcessFile is a File that is associated with a SequencingProcess. **sequencing_process** : The SequencingProcess associated with an entity. -**alias** : sequencing_process **data type** : SequencingProcess **required** : True - **name** : The given filename. -**alias** : name **data type** : string **required** : True - **format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**alias** : format -**data type** : FileFormatEnum +**data type** : Controlled Vocabulary **required** : True - **size** : The size of a file in bytes. -**alias** : size **data type** : integer **required** : True - **checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**alias** : checksum **data type** : string **required** : True - **forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**alias** : forward_or_reverse -**data type** : ForwardOrReverseEnum +**data type** : Controlled Vocabulary **required** : False - **checksum_type** : The type of algorithm used to generate the checksum of a file. -**alias** : checksum_type **data type** : string **required** : True - **dataset** : The Dataset associated with an entity. -**alias** : dataset **data type** : Dataset **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/SequencingProtocol.md b/docs/metadata/worksheets/SequencingProtocol.md index 7f9f870..490c70e 100644 --- a/docs/metadata/worksheets/SequencingProtocol.md +++ b/docs/metadata/worksheets/SequencingProtocol.md @@ -8,102 +8,69 @@ Information about the sequencing of a sample. **description** : Description about the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). -**alias** : description **data type** : string **required** : True - **type** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). -**alias** : type **data type** : string **required** : None - **instrument_model** : The name and model of the technology platform used to perform sequencing. -**alias** : instrument_model -**data type** : InstrumentModelEnum +**data type** : Controlled Vocabulary **required** : True - **sequencing_center** : Center where sample was sequenced. -**alias** : sequencing_center **data type** : string **required** : False - **sequencing_read_length** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process -**alias** : sequencing_read_length **data type** : string **required** : False - **target_coverage** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced. -**alias** : target_coverage **data type** : string **required** : False - **flow_cell_id** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing. -**alias** : flow_cell_id **data type** : string **required** : False - **flow_cell_type** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell. -**alias** : flow_cell_type -**data type** : FlowCellTypeEnum +**data type** : Controlled Vocabulary **required** : False - **umi_barcode_read** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2). -**alias** : umi_barcode_read -**data type** : IndexReadEnum +**data type** : Controlled Vocabulary **required** : False - **umi_barcode_offset** : The offset in sequence of the UMI identifying barcode. (E.g. '16'). -**alias** : umi_barcode_offset **data type** : string **required** : False - **umi_barcode_size** : The size of the UMI identifying barcode (Eg. '10'). -**alias** : umi_barcode_size **data type** : string **required** : False - **cell_barcode_read** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2). -**alias** : cell_barcode_read -**data type** : IndexReadEnum +**data type** : Controlled Vocabulary **required** : False - **cell_barcode_offset** : The offset in sequence of the cell identifying barcode. (Eg. '0'). -**alias** : cell_barcode_offset **data type** : string **required** : False - **cell_barcode_size** : The size of the cell identifying barcode (E.g. '16'). -**alias** : cell_barcode_size **data type** : string **required** : False - **sample_barcode_read** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2). -**alias** : sample_barcode_read -**data type** : SampleBarcodeReadEnum +**data type** : Controlled Vocabulary **required** : False - **attributes** : One or more attributes that further characterizes this Sequencing Protocol. -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Study.md b/docs/metadata/worksheets/Study.md index 823bfab..4bed7f9 100644 --- a/docs/metadata/worksheets/Study.md +++ b/docs/metadata/worksheets/Study.md @@ -8,36 +8,25 @@ Studies are experimental investigations of a particular phenomenon. It involves **title** : A comprehensive title for the study. -**alias** : title **data type** : string **required** : True - **description** : A detailed description (abstract) that describes the goals of this Study. -**alias** : description **data type** : string **required** : True - **type** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'. -**alias** : type -**data type** : StudyTypeEnum +**data type** : Controlled Vocabulary **required** : True - **affiliations** : The Institution(s) associated with an entity. -**alias** : affiliations **data type** : string **required** : True - **attributes** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc) -**alias** : attributes **data type** : Attribute **required** : False - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/StudyFile.md b/docs/metadata/worksheets/StudyFile.md index 543c458..621f907 100644 --- a/docs/metadata/worksheets/StudyFile.md +++ b/docs/metadata/worksheets/StudyFile.md @@ -8,54 +8,37 @@ A StudyFile is a File that is associated with a Study. **study** : The study associated with an entity. -**alias** : study **data type** : Study **required** : True - **name** : The given filename. -**alias** : name **data type** : string **required** : True - **format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**alias** : format -**data type** : FileFormatEnum +**data type** : Controlled Vocabulary **required** : True - **size** : The size of a file in bytes. -**alias** : size **data type** : integer **required** : True - **checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**alias** : checksum **data type** : string **required** : True - **forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**alias** : forward_or_reverse -**data type** : ForwardOrReverseEnum +**data type** : Controlled Vocabulary **required** : False - **checksum_type** : The type of algorithm used to generate the checksum of a file. -**alias** : checksum_type **data type** : string **required** : True - **dataset** : The Dataset associated with an entity. -**alias** : dataset **data type** : Dataset **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/docs/metadata/worksheets/Trio.md b/docs/metadata/worksheets/Trio.md index dc401d6..50f3e75 100644 --- a/docs/metadata/worksheets/Trio.md +++ b/docs/metadata/worksheets/Trio.md @@ -8,24 +8,17 @@ A trio is defined by three individuals representing an individual and their pare **mother** : The mother of an individual. -**alias** : mother **data type** : Individual **required** : True - **father** : The father of an individual. -**alias** : father **data type** : Individual **required** : True - **child** : The child of two individuals. -**alias** : child **data type** : Individual **required** : True - **alias** : The alias for an entity at the time of submission. -**alias** : alias **data type** : string **required** : True diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index d00a25f..757e8f7 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -17,12 +17,12 @@ """Script to generate user-facing metadata schema documentation""" from pathlib import Path -from typing import Callable +from typing import Callable, Union import requests import yaml from jinja2 import Environment, FileSystemLoader -from linkml_runtime.utils.schemaview import SchemaView # type: ignore +from linkml_runtime.utils.schemaview import SchemaView from pydantic import BaseModel, Field HERE = Path(__file__).parent.resolve() @@ -30,7 +30,7 @@ DOCS_DIR = ROOT / "docs" / "metadata" / "worksheets" CONFIG_PATH = ROOT / ".workbook_config.yaml" -TEMPLATE = ".sheet_documentation_template.md" +TEMPLATE = ".sheet_documentation_template.md.jinja" SCHEMA_URL = "https://raw.githubusercontent.com/ghga-de/ghga-metadata-schema/main/src/schema/submission.yaml" # pylint: disable=line-too-long @@ -49,6 +49,29 @@ class SchemaNotLoaded(Exception): """custom error to raise if the schema request return any response code other than 200""" +class WorkbookConfig(BaseModel): + """A workbook configuration""" + + worksheets: list[str] = Field(default_factory=list) + file_name: str + + +class Config(BaseModel): + """Configures multiple workbooks each having a different set of worksheets""" + + workbooks: list[WorkbookConfig] + + @property + def main_workbook(self): + """Extract a workbook configuration based on file_name""" + for sheet in self.workbooks: + if sheet.file_name == MAIN_WORKBOOK: + return sheet + raise MainSheetNotIdentified( + f"No workbook configuration is found for {MAIN_WORKBOOK}" + ) + + def load_config(config_path=CONFIG_PATH) -> dict: """Loads config file""" @@ -64,24 +87,35 @@ def load_config(config_path=CONFIG_PATH) -> dict: def load_schema(schema_url=SCHEMA_URL): """Loads schema""" - schema_config = requests.get(schema_url, timeout=3) + schema_config = requests.get(schema_url, timeout=5) if schema_config.status_code == 200: return SchemaView(schema_config.text) raise SchemaNotLoaded(f"Schema could not be loaded from {SCHEMA_URL}") +def extract_permissible_values(schema: SchemaView, slot_range: Union[str, None]): + """function toe get slot range if it is an enum""" + enum = schema.get_enum(slot_range) + if enum: + return enum.permissible_values + return slot_range + + def extract_slots_from(schema: SchemaView, sheet_name: str) -> list[dict]: """Extracts slot information of a given class""" return [ { - "name": definition.name, - "alias": definition.alias, - "description": definition.description, - "data_type": definition.range, - "required": definition.required, + "name": slot.name, + "alias": slot.alias, + "description": slot.description, + "data_type": { + "range": slot.range, + "enum": extract_permissible_values(schema, slot.range), + }, + "required": slot.required, } - for definition in schema.class_induced_slots(sheet_name) + for slot in schema.class_induced_slots(sheet_name) ] @@ -101,29 +135,6 @@ def generate_workbook( ] -class WorkbookConfig(BaseModel): - """A workbook configuration""" - - worksheets: list[str] = Field(default_factory=list) - file_name: str - - -class Config(BaseModel): - """Configures multiple workbooks each having a different set of worksheets""" - - workbooks: list[WorkbookConfig] - - @property - def main_workbook(self): - """Extract a workbook configuration based on file_name""" - for sheet in self.workbooks: - if sheet.file_name == MAIN_WORKBOOK: - return sheet - raise MainSheetNotIdentified( - f"No workbook configuration is found for {MAIN_WORKBOOK}" - ) - - def generate_markdown(content: dict) -> str: """Generates the markdown text by rendering the content into the template""" @@ -152,6 +163,8 @@ def main(): for sheet in workbook: create_doc_file(DOCS_DIR, sheet["name"], generate_markdown(sheet)) + # print(schema.get_enum("KaryotypeEnum")) + # print(schema.all_enums().keys()) if __name__ == "__main__": From 0b9b6886b8ea911218cf772724e5437a9e7f1991 Mon Sep 17 00:00:00 2001 From: sbilge Date: Thu, 31 Aug 2023 07:38:42 +0000 Subject: [PATCH 04/15] enums added --- .sheet_documentation_template.md.jinja | 19 +- docs/metadata/worksheets/Analysis.md | 40 +-- docs/metadata/worksheets/AnalysisProcess.md | 33 ++- .../worksheets/AnalysisProcessOutputFile.md | 97 ++++-- docs/metadata/worksheets/Biospecimen.md | 97 ++++-- docs/metadata/worksheets/Condition.md | 85 ++++-- .../worksheets/DataAccessCommittee.md | 19 +- docs/metadata/worksheets/DataAccessPolicy.md | 85 ++++-- docs/metadata/worksheets/Dataset.md | 33 ++- docs/metadata/worksheets/Individual.md | 56 ++-- .../worksheets/LibraryPreparationProtocol.md | 278 ++++++++++++++---- docs/metadata/worksheets/Publication.md | 61 ++-- docs/metadata/worksheets/Sample.md | 84 ++++-- docs/metadata/worksheets/SampleFile.md | 97 ++++-- .../worksheets/SequencingExperiment.md | 47 +-- docs/metadata/worksheets/SequencingProcess.md | 82 +++--- .../worksheets/SequencingProcessFile.md | 97 ++++-- .../metadata/worksheets/SequencingProtocol.md | 198 +++++++++---- docs/metadata/worksheets/Study.md | 60 ++-- docs/metadata/worksheets/StudyFile.md | 97 ++++-- docs/metadata/worksheets/Trio.md | 26 +- scripts/update_metadata_docs.py | 31 +- 22 files changed, 1196 insertions(+), 526 deletions(-) diff --git a/.sheet_documentation_template.md.jinja b/.sheet_documentation_template.md.jinja index 952e47f..6aef8e9 100644 --- a/.sheet_documentation_template.md.jinja +++ b/.sheet_documentation_template.md.jinja @@ -7,7 +7,20 @@ ## Fields {% for slot in slots %} -**{{ slot.name }}** : {{ slot.description }} -{% if 'Enum' in slot.data_type.range %}**data type** : Controlled Vocabulary{% else %}**data type** : {{ slot.data_type.range }}{% endif %} -**required** : {{ slot.required }} +**{{ slot.name }}** : {{ slot.description }}
+**required** : {{ slot.required }}
+{% if 'Enum' in slot.data_type.range %} +**data type** : Controlled Vocabulary
+{% else %} +**data type** : {{ slot.data_type.range }}
+{% endif %} + +{% if slot.data_type.enum %} +| Permissible Values | Description | +| --- | --- | +{% for val in slot.data_type.enum.permissible_values%} +| `{{ val.name }}` | `{{ val.description }}` | +{% endfor %} {{ '\n' }} +{% endif %} + {% endfor %} diff --git a/docs/metadata/worksheets/Analysis.md b/docs/metadata/worksheets/Analysis.md index 6d90561..afbe295 100644 --- a/docs/metadata/worksheets/Analysis.md +++ b/docs/metadata/worksheets/Analysis.md @@ -6,27 +6,31 @@ An Analysis is a data transformation that transforms input data to output data. ## Fields +**title** : The title that describes an entity.
+**required** : None
+**data type** : string
-**title** : The title that describes an entity. -**data type** : string -**required** : None -**description** : Describing how an Analysis was carried out. (e.g.: computational tools, settings, etc.). -**data type** : string -**required** : False +**description** : Describing how an Analysis was carried out. (e.g.: computational tools, settings, etc.).
+**required** : False
+**data type** : string
-**type** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF) -**data type** : string -**required** : False -**reference_genome** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13). -**data type** : string -**required** : True +**type** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF)
+**required** : False
+**data type** : string
-**reference_chromosome** : The reference chromosome used for this Analysis. -**data type** : string -**required** : True -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**reference_genome** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13).
+**required** : True
+**data type** : string
+ + +**reference_chromosome** : The reference chromosome used for this Analysis.
+**required** : True
+**data type** : string
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/AnalysisProcess.md b/docs/metadata/worksheets/AnalysisProcess.md index 8ca4be7..dcd4362 100644 --- a/docs/metadata/worksheets/AnalysisProcess.md +++ b/docs/metadata/worksheets/AnalysisProcess.md @@ -6,23 +6,26 @@ None ## Fields +**analysis** : The Analysis the AnalysisProcess was part of
+**required** : True
+**data type** : Analysis
-**analysis** : The Analysis the AnalysisProcess was part of -**data type** : Analysis -**required** : True -**study_input_files** : The StudyFile associated used as an input for an entity. -**data type** : StudyFile -**required** : False +**study_input_files** : The StudyFile associated used as an input for an entity.
+**required** : False
+**data type** : StudyFile
-**sample_input_files** : The SampleFile associated used as an input for an entity. -**data type** : SampleFile -**required** : False -**sequencing_process_input_files** : The SequencingProcessFile associated used as an input for an entity. -**data type** : SequencingProcessFile -**required** : False +**sample_input_files** : The SampleFile associated used as an input for an entity.
+**required** : False
+**data type** : SampleFile
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**sequencing_process_input_files** : The SequencingProcessFile associated used as an input for an entity.
+**required** : False
+**data type** : SequencingProcessFile
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/AnalysisProcessOutputFile.md b/docs/metadata/worksheets/AnalysisProcessOutputFile.md index 73f0103..b84179b 100644 --- a/docs/metadata/worksheets/AnalysisProcessOutputFile.md +++ b/docs/metadata/worksheets/AnalysisProcessOutputFile.md @@ -6,39 +6,82 @@ A AnalysisProcessOutputFile is a File that is associated as an output file with ## Fields +**analysis_process** : The AnalysisProcess associated with an entity.
+**required** : True
+**data type** : AnalysisProcess
-**analysis_process** : The AnalysisProcess associated with an entity. -**data type** : AnalysisProcess -**required** : True -**name** : The given filename. -**data type** : string -**required** : True +**name** : The given filename.
+**required** : True
+**data type** : string
-**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**data type** : Controlled Vocabulary -**required** : True -**size** : The size of a file in bytes. -**data type** : integer -**required** : True +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+**required** : True
+**data type** : Controlled Vocabulary
-**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**data type** : string -**required** : True +| Permissible Values | Description | +| --- | --- | +| `AGP` | `None` | +| `BAI` | `None` | +| `BAM` | `None` | +| `BCF` | `None` | +| `BED` | `None` | +| `CRAI` | `None` | +| `CRAM` | `None` | +| `CSV` | `None` | +| `FASTA` | `None` | +| `FASTQ` | `None` | +| `GFF` | `None` | +| `HDF5` | `None` | +| `INFO` | `None` | +| `JSON` | `None` | +| `MD` | `None` | +| `OTHER` | `None` | +| `PED` | `None` | +| `SAM` | `None` | +| `SFF` | `None` | +| `SRF` | `None` | +| `TAB` | `None` | +| `TABIX` | `None` | +| `TSV` | `None` | +| `TXT` | `None` | +| `VCF` | `None` | +| `WIG` | `None` | -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**data type** : Controlled Vocabulary -**required** : False -**checksum_type** : The type of algorithm used to generate the checksum of a file. -**data type** : string -**required** : True -**dataset** : The Dataset associated with an entity. -**data type** : Dataset -**required** : True +**size** : The size of a file in bytes.
+**required** : True
+**data type** : integer
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+**required** : True
+**data type** : string
+ + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `FORWARD` | `The reads are forward (R1) reads` | +| `REVERSE` | `The reads are reverse (R2) reads` | + + + +**checksum_type** : The type of algorithm used to generate the checksum of a file.
+**required** : True
+**data type** : string
+ + +**dataset** : The Dataset associated with an entity.
+**required** : True
+**data type** : Dataset
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Biospecimen.md b/docs/metadata/worksheets/Biospecimen.md index 7bbc1ee..0d4113b 100644 --- a/docs/metadata/worksheets/Biospecimen.md +++ b/docs/metadata/worksheets/Biospecimen.md @@ -6,43 +6,80 @@ A Biospecimen is any natural material taken from a biological entity (usually a ## Fields +**name** : The name for an entity.
+**required** : False
+**data type** : string
-**name** : The name for an entity. -**data type** : string -**required** : False -**type** : The type of Biospecimen. -**data type** : string -**required** : False +**type** : The type of Biospecimen.
+**required** : False
+**data type** : string
-**description** : Description of an entity. -**data type** : string -**required** : False -**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. -**data type** : string -**required** : False +**description** : Description of an entity.
+**required** : False
+**data type** : string
-**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). -**data type** : string -**required** : False -**individual** : The Individual entity from which this Biospecimen was derived. -**data type** : Individual -**required** : True +**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample.
+**required** : False
+**data type** : string
-**age_at_sampling** : Age of an individual. -**data type** : Controlled Vocabulary -**required** : True -**vital_status_at_sampling** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased'). -**data type** : Controlled Vocabulary -**required** : False +**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
+**required** : False
+**data type** : string
-**tissue** : None -**data type** : string -**required** : True -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**individual** : The Individual entity from which this Biospecimen was derived.
+**required** : True
+**data type** : Individual
+ + +**age_at_sampling** : Age of an individual.
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `0_TO_5` | `Age between 0 to 5.` | +| `6_TO_10` | `Age between 6 to 10.` | +| `11_TO_15` | `Age between 11 to 15.` | +| `16_TO_20` | `Age between 16 to 20.` | +| `21_TO_25` | `Age between 21 to 25.` | +| `26_TO_30` | `Age between 26 to 30.` | +| `31_TO_35` | `Age between 31 to 35.` | +| `36_TO_40` | `Age between 36 to 40.` | +| `41_TO_45` | `Age between 41 to 45.` | +| `46_TO_50` | `Age between 46 to 50.` | +| `51_TO_55` | `Age between 51 to 55.` | +| `56_TO_60` | `Age between 56 to 60.` | +| `61_TO_65` | `Age between 61 to 65.` | +| `66_TO_70` | `Age between 66 to 70.` | +| `71_TO_75` | `Age between 71 to 75.` | +| `76_TO_80` | `Age between 76 to 80.` | +| `81_OR_OLDER` | `Age above 80.` | +| `UNKNOWN` | `Age range unknown.` | + + + +**vital_status_at_sampling** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased').
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `ALIVE` | `Showing characteristics of life; displaying signs of life.` | +| `DECEASED` | `The cessation of life.` | +| `UNKNOWN` | `Vital status is unknown.` | + + + +**tissue** : None
+**required** : True
+**data type** : string
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Condition.md b/docs/metadata/worksheets/Condition.md index 38ac610..32a5303 100644 --- a/docs/metadata/worksheets/Condition.md +++ b/docs/metadata/worksheets/Condition.md @@ -6,39 +6,70 @@ An condition that is linked to comparable samples. ## Fields +**title** : The title that describes an entity.
+**required** : None
+**data type** : string
-**title** : The title that describes an entity. -**data type** : string -**required** : None -**description** : Description of an entity. -**data type** : string -**required** : True +**description** : Description of an entity.
+**required** : True
+**data type** : string
-**name** : The name for an entity. -**data type** : string -**required** : True -**disease_or_healthy** : Whether a condition corresponds to a disease or a healthy state. -**data type** : Controlled Vocabulary -**required** : True +**name** : The name for an entity.
+**required** : True
+**data type** : string
-**case_control_status** : Whether a condition corresponds to a treatment or a control. -**data type** : Controlled Vocabulary -**required** : True -**mutant_or_wildtype** : Whether a condition corresponds to a mutant or a wildtype. -**data type** : Controlled Vocabulary -**required** : True +**disease_or_healthy** : Whether a condition corresponds to a disease or a healthy state.
+**required** : True
+**data type** : Controlled Vocabulary
-**study** : The study associated with an entity. -**data type** : Study -**required** : True +| Permissible Values | Description | +| --- | --- | +| `DISEASE` | `Disease state.` | +| `HEALTHY` | `Healthy state.` | +| `NOT_APPLICABLE` | `The distinction is not applicaple.` | -**attributes** : Key/value pairs corresponding to an entity. -**data type** : Attribute -**required** : False -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**case_control_status** : Whether a condition corresponds to a treatment or a control.
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `NEITHER_CASE_OR_CONTROL_STATUS` | `None` | +| `PROBABLE_CASE_STATUS` | `None` | +| `PROBABLE_CONTROL_STATUS` | `None` | +| `TRUE_CASE_STATUS` | `None` | +| `TRUE_CONTROL_STATUS` | `None` | +| `UNABLE_TO_ASSESS_CASE_OR_CONTROL_STATUS` | `None` | + + + +**mutant_or_wildtype** : Whether a condition corresponds to a mutant or a wildtype.
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `MUTANT` | `Mutant state.` | +| `WILDTYPE` | `Wildtype state.` | +| `NOT_APPLICABLE` | `The distinction is not applicaple.` | + + + +**study** : The study associated with an entity.
+**required** : True
+**data type** : Study
+ + +**attributes** : Key/value pairs corresponding to an entity.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/DataAccessCommittee.md b/docs/metadata/worksheets/DataAccessCommittee.md index ce3e2b8..109af58 100644 --- a/docs/metadata/worksheets/DataAccessCommittee.md +++ b/docs/metadata/worksheets/DataAccessCommittee.md @@ -6,15 +6,16 @@ A group of members that are delegated to grant access to one or more datasets af ## Fields +**email** : Email of a person.
+**required** : True
+**data type** : string
-**email** : Email of a person. -**data type** : string -**required** : True -**institute** : The institute a person is affiliated with. -**data type** : string -**required** : True +**institute** : The institute a person is affiliated with.
+**required** : True
+**data type** : string
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/DataAccessPolicy.md b/docs/metadata/worksheets/DataAccessPolicy.md index c475d88..447a7b3 100644 --- a/docs/metadata/worksheets/DataAccessPolicy.md +++ b/docs/metadata/worksheets/DataAccessPolicy.md @@ -6,35 +6,72 @@ A Data Access Policy specifies under which circumstances, legal or otherwise, a ## Fields +**name** : A name for the Data Access Policy.
+**required** : True
+**data type** : string
-**name** : A name for the Data Access Policy. -**data type** : string -**required** : True -**description** : A short description for the Data Access Policy. -**data type** : string -**required** : True +**description** : A short description for the Data Access Policy.
+**required** : True
+**data type** : string
-**policy_text** : The terms of data use and policy verbiage should be captured here. -**data type** : string -**required** : True -**policy_url** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL. -**data type** : string -**required** : False +**policy_text** : The terms of data use and policy verbiage should be captured here.
+**required** : True
+**data type** : string
-**data_access_committee** : The Data Access Committee linked to this policy. -**data type** : DataAccessCommittee -**required** : True -**data_use_permission** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'. -**data type** : Controlled Vocabulary -**required** : True +**policy_url** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL.
+**required** : False
+**data type** : string
-**data_use_modifiers** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier' -**data type** : Controlled Vocabulary -**required** : False -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**data_access_committee** : The Data Access Committee linked to this policy.
+**required** : True
+**data type** : DataAccessCommittee
+ + +**data_use_permission** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'.
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `general research use` | `None` | +| `health or medical or biomedical research` | `None` | +| `disease specific research` | `None` | +| `no restriction` | `None` | +| `population origins or ancestry research only` | `None` | + + + +**data_use_modifiers** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier'
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `clinical care use` | `None` | +| `return to database or resource` | `None` | +| `institution specific restriction` | `None` | +| `project specific restriction` | `None` | +| `user specific restriction` | `None` | +| `time limit on use` | `None` | +| `publication moratorium` | `None` | +| `geographical restriction` | `None` | +| `ethics approval required` | `None` | +| `collaboration required` | `None` | +| `publication required` | `None` | +| `not for profit, non commercial use only` | `None` | +| `non-commercial use only` | `None` | +| `not for profit organisation use only` | `None` | +| `genetic studies only` | `None` | +| `no general methods research` | `None` | +| `research specific restrictions` | `None` | +| `population origins or ancestry research prohibited` | `None` | + + + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Dataset.md b/docs/metadata/worksheets/Dataset.md index 522507a..80b6976 100644 --- a/docs/metadata/worksheets/Dataset.md +++ b/docs/metadata/worksheets/Dataset.md @@ -6,23 +6,26 @@ A Dataset is a collection of Files that is prepared for distribution and is tied ## Fields +**title** : A title for the submitted Dataset.
+**required** : True
+**data type** : string
-**title** : A title for the submitted Dataset. -**data type** : string -**required** : True -**description** : Description of an entity. -**data type** : string -**required** : True +**description** : Description of an entity.
+**required** : True
+**data type** : string
-**types** : The type of a dataset. -**data type** : string -**required** : True -**data_access_policy** : The Data Access Policy that applies to this Dataset. -**data type** : DataAccessPolicy -**required** : True +**types** : The type of a dataset.
+**required** : True
+**data type** : string
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**data_access_policy** : The Data Access Policy that applies to this Dataset.
+**required** : True
+**data type** : DataAccessPolicy
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Individual.md b/docs/metadata/worksheets/Individual.md index 2f6c1f0..5d422b1 100644 --- a/docs/metadata/worksheets/Individual.md +++ b/docs/metadata/worksheets/Individual.md @@ -6,27 +6,47 @@ An Individual is a Person who is participating in a Study. ## Fields +**sex** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female.
+**required** : True
+**data type** : Controlled Vocabulary
-**sex** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female. -**data type** : Controlled Vocabulary -**required** : True +| Permissible Values | Description | +| --- | --- | +| `FEMALE_SEX_FOR_CLINICAL_USE` | `None` | +| `IMAGING_SEX` | `None` | +| `MALE_SEX_FOR_CLINICAL_USE` | `None` | +| `SPECIFIED_SEX_FOR_CLINICAL_USE` | `None` | +| `UNKNOWN_SEX_FOR_CLINICAL_USE` | `None` | -**karyotype** : The karyotype of an individual if defined. -**data type** : Controlled Vocabulary -**required** : False -**geographical_region** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries. -**data type** : string -**required** : False -**ancestries** : A person's descent or lineage, from a person or from a population. -**data type** : string -**required** : False +**karyotype** : The karyotype of an individual if defined.
+**required** : False
+**data type** : Controlled Vocabulary
-**phenotypic_features** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype. -**data type** : string -**required** : False +| Permissible Values | Description | +| --- | --- | +| `46_XY` | `None` | +| `46_XX` | `None` | +| `OTHER` | `None` | -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + + +**geographical_region** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries.
+**required** : False
+**data type** : string
+ + +**ancestries** : A person's descent or lineage, from a person or from a population.
+**required** : False
+**data type** : string
+ + +**phenotypic_features** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype.
+**required** : False
+**data type** : string
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/LibraryPreparationProtocol.md b/docs/metadata/worksheets/LibraryPreparationProtocol.md index 7751d00..def4c85 100644 --- a/docs/metadata/worksheets/LibraryPreparationProtocol.md +++ b/docs/metadata/worksheets/LibraryPreparationProtocol.md @@ -6,59 +6,225 @@ Information about the library_preparation of an sequencing experiment. ## Fields - -**description** : Description about how a sequencing library was prepared (eg: Library construction method). -**data type** : string -**required** : True - -**library_name** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library. -**data type** : string -**required** : True - -**library_layout** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode -**data type** : Controlled Vocabulary -**required** : True - -**library_type** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc) -**data type** : Controlled Vocabulary -**required** : True - -**library_selection** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc ) -**data type** : Controlled Vocabulary -**required** : True - -**library_preparation** : The general method for sequencing library_preparation (e.g. KAPA PCR-free). -**data type** : string -**required** : True - -**library_preparation_kit_retail_name** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.) -**data type** : Controlled Vocabulary -**required** : False - -**library_preparation_kit_manufacturer** : Manufacturer of library_preparation kit -**data type** : string -**required** : False - -**primer** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA. -**data type** : Controlled Vocabulary -**required** : False - -**end_bias** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript. -**data type** : Controlled Vocabulary -**required** : False - -**target_regions** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study. -**data type** : string -**required** : False - -**rnaseq_strandedness** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand. -**data type** : Controlled Vocabulary -**required** : False - -**attributes** : One or more attributes that further characterizes this library_preparation Protocol. -**data type** : Attribute -**required** : False - -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**description** : Description about how a sequencing library was prepared (eg: Library construction method).
+**required** : True
+**data type** : string
+ + +**library_name** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library.
+**required** : True
+**data type** : string
+ + +**library_layout** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `SE` | `None` | +| `PE` | `None` | + + + +**library_type** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc)
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `WGS` | `None` | +| `WXS` | `None` | +| `WCS` | `None` | +| `TOTAL_RNA` | `None` | +| `M_RNA` | `None` | +| `MI_RNA` | `None` | +| `NC_RNA` | `None` | +| `ATAC` | `None` | +| `METHYLATION` | `None` | +| `CHROMOSOME_CONFORMATION_CAPTURE` | `None` | + + + +**library_selection** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc )
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `5_METHYLCYTIDINE_ANTIBODY_METHOD` | `None` | +| `CAGE_METHOD` | `None` | +| `C_DNA_METHOD` | `None` | +| `CF_H_METHOD` | `None` | +| `CF_M_METHOD` | `None` | +| `CF_S_METHOD` | `None` | +| `CF_T_METHOD` | `None` | +| `CH_IP_SEQ_METHOD` | `None` | +| `D_NASE_METHOD` | `None` | +| `HMPR_METHOD` | `None` | +| `HYBRID_SELECTION_METHOD` | `None` | +| `INVERSE_R_RNA` | `None` | +| `MBD2_PROTEIN_METHYL_CP_G_BINDING_DOMAIN_METHOD` | `None` | +| `MDA` | `None` | +| `MF_METHOD` | `None` | +| `M_NASE_METHOD` | `None` | +| `MSLL_METHOD` | `None` | +| `OLIGO_D_T` | `None` | +| `PADLOCK_PROBES_CAPTURE_METHOD` | `None` | +| `PCR_METHOD` | `None` | +| `POLY_A` | `None` | +| `RACE_METHOD` | `None` | +| `RANDOM_PCR_METHOD` | `None` | +| `RANDOM_METHOD` | `None` | +| `RT_PCR_METHOD` | `None` | +| `REDUCED_REPRESENTATION_METHOD` | `None` | +| `REPEAT_FRACTIONATION` | `None` | +| `RESTRICTION_DIGEST_METHOD` | `None` | +| `SIZE_FRACTIONATION_METHOD` | `None` | +| `UNSPECIFIED` | `None` | +| `OTHER` | `None` | + + + +**library_preparation** : The general method for sequencing library_preparation (e.g. KAPA PCR-free).
+**required** : True
+**data type** : string
+ + +**library_preparation_kit_retail_name** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.)
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `10X_GENOMICS_CHROMIUM_SINGLE_CELL_3_V2` | `None` | +| `10X_GENOMICS_CHROMIUM_SINGLE_CELL_3_V3` | `None` | +| `ACCEL_NGS_2_S_PLUS_DNA_LIBRARY_KIT` | `None` | +| `ACCEL_NGS_METHYL_SEQ_DNA` | `None` | +| `AGILENT_STRAND_SPECIFIC_RNA` | `None` | +| `AGILENT_SURE_SELECT_CUSTOM_ENRICHMENT_KIT` | `None` | +| `AGILENT_SURE_SELECT_V3` | `None` | +| `AGILENT_SURE_SELECT_V4` | `None` | +| `AGILENT_SURE_SELECT_V4_UT_RS` | `None` | +| `AGILENT_SURE_SELECT_V5` | `None` | +| `AGILENT_SURE_SELECT_V5_UT_RS` | `None` | +| `AGILENT_SURE_SELECT_V6` | `None` | +| `AGILENT_SURE_SELECT_V6_ONE` | `None` | +| `AGILENT_SURE_SELECT_V6_UT_RS` | `None` | +| `AGILENT_SURE_SELECT_V7` | `None` | +| `AGILENT_SURE_SELECT_WGS` | `None` | +| `AGILENT_SURE_SELECT_XT_HS_HUMAN_ALL_EXON_V7` | `None` | +| `AGILENT_SURE_SELECT_XT_MOUSE_ALL_EXON` | `None` | +| `AGILENT_XT_HS_SURE_SELECT_CLINICAL_RESEARCH_EXOME_V2` | `None` | +| `AVENIO_CT_DNA_KIT` | `None` | +| `IDT_X_GEN_EXOME_RESEARCH_PANEL` | `None` | +| `ILLUMINA_DNA_PCR_FREE` | `None` | +| `ILLUMINA_NEXTERA_DNA_FLEX` | `None` | +| `ILLUMINA_NEXTERA_EXOME_ENRICHMENT_KIT` | `None` | +| `ILLUMINA_STRANDED_M_RNA_PREP_LIGATION` | `None` | +| `ILLUMINA_TRU_SEQ_CH_IP_SAMPLE_PREPARATION_KIT` | `None` | +| `ILLUMINA_TRU_SEQ_CUSTOM_AMPLICON` | `None` | +| `ILLUMINA_TRU_SEQ_DNA` | `None` | +| `ILLUMINA_TRU_SEQ_NANO_DNA` | `None` | +| `ILLUMINA_TRU_SEQ_NANO_DNA_HT` | `None` | +| `ILLUMINA_TRU_SEQ_NANO_DNA_LT` | `None` | +| `ILLUMINA_TRU_SEQ_NANO_FFPE_DNA` | `None` | +| `ILLUMINA_TRU_SEQ_PCR_FREE` | `None` | +| `ILLUMINA_TRU_SEQ_PCR_FREE_DNA` | `None` | +| `ILLUMINA_TRUSEQ_PCR_FREE_METHYL` | `None` | +| `ILLUMINA_TRU_SEQ_RNA` | `None` | +| `ILLUMINA_TRU_SEQ_SMALL_RNA_KIT` | `None` | +| `ILLUMINA_TRU_SEQ_STRANDED_TOTAL_RNA_KIT` | `None` | +| `ILLUMINA_TRU_SEQ_STRANDED_TOTAL_RNA_LIBRARY_PREP_GLOBIN` | `None` | +| `ILLUMINA_TRU_SEQ_STRANDED_TOTAL_RNA_RIBO_MINUS_GOLD` | `None` | +| `ILLUMINA_VAHTS_TOTAL_RNA` | `None` | +| `INFORM_ONCO_PANEL_HG19` | `None` | +| `ION_AMPLI_SEQ_EXOME_KIT` | `None` | +| `KAPA_HIFI_HOT_START_READYMIX` | `None` | +| `KAPA_HYPER_PREP_KIT` | `None` | +| `KAPA_HYPER_PLUS_KIT` | `None` | +| `KAPA_M_RNA_HYPER_PREP_KIT` | `None` | +| `MAGNETIC_METHYLATED_DNA_IMMUNOPRECIPITATION_DIAGNODE` | `None` | +| `NEB_NEXT_CH_IP_SEQ_LIBRARY_PREP_KIT_FOR_ILLUMINA` | `None` | +| `NEB_NEXT_GLOBIN_R_RNA_DEPLETION_KIT_HUMAN_MOUSE_RAT_WITH_BEADS` | `None` | +| `NEB_NEXT_POLY_A_M_RNA_MAGNETIC_ISOLATION_MODULE` | `None` | +| `NEB_NEXT_RNA_ULTRA_II_STRANDED` | `None` | +| `NEBNEXT_ULTRA_DIRECTIONAL_RNA` | `None` | +| `NEB_NEXT_ULTRA_DNA` | `None` | +| `NEB_NEXT_ULTRA_DNA_LIBRARY_PREP_KIT_FOR_ILLUMINA` | `None` | +| `NEB_NEXT_ULTRA_II_DIRECTIONAL_RNA` | `None` | +| `NEB_NEXT_ULTRA_II_DNA_LIBRARY_PREP_KIT_FOR_ILLUMINA` | `None` | +| `NEXTERA_XT_DNA` | `None` | +| `OLIGO_D_T` | `None` | +| `PICO_METHYL_SEQ` | `None` | +| `SMART_SEQ_V4_ULTRA_LOW_INPUT_RNA_KIT` | `None` | +| `SMAR_TER_STRANDED_TOTAL_RNA_SEQ_KIT` | `None` | +| `SMAR_TER_ULTRA_LOW_INPUT_RNA_AND_NEB_NEXT_CH_IP_SEQ` | `None` | +| `SMAR_TER_ULTRA_LOW_INPUT_RNA_V4_AND_NEB_NEXT_CH_IP_SEQ` | `None` | +| `SMAR_TSEQ2_TAG` | `None` | +| `SUPER_SCRIPT_II_RT_BULK` | `None` | +| `SURE_CELL_ATAC_SEQ_LIBRARY_PREP_KIT` | `None` | +| `SURE_SELECT_EUROFINS_ENRICHMENT_CUSTOM_01` | `None` | +| `TAKARA_CLONTECH_SMAR_TER_STRANDED_TOTAL_RNA` | `None` | +| `TAKARA_SMAR_TER_PREP_X_DNA_LIBRARY_KIT_ACTIVE_MOTIF_CUSTOM_INDICES_01` | `None` | +| `TEMPLATE_SWITCHING_RT_ENZYME_MIX_BULK` | `None` | +| `TWIST_HUMAN_CORE_EXOME_PLUS_KIT` | `None` | +| `ULTRALOW_METHYL_SEQ_WITH_TRUE_METHYL_OX_BS_MODULE` | `None` | + + + +**library_preparation_kit_manufacturer** : Manufacturer of library_preparation kit
+**required** : False
+**data type** : string
+ + +**primer** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `OLIGO_D_T` | `None` | +| `RANDOM` | `None` | +| `GENE_SPECIFIC` | `None` | +| `OTHER` | `None` | + + + +**end_bias** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `3_PRIME_END` | `None` | +| `5_PRIME_END` | `None` | +| `FULL_LENGTH` | `None` | + + + +**target_regions** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study.
+**required** : False
+**data type** : string
+ + +**rnaseq_strandedness** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `SENSE` | `None` | +| `ANTISENSE` | `None` | +| `BOTH` | `None` | + + + +**attributes** : One or more attributes that further characterizes this library_preparation Protocol.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Publication.md b/docs/metadata/worksheets/Publication.md index f72ef6d..d5c60a7 100644 --- a/docs/metadata/worksheets/Publication.md +++ b/docs/metadata/worksheets/Publication.md @@ -6,39 +6,46 @@ The Publication entity represents a publication. While a publication can be any ## Fields +**title** : The title for the Publication.
+**required** : False
+**data type** : string
-**title** : The title for the Publication. -**data type** : string -**required** : False -**abstract** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study. -**data type** : string -**required** : False +**abstract** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study.
+**required** : False
+**data type** : string
-**author** : The individual who is responsible for the content of a document version. -**data type** : string -**required** : False -**year** : Year in which the paper was published. -**data type** : integer -**required** : False +**author** : The individual who is responsible for the content of a document version.
+**required** : False
+**data type** : string
-**journal** : Name of the journal. -**data type** : string -**required** : False -**doi** : DOI identifier of the Publication. -**data type** : string -**required** : True +**year** : Year in which the paper was published.
+**required** : False
+**data type** : integer
-**study** : The Study entity associated with this Publication. -**data type** : Study -**required** : True -**xref** : One or more cross-references for this Publication. -**data type** : string -**required** : False +**journal** : Name of the journal.
+**required** : False
+**data type** : string
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**doi** : DOI identifier of the Publication.
+**required** : True
+**data type** : string
+ + +**study** : The Study entity associated with this Publication.
+**required** : True
+**data type** : Study
+ + +**xref** : One or more cross-references for this Publication.
+**required** : False
+**data type** : string
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Sample.md b/docs/metadata/worksheets/Sample.md index cf3fe9e..b6eb285 100644 --- a/docs/metadata/worksheets/Sample.md +++ b/docs/metadata/worksheets/Sample.md @@ -6,43 +6,67 @@ A sample is a limited quantity of something to be used for testing, analysis, in ## Fields +**name** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1).
+**required** : True
+**data type** : string
-**name** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1). -**data type** : string -**required** : True -**type** : The type of sample. -**data type** : Controlled Vocabulary -**required** : False +**type** : The type of sample.
+**required** : False
+**data type** : Controlled Vocabulary
-**description** : Short textual description of the sample (How the sample was collected, sample source, Protocol followed for processing the sample etc). -**data type** : string -**required** : True +| Permissible Values | Description | +| --- | --- | +| `CF_DNA` | `None` | +| `DEPLETED_RNA` | `None` | +| `DS_DNA_CH_IP` | `None` | +| `FFPE_DNA` | `None` | +| `FFPE_TOTAL_RNA` | `None` | +| `GENOMIC_DNA` | `None` | +| `PCR_PRODUCTS` | `None` | +| `POLY_A_RNA` | `None` | +| `SINGLE_CELL_DNA` | `None` | +| `SINGLE_CELL_RNA` | `None` | +| `SMALL_RNA` | `None` | +| `TOTAL_RNA` | `None` | -**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample. -**data type** : string -**required** : False -**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen). -**data type** : string -**required** : False -**biospecimen** : The Biospecimen from which this Sample was prepared from. -**data type** : Biospecimen -**required** : False +**description** : Short textual description of the sample (How the sample was collected, sample source, Protocol followed for processing the sample etc).
+**required** : True
+**data type** : string
-**condition** : The condition associated with an entity. -**data type** : Condition -**required** : True -**xref** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession. -**data type** : string -**required** : False +**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample.
+**required** : False
+**data type** : string
-**attributes** : Key/value pairs corresponding to an entity. -**data type** : Attribute -**required** : False -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
+**required** : False
+**data type** : string
+ + +**biospecimen** : The Biospecimen from which this Sample was prepared from.
+**required** : False
+**data type** : Biospecimen
+ + +**condition** : The condition associated with an entity.
+**required** : True
+**data type** : Condition
+ + +**xref** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession.
+**required** : False
+**data type** : string
+ + +**attributes** : Key/value pairs corresponding to an entity.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/SampleFile.md b/docs/metadata/worksheets/SampleFile.md index 6a73e93..d07232d 100644 --- a/docs/metadata/worksheets/SampleFile.md +++ b/docs/metadata/worksheets/SampleFile.md @@ -6,39 +6,82 @@ A SampleFile is a File that is associated with a Sample. ## Fields +**sample** : The sample associated with an entity.
+**required** : True
+**data type** : Sample
-**sample** : The sample associated with an entity. -**data type** : Sample -**required** : True -**name** : The given filename. -**data type** : string -**required** : True +**name** : The given filename.
+**required** : True
+**data type** : string
-**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**data type** : Controlled Vocabulary -**required** : True -**size** : The size of a file in bytes. -**data type** : integer -**required** : True +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+**required** : True
+**data type** : Controlled Vocabulary
-**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**data type** : string -**required** : True +| Permissible Values | Description | +| --- | --- | +| `AGP` | `None` | +| `BAI` | `None` | +| `BAM` | `None` | +| `BCF` | `None` | +| `BED` | `None` | +| `CRAI` | `None` | +| `CRAM` | `None` | +| `CSV` | `None` | +| `FASTA` | `None` | +| `FASTQ` | `None` | +| `GFF` | `None` | +| `HDF5` | `None` | +| `INFO` | `None` | +| `JSON` | `None` | +| `MD` | `None` | +| `OTHER` | `None` | +| `PED` | `None` | +| `SAM` | `None` | +| `SFF` | `None` | +| `SRF` | `None` | +| `TAB` | `None` | +| `TABIX` | `None` | +| `TSV` | `None` | +| `TXT` | `None` | +| `VCF` | `None` | +| `WIG` | `None` | -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**data type** : Controlled Vocabulary -**required** : False -**checksum_type** : The type of algorithm used to generate the checksum of a file. -**data type** : string -**required** : True -**dataset** : The Dataset associated with an entity. -**data type** : Dataset -**required** : True +**size** : The size of a file in bytes.
+**required** : True
+**data type** : integer
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+**required** : True
+**data type** : string
+ + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `FORWARD` | `The reads are forward (R1) reads` | +| `REVERSE` | `The reads are reverse (R2) reads` | + + + +**checksum_type** : The type of algorithm used to generate the checksum of a file.
+**required** : True
+**data type** : string
+ + +**dataset** : The Dataset associated with an entity.
+**required** : True
+**data type** : Dataset
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/SequencingExperiment.md b/docs/metadata/worksheets/SequencingExperiment.md index aa76f8f..2871a7a 100644 --- a/docs/metadata/worksheets/SequencingExperiment.md +++ b/docs/metadata/worksheets/SequencingExperiment.md @@ -6,31 +6,36 @@ An sequencing experiment is an investigation that consists of a coordinated set ## Fields +**title** : Name for the experiment (eg: GHGAE_PBMC_RNAseq).
+**required** : None
+**data type** : string
-**title** : Name for the experiment (eg: GHGAE_PBMC_RNAseq). -**data type** : string -**required** : None -**description** : A detailed description of the Experiment. -**data type** : string -**required** : True +**description** : A detailed description of the Experiment.
+**required** : True
+**data type** : string
-**type** : The type of sequencing experiment. -**data type** : string -**required** : False -**sequencing_protocol** : The sequencing protocol associated with an entity. -**data type** : SequencingProtocol -**required** : True +**type** : The type of sequencing experiment.
+**required** : False
+**data type** : string
-**library_preparation_protocol** : The library_preparation Protocol associated with an entity. -**data type** : LibraryPreparationProtocol -**required** : True -**attributes** : Key/value pairs corresponding to an entity. -**data type** : Attribute -**required** : False +**sequencing_protocol** : The sequencing protocol associated with an entity.
+**required** : True
+**data type** : SequencingProtocol
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**library_preparation_protocol** : The library_preparation Protocol associated with an entity.
+**required** : True
+**data type** : LibraryPreparationProtocol
+ + +**attributes** : Key/value pairs corresponding to an entity.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProcess.md b/docs/metadata/worksheets/SequencingProcess.md index 18d5b95..eaa5382 100644 --- a/docs/metadata/worksheets/SequencingProcess.md +++ b/docs/metadata/worksheets/SequencingProcess.md @@ -6,51 +6,61 @@ A sequencing process linking a sample to sequencing output. ## Fields +**title** : The title that describes an entity.
+**required** : None
+**data type** : string
-**title** : The title that describes an entity. -**data type** : string -**required** : None -**description** : Description of an entity. -**data type** : string -**required** : True +**description** : Description of an entity.
+**required** : True
+**data type** : string
-**name** : The name for an entity. -**data type** : string -**required** : True -**sequencing_run_id** : Identifier of the sequencing run. Used for batch correction. -**data type** : string -**required** : False +**name** : The name for an entity.
+**required** : True
+**data type** : string
-**sequencing_lane_id** : Identifier of the sequencing lane. Used for batch correction. -**data type** : string -**required** : False -**sequencing_machine_id** : Identifier of the sequencing machine. Used for batch correction. -**data type** : string -**required** : False +**sequencing_run_id** : Identifier of the sequencing run. Used for batch correction.
+**required** : False
+**data type** : string
-**sequencing_experiment** : The sequencing experiment associated with an entity. -**data type** : SequencingExperiment -**required** : True -**index_sequence** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample. -**data type** : string -**required** : False +**sequencing_lane_id** : Identifier of the sequencing lane. Used for batch correction.
+**required** : False
+**data type** : string
-**lane_number** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing. -**data type** : string -**required** : False -**sample** : The sample associated with an entity. -**data type** : Sample -**required** : True +**sequencing_machine_id** : Identifier of the sequencing machine. Used for batch correction.
+**required** : False
+**data type** : string
-**attributes** : Key/value pairs corresponding to an entity. -**data type** : Attribute -**required** : False -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**sequencing_experiment** : The sequencing experiment associated with an entity.
+**required** : True
+**data type** : SequencingExperiment
+ + +**index_sequence** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample.
+**required** : False
+**data type** : string
+ + +**lane_number** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing.
+**required** : False
+**data type** : string
+ + +**sample** : The sample associated with an entity.
+**required** : True
+**data type** : Sample
+ + +**attributes** : Key/value pairs corresponding to an entity.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProcessFile.md b/docs/metadata/worksheets/SequencingProcessFile.md index f011c9d..060bc4a 100644 --- a/docs/metadata/worksheets/SequencingProcessFile.md +++ b/docs/metadata/worksheets/SequencingProcessFile.md @@ -6,39 +6,82 @@ A SequencingProcessFile is a File that is associated with a SequencingProcess. ## Fields +**sequencing_process** : The SequencingProcess associated with an entity.
+**required** : True
+**data type** : SequencingProcess
-**sequencing_process** : The SequencingProcess associated with an entity. -**data type** : SequencingProcess -**required** : True -**name** : The given filename. -**data type** : string -**required** : True +**name** : The given filename.
+**required** : True
+**data type** : string
-**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**data type** : Controlled Vocabulary -**required** : True -**size** : The size of a file in bytes. -**data type** : integer -**required** : True +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+**required** : True
+**data type** : Controlled Vocabulary
-**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**data type** : string -**required** : True +| Permissible Values | Description | +| --- | --- | +| `AGP` | `None` | +| `BAI` | `None` | +| `BAM` | `None` | +| `BCF` | `None` | +| `BED` | `None` | +| `CRAI` | `None` | +| `CRAM` | `None` | +| `CSV` | `None` | +| `FASTA` | `None` | +| `FASTQ` | `None` | +| `GFF` | `None` | +| `HDF5` | `None` | +| `INFO` | `None` | +| `JSON` | `None` | +| `MD` | `None` | +| `OTHER` | `None` | +| `PED` | `None` | +| `SAM` | `None` | +| `SFF` | `None` | +| `SRF` | `None` | +| `TAB` | `None` | +| `TABIX` | `None` | +| `TSV` | `None` | +| `TXT` | `None` | +| `VCF` | `None` | +| `WIG` | `None` | -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**data type** : Controlled Vocabulary -**required** : False -**checksum_type** : The type of algorithm used to generate the checksum of a file. -**data type** : string -**required** : True -**dataset** : The Dataset associated with an entity. -**data type** : Dataset -**required** : True +**size** : The size of a file in bytes.
+**required** : True
+**data type** : integer
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+**required** : True
+**data type** : string
+ + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `FORWARD` | `The reads are forward (R1) reads` | +| `REVERSE` | `The reads are reverse (R2) reads` | + + + +**checksum_type** : The type of algorithm used to generate the checksum of a file.
+**required** : True
+**data type** : string
+ + +**dataset** : The Dataset associated with an entity.
+**required** : True
+**data type** : Dataset
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProtocol.md b/docs/metadata/worksheets/SequencingProtocol.md index 490c70e..eacb3aa 100644 --- a/docs/metadata/worksheets/SequencingProtocol.md +++ b/docs/metadata/worksheets/SequencingProtocol.md @@ -6,71 +6,167 @@ Information about the sequencing of a sample. ## Fields +**description** : Description about the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc).
+**required** : True
+**data type** : string
+ + +**type** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc).
+**required** : None
+**data type** : string
+ + +**instrument_model** : The name and model of the technology platform used to perform sequencing.
+**required** : True
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `ILLUMINA_HI_SCAN` | `None` | +| `ILLUMINA_HI_SEQ_1000` | `None` | +| `ILLUMINA_HI_SEQ_1500` | `None` | +| `ILLUMINA_HI_SEQ_2000` | `None` | +| `ILLUMINA_HI_SEQ_2500` | `None` | +| `ILLUMINA_HI_SEQ_3000` | `None` | +| `ILLUMINA_HI_SEQ_4000` | `None` | +| `ILLUMINA_HI_SEQ_X_FIVE` | `None` | +| `ILLUMINA_HI_SEQ_X_TEN` | `None` | +| `ILLUMINA_HI_SEQ_X` | `None` | +| `ILLUMINA_I_SCAN` | `None` | +| `ILLUMINA_I_SEQ_100` | `None` | +| `ILLUMINA_MINI_SEQ` | `None` | +| `ILLUMINA_MI_SEQ` | `None` | +| `ILLUMINA_MI_SEQ_DX` | `None` | +| `ILLUMINA_MI_SEQ_DX_RESEARCH_MODE` | `None` | +| `ILLUMINA_NEXT_SEQ_500` | `None` | +| `ILLUMINA_NEXT_SEQ_550` | `None` | +| `ILLUMINA_NEXT_SEQ_550_DX` | `None` | +| `ILLUMINA_NEXT_SEQ_550_DX_RESEARCH_MODE` | `None` | +| `ILLUMINA_NEXT_SEQ_1000` | `None` | +| `ILLUMINA_NEXT_SEQ_2000` | `None` | +| `ILLUMINA_NOVA_SEQ_6000` | `None` | +| `ILLUMINA_GENOME_ANALYZER` | `None` | +| `ILLUMINA_GENOME_ANALYZER_II` | `None` | +| `ILLUMINA_GENOME_ANALYZER_IIX` | `None` | +| `ILLUMINA_HI_SCAN_SQ` | `None` | +| `PAC_BIO_REVIO` | `None` | +| `PAC_BIO_ONSO` | `None` | +| `PAC_BIO_SEQUEL_IIE` | `None` | +| `PAC_BIO_SEQUEL_II` | `None` | +| `PAC_BIO_SEQUEL` | `None` | +| `PAC_BIO_RS` | `None` | +| `PAC_BIO_RS_II` | `None` | +| `ONT_MIN_ION` | `None` | +| `ONT_GRID_ION` | `None` | +| `ONT_PROMETH_ION` | `None` | +| `DNBSEQ_G50` | `None` | +| `DNBSEQ_T7` | `None` | +| `DNBSEQ_G400` | `None` | +| `DNBSEQ_G400_FAST` | `None` | +| `ULTIMA_UG_100` | `None` | +| `OTHER` | `None` | -**description** : Description about the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). -**data type** : string -**required** : True -**type** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc). -**data type** : string -**required** : None -**instrument_model** : The name and model of the technology platform used to perform sequencing. -**data type** : Controlled Vocabulary -**required** : True +**sequencing_center** : Center where sample was sequenced.
+**required** : False
+**data type** : string
-**sequencing_center** : Center where sample was sequenced. -**data type** : string -**required** : False -**sequencing_read_length** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process -**data type** : string -**required** : False +**sequencing_read_length** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process
+**required** : False
+**data type** : string
-**target_coverage** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced. -**data type** : string -**required** : False -**flow_cell_id** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing. -**data type** : string -**required** : False +**target_coverage** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced.
+**required** : False
+**data type** : string
-**flow_cell_type** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell. -**data type** : Controlled Vocabulary -**required** : False -**umi_barcode_read** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2). -**data type** : Controlled Vocabulary -**required** : False +**flow_cell_id** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing.
+**required** : False
+**data type** : string
-**umi_barcode_offset** : The offset in sequence of the UMI identifying barcode. (E.g. '16'). -**data type** : string -**required** : False -**umi_barcode_size** : The size of the UMI identifying barcode (Eg. '10'). -**data type** : string -**required** : False +**flow_cell_type** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell.
+**required** : False
+**data type** : Controlled Vocabulary
-**cell_barcode_read** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2). -**data type** : Controlled Vocabulary -**required** : False +| Permissible Values | Description | +| --- | --- | +| `ILLUMINA_NOVA_SEQ_S2` | `None` | +| `ILLUMINA_NOVA_SEQ_S4` | `None` | +| `PROMETH_ION` | `None` | +| `FLONGLE` | `None` | +| `MIN_ION` | `None` | +| `GRID_ION` | `None` | +| `OTHER` | `None` | -**cell_barcode_offset** : The offset in sequence of the cell identifying barcode. (Eg. '0'). -**data type** : string -**required** : False -**cell_barcode_size** : The size of the cell identifying barcode (E.g. '16'). -**data type** : string -**required** : False -**sample_barcode_read** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2). -**data type** : Controlled Vocabulary -**required** : False +**umi_barcode_read** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2).
+**required** : False
+**data type** : Controlled Vocabulary
-**attributes** : One or more attributes that further characterizes this Sequencing Protocol. -**data type** : Attribute -**required** : False +| Permissible Values | Description | +| --- | --- | +| `INDEX1` | `None` | +| `INDEX2` | `None` | +| `READ1` | `None` | +| `READ2` | `None` | -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + + +**umi_barcode_offset** : The offset in sequence of the UMI identifying barcode. (E.g. '16').
+**required** : False
+**data type** : string
+ + +**umi_barcode_size** : The size of the UMI identifying barcode (Eg. '10').
+**required** : False
+**data type** : string
+ + +**cell_barcode_read** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2).
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `INDEX1` | `None` | +| `INDEX2` | `None` | +| `READ1` | `None` | +| `READ2` | `None` | + + + +**cell_barcode_offset** : The offset in sequence of the cell identifying barcode. (Eg. '0').
+**required** : False
+**data type** : string
+ + +**cell_barcode_size** : The size of the cell identifying barcode (E.g. '16').
+**required** : False
+**data type** : string
+ + +**sample_barcode_read** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2).
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `INDEX1` | `None` | +| `INDEX1_AND_INDEX2` | `None` | +| `OTHER` | `None` | + + + +**attributes** : One or more attributes that further characterizes this Sequencing Protocol.
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Study.md b/docs/metadata/worksheets/Study.md index 4bed7f9..2f78a1e 100644 --- a/docs/metadata/worksheets/Study.md +++ b/docs/metadata/worksheets/Study.md @@ -6,27 +6,51 @@ Studies are experimental investigations of a particular phenomenon. It involves ## Fields +**title** : A comprehensive title for the study.
+**required** : True
+**data type** : string
-**title** : A comprehensive title for the study. -**data type** : string -**required** : True -**description** : A detailed description (abstract) that describes the goals of this Study. -**data type** : string -**required** : True +**description** : A detailed description (abstract) that describes the goals of this Study.
+**required** : True
+**data type** : string
-**type** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'. -**data type** : Controlled Vocabulary -**required** : True -**affiliations** : The Institution(s) associated with an entity. -**data type** : string -**required** : True +**type** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'.
+**required** : True
+**data type** : Controlled Vocabulary
-**attributes** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc) -**data type** : Attribute -**required** : False +| Permissible Values | Description | +| --- | --- | +| `CANCER_GENOMICS` | `None` | +| `EPIGENETICS` | `None` | +| `EXOME_SEQUENCING` | `None` | +| `FORENSIC_GENETICS` | `None` | +| `PALEO_GENOMICS` | `None` | +| `GENE_REGULATION_STUDY` | `None` | +| `METAGENOMICS` | `None` | +| `OTHER` | `None` | +| `POOLED_CLONE_SEQUENCING` | `None` | +| `POPULATION_GENOMICS` | `None` | +| `RNASEQ` | `None` | +| `RESEQUENCING` | `None` | +| `SYNTHETIC_GENOMICS` | `None` | +| `TRANSCRIPTOME_ANALYSIS` | `None` | +| `WHOLE_GENOME_SEQUENCING` | `None` | +| `GWAS` | `None` | -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + + +**affiliations** : The Institution(s) associated with an entity.
+**required** : True
+**data type** : string
+ + +**attributes** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc)
+**required** : False
+**data type** : Attribute
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/StudyFile.md b/docs/metadata/worksheets/StudyFile.md index 621f907..b1d83a8 100644 --- a/docs/metadata/worksheets/StudyFile.md +++ b/docs/metadata/worksheets/StudyFile.md @@ -6,39 +6,82 @@ A StudyFile is a File that is associated with a Study. ## Fields +**study** : The study associated with an entity.
+**required** : True
+**data type** : Study
-**study** : The study associated with an entity. -**data type** : Study -**required** : True -**name** : The given filename. -**data type** : string -**required** : True +**name** : The given filename.
+**required** : True
+**data type** : string
-**format** : The format of the file: BAM, SAM, CRAM, BAI, etc. -**data type** : Controlled Vocabulary -**required** : True -**size** : The size of a file in bytes. -**data type** : integer -**required** : True +**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+**required** : True
+**data type** : Controlled Vocabulary
-**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly. -**data type** : string -**required** : True +| Permissible Values | Description | +| --- | --- | +| `AGP` | `None` | +| `BAI` | `None` | +| `BAM` | `None` | +| `BCF` | `None` | +| `BED` | `None` | +| `CRAI` | `None` | +| `CRAM` | `None` | +| `CSV` | `None` | +| `FASTA` | `None` | +| `FASTQ` | `None` | +| `GFF` | `None` | +| `HDF5` | `None` | +| `INFO` | `None` | +| `JSON` | `None` | +| `MD` | `None` | +| `OTHER` | `None` | +| `PED` | `None` | +| `SAM` | `None` | +| `SFF` | `None` | +| `SRF` | `None` | +| `TAB` | `None` | +| `TABIX` | `None` | +| `TSV` | `None` | +| `TXT` | `None` | +| `VCF` | `None` | +| `WIG` | `None` | -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction. -**data type** : Controlled Vocabulary -**required** : False -**checksum_type** : The type of algorithm used to generate the checksum of a file. -**data type** : string -**required** : True -**dataset** : The Dataset associated with an entity. -**data type** : Dataset -**required** : True +**size** : The size of a file in bytes.
+**required** : True
+**data type** : integer
-**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True + +**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+**required** : True
+**data type** : string
+ + +**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+**required** : False
+**data type** : Controlled Vocabulary
+ +| Permissible Values | Description | +| --- | --- | +| `FORWARD` | `The reads are forward (R1) reads` | +| `REVERSE` | `The reads are reverse (R2) reads` | + + + +**checksum_type** : The type of algorithm used to generate the checksum of a file.
+**required** : True
+**data type** : string
+ + +**dataset** : The Dataset associated with an entity.
+**required** : True
+**data type** : Dataset
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/docs/metadata/worksheets/Trio.md b/docs/metadata/worksheets/Trio.md index 50f3e75..ed4839b 100644 --- a/docs/metadata/worksheets/Trio.md +++ b/docs/metadata/worksheets/Trio.md @@ -6,19 +6,21 @@ A trio is defined by three individuals representing an individual and their pare ## Fields +**mother** : The mother of an individual.
+**required** : True
+**data type** : Individual
-**mother** : The mother of an individual. -**data type** : Individual -**required** : True -**father** : The father of an individual. -**data type** : Individual -**required** : True +**father** : The father of an individual.
+**required** : True
+**data type** : Individual
-**child** : The child of two individuals. -**data type** : Individual -**required** : True -**alias** : The alias for an entity at the time of submission. -**data type** : string -**required** : True +**child** : The child of two individuals.
+**required** : True
+**data type** : Individual
+ + +**alias** : The alias for an entity at the time of submission.
+**required** : True
+**data type** : string
diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index 757e8f7..c8f9e38 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -94,14 +94,29 @@ def load_schema(schema_url=SCHEMA_URL): def extract_permissible_values(schema: SchemaView, slot_range: Union[str, None]): - """function toe get slot range if it is an enum""" + """enum""" enum = schema.get_enum(slot_range) if enum: - return enum.permissible_values - return slot_range + return { + "enum": enum.name, + "description": enum.description, + "permissible_values": [ + {"name": key, "description": val.description} # type: ignore + if enum.permissible_values + else None + for key, val in enum.permissible_values.items() # type: ignore + ], + } + return None -def extract_slots_from(schema: SchemaView, sheet_name: str) -> list[dict]: +def extract_slots_from( + schema: SchemaView, + sheet_name: str, + add_enum: Callable[ + [SchemaView, Union[str, None]], Union[dict, None] + ] = extract_permissible_values, +) -> list[dict]: """Extracts slot information of a given class""" return [ @@ -111,7 +126,7 @@ def extract_slots_from(schema: SchemaView, sheet_name: str) -> list[dict]: "description": slot.description, "data_type": { "range": slot.range, - "enum": extract_permissible_values(schema, slot.range), + "enum": add_enum(schema, slot.range), }, "required": slot.required, } @@ -138,7 +153,7 @@ def generate_workbook( def generate_markdown(content: dict) -> str: """Generates the markdown text by rendering the content into the template""" - env = Environment(loader=FileSystemLoader(ROOT)) + env = Environment(loader=FileSystemLoader(ROOT), trim_blocks=True) template = env.get_template(TEMPLATE) return template.render(content) @@ -162,9 +177,9 @@ def main(): workbook = generate_workbook(schema, worksheet_names, extract_slots_from) for sheet in workbook: + # print(sheet) + # print("\n\n") create_doc_file(DOCS_DIR, sheet["name"], generate_markdown(sheet)) - # print(schema.get_enum("KaryotypeEnum")) - # print(schema.all_enums().keys()) if __name__ == "__main__": From d0aad70251d58a4fd83330459c08efa7e6a27d7a Mon Sep 17 00:00:00 2001 From: sbilge Date: Thu, 31 Aug 2023 13:03:20 +0000 Subject: [PATCH 05/15] minor refactoring --- .sheet_documentation_template.md.jinja | 11 ++- docs/metadata/worksheets/Analysis.md | 26 +++--- docs/metadata/worksheets/AnalysisProcess.md | 23 ++--- .../worksheets/AnalysisProcessOutputFile.md | 47 +++++----- docs/metadata/worksheets/Biospecimen.md | 50 +++++----- docs/metadata/worksheets/Condition.md | 47 ++++++---- .../worksheets/DataAccessCommittee.md | 13 ++- docs/metadata/worksheets/DataAccessPolicy.md | 40 ++++---- docs/metadata/worksheets/Dataset.md | 21 ++--- docs/metadata/worksheets/Individual.md | 32 ++++--- .../worksheets/LibraryPreparationProtocol.md | 80 +++++++++++----- docs/metadata/worksheets/Publication.md | 43 ++++----- docs/metadata/worksheets/Sample.md | 48 +++++----- docs/metadata/worksheets/SampleFile.md | 47 +++++----- .../worksheets/SequencingExperiment.md | 31 +++---- docs/metadata/worksheets/SequencingProcess.md | 56 +++++------- .../worksheets/SequencingProcessFile.md | 47 +++++----- .../metadata/worksheets/SequencingProtocol.md | 91 +++++++++++-------- docs/metadata/worksheets/Study.md | 28 +++--- docs/metadata/worksheets/StudyFile.md | 47 +++++----- docs/metadata/worksheets/Trio.md | 18 ++-- scripts/update_metadata_docs.py | 65 +++++++------ 22 files changed, 490 insertions(+), 421 deletions(-) diff --git a/.sheet_documentation_template.md.jinja b/.sheet_documentation_template.md.jinja index 6aef8e9..a2f79c0 100644 --- a/.sheet_documentation_template.md.jinja +++ b/.sheet_documentation_template.md.jinja @@ -7,20 +7,25 @@ ## Fields {% for slot in slots %} -**{{ slot.name }}** : {{ slot.description }}
+### ***{{ slot.name }}***
+**description** : {{ slot.description }}
**required** : {{ slot.required }}
{% if 'Enum' in slot.data_type.range %} **data type** : Controlled Vocabulary
{% else %} **data type** : {{ slot.data_type.range }}
{% endif %} - {% if slot.data_type.enum %} + +
+ Permissible Values + | Permissible Values | Description | | --- | --- | {% for val in slot.data_type.enum.permissible_values%} | `{{ val.name }}` | `{{ val.description }}` | {% endfor %} {{ '\n' }} -{% endif %} +
+{% endif %} {% endfor %} diff --git a/docs/metadata/worksheets/Analysis.md b/docs/metadata/worksheets/Analysis.md index afbe295..ffbf1fc 100644 --- a/docs/metadata/worksheets/Analysis.md +++ b/docs/metadata/worksheets/Analysis.md @@ -6,31 +6,27 @@ An Analysis is a data transformation that transforms input data to output data. ## Fields -**title** : The title that describes an entity.
+### ***title***
+**description** : The title that describes an entity.
**required** : None
**data type** : string
- - +### ***description***
**description** : Describing how an Analysis was carried out. (e.g.: computational tools, settings, etc.).
**required** : False
**data type** : string
- - -**type** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF)
+### ***type***
+**description** : The type of the Analysis. Either Reference Alignment (BAM) or Sequence Variation (VCF)
**required** : False
**data type** : string
- - -**reference_genome** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13).
+### ***reference_genome***
+**description** : A published genetic sequence that is used as a reference sequence against which other sequences are compared. Reference genome(s) or annotation(s) used for prior analyses (eg: GRCh38.p13).
**required** : True
**data type** : string
- - -**reference_chromosome** : The reference chromosome used for this Analysis.
+### ***reference_chromosome***
+**description** : The reference chromosome used for this Analysis.
**required** : True
**data type** : string
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/AnalysisProcess.md b/docs/metadata/worksheets/AnalysisProcess.md index dcd4362..1b4815d 100644 --- a/docs/metadata/worksheets/AnalysisProcess.md +++ b/docs/metadata/worksheets/AnalysisProcess.md @@ -6,26 +6,23 @@ None ## Fields -**analysis** : The Analysis the AnalysisProcess was part of
+### ***analysis***
+**description** : The Analysis the AnalysisProcess was part of
**required** : True
**data type** : Analysis
- - -**study_input_files** : The StudyFile associated used as an input for an entity.
+### ***study_input_files***
+**description** : The StudyFile associated used as an input for an entity.
**required** : False
**data type** : StudyFile
- - -**sample_input_files** : The SampleFile associated used as an input for an entity.
+### ***sample_input_files***
+**description** : The SampleFile associated used as an input for an entity.
**required** : False
**data type** : SampleFile
- - -**sequencing_process_input_files** : The SequencingProcessFile associated used as an input for an entity.
+### ***sequencing_process_input_files***
+**description** : The SequencingProcessFile associated used as an input for an entity.
**required** : False
**data type** : SequencingProcessFile
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/AnalysisProcessOutputFile.md b/docs/metadata/worksheets/AnalysisProcessOutputFile.md index b84179b..4cb93b2 100644 --- a/docs/metadata/worksheets/AnalysisProcessOutputFile.md +++ b/docs/metadata/worksheets/AnalysisProcessOutputFile.md @@ -6,20 +6,22 @@ A AnalysisProcessOutputFile is a File that is associated as an output file with ## Fields -**analysis_process** : The AnalysisProcess associated with an entity.
+### ***analysis_process***
+**description** : The AnalysisProcess associated with an entity.
**required** : True
**data type** : AnalysisProcess
- - -**name** : The given filename.
+### ***name***
+**description** : The given filename.
**required** : True
**data type** : string
- - -**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+### ***format***
+**description** : The format of the file: BAM, SAM, CRAM, BAI, etc.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `AGP` | `None` | @@ -50,38 +52,41 @@ A AnalysisProcessOutputFile is a File that is associated as an output file with | `WIG` | `None` | +
-**size** : The size of a file in bytes.
+### ***size***
+**description** : The size of a file in bytes.
**required** : True
**data type** : integer
- - -**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+### ***checksum***
+**description** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
**required** : True
**data type** : string
- - -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+### ***forward_or_reverse***
+**description** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `FORWARD` | `The reads are forward (R1) reads` | | `REVERSE` | `The reads are reverse (R2) reads` | +
-**checksum_type** : The type of algorithm used to generate the checksum of a file.
+### ***checksum_type***
+**description** : The type of algorithm used to generate the checksum of a file.
**required** : True
**data type** : string
- - -**dataset** : The Dataset associated with an entity.
+### ***dataset***
+**description** : The Dataset associated with an entity.
**required** : True
**data type** : Dataset
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Biospecimen.md b/docs/metadata/worksheets/Biospecimen.md index 0d4113b..19d309d 100644 --- a/docs/metadata/worksheets/Biospecimen.md +++ b/docs/metadata/worksheets/Biospecimen.md @@ -6,40 +6,38 @@ A Biospecimen is any natural material taken from a biological entity (usually a ## Fields -**name** : The name for an entity.
+### ***name***
+**description** : The name for an entity.
**required** : False
**data type** : string
- - -**type** : The type of Biospecimen.
+### ***type***
+**description** : The type of Biospecimen.
**required** : False
**data type** : string
- - +### ***description***
**description** : Description of an entity.
**required** : False
**data type** : string
- - -**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample.
+### ***isolation***
+**description** : Method or device employed for collecting/isolating a biospecimen or a sample.
**required** : False
**data type** : string
- - -**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
+### ***storage***
+**description** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
**required** : False
**data type** : string
- - -**individual** : The Individual entity from which this Biospecimen was derived.
+### ***individual***
+**description** : The Individual entity from which this Biospecimen was derived.
**required** : True
**data type** : Individual
- - -**age_at_sampling** : Age of an individual.
+### ***age_at_sampling***
+**description** : Age of an individual.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `0_TO_5` | `Age between 0 to 5.` | @@ -62,11 +60,16 @@ A Biospecimen is any natural material taken from a biological entity (usually a | `UNKNOWN` | `Age range unknown.` | +
-**vital_status_at_sampling** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased').
+### ***vital_status_at_sampling***
+**description** : Vital Status of an Individual at the point of sampling (eg:'Alive', 'Deceased').
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `ALIVE` | `Showing characteristics of life; displaying signs of life.` | @@ -74,12 +77,13 @@ A Biospecimen is any natural material taken from a biological entity (usually a | `UNKNOWN` | `Vital status is unknown.` | +
-**tissue** : None
+### ***tissue***
+**description** : None
**required** : True
**data type** : string
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Condition.md b/docs/metadata/worksheets/Condition.md index 32a5303..3946db5 100644 --- a/docs/metadata/worksheets/Condition.md +++ b/docs/metadata/worksheets/Condition.md @@ -6,25 +6,26 @@ An condition that is linked to comparable samples. ## Fields -**title** : The title that describes an entity.
+### ***title***
+**description** : The title that describes an entity.
**required** : None
**data type** : string
- - +### ***description***
**description** : Description of an entity.
**required** : True
**data type** : string
- - -**name** : The name for an entity.
+### ***name***
+**description** : The name for an entity.
**required** : True
**data type** : string
- - -**disease_or_healthy** : Whether a condition corresponds to a disease or a healthy state.
+### ***disease_or_healthy***
+**description** : Whether a condition corresponds to a disease or a healthy state.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `DISEASE` | `Disease state.` | @@ -32,11 +33,16 @@ An condition that is linked to comparable samples. | `NOT_APPLICABLE` | `The distinction is not applicaple.` | +
-**case_control_status** : Whether a condition corresponds to a treatment or a control.
+### ***case_control_status***
+**description** : Whether a condition corresponds to a treatment or a control.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `NEITHER_CASE_OR_CONTROL_STATUS` | `None` | @@ -47,11 +53,16 @@ An condition that is linked to comparable samples. | `UNABLE_TO_ASSESS_CASE_OR_CONTROL_STATUS` | `None` | +
-**mutant_or_wildtype** : Whether a condition corresponds to a mutant or a wildtype.
+### ***mutant_or_wildtype***
+**description** : Whether a condition corresponds to a mutant or a wildtype.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `MUTANT` | `Mutant state.` | @@ -59,17 +70,17 @@ An condition that is linked to comparable samples. | `NOT_APPLICABLE` | `The distinction is not applicaple.` | +
-**study** : The study associated with an entity.
+### ***study***
+**description** : The study associated with an entity.
**required** : True
**data type** : Study
- - -**attributes** : Key/value pairs corresponding to an entity.
+### ***attributes***
+**description** : Key/value pairs corresponding to an entity.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/DataAccessCommittee.md b/docs/metadata/worksheets/DataAccessCommittee.md index 109af58..4e2edf7 100644 --- a/docs/metadata/worksheets/DataAccessCommittee.md +++ b/docs/metadata/worksheets/DataAccessCommittee.md @@ -6,16 +6,15 @@ A group of members that are delegated to grant access to one or more datasets af ## Fields -**email** : Email of a person.
+### ***email***
+**description** : Email of a person.
**required** : True
**data type** : string
- - -**institute** : The institute a person is affiliated with.
+### ***institute***
+**description** : The institute a person is affiliated with.
**required** : True
**data type** : string
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/DataAccessPolicy.md b/docs/metadata/worksheets/DataAccessPolicy.md index 447a7b3..2b2ea21 100644 --- a/docs/metadata/worksheets/DataAccessPolicy.md +++ b/docs/metadata/worksheets/DataAccessPolicy.md @@ -6,35 +6,34 @@ A Data Access Policy specifies under which circumstances, legal or otherwise, a ## Fields -**name** : A name for the Data Access Policy.
+### ***name***
+**description** : A name for the Data Access Policy.
**required** : True
**data type** : string
- - +### ***description***
**description** : A short description for the Data Access Policy.
**required** : True
**data type** : string
- - -**policy_text** : The terms of data use and policy verbiage should be captured here.
+### ***policy_text***
+**description** : The terms of data use and policy verbiage should be captured here.
**required** : True
**data type** : string
- - -**policy_url** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL.
+### ***policy_url***
+**description** : URL for the policy, if available. This is useful if the terms of the policy is made available online at a resolvable URL.
**required** : False
**data type** : string
- - -**data_access_committee** : The Data Access Committee linked to this policy.
+### ***data_access_committee***
+**description** : The Data Access Committee linked to this policy.
**required** : True
**data type** : DataAccessCommittee
- - -**data_use_permission** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'.
+### ***data_use_permission***
+**description** : Data use permission associated with a policy. Typically one or more terms from DUO and should be descendants of 'DUO:0000001 data use permission'.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `general research use` | `None` | @@ -44,11 +43,16 @@ A Data Access Policy specifies under which circumstances, legal or otherwise, a | `population origins or ancestry research only` | `None` | +
-**data_use_modifiers** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier'
+### ***data_use_modifiers***
+**description** : Modifier for Data use permission associated with a policy. Should be descendants of 'DUO:0000017 data use modifier'
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `clinical care use` | `None` | @@ -71,7 +75,9 @@ A Data Access Policy specifies under which circumstances, legal or otherwise, a | `population origins or ancestry research prohibited` | `None` | +
-**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Dataset.md b/docs/metadata/worksheets/Dataset.md index 80b6976..35b7f3a 100644 --- a/docs/metadata/worksheets/Dataset.md +++ b/docs/metadata/worksheets/Dataset.md @@ -6,26 +6,23 @@ A Dataset is a collection of Files that is prepared for distribution and is tied ## Fields -**title** : A title for the submitted Dataset.
+### ***title***
+**description** : A title for the submitted Dataset.
**required** : True
**data type** : string
- - +### ***description***
**description** : Description of an entity.
**required** : True
**data type** : string
- - -**types** : The type of a dataset.
+### ***types***
+**description** : The type of a dataset.
**required** : True
**data type** : string
- - -**data_access_policy** : The Data Access Policy that applies to this Dataset.
+### ***data_access_policy***
+**description** : The Data Access Policy that applies to this Dataset.
**required** : True
**data type** : DataAccessPolicy
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Individual.md b/docs/metadata/worksheets/Individual.md index 5d422b1..d8bbdc9 100644 --- a/docs/metadata/worksheets/Individual.md +++ b/docs/metadata/worksheets/Individual.md @@ -6,10 +6,14 @@ An Individual is a Person who is participating in a Study. ## Fields -**sex** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female.
+### ***sex***
+**description** : The assemblage of physical properties or qualities by which male is distinguished from female; the physical difference between male and female; the distinguishing peculiarity of male or female.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `FEMALE_SEX_FOR_CLINICAL_USE` | `None` | @@ -19,11 +23,16 @@ An Individual is a Person who is participating in a Study. | `UNKNOWN_SEX_FOR_CLINICAL_USE` | `None` | +
-**karyotype** : The karyotype of an individual if defined.
+### ***karyotype***
+**description** : The karyotype of an individual if defined.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `46_XY` | `None` | @@ -31,22 +40,21 @@ An Individual is a Person who is participating in a Study. | `OTHER` | `None` | +
-**geographical_region** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries.
+### ***geographical_region***
+**description** : The geographical region where the Individual is located. Any demarcated area of the Earth; may be determined by both natural and human boundaries.
**required** : False
**data type** : string
- - -**ancestries** : A person's descent or lineage, from a person or from a population.
+### ***ancestries***
+**description** : A person's descent or lineage, from a person or from a population.
**required** : False
**data type** : string
- - -**phenotypic_features** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype.
+### ***phenotypic_features***
+**description** : The Phenotypic Feature entity that is associated with this Biospecimen at the time of retrieval from the organism. Typically, a concept from Human Phenotype Ontology. For example, 'HP:0100244' indicates that the Individual - from_which_the_Biospecimen was extracted from - exhibits_'Fibrosarcoma'_as_one_of_its_phenotype.
**required** : False
**data type** : string
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/LibraryPreparationProtocol.md b/docs/metadata/worksheets/LibraryPreparationProtocol.md index def4c85..9b214a7 100644 --- a/docs/metadata/worksheets/LibraryPreparationProtocol.md +++ b/docs/metadata/worksheets/LibraryPreparationProtocol.md @@ -6,31 +6,38 @@ Information about the library_preparation of an sequencing experiment. ## Fields +### ***description***
**description** : Description about how a sequencing library was prepared (eg: Library construction method).
**required** : True
**data type** : string
- - -**library_name** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library.
+### ***library_name***
+**description** : A short name identifying the library to potential users. The same name may refer to multiple versions of the same continually updated library.
**required** : True
**data type** : string
- - -**library_layout** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode
+### ***library_layout***
+**description** : Describe whether the library was sequenced in single-end (forward or reverse) or paired-end mode
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `SE` | `None` | | `PE` | `None` | +
-**library_type** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc)
+### ***library_type***
+**description** : Describe the level of omics analysis (eg: Metagenome, transcriptome, etc)
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `WGS` | `None` | @@ -45,11 +52,16 @@ Information about the library_preparation of an sequencing experiment. | `CHROMOSOME_CONFORMATION_CAPTURE` | `None` | +
-**library_selection** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc )
+### ***library_selection***
+**description** : Whether any method was used to select for or against, enrich, or screen the material being sequenced. library_selection method (e.g. random, PCA, cDNA, etc )
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `5_METHYLCYTIDINE_ANTIBODY_METHOD` | `None` | @@ -85,16 +97,20 @@ Information about the library_preparation of an sequencing experiment. | `OTHER` | `None` | +
-**library_preparation** : The general method for sequencing library_preparation (e.g. KAPA PCR-free).
+### ***library_preparation***
+**description** : The general method for sequencing library_preparation (e.g. KAPA PCR-free).
**required** : True
**data type** : string
- - -**library_preparation_kit_retail_name** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.)
+### ***library_preparation_kit_retail_name***
+**description** : A unique identifier for the kit used to construct a genomic library. This may include the vendor name, kit name and kit version (e.g. Agilent sure select Human Exome V8, Twist RefSeq Exome, etc.)
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `10X_GENOMICS_CHROMIUM_SINGLE_CELL_3_V2` | `None` | @@ -172,16 +188,20 @@ Information about the library_preparation of an sequencing experiment. | `ULTRALOW_METHYL_SEQ_WITH_TRUE_METHYL_OX_BS_MODULE` | `None` | +
-**library_preparation_kit_manufacturer** : Manufacturer of library_preparation kit
+### ***library_preparation_kit_manufacturer***
+**description** : Manufacturer of library_preparation kit
**required** : False
**data type** : string
- - -**primer** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA.
+### ***primer***
+**description** : The type of primer used for reverse transcription, e.g. 'oligo-dT' or 'random' primer. This allows users to identify content of the cDNA library input e.g. enriched for mRNA.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `OLIGO_D_T` | `None` | @@ -190,11 +210,16 @@ Information about the library_preparation of an sequencing experiment. | `OTHER` | `None` | +
-**end_bias** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript.
+### ***end_bias***
+**description** : The end of the cDNA molecule that is preferentially sequenced, e.g. 3/5 prime tag or end, or the full-length transcript.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `3_PRIME_END` | `None` | @@ -202,16 +227,20 @@ Information about the library_preparation of an sequencing experiment. | `FULL_LENGTH` | `None` | +
-**target_regions** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study.
+### ***target_regions***
+**description** : Subset of genes or specific regions of the genome, which are most likely to be involved in the phenotype under study.
**required** : False
**data type** : string
- - -**rnaseq_strandedness** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand.
+### ***rnaseq_strandedness***
+**description** : The strandedness of the library, whether reads come from both strands of the cDNA or only from the first (antisense) or the second (sense) strand.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `SENSE` | `None` | @@ -219,12 +248,13 @@ Information about the library_preparation of an sequencing experiment. | `BOTH` | `None` | +
-**attributes** : One or more attributes that further characterizes this library_preparation Protocol.
+### ***attributes***
+**description** : One or more attributes that further characterizes this library_preparation Protocol.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Publication.md b/docs/metadata/worksheets/Publication.md index d5c60a7..e79f178 100644 --- a/docs/metadata/worksheets/Publication.md +++ b/docs/metadata/worksheets/Publication.md @@ -6,46 +6,39 @@ The Publication entity represents a publication. While a publication can be any ## Fields -**title** : The title for the Publication.
+### ***title***
+**description** : The title for the Publication.
**required** : False
**data type** : string
- - -**abstract** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study.
+### ***abstract***
+**description** : The study abstract that describes the goals. Can also hold abstract from a publication related to this study.
**required** : False
**data type** : string
- - -**author** : The individual who is responsible for the content of a document version.
+### ***author***
+**description** : The individual who is responsible for the content of a document version.
**required** : False
**data type** : string
- - -**year** : Year in which the paper was published.
+### ***year***
+**description** : Year in which the paper was published.
**required** : False
**data type** : integer
- - -**journal** : Name of the journal.
+### ***journal***
+**description** : Name of the journal.
**required** : False
**data type** : string
- - -**doi** : DOI identifier of the Publication.
+### ***doi***
+**description** : DOI identifier of the Publication.
**required** : True
**data type** : string
- - -**study** : The Study entity associated with this Publication.
+### ***study***
+**description** : The Study entity associated with this Publication.
**required** : True
**data type** : Study
- - -**xref** : One or more cross-references for this Publication.
+### ***xref***
+**description** : One or more cross-references for this Publication.
**required** : False
**data type** : string
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Sample.md b/docs/metadata/worksheets/Sample.md index b6eb285..31b7fa4 100644 --- a/docs/metadata/worksheets/Sample.md +++ b/docs/metadata/worksheets/Sample.md @@ -6,15 +6,18 @@ A sample is a limited quantity of something to be used for testing, analysis, in ## Fields -**name** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1).
+### ***name***
+**description** : Name of the sample (eg:GHGAS_Blood_Sample1 or GHGAS_PBMC_RNAseq_S1).
**required** : True
**data type** : string
- - -**type** : The type of sample.
+### ***type***
+**description** : The type of sample.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `CF_DNA` | `None` | @@ -31,42 +34,37 @@ A sample is a limited quantity of something to be used for testing, analysis, in | `TOTAL_RNA` | `None` | +
+### ***description***
**description** : Short textual description of the sample (How the sample was collected, sample source, Protocol followed for processing the sample etc).
**required** : True
**data type** : string
- - -**isolation** : Method or device employed for collecting/isolating a biospecimen or a sample.
+### ***isolation***
+**description** : Method or device employed for collecting/isolating a biospecimen or a sample.
**required** : False
**data type** : string
- - -**storage** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
+### ***storage***
+**description** : Methods by which a biospecimen or a sample is stored (e.g. frozen in liquid nitrogen).
**required** : False
**data type** : string
- - -**biospecimen** : The Biospecimen from which this Sample was prepared from.
+### ***biospecimen***
+**description** : The Biospecimen from which this Sample was prepared from.
**required** : False
**data type** : Biospecimen
- - -**condition** : The condition associated with an entity.
+### ***condition***
+**description** : The condition associated with an entity.
**required** : True
**data type** : Condition
- - -**xref** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession.
+### ***xref***
+**description** : One or more cross-references for this Sample. For example, this Sample may have an EBI BioSamples accession or an EGA Sample accession.
**required** : False
**data type** : string
- - -**attributes** : Key/value pairs corresponding to an entity.
+### ***attributes***
+**description** : Key/value pairs corresponding to an entity.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/SampleFile.md b/docs/metadata/worksheets/SampleFile.md index d07232d..4cfdc33 100644 --- a/docs/metadata/worksheets/SampleFile.md +++ b/docs/metadata/worksheets/SampleFile.md @@ -6,20 +6,22 @@ A SampleFile is a File that is associated with a Sample. ## Fields -**sample** : The sample associated with an entity.
+### ***sample***
+**description** : The sample associated with an entity.
**required** : True
**data type** : Sample
- - -**name** : The given filename.
+### ***name***
+**description** : The given filename.
**required** : True
**data type** : string
- - -**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+### ***format***
+**description** : The format of the file: BAM, SAM, CRAM, BAI, etc.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `AGP` | `None` | @@ -50,38 +52,41 @@ A SampleFile is a File that is associated with a Sample. | `WIG` | `None` | +
-**size** : The size of a file in bytes.
+### ***size***
+**description** : The size of a file in bytes.
**required** : True
**data type** : integer
- - -**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+### ***checksum***
+**description** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
**required** : True
**data type** : string
- - -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+### ***forward_or_reverse***
+**description** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `FORWARD` | `The reads are forward (R1) reads` | | `REVERSE` | `The reads are reverse (R2) reads` | +
-**checksum_type** : The type of algorithm used to generate the checksum of a file.
+### ***checksum_type***
+**description** : The type of algorithm used to generate the checksum of a file.
**required** : True
**data type** : string
- - -**dataset** : The Dataset associated with an entity.
+### ***dataset***
+**description** : The Dataset associated with an entity.
**required** : True
**data type** : Dataset
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/SequencingExperiment.md b/docs/metadata/worksheets/SequencingExperiment.md index 2871a7a..98fb3f7 100644 --- a/docs/metadata/worksheets/SequencingExperiment.md +++ b/docs/metadata/worksheets/SequencingExperiment.md @@ -6,36 +6,31 @@ An sequencing experiment is an investigation that consists of a coordinated set ## Fields -**title** : Name for the experiment (eg: GHGAE_PBMC_RNAseq).
+### ***title***
+**description** : Name for the experiment (eg: GHGAE_PBMC_RNAseq).
**required** : None
**data type** : string
- - +### ***description***
**description** : A detailed description of the Experiment.
**required** : True
**data type** : string
- - -**type** : The type of sequencing experiment.
+### ***type***
+**description** : The type of sequencing experiment.
**required** : False
**data type** : string
- - -**sequencing_protocol** : The sequencing protocol associated with an entity.
+### ***sequencing_protocol***
+**description** : The sequencing protocol associated with an entity.
**required** : True
**data type** : SequencingProtocol
- - -**library_preparation_protocol** : The library_preparation Protocol associated with an entity.
+### ***library_preparation_protocol***
+**description** : The library_preparation Protocol associated with an entity.
**required** : True
**data type** : LibraryPreparationProtocol
- - -**attributes** : Key/value pairs corresponding to an entity.
+### ***attributes***
+**description** : Key/value pairs corresponding to an entity.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProcess.md b/docs/metadata/worksheets/SequencingProcess.md index eaa5382..197b78f 100644 --- a/docs/metadata/worksheets/SequencingProcess.md +++ b/docs/metadata/worksheets/SequencingProcess.md @@ -6,61 +6,51 @@ A sequencing process linking a sample to sequencing output. ## Fields -**title** : The title that describes an entity.
+### ***title***
+**description** : The title that describes an entity.
**required** : None
**data type** : string
- - +### ***description***
**description** : Description of an entity.
**required** : True
**data type** : string
- - -**name** : The name for an entity.
+### ***name***
+**description** : The name for an entity.
**required** : True
**data type** : string
- - -**sequencing_run_id** : Identifier of the sequencing run. Used for batch correction.
+### ***sequencing_run_id***
+**description** : Identifier of the sequencing run. Used for batch correction.
**required** : False
**data type** : string
- - -**sequencing_lane_id** : Identifier of the sequencing lane. Used for batch correction.
+### ***sequencing_lane_id***
+**description** : Identifier of the sequencing lane. Used for batch correction.
**required** : False
**data type** : string
- - -**sequencing_machine_id** : Identifier of the sequencing machine. Used for batch correction.
+### ***sequencing_machine_id***
+**description** : Identifier of the sequencing machine. Used for batch correction.
**required** : False
**data type** : string
- - -**sequencing_experiment** : The sequencing experiment associated with an entity.
+### ***sequencing_experiment***
+**description** : The sequencing experiment associated with an entity.
**required** : True
**data type** : SequencingExperiment
- - -**index_sequence** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample.
+### ***index_sequence***
+**description** : A unique nucleotide sequence that is added to a sample during library_preparation to serve as a unique identifier for the sample.
**required** : False
**data type** : string
- - -**lane_number** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing.
+### ***lane_number***
+**description** : The numerical identifier for the lane or machine unit where a sample was located during nucleotide sequencing.
**required** : False
**data type** : string
- - -**sample** : The sample associated with an entity.
+### ***sample***
+**description** : The sample associated with an entity.
**required** : True
**data type** : Sample
- - -**attributes** : Key/value pairs corresponding to an entity.
+### ***attributes***
+**description** : Key/value pairs corresponding to an entity.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProcessFile.md b/docs/metadata/worksheets/SequencingProcessFile.md index 060bc4a..fec5548 100644 --- a/docs/metadata/worksheets/SequencingProcessFile.md +++ b/docs/metadata/worksheets/SequencingProcessFile.md @@ -6,20 +6,22 @@ A SequencingProcessFile is a File that is associated with a SequencingProcess. ## Fields -**sequencing_process** : The SequencingProcess associated with an entity.
+### ***sequencing_process***
+**description** : The SequencingProcess associated with an entity.
**required** : True
**data type** : SequencingProcess
- - -**name** : The given filename.
+### ***name***
+**description** : The given filename.
**required** : True
**data type** : string
- - -**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+### ***format***
+**description** : The format of the file: BAM, SAM, CRAM, BAI, etc.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `AGP` | `None` | @@ -50,38 +52,41 @@ A SequencingProcessFile is a File that is associated with a SequencingProcess. | `WIG` | `None` | +
-**size** : The size of a file in bytes.
+### ***size***
+**description** : The size of a file in bytes.
**required** : True
**data type** : integer
- - -**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+### ***checksum***
+**description** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
**required** : True
**data type** : string
- - -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+### ***forward_or_reverse***
+**description** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `FORWARD` | `The reads are forward (R1) reads` | | `REVERSE` | `The reads are reverse (R2) reads` | +
-**checksum_type** : The type of algorithm used to generate the checksum of a file.
+### ***checksum_type***
+**description** : The type of algorithm used to generate the checksum of a file.
**required** : True
**data type** : string
- - -**dataset** : The Dataset associated with an entity.
+### ***dataset***
+**description** : The Dataset associated with an entity.
**required** : True
**data type** : Dataset
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/SequencingProtocol.md b/docs/metadata/worksheets/SequencingProtocol.md index eacb3aa..0bcbc63 100644 --- a/docs/metadata/worksheets/SequencingProtocol.md +++ b/docs/metadata/worksheets/SequencingProtocol.md @@ -6,20 +6,22 @@ Information about the sequencing of a sample. ## Fields +### ***description***
**description** : Description about the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc).
**required** : True
**data type** : string
- - -**type** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc).
+### ***type***
+**description** : Type of the sequencing Protocol (eg: mRNA-seq, Whole exome long-read sequencing etc).
**required** : None
**data type** : string
- - -**instrument_model** : The name and model of the technology platform used to perform sequencing.
+### ***instrument_model***
+**description** : The name and model of the technology platform used to perform sequencing.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `ILLUMINA_HI_SCAN` | `None` | @@ -67,31 +69,32 @@ Information about the sequencing of a sample. | `OTHER` | `None` | +
-**sequencing_center** : Center where sample was sequenced.
+### ***sequencing_center***
+**description** : Center where sample was sequenced.
**required** : False
**data type** : string
- - -**sequencing_read_length** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process
+### ***sequencing_read_length***
+**description** : Length of sequencing reads (eg: Long or short or actual number of the read length etc). The number of nucleotides successfully ordered from each side of a nucleic acid fragment obtained after the completion of a sequencing process
**required** : False
**data type** : string
- - -**target_coverage** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced.
+### ***target_coverage***
+**description** : Mean coverage for whole genome sequencing, or mean target coverage for whole exome and targeted sequencing. The number of times a particular locus (site, nucleotide, amplicon, region) was sequenced.
**required** : False
**data type** : string
- - -**flow_cell_id** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing.
+### ***flow_cell_id***
+**description** : Flow Cell ID (eg: Experiment ID_Cell 1_Lane_1). The barcode assigned to a flow cell used in nucleotide sequencing.
**required** : False
**data type** : string
- - -**flow_cell_type** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell.
+### ***flow_cell_type***
+**description** : Type of flow cell used (e.g. S4, S2 for NovaSeq; PromethION, Flongle for Nanopore). Aparatus in the fluidic subsystem where the sheath and sample meet. Can be one of several types; jet-in-air, quartz cuvette, or a hybrid of the two. The sample flows through the center of a fluid column of sheath fluid in the flow cell.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `ILLUMINA_NOVA_SEQ_S2` | `None` | @@ -103,11 +106,16 @@ Information about the sequencing of a sample. | `OTHER` | `None` | +
-**umi_barcode_read** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2).
+### ***umi_barcode_read***
+**description** : The type of read that contains the UMI barcode (Eg: index1/index2/read1/read2).
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `INDEX1` | `None` | @@ -116,21 +124,24 @@ Information about the sequencing of a sample. | `READ2` | `None` | +
-**umi_barcode_offset** : The offset in sequence of the UMI identifying barcode. (E.g. '16').
+### ***umi_barcode_offset***
+**description** : The offset in sequence of the UMI identifying barcode. (E.g. '16').
**required** : False
**data type** : string
- - -**umi_barcode_size** : The size of the UMI identifying barcode (Eg. '10').
+### ***umi_barcode_size***
+**description** : The size of the UMI identifying barcode (Eg. '10').
**required** : False
**data type** : string
- - -**cell_barcode_read** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2).
+### ***cell_barcode_read***
+**description** : The type of read that contains the cell barcode (eg: index1/index2/read1/read2).
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `INDEX1` | `None` | @@ -139,21 +150,24 @@ Information about the sequencing of a sample. | `READ2` | `None` | +
-**cell_barcode_offset** : The offset in sequence of the cell identifying barcode. (Eg. '0').
+### ***cell_barcode_offset***
+**description** : The offset in sequence of the cell identifying barcode. (Eg. '0').
**required** : False
**data type** : string
- - -**cell_barcode_size** : The size of the cell identifying barcode (E.g. '16').
+### ***cell_barcode_size***
+**description** : The size of the cell identifying barcode (E.g. '16').
**required** : False
**data type** : string
- - -**sample_barcode_read** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2).
+### ***sample_barcode_read***
+**description** : The type of read that contains the sample barcode (eg: index1/index2/read1/read2).
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `INDEX1` | `None` | @@ -161,12 +175,13 @@ Information about the sequencing of a sample. | `OTHER` | `None` | +
-**attributes** : One or more attributes that further characterizes this Sequencing Protocol.
+### ***attributes***
+**description** : One or more attributes that further characterizes this Sequencing Protocol.
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Study.md b/docs/metadata/worksheets/Study.md index 2f78a1e..0b3b1c6 100644 --- a/docs/metadata/worksheets/Study.md +++ b/docs/metadata/worksheets/Study.md @@ -6,20 +6,22 @@ Studies are experimental investigations of a particular phenomenon. It involves ## Fields -**title** : A comprehensive title for the study.
+### ***title***
+**description** : A comprehensive title for the study.
**required** : True
**data type** : string
- - +### ***description***
**description** : A detailed description (abstract) that describes the goals of this Study.
**required** : True
**data type** : string
- - -**type** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'.
+### ***type***
+**description** : The type of Study. For example, 'Cancer Genomics', 'Epigenetics', 'Exome Sequencing'.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `CANCER_GENOMICS` | `None` | @@ -40,17 +42,17 @@ Studies are experimental investigations of a particular phenomenon. It involves | `GWAS` | `None` | +
-**affiliations** : The Institution(s) associated with an entity.
+### ***affiliations***
+**description** : The Institution(s) associated with an entity.
**required** : True
**data type** : string
- - -**attributes** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc)
+### ***attributes***
+**description** : Custom key/value pairs that further characterizes the Study. (e.g.: approaches - single-cell,_bulk_etc)
**required** : False
**data type** : Attribute
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/StudyFile.md b/docs/metadata/worksheets/StudyFile.md index b1d83a8..789a5ce 100644 --- a/docs/metadata/worksheets/StudyFile.md +++ b/docs/metadata/worksheets/StudyFile.md @@ -6,20 +6,22 @@ A StudyFile is a File that is associated with a Study. ## Fields -**study** : The study associated with an entity.
+### ***study***
+**description** : The study associated with an entity.
**required** : True
**data type** : Study
- - -**name** : The given filename.
+### ***name***
+**description** : The given filename.
**required** : True
**data type** : string
- - -**format** : The format of the file: BAM, SAM, CRAM, BAI, etc.
+### ***format***
+**description** : The format of the file: BAM, SAM, CRAM, BAI, etc.
**required** : True
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `AGP` | `None` | @@ -50,38 +52,41 @@ A StudyFile is a File that is associated with a Study. | `WIG` | `None` | +
-**size** : The size of a file in bytes.
+### ***size***
+**description** : The size of a file in bytes.
**required** : True
**data type** : integer
- - -**checksum** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
+### ***checksum***
+**description** : A computed value which depends on the contents of a block of data and which is transmitted or stored along with the data in order to detect corruption of the data. The receiving system recomputes the checksum based upon the received data and compares this value with the one sent with the data. If the two values are the same, the receiver has some confidence that the data was received correctly.
**required** : True
**data type** : string
- - -**forward_or_reverse** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
+### ***forward_or_reverse***
+**description** : Denotes whether a submitted FASTQ file contains forward (R1) or reverse (R2) reads for paired-end sequencing. The number that identifies each read direction in a paired-end nucleotide sequencing reaction.
**required** : False
**data type** : Controlled Vocabulary
+
+ Permissible Values + | Permissible Values | Description | | --- | --- | | `FORWARD` | `The reads are forward (R1) reads` | | `REVERSE` | `The reads are reverse (R2) reads` | +
-**checksum_type** : The type of algorithm used to generate the checksum of a file.
+### ***checksum_type***
+**description** : The type of algorithm used to generate the checksum of a file.
**required** : True
**data type** : string
- - -**dataset** : The Dataset associated with an entity.
+### ***dataset***
+**description** : The Dataset associated with an entity.
**required** : True
**data type** : Dataset
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/docs/metadata/worksheets/Trio.md b/docs/metadata/worksheets/Trio.md index ed4839b..a7fc52b 100644 --- a/docs/metadata/worksheets/Trio.md +++ b/docs/metadata/worksheets/Trio.md @@ -6,21 +6,19 @@ A trio is defined by three individuals representing an individual and their pare ## Fields -**mother** : The mother of an individual.
+### ***mother***
+**description** : The mother of an individual.
**required** : True
**data type** : Individual
- - -**father** : The father of an individual.
+### ***father***
+**description** : The father of an individual.
**required** : True
**data type** : Individual
- - -**child** : The child of two individuals.
+### ***child***
+**description** : The child of two individuals.
**required** : True
**data type** : Individual
- - -**alias** : The alias for an entity at the time of submission.
+### ***alias***
+**description** : The alias for an entity at the time of submission.
**required** : True
**data type** : string
diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index c8f9e38..040086d 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -72,25 +72,39 @@ def main_workbook(self): ) -def load_config(config_path=CONFIG_PATH) -> dict: - """Loads config file""" +class IOOperations: + """bundles input operation functions""" - try: - with open(config_path, "r", encoding="utf8") as config_file: - return yaml.safe_load(config_file) - except FileNotFoundError as exc: - raise WorkbookConfigurationNotFound( - f"Workbook configuration not found at: {config_path}" - ) from exc + config_path: Path = CONFIG_PATH + schema_url: str = SCHEMA_URL + output_dir: Path = DOCS_DIR + @property + def load_config(self) -> dict: + """Loads config file""" + + try: + with open(self.config_path, "r", encoding="utf8") as config_file: + return yaml.safe_load(config_file) + except FileNotFoundError as exc: + raise WorkbookConfigurationNotFound( + f"Workbook configuration not found at: {self.config_path}" + ) from exc + + @property + def load_schema(self): + """Loads schema""" + + schema_config = requests.get(self.schema_url, timeout=5) + if schema_config.status_code == 200: + return SchemaView(schema_config.text) + raise SchemaNotLoaded(f"Schema could not be loaded from {self.schema_url}") -def load_schema(schema_url=SCHEMA_URL): - """Loads schema""" + def create_doc_file(self, name: str, content: str) -> None: + """Creates a markdown file for a given sheet and content""" - schema_config = requests.get(schema_url, timeout=5) - if schema_config.status_code == 200: - return SchemaView(schema_config.text) - raise SchemaNotLoaded(f"Schema could not be loaded from {SCHEMA_URL}") + with open(self.output_dir / (name + ".md"), mode="w", encoding="utf8") as file: + file.write(content) def extract_permissible_values(schema: SchemaView, slot_range: Union[str, None]): @@ -122,7 +136,6 @@ def extract_slots_from( return [ { "name": slot.name, - "alias": slot.alias, "description": slot.description, "data_type": { "range": slot.range, @@ -158,28 +171,20 @@ def generate_markdown(content: dict) -> str: return template.render(content) -def create_doc_file(out_dir: Path, name: str, content: str) -> None: - """Creates a markdown file for a given sheet and content""" - - with open(out_dir / (name + ".md"), mode="w", encoding="utf8") as file: - file.write(content) - - def main(): """Patches things together""" - - config = Config.model_validate(load_config()) + io_ops = IOOperations() + config = Config.model_validate(io_ops.load_config) if config.main_workbook is None: raise MainSheetNotIdentified worksheet_names = config.main_workbook.worksheets - schema = load_schema() - workbook = generate_workbook(schema, worksheet_names, extract_slots_from) + workbook = generate_workbook( + io_ops.load_schema, worksheet_names, extract_slots_from + ) for sheet in workbook: - # print(sheet) - # print("\n\n") - create_doc_file(DOCS_DIR, sheet["name"], generate_markdown(sheet)) + IOOperations.create_doc_file(io_ops, sheet["name"], generate_markdown(sheet)) if __name__ == "__main__": From b8799a58114855e71ab89d7a82d6b785c96fda31 Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 14:24:17 +0000 Subject: [PATCH 06/15] doc page summarizing submission content --- docs/metadata/submission.md | 355 ++++++++++++++++++ .../.sheet_documentation_template.md.jinja | 0 ...submission_documentation_template.md.jinja | 17 + scripts/update_metadata_docs.py | 74 +++- 4 files changed, 426 insertions(+), 20 deletions(-) create mode 100644 docs/metadata/submission.md rename .sheet_documentation_template.md.jinja => docs/templates/.sheet_documentation_template.md.jinja (100%) create mode 100644 docs/templates/.submission_documentation_template.md.jinja diff --git a/docs/metadata/submission.md b/docs/metadata/submission.md new file mode 100644 index 0000000..481de07 --- /dev/null +++ b/docs/metadata/submission.md @@ -0,0 +1,355 @@ +# Submission + + +## ghga_submission_full.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+ + +StudyFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+ + +Sample worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+ + +SampleFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+ + +Condition worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+ + +Biospecimen worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+ + +Individual worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+ + +Trio worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+ + +LibraryPreparationProtocol worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md)
+ + +SequencingProtocol worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md)
+ + +SequencingExperiment worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md)
+ + +SequencingProcess worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md)
+ + +SequencingProcessFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md)
+ + +Analysis worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Analysis.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Analysis.md)
+ + +AnalysisProcess worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcess.md)
+ + +AnalysisProcessOutputFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcessOutputFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcessOutputFile.md)
+ + +Dataset worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+ + +Publication worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+ + + +## ghga_submission_minimal.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+ + +StudyFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+ + +Dataset worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+ + +Publication worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+ + + +## ghga_submission_sample.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+ + +StudyFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+ + +Sample worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+ + +SampleFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+ + +Condition worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+ + +Dataset worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+ + +Publication worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+ + + +## ghga_submission_individual.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+ + +StudyFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+ + +Sample worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+ + +SampleFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+ + +Condition worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+ + +Biospecimen worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+ + +Individual worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+ + +Trio worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+ + +Dataset worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+ + +Publication worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+ + + +## ghga_submission_seq.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+ + +StudyFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+ + +Sample worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+ + +SampleFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+ + +Condition worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+ + +Biospecimen worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+ + +Individual worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+ + +Trio worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+ + +LibraryPreparationProtocol worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md)
+ + +SequencingProtocol worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md)
+ + +SequencingExperiment worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md)
+ + +SequencingProcess worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md)
+ + +SequencingProcessFile worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md)
+ + +Dataset worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+ + +Publication worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
diff --git a/.sheet_documentation_template.md.jinja b/docs/templates/.sheet_documentation_template.md.jinja similarity index 100% rename from .sheet_documentation_template.md.jinja rename to docs/templates/.sheet_documentation_template.md.jinja diff --git a/docs/templates/.submission_documentation_template.md.jinja b/docs/templates/.submission_documentation_template.md.jinja new file mode 100644 index 0000000..153687a --- /dev/null +++ b/docs/templates/.submission_documentation_template.md.jinja @@ -0,0 +1,17 @@ +# Submission + +{% for workbook in workbooks %} +## {{ workbook.file_name }}
+ +Submission components: +---------------------- + +{% for sheet in workbook.worksheets %} + +{{ sheet }} worksheet: + +* [https://ghga-de.github.io/docs/metadata/overviewworksheets/{{ sheet }}.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/{{ sheet }}.md)
+ +{% endfor %} + +{% endfor %} diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index 040086d..e6594a3 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -27,10 +27,11 @@ HERE = Path(__file__).parent.resolve() ROOT = HERE.parent -DOCS_DIR = ROOT / "docs" / "metadata" / "worksheets" +SHEET_DIR = ROOT / "docs" / "metadata" / "worksheets" +SUBMISSION_DIR = ROOT / "docs" / "metadata" CONFIG_PATH = ROOT / ".workbook_config.yaml" +TEMPLATE_DIR = ROOT / "docs" / "templates" -TEMPLATE = ".sheet_documentation_template.md.jinja" SCHEMA_URL = "https://raw.githubusercontent.com/ghga-de/ghga-metadata-schema/main/src/schema/submission.yaml" # pylint: disable=line-too-long @@ -72,12 +73,12 @@ def main_workbook(self): ) -class IOOperations: +class LoadOperations: """bundles input operation functions""" config_path: Path = CONFIG_PATH schema_url: str = SCHEMA_URL - output_dir: Path = DOCS_DIR + template_dir: Path = TEMPLATE_DIR @property def load_config(self) -> dict: @@ -100,13 +101,23 @@ def load_schema(self): return SchemaView(schema_config.text) raise SchemaNotLoaded(f"Schema could not be loaded from {self.schema_url}") - def create_doc_file(self, name: str, content: str) -> None: - """Creates a markdown file for a given sheet and content""" - with open(self.output_dir / (name + ".md"), mode="w", encoding="utf8") as file: +def create_doc_file(output_dir: Path, name: str, content: Union[str, None]) -> None: + """Creates a markdown file for a given sheet and content""" + + if content: + with open(output_dir / (name + ".md"), mode="w", encoding="utf8") as file: file.write(content) +def generate_markdown(load_ops: LoadOperations, template_name: str, content: dict): + """Generates the markdown text by rendering the content into the template""" + + env = Environment(loader=FileSystemLoader(load_ops.template_dir), trim_blocks=True) + template = env.get_template(template_name) + return template.render(content) + + def extract_permissible_values(schema: SchemaView, slot_range: Union[str, None]): """enum""" enum = schema.get_enum(slot_range) @@ -163,28 +174,51 @@ def generate_workbook( ] -def generate_markdown(content: dict) -> str: - """Generates the markdown text by rendering the content into the template""" - - env = Environment(loader=FileSystemLoader(ROOT), trim_blocks=True) - template = env.get_template(TEMPLATE) - return template.render(content) - +def generate_sheet_docs(config: Config, load_ops: LoadOperations): + """fn""" -def main(): - """Patches things together""" - io_ops = IOOperations() - config = Config.model_validate(io_ops.load_config) if config.main_workbook is None: raise MainSheetNotIdentified worksheet_names = config.main_workbook.worksheets + # Generate worksheet documentation of the main workbook workbook = generate_workbook( - io_ops.load_schema, worksheet_names, extract_slots_from + load_ops.load_schema, worksheet_names, extract_slots_from ) for sheet in workbook: - IOOperations.create_doc_file(io_ops, sheet["name"], generate_markdown(sheet)) + create_doc_file( + SHEET_DIR, + sheet["name"], + generate_markdown( + load_ops, ".sheet_documentation_template.md.jinja", sheet + ), + ) + + +def generate_submission_doc(config: Config, load_ops: LoadOperations): + """fn""" + + submission_content = config.model_dump() + print(submission_content) + + create_doc_file( + SUBMISSION_DIR, + "submission", + generate_markdown( + load_ops, ".submission_documentation_template.md.jinja", submission_content + ), + ) + + +def main(): + """Patches things together""" + load_ops = LoadOperations() + config = Config.model_validate(load_ops.load_config) + + generate_sheet_docs(config, load_ops) + + generate_submission_doc(config, load_ops) if __name__ == "__main__": From d30cb7a2c62c5ad72ea658e4e6f432f9ff7ac3e2 Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 14:31:56 +0000 Subject: [PATCH 07/15] template update --- .description.md | 25 ++++ .design.md | 7 + .github/workflows/check_readme.yaml | 16 ++ .gitignore | 1 + .mandatory_files | 8 +- .readme_template.md | 109 ++++++++++++++ .static_files | 7 + readme_generation.md | 47 ++++++ scripts/__init__.py | 17 +++ scripts/update_all.py | 51 +++++++ scripts/update_readme.py | 217 ++++++++++++++++++++++++++++ scripts/update_template_files.py | 2 +- 12 files changed, 501 insertions(+), 6 deletions(-) create mode 100644 .description.md create mode 100644 .design.md create mode 100644 .github/workflows/check_readme.yaml create mode 100644 .readme_template.md create mode 100644 readme_generation.md create mode 100644 scripts/__init__.py create mode 100755 scripts/update_all.py create mode 100755 scripts/update_readme.py diff --git a/.description.md b/.description.md new file mode 100644 index 0000000..de53957 --- /dev/null +++ b/.description.md @@ -0,0 +1,25 @@ + + +This repo is a template for creating a new microservice. + +The directories, files, and their structure herein are recommendations +from the GHGA Dev Team. + +### Naming Conventions +The github repository contains only lowercase letters, numbers, and hyphens "-", +e.g.: `my-microservice` + +The python package (and thus the source repository) contains underscores "_" +instead of hyphens, e.g.: `my_microservice` +However, an abbreviated version is prefered as package name. + +### Adapt to your service +This is just a template and needs some adaption to your specific use case. + +Please search for **"please adapt"** comments. They will indicate all locations +that need modification. Once the adaptions are in place, please remove these # +comments. + +Finally, follow the instructions to generate the README.md described in +[`./readme_generation.md`](./readme_generation.md). Please also adapt this markdown file +by providing an overview of the feature of the package. diff --git a/.design.md b/.design.md new file mode 100644 index 0000000..f2dfee4 --- /dev/null +++ b/.design.md @@ -0,0 +1,7 @@ + + +This is a Python-based service following the Triple Hexagonal Architecture pattern. +It uses protocol/provider pairs and dependency injection mechanisms provided by the +[hexkit](https://github.com/ghga-de/hexkit) library. diff --git a/.github/workflows/check_readme.yaml b/.github/workflows/check_readme.yaml new file mode 100644 index 0000000..a052d1e --- /dev/null +++ b/.github/workflows/check_readme.yaml @@ -0,0 +1,16 @@ +name: Check if the readme is up to date. + +on: push + +jobs: + static-code-analysis: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + + - id: common + uses: ghga-de/gh-action-common@v2 + + - name: Check readme + run: | + ./scripts/update_readme.py --check diff --git a/.gitignore b/.gitignore index 231fdfa..28b8a9d 100644 --- a/.gitignore +++ b/.gitignore @@ -50,6 +50,7 @@ coverage.xml *.py,cover .hypothesis/ .pytest_cache/ +prof/ # Translations *.mo diff --git a/.mandatory_files b/.mandatory_files index bf48ac2..d7cf949 100644 --- a/.mandatory_files +++ b/.mandatory_files @@ -10,11 +10,6 @@ .devcontainer/docker-compose.yml .devcontainer/Dockerfile - -docs/README.md - -example_data - tests/__init__.py tests/fixtures/__init__.py @@ -28,3 +23,6 @@ README.md setup.py setup.cfg requirements-dev.txt + +.description.md +.design.md diff --git a/.readme_template.md b/.readme_template.md new file mode 100644 index 0000000..878dbdc --- /dev/null +++ b/.readme_template.md @@ -0,0 +1,109 @@ + +[![tests](https://github.com/ghga-de/$name/actions/workflows/unit_and_int_tests.yaml/badge.svg)](https://github.com/ghga-de/$name/actions/workflows/unit_and_int_tests.yaml) +[![Coverage Status](https://coveralls.io/repos/github/ghga-de/$name/badge.svg?branch=main)](https://coveralls.io/github/ghga-de/$name?branch=main) + +# $title + +$summary + +## Description + +$description + +## Installation +We recommend using the provided Docker container. + +A pre-build version is available at [docker hub](https://hub.docker.com/repository/docker/ghga/$name): +```bash +docker pull ghga/$name:$version +``` + +Or you can build the container yourself from the [`./Dockerfile`](./Dockerfile): +```bash +# Execute in the repo's root dir: +docker build -t ghga/$name:$version . +``` + +For production-ready deployment, we recommend using Kubernetes, however, +for simple use cases, you could execute the service using docker +on a single server: +```bash +# The entrypoint is preconfigured: +docker run -p 8080:8080 ghga/$name:$version --help +``` + +If you prefer not to use containers, you may install the service from source: +```bash +# Execute in the repo's root dir: +pip install . + +# To run the service: +$shortname --help +``` + +## Configuration +### Parameters + +The service requires the following configuration parameters: +$config_description + +### Usage: + +A template YAML for configurating the service can be found at +[`./example-config.yaml`](./example-config.yaml). +Please adapt it, rename it to `.$shortname.yaml`, and place it into one of the following locations: +- in the current working directory were you are execute the service (on unix: `./.$shortname.yaml`) +- in your home directory (on unix: `~/.$shortname.yaml`) + +The config yaml will be automatically parsed by the service. + +**Important: If you are using containers, the locations refer to paths within the container.** + +All parameters mentioned in the [`./example-config.yaml`](./example-config.yaml) +could also be set using environment variables or file secrets. + +For naming the environment variables, just prefix the parameter name with `${shortname}_`, +e.g. for the `host` set an environment variable named `${shortname}_host` +(you may use both upper or lower cases, however, it is standard to define all env +variables in upper cases). + +To using file secrets please refer to the +[corresponding section](https://pydantic-docs.helpmanual.io/usage/settings/#secret-support) +of the pydantic documentation. + +$openapi_doc + +## Architecture and Design: +$design_description + +## Development +For setting up the development environment, we rely on the +[devcontainer feature](https://code.visualstudio.com/docs/remote/containers) of vscode +in combination with Docker Compose. + +To use it, you have to have Docker Compose as well as vscode with its "Remote - Containers" +extension (`ms-vscode-remote.remote-containers`) installed. +Then open this repository in vscode and run the command +`Remote-Containers: Reopen in Container` from the vscode "Command Palette". + +This will give you a full-fledged, pre-configured development environment including: +- infrastructural dependencies of the service (databases, etc.) +- all relevant vscode extensions pre-installed +- pre-configured linting and auto-formating +- a pre-configured debugger +- automatic license-header insertion + +Moreover, inside the devcontainer, a convenience commands `dev_install` is available. +It installs the service with all development dependencies, installs pre-commit. + +The installation is performed automatically when you build the devcontainer. However, +if you update dependencies in the [`./setup.cfg`](./setup.cfg) or the +[`./requirements-dev.txt`](./requirements-dev.txt), please run it again. + +## License +This repository is free to use and modify according to the +[Apache 2.0 License](./LICENSE). + +## Readme Generation +This readme is autogenerate, please see [`readme_generation.md`](./readme_generation.md) +for details. diff --git a/.static_files b/.static_files index c9b555f..3ebfa18 100644 --- a/.static_files +++ b/.static_files @@ -15,11 +15,14 @@ scripts/script_utils/__init__.py scripts/script_utils/cli.py +scripts/__init__.py +scripts/update_all.py scripts/license_checker.py scripts/get_package_name.py scripts/update_config_docs.py scripts/update_template_files.py scripts/update_openapi_docs.py +scripts/update_readme.py scripts/README.md .github/workflows/check_config_docs.yaml @@ -27,6 +30,7 @@ scripts/README.md .github/workflows/static_code_analysis.yaml .github/workflows/unit_and_int_tests.yaml .github/workflows/check_openapi_spec.yaml +.github/workflows/check_readme.yaml .github/workflows/cd.yaml example_data/README.md @@ -44,3 +48,6 @@ LICENSE requirements.txt requirements-dev-common.txt setup.py + +.readme_template.md +readme_generation.md diff --git a/readme_generation.md b/readme_generation.md new file mode 100644 index 0000000..432153c --- /dev/null +++ b/readme_generation.md @@ -0,0 +1,47 @@ + + +# Readme Generation + +The README file is generated by collecting information from different sources as +outlined in the following. + +- name: The full name of the package is derived from the remote origin Git repository. +- title: A title case representation of the name. +- shortname: An abbreviation of the full name. This is derived from the name mentioned + in the [`./setup.cfg`](`./setup.cfg). +- summary: A short 1-2 sentence summary derived from the description in the + [`./setup.cfg`](`./setup.cfg). +- version: The package version derived from the version specified in the + [`./setup.cfg`](`./setup.cfg). +- description: A markdown-formatted description of the features and use cases of this + service or package. Obtained from the [`./.description.md`](./.description.md). +- design_description: A markdown-formatted description of the overall architecture and + design of the package. Obtained from the [`./.design.md`](./.design.md). +- config_description: A markdown-formatted description of all config parameters. + This is autogenerated from the [`./config_schema.json`](./config_schema.json). +- openapi_doc: A markdown-formatted description of the HTTP API. This is autogenerated + and links to the [`./openapi.yaml`](./openapi.yaml). If the openapi.yaml is not + this documentation is empty. + +The [`./.readme_template.md`](./.readme_template.md) serves as a template where the +above variable can be filled in using Pythons `string.Template` utility from the +standard library. + +The [`./scripts/update_readme.py`] script can be used to collect all information and +fill it into the template to generate the README file. diff --git a/scripts/__init__.py b/scripts/__init__.py new file mode 100644 index 0000000..6222ab0 --- /dev/null +++ b/scripts/__init__.py @@ -0,0 +1,17 @@ +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Scripts and utils used during development or in CI pipelines.""" diff --git a/scripts/update_all.py b/scripts/update_all.py new file mode 100755 index 0000000..78854df --- /dev/null +++ b/scripts/update_all.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +"""Run all update scripts that are present in the repository in the correct order""" + +try: + from scripts.update_template_files import main as update_template +except ImportError: + pass +else: + print("Pulling in updates from template repository") + update_template() + +try: + from scripts.update_config_docs import main as update_config +except ImportError: + pass +else: + print("Updating config docs") + update_config() + +try: + from scripts.update_openapi_docs import main as update_openapi +except ImportError: + pass +else: + print("Updating OpenAPI docs") + update_openapi() + +try: + from scripts.update_readme import main as update_readme +except ImportError: + pass +else: + print("Updating README") + update_readme() diff --git a/scripts/update_readme.py b/scripts/update_readme.py new file mode 100755 index 0000000..594aedf --- /dev/null +++ b/scripts/update_readme.py @@ -0,0 +1,217 @@ +#!/usr/bin/env python3 + +# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln +# for the German Human Genome-Phenome Archive (GHGA) +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Generate documentation for this package using different sources.""" + +import json +import subprocess # nosec +import sys +from pathlib import Path +from string import Template + +import jsonschema2md +from pydantic import BaseModel, Field +from script_utils.cli import echo_failure, echo_success, run +from setuptools.config.setupcfg import read_configuration +from stringcase import spinalcase, titlecase + +ROOT_DIR = Path(__file__).parent.parent.resolve() +SETUP_CFG_PATH = ROOT_DIR / "setup.cfg" +DESCRIPTION_PATH = ROOT_DIR / ".description.md" +DESIGN_PATH = ROOT_DIR / ".design.md" +README_TEMPLATE_PATH = ROOT_DIR / ".readme_template.md" +CONFIG_SCHEMA_PATH = ROOT_DIR / "config_schema.json" +OPENAPI_YAML_REL_PATH = "./openapi.yaml" +README_PATH = ROOT_DIR / "README.md" + + +class PackageHeader(BaseModel): + """A basic summary of a package.""" + + shortname: str = Field( + ..., + description=( + "The abbreviation of the package name. Is identical to the package name." + ), + ) + version: str = Field(..., description="The version of the package.") + summary: str = Field( + ..., description="A short 1 or 2 sentence summary of the package." + ) + + +class PackageName(BaseModel): + """The name of a package and it's different representations.""" + + name: str = Field(..., description="The full name of the package in spinal case.") + title: str = Field(..., description="The name of the package formatted as title.") + + +class PackageDetails(PackageHeader, PackageName): + """A container for details on a package used to build documentation.""" + + description: str = Field( + ..., description="A markdown-formatted description of the package." + ) + design_description: str = Field( + ..., + description=( + "A markdown-formatted description of overall architecture and design of" + + " the package." + ), + ) + config_description: str = Field( + ..., + description=( + "A markdown-formatted list of all configuration parameters of this package." + ), + ) + openapi_doc: str = Field( + ..., + description=( + "A markdown-formatted description rendering or linking to an OpenAPI" + " specification of the package." + ), + ) + + +def read_package_header() -> PackageHeader: + """Read basic information about the package from the setup.cfg.""" + + setup_config = read_configuration(SETUP_CFG_PATH) + setup_metadata = setup_config["metadata"] + return PackageHeader( + shortname=setup_metadata["name"], + version=setup_metadata["version"], + summary=setup_metadata["description"], + ) + + +def read_package_name() -> PackageName: + """Infer the package name from the name of the git origin.""" + + with subprocess.Popen( + args="basename -s .git `git config --get remote.origin.url`", + cwd=ROOT_DIR, + stdout=subprocess.PIPE, + shell=True, + ) as process: + stdout, _ = process.communicate() + + if not stdout: + raise RuntimeError("The name of the git origin could not be resolved.") + git_origin_name = stdout.decode("utf-8").strip() + + return PackageName( + name=spinalcase(git_origin_name), title=titlecase(git_origin_name) + ) + + +def read_package_description() -> str: + """Read the package description.""" + + return DESCRIPTION_PATH.read_text() + + +def read_design_description() -> str: + """Read the design description.""" + + return DESIGN_PATH.read_text() + + +def generate_config_docs() -> str: + """Generate markdown-formatted documentation for the configration parameters + listed in the config schema.""" + + parser = jsonschema2md.Parser( + examples_as_yaml=False, + show_examples="all", + ) + with open(CONFIG_SCHEMA_PATH, "r", encoding="utf-8") as json_file: + config_schema = json.load(json_file) + + md_lines = parser.parse_schema(config_schema) + + # ignore everything before the properites header: + properties_index = md_lines.index("## Properties\n\n") + md_lines = md_lines[properties_index + 1 :] + + return "\n".join(md_lines) + + +def generate_openapi_docs() -> str: + """Generate markdown-formatted documentation linking to or rendering an OpenAPI + specification of the package. If no OpenAPI specification is present, return an + empty string.""" + + open_api_yaml_path = ROOT_DIR / OPENAPI_YAML_REL_PATH + + if not open_api_yaml_path.exists(): + return "" + + return ( + "## HTTP API\n" + + "An OpenAPI specification for this service can be found" + + f" [here]({OPENAPI_YAML_REL_PATH})." + ) + + +def get_package_details() -> PackageDetails: + """Get details required to build documentation for the package.""" + + header = read_package_header() + name = read_package_name() + description = read_package_description() + config_description = generate_config_docs() + return PackageDetails( + **header.dict(), + **name.dict(), + description=description, + config_description=config_description, + design_description=read_design_description(), + openapi_doc=generate_openapi_docs(), + ) + + +def generate_single_readme(*, details: PackageDetails) -> str: + """Generate a single markdown-formatted readme file for the package based on the + provided details.""" + + template_content = README_TEMPLATE_PATH.read_text() + template = Template(template_content) + return template.substitute(details.dict()) + + +def main(check: bool = False) -> None: + """Update the readme markdown.""" + + details = get_package_details() + readme_content = generate_single_readme(details=details) + + if check: + if README_PATH.read_text() != readme_content: + echo_failure("README.md is not up to date.") + sys.exit(1) + echo_success("README.md is up to date.") + return + + README_PATH.write_text(readme_content) + echo_success("Successfully updated README.md.") + + +if __name__ == "__main__": + run(main) diff --git a/scripts/update_template_files.py b/scripts/update_template_files.py index 7ff6e22..952fe2c 100755 --- a/scripts/update_template_files.py +++ b/scripts/update_template_files.py @@ -33,7 +33,7 @@ try: from script_utils.cli import echo_failure, echo_success, run except ImportError: - echo_failure = echo_success = print # type: ignore + echo_failure = echo_success = print def run(main_fn): """Run main function without cli tools (typer).""" From 9b13d76113632b42a0309119b78fab75ed01fc69 Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 14:41:27 +0000 Subject: [PATCH 08/15] template adjustments --- .deprecated_files | 2 + .deprecated_files_ignore | 2 + .description.md | 25 ---- .design.md | 7 -- .mandatory_files_ignore | 3 + .readme_template.md | 109 ---------------- .static_files_ignore | 10 +- docs/metadata/submission.md | 5 - readme_generation.md | 47 ------- scripts/get_package_name.py | 46 ------- scripts/update_metadata_docs.py | 1 - scripts/update_readme.py | 217 -------------------------------- 12 files changed, 15 insertions(+), 459 deletions(-) delete mode 100644 .description.md delete mode 100644 .design.md delete mode 100644 .readme_template.md delete mode 100644 readme_generation.md delete mode 100755 scripts/get_package_name.py delete mode 100755 scripts/update_readme.py diff --git a/.deprecated_files b/.deprecated_files index d522224..888a8e8 100644 --- a/.deprecated_files +++ b/.deprecated_files @@ -12,3 +12,5 @@ scripts/check_mandatory_and_static_files.py scripts/update_static_files.py + +docs diff --git a/.deprecated_files_ignore b/.deprecated_files_ignore index b589813..839acf2 100644 --- a/.deprecated_files_ignore +++ b/.deprecated_files_ignore @@ -1,2 +1,4 @@ # Optional list of files which are actually deprecated in the template # but are still allowed to be used in the current repository + +docs diff --git a/.description.md b/.description.md deleted file mode 100644 index de53957..0000000 --- a/.description.md +++ /dev/null @@ -1,25 +0,0 @@ - - -This repo is a template for creating a new microservice. - -The directories, files, and their structure herein are recommendations -from the GHGA Dev Team. - -### Naming Conventions -The github repository contains only lowercase letters, numbers, and hyphens "-", -e.g.: `my-microservice` - -The python package (and thus the source repository) contains underscores "_" -instead of hyphens, e.g.: `my_microservice` -However, an abbreviated version is prefered as package name. - -### Adapt to your service -This is just a template and needs some adaption to your specific use case. - -Please search for **"please adapt"** comments. They will indicate all locations -that need modification. Once the adaptions are in place, please remove these # -comments. - -Finally, follow the instructions to generate the README.md described in -[`./readme_generation.md`](./readme_generation.md). Please also adapt this markdown file -by providing an overview of the feature of the package. diff --git a/.design.md b/.design.md deleted file mode 100644 index f2dfee4..0000000 --- a/.design.md +++ /dev/null @@ -1,7 +0,0 @@ - - -This is a Python-based service following the Triple Hexagonal Architecture pattern. -It uses protocol/provider pairs and dependency injection mechanisms provided by the -[hexkit](https://github.com/ghga-de/hexkit) library. diff --git a/.mandatory_files_ignore b/.mandatory_files_ignore index c03be42..5879e3e 100644 --- a/.mandatory_files_ignore +++ b/.mandatory_files_ignore @@ -14,3 +14,6 @@ README.md setup.py setup.cfg requirements-dev.txt + +.description.md +.design.md diff --git a/.readme_template.md b/.readme_template.md deleted file mode 100644 index 878dbdc..0000000 --- a/.readme_template.md +++ /dev/null @@ -1,109 +0,0 @@ - -[![tests](https://github.com/ghga-de/$name/actions/workflows/unit_and_int_tests.yaml/badge.svg)](https://github.com/ghga-de/$name/actions/workflows/unit_and_int_tests.yaml) -[![Coverage Status](https://coveralls.io/repos/github/ghga-de/$name/badge.svg?branch=main)](https://coveralls.io/github/ghga-de/$name?branch=main) - -# $title - -$summary - -## Description - -$description - -## Installation -We recommend using the provided Docker container. - -A pre-build version is available at [docker hub](https://hub.docker.com/repository/docker/ghga/$name): -```bash -docker pull ghga/$name:$version -``` - -Or you can build the container yourself from the [`./Dockerfile`](./Dockerfile): -```bash -# Execute in the repo's root dir: -docker build -t ghga/$name:$version . -``` - -For production-ready deployment, we recommend using Kubernetes, however, -for simple use cases, you could execute the service using docker -on a single server: -```bash -# The entrypoint is preconfigured: -docker run -p 8080:8080 ghga/$name:$version --help -``` - -If you prefer not to use containers, you may install the service from source: -```bash -# Execute in the repo's root dir: -pip install . - -# To run the service: -$shortname --help -``` - -## Configuration -### Parameters - -The service requires the following configuration parameters: -$config_description - -### Usage: - -A template YAML for configurating the service can be found at -[`./example-config.yaml`](./example-config.yaml). -Please adapt it, rename it to `.$shortname.yaml`, and place it into one of the following locations: -- in the current working directory were you are execute the service (on unix: `./.$shortname.yaml`) -- in your home directory (on unix: `~/.$shortname.yaml`) - -The config yaml will be automatically parsed by the service. - -**Important: If you are using containers, the locations refer to paths within the container.** - -All parameters mentioned in the [`./example-config.yaml`](./example-config.yaml) -could also be set using environment variables or file secrets. - -For naming the environment variables, just prefix the parameter name with `${shortname}_`, -e.g. for the `host` set an environment variable named `${shortname}_host` -(you may use both upper or lower cases, however, it is standard to define all env -variables in upper cases). - -To using file secrets please refer to the -[corresponding section](https://pydantic-docs.helpmanual.io/usage/settings/#secret-support) -of the pydantic documentation. - -$openapi_doc - -## Architecture and Design: -$design_description - -## Development -For setting up the development environment, we rely on the -[devcontainer feature](https://code.visualstudio.com/docs/remote/containers) of vscode -in combination with Docker Compose. - -To use it, you have to have Docker Compose as well as vscode with its "Remote - Containers" -extension (`ms-vscode-remote.remote-containers`) installed. -Then open this repository in vscode and run the command -`Remote-Containers: Reopen in Container` from the vscode "Command Palette". - -This will give you a full-fledged, pre-configured development environment including: -- infrastructural dependencies of the service (databases, etc.) -- all relevant vscode extensions pre-installed -- pre-configured linting and auto-formating -- a pre-configured debugger -- automatic license-header insertion - -Moreover, inside the devcontainer, a convenience commands `dev_install` is available. -It installs the service with all development dependencies, installs pre-commit. - -The installation is performed automatically when you build the devcontainer. However, -if you update dependencies in the [`./setup.cfg`](./setup.cfg) or the -[`./requirements-dev.txt`](./requirements-dev.txt), please run it again. - -## License -This repository is free to use and modify according to the -[Apache 2.0 License](./LICENSE). - -## Readme Generation -This readme is autogenerate, please see [`readme_generation.md`](./readme_generation.md) -for details. diff --git a/.static_files_ignore b/.static_files_ignore index 97ddb09..6640c85 100644 --- a/.static_files_ignore +++ b/.static_files_ignore @@ -2,8 +2,6 @@ # but are allowed to have different content in the current repository pytest.ini -scripts/update_config_docs.py -scripts/update_openapi_docs.py requirements.txt example_data/README.md .devcontainer/dev_install @@ -15,3 +13,11 @@ setup.py .github/workflows/static_code_analysis.yaml .github/workflows/cd.yaml .pre-commit-config.yaml + +.readme_template.md +readme_generation.md + +scripts/get_package_name.py +scripts/update_config_docs.py +scripts/update_openapi_docs.py +scripts/update_readme.py diff --git a/docs/metadata/submission.md b/docs/metadata/submission.md index 481de07..8de9fd0 100644 --- a/docs/metadata/submission.md +++ b/docs/metadata/submission.md @@ -1,6 +1,5 @@ # Submission - ## ghga_submission_full.xlsx
Submission components: @@ -107,7 +106,6 @@ Publication worksheet: * [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
- ## ghga_submission_minimal.xlsx
Submission components: @@ -144,7 +142,6 @@ Publication worksheet: * [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
- ## ghga_submission_sample.xlsx
Submission components: @@ -196,7 +193,6 @@ Publication worksheet: * [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
- ## ghga_submission_individual.xlsx
Submission components: @@ -263,7 +259,6 @@ Publication worksheet: * [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
- ## ghga_submission_seq.xlsx
Submission components: diff --git a/readme_generation.md b/readme_generation.md deleted file mode 100644 index 432153c..0000000 --- a/readme_generation.md +++ /dev/null @@ -1,47 +0,0 @@ - - -# Readme Generation - -The README file is generated by collecting information from different sources as -outlined in the following. - -- name: The full name of the package is derived from the remote origin Git repository. -- title: A title case representation of the name. -- shortname: An abbreviation of the full name. This is derived from the name mentioned - in the [`./setup.cfg`](`./setup.cfg). -- summary: A short 1-2 sentence summary derived from the description in the - [`./setup.cfg`](`./setup.cfg). -- version: The package version derived from the version specified in the - [`./setup.cfg`](`./setup.cfg). -- description: A markdown-formatted description of the features and use cases of this - service or package. Obtained from the [`./.description.md`](./.description.md). -- design_description: A markdown-formatted description of the overall architecture and - design of the package. Obtained from the [`./.design.md`](./.design.md). -- config_description: A markdown-formatted description of all config parameters. - This is autogenerated from the [`./config_schema.json`](./config_schema.json). -- openapi_doc: A markdown-formatted description of the HTTP API. This is autogenerated - and links to the [`./openapi.yaml`](./openapi.yaml). If the openapi.yaml is not - this documentation is empty. - -The [`./.readme_template.md`](./.readme_template.md) serves as a template where the -above variable can be filled in using Pythons `string.Template` utility from the -standard library. - -The [`./scripts/update_readme.py`] script can be used to collect all information and -fill it into the template to generate the README file. diff --git a/scripts/get_package_name.py b/scripts/get_package_name.py deleted file mode 100755 index 7c6b79a..0000000 --- a/scripts/get_package_name.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln -# for the German Human Genome-Phenome Archive (GHGA) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Extracts the package name from the setup.cfg""" - -from pathlib import Path - -REPO_ROOT_DIR = Path(__file__).parent.parent.resolve() -SETUP_CFG_PATH = REPO_ROOT_DIR / "setup.cfg" -NAME_PREFIX = "name = " - - -def get_package_name() -> str: - """Extracts the package name""" - - with open(SETUP_CFG_PATH, "r", encoding="utf8") as setup_cfg: - for line in setup_cfg.readlines(): - line_stripped = line.strip() - if line_stripped.startswith(NAME_PREFIX): - package_name = line_stripped[len(NAME_PREFIX) :] - return package_name - raise RuntimeError("Could not find package name.") - - -def run(): - """Run this script.""" - package_name = get_package_name() - print(package_name) - - -if __name__ == "__main__": - run() diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index e6594a3..06355eb 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -200,7 +200,6 @@ def generate_submission_doc(config: Config, load_ops: LoadOperations): """fn""" submission_content = config.model_dump() - print(submission_content) create_doc_file( SUBMISSION_DIR, diff --git a/scripts/update_readme.py b/scripts/update_readme.py deleted file mode 100755 index 594aedf..0000000 --- a/scripts/update_readme.py +++ /dev/null @@ -1,217 +0,0 @@ -#!/usr/bin/env python3 - -# Copyright 2021 - 2023 Universität Tübingen, DKFZ, EMBL, and Universität zu Köln -# for the German Human Genome-Phenome Archive (GHGA) -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -"""Generate documentation for this package using different sources.""" - -import json -import subprocess # nosec -import sys -from pathlib import Path -from string import Template - -import jsonschema2md -from pydantic import BaseModel, Field -from script_utils.cli import echo_failure, echo_success, run -from setuptools.config.setupcfg import read_configuration -from stringcase import spinalcase, titlecase - -ROOT_DIR = Path(__file__).parent.parent.resolve() -SETUP_CFG_PATH = ROOT_DIR / "setup.cfg" -DESCRIPTION_PATH = ROOT_DIR / ".description.md" -DESIGN_PATH = ROOT_DIR / ".design.md" -README_TEMPLATE_PATH = ROOT_DIR / ".readme_template.md" -CONFIG_SCHEMA_PATH = ROOT_DIR / "config_schema.json" -OPENAPI_YAML_REL_PATH = "./openapi.yaml" -README_PATH = ROOT_DIR / "README.md" - - -class PackageHeader(BaseModel): - """A basic summary of a package.""" - - shortname: str = Field( - ..., - description=( - "The abbreviation of the package name. Is identical to the package name." - ), - ) - version: str = Field(..., description="The version of the package.") - summary: str = Field( - ..., description="A short 1 or 2 sentence summary of the package." - ) - - -class PackageName(BaseModel): - """The name of a package and it's different representations.""" - - name: str = Field(..., description="The full name of the package in spinal case.") - title: str = Field(..., description="The name of the package formatted as title.") - - -class PackageDetails(PackageHeader, PackageName): - """A container for details on a package used to build documentation.""" - - description: str = Field( - ..., description="A markdown-formatted description of the package." - ) - design_description: str = Field( - ..., - description=( - "A markdown-formatted description of overall architecture and design of" - + " the package." - ), - ) - config_description: str = Field( - ..., - description=( - "A markdown-formatted list of all configuration parameters of this package." - ), - ) - openapi_doc: str = Field( - ..., - description=( - "A markdown-formatted description rendering or linking to an OpenAPI" - " specification of the package." - ), - ) - - -def read_package_header() -> PackageHeader: - """Read basic information about the package from the setup.cfg.""" - - setup_config = read_configuration(SETUP_CFG_PATH) - setup_metadata = setup_config["metadata"] - return PackageHeader( - shortname=setup_metadata["name"], - version=setup_metadata["version"], - summary=setup_metadata["description"], - ) - - -def read_package_name() -> PackageName: - """Infer the package name from the name of the git origin.""" - - with subprocess.Popen( - args="basename -s .git `git config --get remote.origin.url`", - cwd=ROOT_DIR, - stdout=subprocess.PIPE, - shell=True, - ) as process: - stdout, _ = process.communicate() - - if not stdout: - raise RuntimeError("The name of the git origin could not be resolved.") - git_origin_name = stdout.decode("utf-8").strip() - - return PackageName( - name=spinalcase(git_origin_name), title=titlecase(git_origin_name) - ) - - -def read_package_description() -> str: - """Read the package description.""" - - return DESCRIPTION_PATH.read_text() - - -def read_design_description() -> str: - """Read the design description.""" - - return DESIGN_PATH.read_text() - - -def generate_config_docs() -> str: - """Generate markdown-formatted documentation for the configration parameters - listed in the config schema.""" - - parser = jsonschema2md.Parser( - examples_as_yaml=False, - show_examples="all", - ) - with open(CONFIG_SCHEMA_PATH, "r", encoding="utf-8") as json_file: - config_schema = json.load(json_file) - - md_lines = parser.parse_schema(config_schema) - - # ignore everything before the properites header: - properties_index = md_lines.index("## Properties\n\n") - md_lines = md_lines[properties_index + 1 :] - - return "\n".join(md_lines) - - -def generate_openapi_docs() -> str: - """Generate markdown-formatted documentation linking to or rendering an OpenAPI - specification of the package. If no OpenAPI specification is present, return an - empty string.""" - - open_api_yaml_path = ROOT_DIR / OPENAPI_YAML_REL_PATH - - if not open_api_yaml_path.exists(): - return "" - - return ( - "## HTTP API\n" - + "An OpenAPI specification for this service can be found" - + f" [here]({OPENAPI_YAML_REL_PATH})." - ) - - -def get_package_details() -> PackageDetails: - """Get details required to build documentation for the package.""" - - header = read_package_header() - name = read_package_name() - description = read_package_description() - config_description = generate_config_docs() - return PackageDetails( - **header.dict(), - **name.dict(), - description=description, - config_description=config_description, - design_description=read_design_description(), - openapi_doc=generate_openapi_docs(), - ) - - -def generate_single_readme(*, details: PackageDetails) -> str: - """Generate a single markdown-formatted readme file for the package based on the - provided details.""" - - template_content = README_TEMPLATE_PATH.read_text() - template = Template(template_content) - return template.substitute(details.dict()) - - -def main(check: bool = False) -> None: - """Update the readme markdown.""" - - details = get_package_details() - readme_content = generate_single_readme(details=details) - - if check: - if README_PATH.read_text() != readme_content: - echo_failure("README.md is not up to date.") - sys.exit(1) - echo_success("README.md is up to date.") - return - - README_PATH.write_text(readme_content) - echo_success("Successfully updated README.md.") - - -if __name__ == "__main__": - run(main) From cfbe45604074375d35eaad046d78f44af6e5e7c8 Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 14:53:13 +0000 Subject: [PATCH 09/15] template adjustments --- .github/workflows/check_readme.yaml | 16 ---------------- .static_files_ignore | 2 ++ 2 files changed, 2 insertions(+), 16 deletions(-) delete mode 100644 .github/workflows/check_readme.yaml diff --git a/.github/workflows/check_readme.yaml b/.github/workflows/check_readme.yaml deleted file mode 100644 index a052d1e..0000000 --- a/.github/workflows/check_readme.yaml +++ /dev/null @@ -1,16 +0,0 @@ -name: Check if the readme is up to date. - -on: push - -jobs: - static-code-analysis: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - - id: common - uses: ghga-de/gh-action-common@v2 - - - name: Check readme - run: | - ./scripts/update_readme.py --check diff --git a/.static_files_ignore b/.static_files_ignore index 6640c85..0c83674 100644 --- a/.static_files_ignore +++ b/.static_files_ignore @@ -11,6 +11,8 @@ setup.py .github/workflows/unit_and_int_tests.yaml .github/workflows/check_openapi_spec.yaml .github/workflows/static_code_analysis.yaml +.github/workflows/check_readme.yaml + .github/workflows/cd.yaml .pre-commit-config.yaml From d0d058422607a83f29feddf94bab76a02475fd0b Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 15:00:09 +0000 Subject: [PATCH 10/15] minor template fix --- docs/metadata/submission.md | 128 +++++++++--------- ...submission_documentation_template.md.jinja | 2 +- 2 files changed, 65 insertions(+), 65 deletions(-) diff --git a/docs/metadata/submission.md b/docs/metadata/submission.md index 8de9fd0..37d4e6f 100644 --- a/docs/metadata/submission.md +++ b/docs/metadata/submission.md @@ -8,102 +8,102 @@ Submission components: Study worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
StudyFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
Sample worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
SampleFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
Condition worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
Biospecimen worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
Individual worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
Trio worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
LibraryPreparationProtocol worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
SequencingProtocol worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
SequencingExperiment worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
SequencingProcess worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
SequencingProcessFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
Analysis worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Analysis.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Analysis.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md](https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md)
AnalysisProcess worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcess.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md)
AnalysisProcessOutputFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcessOutputFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/AnalysisProcessOutputFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md)
Dataset worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
DataAccessPolicy worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
DataAccessCommittee worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
Publication worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
## ghga_submission_minimal.xlsx
@@ -114,32 +114,32 @@ Submission components: Study worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
StudyFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
Dataset worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
DataAccessPolicy worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
DataAccessCommittee worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
Publication worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
## ghga_submission_sample.xlsx
@@ -150,47 +150,47 @@ Submission components: Study worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
StudyFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
Sample worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
SampleFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
Condition worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
Dataset worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
DataAccessPolicy worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
DataAccessCommittee worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
Publication worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
## ghga_submission_individual.xlsx
@@ -201,62 +201,62 @@ Submission components: Study worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
StudyFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
Sample worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
SampleFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
Condition worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
Biospecimen worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
Individual worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
Trio worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
Dataset worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
DataAccessPolicy worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
DataAccessCommittee worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
Publication worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
## ghga_submission_seq.xlsx
@@ -267,84 +267,84 @@ Submission components: Study worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Study.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
StudyFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/StudyFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
Sample worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Sample.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
SampleFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SampleFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
Condition worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Condition.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
Biospecimen worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Biospecimen.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
Individual worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Individual.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
Trio worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Trio.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
LibraryPreparationProtocol worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/LibraryPreparationProtocol.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
SequencingProtocol worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProtocol.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
SequencingExperiment worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingExperiment.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
SequencingProcess worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcess.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
SequencingProcessFile worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/SequencingProcessFile.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
Dataset worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Dataset.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
DataAccessPolicy worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessPolicy.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
DataAccessCommittee worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/DataAccessCommittee.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
Publication worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/Publication.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/templates/.submission_documentation_template.md.jinja b/docs/templates/.submission_documentation_template.md.jinja index 153687a..6baa150 100644 --- a/docs/templates/.submission_documentation_template.md.jinja +++ b/docs/templates/.submission_documentation_template.md.jinja @@ -10,7 +10,7 @@ Submission components: {{ sheet }} worksheet: -* [https://ghga-de.github.io/docs/metadata/overviewworksheets/{{ sheet }}.md](https://ghga-de.github.io/docs/metadata/overviewworksheets/{{ sheet }}.md)
+* [https://ghga-de.github.io/docs/metadata/worksheets/{{ sheet }}.md](https://ghga-de.github.io/docs/metadata/worksheets/{{ sheet }}.md)
{% endfor %} From 03e68823efa38e7784d32f51dcd2f32a7e17447d Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 15:11:48 +0000 Subject: [PATCH 11/15] submission doc content goes to submission folder --- docs/metadata/entities.md | 2 +- docs/metadata/modules.md | 15 --------------- docs/metadata/submission/submission.md | 1 + .../submission_sheets.md} | 0 mkdocs.yaml | 2 +- scripts/update_metadata_docs.py | 4 ++-- 6 files changed, 5 insertions(+), 19 deletions(-) delete mode 100644 docs/metadata/modules.md create mode 100644 docs/metadata/submission/submission.md rename docs/metadata/{submission.md => submission/submission_sheets.md} (100%) diff --git a/docs/metadata/entities.md b/docs/metadata/entities.md index 7e17372..c3b3e4e 100644 --- a/docs/metadata/entities.md +++ b/docs/metadata/entities.md @@ -85,7 +85,7 @@ GHGA presents its content to potential data requesters and submitters with the * The *Dataset* entity is aimed at capturing relevant information about a dataset itself. The data submitter can provide a description and a title for the dataset. The main purpose of this entity is to link a dataset to the related study, experiments, samples, analysis, files and data access policies. These links must be provided on the submission of data, either through automatic linking with respect to the *Data Access Committee*, or the data submitter. -All properties captured in the *Dataset* entity are required for the functionality of GHGA and are therefore mandatory. The only exception is the analysis alias, which only needs to be provided if an analysis is to be submitted. A title and description can be indexed by the database in order to make the GHGA Data Portal searchable for a specific dataset. In addition, the links to study, experiment, samples, analysis (if avalaible) and files are necessary to provide a data requester with all relevant data and metadata associated with a dataset. This also ensures reusability in the light of the FAIR Data Principles. +All properties captured in the *Dataset* entity are required for the functionality of GHGA and are therefore mandatory. The only exception is the analysis alias, which only needs to be provided if an analysis is to be submitted. A title and description can be indexed by the database in order to make the GHGA Data Portal searchable for a specific dataset. In addition, the links to study, experiment, samples, analysis (if available) and files are necessary to provide a data requester with all relevant data and metadata associated with a dataset. This also ensures reusability in the light of the FAIR Data Principles. ## **Data Access Policy and Committee** diff --git a/docs/metadata/modules.md b/docs/metadata/modules.md deleted file mode 100644 index e8fdfc0..0000000 --- a/docs/metadata/modules.md +++ /dev/null @@ -1,15 +0,0 @@ -# **Modules in the GHGA Metadata Model** - -- **Basic Module**: The Basic Module is the fundamental module in the GHGA Metadata Schema. It covers the minimal amount of information that must be included in a successful submission. - -- **Sample Module**: Every Basic Module can be linked to one or more Sample Modules. This module contains information relating to the sample that was later sequenced in a sequencing experiment. - -- **Phenotype Module**: One Sample Module can have one or more Phenotype Modules. This module can be used when a sample originated from a ‘Biospecimen’ or an ‘Individual’ and thus allows to group several Sample Modules based on the sample origin. In addition, the Phenotype Module captures detailed information about phenotypes or individual demographics. - -- **Sequencing Module**: One Sample Module can also be linked to one or more Sequencing Modules. The Sequencing Module captures information about the ‘Sequencing Process’, such as the sequencing and library preparation protocols. - -- **Data Use Conditions Module**: The Data Use Conditions Module captures in granular detail what restrictions and use conditions are associated with a Data Access Policy. This section also captures the Data Access Committee that enforces the Data Access Policy requirements. - -- **Dataset Module**: The Dataset Module contains the ‘Dataset’ entity, which is a collection of one or more Files from one or more Modules. All Files within the Dataset Module are subject to the Data Access Policy that is captured in the Data Use Conditions Module. One Dataset Module can only be linked to one Data Use Conditions Module. - -- **Analysis Module**: A dataset can have one or more Analysis Modules where each Analysis Module links to one or more files as input to the Analysis, one or more files as output to the Analysis, and the ‘Analysis Process’ that captures how the analysis was performed. diff --git a/docs/metadata/submission/submission.md b/docs/metadata/submission/submission.md new file mode 100644 index 0000000..1812664 --- /dev/null +++ b/docs/metadata/submission/submission.md @@ -0,0 +1 @@ +# Guideline - How to do metadata submission diff --git a/docs/metadata/submission.md b/docs/metadata/submission/submission_sheets.md similarity index 100% rename from docs/metadata/submission.md rename to docs/metadata/submission/submission_sheets.md diff --git a/mkdocs.yaml b/mkdocs.yaml index c38455e..8760874 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -81,7 +81,7 @@ nav: - "GHGA Metadata Model": - "Overview": metadata/overview.md - metadata/concepts.md - - metadata/modules.md - "Entities & Attributes": metadata/entities.md + - "Metadata Submission": metadata/submission/submission.md - "Tools": - "GHGA Transpiler": transpiler/transpiler.md diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index 06355eb..f327728 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -28,7 +28,7 @@ HERE = Path(__file__).parent.resolve() ROOT = HERE.parent SHEET_DIR = ROOT / "docs" / "metadata" / "worksheets" -SUBMISSION_DIR = ROOT / "docs" / "metadata" +SUBMISSION_DIR = ROOT / "docs" / "metadata" / "submission" CONFIG_PATH = ROOT / ".workbook_config.yaml" TEMPLATE_DIR = ROOT / "docs" / "templates" @@ -203,7 +203,7 @@ def generate_submission_doc(config: Config, load_ops: LoadOperations): create_doc_file( SUBMISSION_DIR, - "submission", + "submission_sheets", generate_markdown( load_ops, ".submission_documentation_template.md.jinja", submission_content ), From 1c4042458616f7c951ebf26ce9e215789aebe10a Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 1 Sep 2023 16:27:19 +0000 Subject: [PATCH 12/15] documentation structure changes --- .../submission/ghga_submission_full.xlsx.md | 86 +++++ .../ghga_submission_individual.xlsx.md | 54 +++ .../ghga_submission_minimal.xlsx.md | 30 ++ .../submission/ghga_submission_sample.xlsx.md | 42 +++ .../submission/ghga_submission_seq.xlsx.md | 74 ++++ docs/metadata/submission/submission_sheets.md | 350 ------------------ ...submission_documentation_template.md.jinja | 8 +- mkdocs.yaml | 9 +- scripts/update_metadata_docs.py | 19 +- 9 files changed, 306 insertions(+), 366 deletions(-) create mode 100644 docs/metadata/submission/ghga_submission_full.xlsx.md create mode 100644 docs/metadata/submission/ghga_submission_individual.xlsx.md create mode 100644 docs/metadata/submission/ghga_submission_minimal.xlsx.md create mode 100644 docs/metadata/submission/ghga_submission_sample.xlsx.md create mode 100644 docs/metadata/submission/ghga_submission_seq.xlsx.md delete mode 100644 docs/metadata/submission/submission_sheets.md diff --git a/docs/metadata/submission/ghga_submission_full.xlsx.md b/docs/metadata/submission/ghga_submission_full.xlsx.md new file mode 100644 index 0000000..e226574 --- /dev/null +++ b/docs/metadata/submission/ghga_submission_full.xlsx.md @@ -0,0 +1,86 @@ +# Submission + +## ghga_submission_full.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
+ + +StudyFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
+ + +Sample worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
+ + +SampleFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
+ + +Condition worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
+ + +Biospecimen worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
+ + +Individual worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
+ + +Trio worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
+ + +LibraryPreparationProtocol worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
+ + +SequencingProtocol worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
+ + +SequencingExperiment worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
+ + +SequencingProcess worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
+ + +SequencingProcessFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
+ + +Analysis worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md](https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md)
+ + +AnalysisProcess worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md)
+ + +AnalysisProcessOutputFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md)
+ + +Dataset worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
+ + +Publication worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/metadata/submission/ghga_submission_individual.xlsx.md b/docs/metadata/submission/ghga_submission_individual.xlsx.md new file mode 100644 index 0000000..898ed1d --- /dev/null +++ b/docs/metadata/submission/ghga_submission_individual.xlsx.md @@ -0,0 +1,54 @@ +# Submission + +## ghga_submission_individual.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
+ + +StudyFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
+ + +Sample worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
+ + +SampleFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
+ + +Condition worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
+ + +Biospecimen worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
+ + +Individual worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
+ + +Trio worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
+ + +Dataset worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
+ + +Publication worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/metadata/submission/ghga_submission_minimal.xlsx.md b/docs/metadata/submission/ghga_submission_minimal.xlsx.md new file mode 100644 index 0000000..177b448 --- /dev/null +++ b/docs/metadata/submission/ghga_submission_minimal.xlsx.md @@ -0,0 +1,30 @@ +# Submission + +## ghga_submission_minimal.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
+ + +StudyFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
+ + +Dataset worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
+ + +Publication worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/metadata/submission/ghga_submission_sample.xlsx.md b/docs/metadata/submission/ghga_submission_sample.xlsx.md new file mode 100644 index 0000000..2e08f64 --- /dev/null +++ b/docs/metadata/submission/ghga_submission_sample.xlsx.md @@ -0,0 +1,42 @@ +# Submission + +## ghga_submission_sample.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
+ + +StudyFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
+ + +Sample worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
+ + +SampleFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
+ + +Condition worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
+ + +Dataset worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
+ + +Publication worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/metadata/submission/ghga_submission_seq.xlsx.md b/docs/metadata/submission/ghga_submission_seq.xlsx.md new file mode 100644 index 0000000..855174c --- /dev/null +++ b/docs/metadata/submission/ghga_submission_seq.xlsx.md @@ -0,0 +1,74 @@ +# Submission + +## ghga_submission_seq.xlsx
+ +Submission components: +---------------------- + + +Study worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
+ + +StudyFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
+ + +Sample worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
+ + +SampleFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
+ + +Condition worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
+ + +Biospecimen worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
+ + +Individual worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
+ + +Trio worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
+ + +LibraryPreparationProtocol worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
+ + +SequencingProtocol worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
+ + +SequencingExperiment worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
+ + +SequencingProcess worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
+ + +SequencingProcessFile worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
+ + +Dataset worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
+ + +DataAccessPolicy worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
+ + +DataAccessCommittee worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
+ + +Publication worksheet: +* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/metadata/submission/submission_sheets.md b/docs/metadata/submission/submission_sheets.md deleted file mode 100644 index 37d4e6f..0000000 --- a/docs/metadata/submission/submission_sheets.md +++ /dev/null @@ -1,350 +0,0 @@ -# Submission - -## ghga_submission_full.xlsx
- -Submission components: ----------------------- - - -Study worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
- - -StudyFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
- - -Sample worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
- - -SampleFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
- - -Condition worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
- - -Biospecimen worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
- - -Individual worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
- - -Trio worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
- - -LibraryPreparationProtocol worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
- - -SequencingProtocol worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
- - -SequencingExperiment worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
- - -SequencingProcess worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
- - -SequencingProcessFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
- - -Analysis worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md](https://ghga-de.github.io/docs/metadata/worksheets/Analysis.md)
- - -AnalysisProcess worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcess.md)
- - -AnalysisProcessOutputFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md](https://ghga-de.github.io/docs/metadata/worksheets/AnalysisProcessOutputFile.md)
- - -Dataset worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
- - -DataAccessPolicy worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
- - -DataAccessCommittee worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
- - -Publication worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
- - -## ghga_submission_minimal.xlsx
- -Submission components: ----------------------- - - -Study worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
- - -StudyFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
- - -Dataset worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
- - -DataAccessPolicy worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
- - -DataAccessCommittee worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
- - -Publication worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
- - -## ghga_submission_sample.xlsx
- -Submission components: ----------------------- - - -Study worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
- - -StudyFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
- - -Sample worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
- - -SampleFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
- - -Condition worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
- - -Dataset worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
- - -DataAccessPolicy worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
- - -DataAccessCommittee worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
- - -Publication worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
- - -## ghga_submission_individual.xlsx
- -Submission components: ----------------------- - - -Study worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
- - -StudyFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
- - -Sample worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
- - -SampleFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
- - -Condition worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
- - -Biospecimen worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
- - -Individual worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
- - -Trio worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
- - -Dataset worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
- - -DataAccessPolicy worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
- - -DataAccessCommittee worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
- - -Publication worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
- - -## ghga_submission_seq.xlsx
- -Submission components: ----------------------- - - -Study worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Study.md](https://ghga-de.github.io/docs/metadata/worksheets/Study.md)
- - -StudyFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md](https://ghga-de.github.io/docs/metadata/worksheets/StudyFile.md)
- - -Sample worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Sample.md](https://ghga-de.github.io/docs/metadata/worksheets/Sample.md)
- - -SampleFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SampleFile.md)
- - -Condition worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Condition.md](https://ghga-de.github.io/docs/metadata/worksheets/Condition.md)
- - -Biospecimen worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md](https://ghga-de.github.io/docs/metadata/worksheets/Biospecimen.md)
- - -Individual worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Individual.md](https://ghga-de.github.io/docs/metadata/worksheets/Individual.md)
- - -Trio worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Trio.md](https://ghga-de.github.io/docs/metadata/worksheets/Trio.md)
- - -LibraryPreparationProtocol worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/LibraryPreparationProtocol.md)
- - -SequencingProtocol worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProtocol.md)
- - -SequencingExperiment worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingExperiment.md)
- - -SequencingProcess worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcess.md)
- - -SequencingProcessFile worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md](https://ghga-de.github.io/docs/metadata/worksheets/SequencingProcessFile.md)
- - -Dataset worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md](https://ghga-de.github.io/docs/metadata/worksheets/Dataset.md)
- - -DataAccessPolicy worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessPolicy.md)
- - -DataAccessCommittee worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md](https://ghga-de.github.io/docs/metadata/worksheets/DataAccessCommittee.md)
- - -Publication worksheet: - -* [https://ghga-de.github.io/docs/metadata/worksheets/Publication.md](https://ghga-de.github.io/docs/metadata/worksheets/Publication.md)
diff --git a/docs/templates/.submission_documentation_template.md.jinja b/docs/templates/.submission_documentation_template.md.jinja index 6baa150..1600a52 100644 --- a/docs/templates/.submission_documentation_template.md.jinja +++ b/docs/templates/.submission_documentation_template.md.jinja @@ -1,17 +1,13 @@ # Submission -{% for workbook in workbooks %} -## {{ workbook.file_name }}
+## {{ file_name }}
Submission components: ---------------------- -{% for sheet in workbook.worksheets %} +{% for sheet in worksheets %} {{ sheet }} worksheet: - * [https://ghga-de.github.io/docs/metadata/worksheets/{{ sheet }}.md](https://ghga-de.github.io/docs/metadata/worksheets/{{ sheet }}.md)
{% endfor %} - -{% endfor %} diff --git a/mkdocs.yaml b/mkdocs.yaml index 8760874..c5c8fc8 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -82,6 +82,13 @@ nav: - "Overview": metadata/overview.md - metadata/concepts.md - "Entities & Attributes": metadata/entities.md - - "Metadata Submission": metadata/submission/submission.md + - "Metadata Submission": + - "Submission Overview": metadata/submission/submission.md + - "Submission Spreadsheets": + - "GHGA Full Submission": metadata/submission/ghga_submission_full.xlsx.md + - "GHGA Minimal Submission": metadata/submission/ghga_submission_minimal.xlsx.md + - "GHGA Individual Submission": metadata/submission/ghga_submission_individual.xlsx.md + - "GHGA Sample Submission": metadata/submission/ghga_submission_sample.xlsx.md + - "GHGA Sequence Submission": metadata/submission/ghga_submission_seq.xlsx.md - "Tools": - "GHGA Transpiler": transpiler/transpiler.md diff --git a/scripts/update_metadata_docs.py b/scripts/update_metadata_docs.py index f327728..addd41b 100644 --- a/scripts/update_metadata_docs.py +++ b/scripts/update_metadata_docs.py @@ -199,15 +199,16 @@ def generate_sheet_docs(config: Config, load_ops: LoadOperations): def generate_submission_doc(config: Config, load_ops: LoadOperations): """fn""" - submission_content = config.model_dump() - - create_doc_file( - SUBMISSION_DIR, - "submission_sheets", - generate_markdown( - load_ops, ".submission_documentation_template.md.jinja", submission_content - ), - ) + for workbook in config.workbooks: + create_doc_file( + SUBMISSION_DIR, + workbook.file_name, + generate_markdown( + load_ops, + ".submission_documentation_template.md.jinja", + workbook.model_dump(), + ), + ) def main(): From b076760e9cc0474fc40434f4d4f79174fbb6bf9c Mon Sep 17 00:00:00 2001 From: Karoline Mauer Date: Tue, 26 Sep 2023 12:47:12 +0000 Subject: [PATCH 13/15] fixed dataset entity description --- user_docs/metadata/entities.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/user_docs/metadata/entities.md b/user_docs/metadata/entities.md index 5180b07..0e3b133 100644 --- a/user_docs/metadata/entities.md +++ b/user_docs/metadata/entities.md @@ -79,13 +79,13 @@ The data submitter is required to provide an analysis alias, the aliases for the ## **Dataset** -GHGA presents its content to potential data requesters and submitters with the *Dataset* entity, which focuses on sharing functionality by describing the contents at a high level. Each dataset is linked to a *Data Access Policy*, which builds the legal basis for the sharing of data. One dataset has links to *Experiment* and / or *Analysis* entities to bundle all relevant data that makes a dataset by the definition of the GHGA Metadata Schema. +GHGA presents its content to potential data requesters and submitters with the *Dataset* entity, which focuses on sharing functionality by describing the contents at a high level. One dataset has links to *Files* and the *Data Access Policy* to bundle all relevant data that makes a dataset by the definition of the GHGA Metadata Schema. ### **Dataset metadata properties** -The *Dataset* entity is aimed at capturing relevant information about a dataset itself. The data submitter can provide a description and a title for the dataset. The main purpose of this entity is to link a dataset to the related study, experiments, samples, analysis, files and data access policies. These links must be provided on the submission of data, either through automatic linking with respect to the *Data Access Committee*, or the data submitter. +The *Dataset* entity is aimed at capturing relevant information about a dataset itself. The data submitter can provide a description and a title for the dataset. The main purpose of this entity is to link a dataset to the related files and *Data Access Policy*. These links must be provided on the submission of data, either through automatic linking with respect to the *Data Access Committee*, or the data submitter. -All properties captured in the *Dataset* entity are required for the functionality of GHGA and are therefore mandatory. The only exception is the analysis alias, which only needs to be provided if an analysis is to be submitted. A title and description can be indexed by the database in order to make the GHGA Data Portal searchable for a specific dataset. In addition, the links to study, experiment, samples, analysis (if avalaible) and files are necessary to provide a data requester with all relevant data and metadata associated with a dataset. This also ensures reusability in the light of the FAIR Data Principles. +All properties captured in the *Dataset* entity are required for the functionality of GHGA and are therefore mandatory. A title and description can be indexed by the database in order to make the GHGA Data Portal searchable for a specific dataset. In addition, the links to DAP and files are necessary to provide a data requester with the relevant data and the *Data Access Policy*, which builds the legal basis for the sharing of data, associated with a dataset. This also ensures reusability in the light of the FAIR Data Principles. ## **Data Access Policy and Committee** From efcd8ed18050b2e47a153870296781364f4a3f8a Mon Sep 17 00:00:00 2001 From: sbilge Date: Wed, 24 Jan 2024 15:12:19 +0000 Subject: [PATCH 14/15] minor fixes --- user_docs/validator/validator.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/user_docs/validator/validator.md b/user_docs/validator/validator.md index 3b9e52a..20e3275 100644 --- a/user_docs/validator/validator.md +++ b/user_docs/validator/validator.md @@ -55,13 +55,13 @@ Options: 1. To validate `data.json` against the schema `submission.yaml` and store the validation report into the file `report.json`: ``` -ghga-validator --input schema.json --schema submission.yaml --report report.json +ghga_validator --input data.json --schema submission.yaml --report report.json ``` 2. To validate with providing the root class `Submission` for validation: ``` -ghga-validator --input schema.json --schema submission.yaml --report report.json --target-class Submission +ghga_validator --input data.json --schema submission.yaml --report report.json --target-class Submission ``` 3. To display help message: ``` -ghga-validator --help +ghga_validator --help ``` From 24b840834d23a8adadc69c246c63abf61e950937 Mon Sep 17 00:00:00 2001 From: sbilge Date: Fri, 9 Feb 2024 20:36:39 +0100 Subject: [PATCH 15/15] template check removed --- .github/workflows/check_template_files.yaml | 21 --------------------- 1 file changed, 21 deletions(-) delete mode 100644 .github/workflows/check_template_files.yaml diff --git a/.github/workflows/check_template_files.yaml b/.github/workflows/check_template_files.yaml deleted file mode 100644 index 9fb5cbf..0000000 --- a/.github/workflows/check_template_files.yaml +++ /dev/null @@ -1,21 +0,0 @@ -name: Check template files - -on: push - -jobs: - check-template-files: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: Set up Python 3.9 - uses: actions/setup-python@v4 - with: - python-version: 3.9 - - name: Check template files - run: | - if [ "${{ github.event.repository.name }}" == "microservice-repository-template" ] - then - echo "Skipping this test as operating on the template repo." - else - ./scripts/update_template_files.py --check - fi