Merge pull request #284 from togoid/main

release 2024-12-23
togoid · Dec 23, 2024 · c4d81b6 · c4d81b6
2 parents d23fc88 + 70f8e49
commit c4d81b6
Show file tree

Hide file tree

Showing 12 changed files with 1,680 additions and 2,067 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -6,7 +6,7 @@ RUN apt-get update -y && apt-get install -y \
       libjson-perl=4.04000-1 \
       libany-uri-escape-perl=0.01-3 \
       libwww-perl=6.61-1 \
-      curl=7.81.0-1ubuntu1.19 \
+      curl=7.81.0-1ubuntu1.20 \
       wget=1.21.2-2ubuntu1 \
       gawk=1:5.1.0-1build3 \
       python3=3.10.6-1~22.04.1 \

diff --git a/Rakefile b/Rakefile
@@ -550,51 +550,70 @@ namespace :prepare do
   task :bioproject => INPUT_BIOPROJECT_DIR do
     $stderr.puts "## Prepare input files for BioProject"
     download_lock(INPUT_BIOPROJECT_DIR) do
+      updated = false
       input_file = "#{INPUT_BIOPROJECT_DIR}/bioproject.xml"
       input_url  = "https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml"
       if update_input_file?(input_file, input_url)
         download_file(INPUT_BIOPROJECT_DIR, input_url)
         sh "python bin/bioproject_xml2tsv.py #{INPUT_BIOPROJECT_DIR}/bioproject.xml > #{INPUT_BIOPROJECT_DIR}/bioproject.tsv"
+        updated = true
       end
+
+      input_file = "#{INPUT_BIOPROJECT_DIR}/bioproject2biosample.tsv"
+      input_url  = "https://ddbj.nig.ac.jp/public/dblink/bioproject-biosample/bioproject2biosample.tsv"
+      if update_input_file?(input_file, input_url)
+        download_file(INPUT_BIOPROJECT_DIR, input_url)
+        updated = true
+      end
+      updated
     end
   end
 
   desc "Prepare required files for BioSample"
   task :biosample => INPUT_BIOSAMPLE_DIR do
     $stderr.puts "## Prepare input files for BioSample"
     download_lock(INPUT_BIOSAMPLE_DIR) do
+      updated = false
       input_file = "#{INPUT_BIOSAMPLE_DIR}/biosample_set.xml.gz"
       input_url  = "https://ftp.ncbi.nlm.nih.gov/biosample/biosample_set.xml.gz"
       if update_input_file?(input_file, input_url)
         download_file(INPUT_BIOSAMPLE_DIR, input_url)
         sh "gzip -dc #{input_file} > #{INPUT_BIOSAMPLE_DIR}/biosample_set.xml"
         sh "python bin/biosample_xml2tsv.py #{INPUT_BIOSAMPLE_DIR}/biosample_set.xml > #{INPUT_BIOSAMPLE_DIR}/biosample_set.tsv"
+        updated = true
       end
+      updated
     end
   end
 
   desc "Prepare required files for Cellosaurus"
   task :cellosaurus => INPUT_CELLOSAURUS_DIR do
     $stderr.puts "## Prepare input files for Cellosaurus"
     download_lock(INPUT_CELLOSAURUS_DIR) do
+      updated = false
       input_file = "#{INPUT_CELLOSAURUS_DIR}/cellosaurus.txt"
       input_url  = "https://ftp.expasy.org/databases/cellosaurus/cellosaurus.txt"
       if update_input_file?(input_file, input_url)
         download_file(INPUT_CELLOSAURUS_DIR, input_url)
+        updated = true
       end
+      updated
     end
   end
 
   desc "Prepare required files for ClinVar"
   task :clinvar => INPUT_CLINVAR_DIR do
     $stderr.puts "## Prepare input files for ClinVar"
     download_lock(INPUT_CLINVAR_DIR) do
+      updated = false
       input_file = "#{INPUT_CLINVAR_DIR}/variant_summary.txt.gz"
       input_url  = "https://ftp.ncbi.nlm.nih.gov/pub/clinvar/tab_delimited/variant_summary.txt.gz"
       if update_input_file?(input_file, input_url)
         download_file(INPUT_CLINVAR_DIR, input_url)
         sh "gzip -dc #{input_file} > #{INPUT_CLINVAR_DIR}/variant_summary.txt"
+        updated = true
       end
+      updated
     end
   end
 

diff --git a/config/bioproject-biosample/config.yaml b/config/bioproject-biosample/config.yaml
@@ -2,6 +2,7 @@ link:
   file: sample.tsv
   forward: TIO_000094
   reverse: TIO_000095
+  description: "The relationship between BioSample and BioProject is derived from the XML files provided by BioSample (biosample_set.xml) and BioProject (bioproject.xml). However, these files do not comprehensively cover all relations. If you are unable to find direct relations between your input BioSample/BioProject IDs and the corresponding BioProject/BioSample IDs, please consider using the path via SRA Experiment IDs, which may contains expected relations."
 update:
   frequency: Monthly
-  method: curl -sS https://ddbj.nig.ac.jp/public/dblink/bioproject-biosample/bioproject2biosample.tsv
+  method: awk -F "\t" 'FNR==1{fn++} fn==1{b[$1,$2]=1; print $1 "\t" $2} fn==2&&$2=="ID"{a[$3]=$1} fn==3&&$2=="BioProject ID"&&a[$3]&&!b[a[$3],$1]{print a[$3] "\t" $1}' $TOGOID_ROOT/input/bioproject/bioproject2biosample.tsv $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv
diff --git a/config/biosample-bioproject/config.yaml b/config/biosample-bioproject/config.yaml
@@ -2,6 +2,7 @@ link:
   file: sample.tsv
   forward: TIO_000095
   reverse: TIO_000094
+  description: "The relationship between BioSample and BioProject is derived from the XML files provided by BioSample (biosample_set.xml) and BioProject (bioproject.xml). However, these files do not comprehensively cover all relations. If you are unable to find direct relations between your input BioSample/BioProject IDs and the corresponding BioProject/BioSample IDs, please consider using the path via SRA Experiment IDs, which may contains expected relations."
 update:
   frequency: Monthly
-  method: awk -F "\t" 'FNR==NR&&$2=="ID"{a[$3]=$1}FNR!=NR&&$2=="BioProject ID"&&a[$3]{print $1 "\t" a[$3]}' $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv
+  method: awk -F "\t" 'FNR==1{fn++} fn==1{b[$2,$1]=1; print $2 "\t" $1} fn==2&&$2=="ID"{a[$3]=$1} fn==3&&$2=="BioProject ID"&&a[$3]&&!b[$1,a[$3]]{print $1 "\t" a[$3]}' $TOGOID_ROOT/input/bioproject/bioproject2biosample.tsv $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv
diff --git a/docs/help.md b/docs/help.md
@@ -1,5 +1,5 @@
 # TogoID ver. 2.0
-Datasets last updated: 2024-12-17
+Datasets last updated: 2024-12-23
 
 ## About
 - [TogoID](https://togoid.dbcls.jp/) is an ID conversion service implementing unique features with an intuitive web interface and an API for programmatic access. TogoID supports datasets from various biological categories such as gene, protein, chemical compound, pathway, disease, etc. TogoID users can perform exploratory multistep conversions to find a path among IDs. To guide the interpretation of biological meanings in the conversions, we crafted an [ontology](https://togoid.dbcls.jp/ontology) that defines the semantics of the dataset relations.
@@ -10,7 +10,7 @@ Datasets last updated: 2024-12-17
 ## Video tutorial
 - [How to use TogoID: an exploratory ID converter to bridge biological datasets](https://youtu.be/gXnvm6Fn4R8)
 
-## Statistics (as of 2024-12-17)
+## Statistics (as of 2024-12-23)
 - Number of target datasets 
     - 105 (from 73 databases)
 - For details on the target DBs and ID examples, please refer to the "DATASETS" tab. 

diff --git a/docs/help_ja.md b/docs/help_ja.md
@@ -1,5 +1,5 @@
 # TogoID ver. 2.0
-Datasets last updated: 2024-12-17
+Datasets last updated: 2024-12-23
 
 ## About
 - [TogoID](https://togoid.dbcls.jp/) は、直感的なインターフェースにより生命科学系データベース(DB)間のつながりを探索的に確認しながらID変換を行うことができるウェブアプリケーションです。同一の実体を指すID間の変換だけでなく、関連する別のカテゴリーのIDへの変換も可能です。また、直接リンクされていないDBのID間でも、他のDBを経由した変換を探索することができます。
@@ -10,7 +10,7 @@ Datasets last updated: 2024-12-17
 ## 動画マニュアル
 - [TogoIDを使って生命科学系データベースのさまざまなIDを探索的に変換する](https://youtu.be/gXnvm6Fn4R8)
 
-## 統計 (2024-12-17)
+## 統計 (2024-12-23)
 - 対象データセット数 
     - 105 (73 のデータベースに由来)
 - 対象DBの詳細やID例については、"DATASETS" タブ からご覧いただけます。 

diff --git a/docs/news.md b/docs/news.md
@@ -1,3 +1,6 @@
+# 2024-12-23
+- Weekly update has been completed.
+
 # 2024-12-17
 - Weekly update has been completed.
 

diff --git a/log/config-summary.tsv b/log/config-summary.tsv
@@ -3,11 +3,11 @@ affy_probeset-ncbigene	FIXME	Probe	Affymetrix probeset	http://identifiers.org/af
 assembly_insdc-bioproject	FIXME	Genome	Assembly INSDC	http://identifiers.org/insdc.gca/	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	tio:TIO_000078	genome is sequenced in project	is sequenced in	tio:TIO_000079	project determines genome sequence	determines genome sequence	Monthly	"curl -sS https://ddbj.nig.ac.jp/public/dblink/assembly_genome-bp/assembly_genome2bp.tsv"
 assembly_insdc-biosample	FIXME	Genome	Assembly INSDC	http://identifiers.org/insdc.gca/	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	tio:TIO_000080	genome is sequenced with sample	is sequenced with	tio:TIO_000081	sample is used to sequence genome	is used to sequence	Monthly	"curl -sS https://ddbj.nig.ac.jp/public/dblink/assembly_genome-bs/assembly_genome2bs.tsv"
 assembly_insdc-insdc_master	FIXME	Genome	Assembly INSDC	http://identifiers.org/insdc.gca/	nbdc02567	Project	INSDC master	http://identifiers.org/insdc/	tio:TIO_000078	genome is sequenced in project	is sequenced in	tio:TIO_000079	project determines genome sequence	determines genome sequence	Monthly	"curl -sS https://ddbj.nig.ac.jp/public/dblink/assembly_genome-insdc/assembly_genome2insdc.tsv"
-bioproject-biosample	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	tio:TIO_000094	project analyzes sample	analyzes	tio:TIO_000095	sample is analyzed in project	is analyzed in	Monthly	"curl -sS https://ddbj.nig.ac.jp/public/dblink/bioproject-biosample/bioproject2biosample.tsv"
+bioproject-biosample	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	tio:TIO_000094	project analyzes sample	analyzes	tio:TIO_000095	sample is analyzed in project	is analyzed in	Monthly	"awk -F \"\\t\" 'FNR==1{fn++} fn==1{b[$1,$2]=1; print $1 \"\\t\" $2} fn==2&&$2==\"ID\"{a[$3]=$1} fn==3&&$2==\"BioProject ID\"&&a[$3]&&!b[a[$3],$1]{print a[$3] \"\\t\" $1}' $TOGOID_ROOT/input/bioproject/bioproject2biosample.tsv $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv"
 bioproject-geo_series	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	nbdc00080	Project	GEO series	http://identifiers.org/geo/	tio:TIO_000001	is nearly equivalent to	is nearly equivalent to	tio:TIO_000001	is nearly equivalent to	is nearly equivalent to	Monthly	"awk -F \"\\t\" '$2==\"GEO\"{print $1 \"\\t\" $3}' $TOGOID_ROOT/input/bioproject/bioproject.tsv"
 bioproject-pubmed	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	nbdc00179	Literature	PubMed	http://rdf.ncbi.nlm.nih.gov/pubmed/	tio:TIO_000032	has reference	has reference	tio:TIO_000033	is reference of	is reference of	Monthly	"awk -F \"\\t\" '$2==\"PubMed\"{print $1 \"\\t\" $3}' $TOGOID_ROOT/input/bioproject/bioproject.tsv"
 bioproject_umbrella-bioproject	nbdc00476	Project	BioProject umbrella	http://identifiers.org/bioproject/	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	tio:TIO_000106	has member	has member	tio:TIO_000107	is member of	is member of	Monthly	"curl -sS https://ddbj.nig.ac.jp/public/dblink/bioproject_umbrella-bioproject/bioproject_umbrella2bioproject.tsv"
-biosample-bioproject	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	tio:TIO_000095	sample is analyzed in project	is analyzed in	tio:TIO_000094	project analyzes sample	analyzes	Monthly	"awk -F \"\\t\" 'FNR==NR&&$2==\"ID\"{a[$3]=$1}FNR!=NR&&$2==\"BioProject ID\"&&a[$3]{print $1 \"\\t\" a[$3]}' $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv"
+biosample-bioproject	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	nbdc00476	Project	BioProject	http://identifiers.org/bioproject/	tio:TIO_000095	sample is analyzed in project	is analyzed in	tio:TIO_000094	project analyzes sample	analyzes	Monthly	"awk -F \"\\t\" 'FNR==1{fn++} fn==1{b[$2,$1]=1; print $2 \"\\t\" $1} fn==2&&$2==\"ID\"{a[$3]=$1} fn==3&&$2==\"BioProject ID\"&&a[$3]&&!b[$1,a[$3]]{print $1 \"\\t\" a[$3]}' $TOGOID_ROOT/input/bioproject/bioproject2biosample.tsv $TOGOID_ROOT/input/bioproject/bioproject.tsv $TOGOID_ROOT/input/biosample/biosample_set.tsv"
 biosample-geo_sample	nbdc01983	Sample	BioSample	http://identifiers.org/biosample/	nbdc00080	Sample	GEO sample	http://identifiers.org/geo/	tio:TIO_000002	is equivalent to	is equivalent to	tio:TIO_000002	is equivalent to	is equivalent to	Daily	"awk -F \"\\t\" '($2==\"GEO ID\"&&$3~/^GSM[0-9]+$/)||($2==\"GEO URL\"&&$3=gensub(/.+acc=/,\"\",\"g\",$3))&&!a[$1,$3]++{print $1 \"\\t\" $3}' $TOGOID_ROOT/input/biosample/biosample_set.tsv"
 cellosaurus-ncit_disease	nbdc02180	CellLine	Cellosaurus	http://www.cellosaurus.org/CVCL_	FIXME	Phenotype	NCIt disease	http://purl.obolibrary.org/obo/NCIT_	tio:TIO_000082	cell line is model of phenotype	is model of	tio:TIO_000083	phenotype is observed in cell line	is observed in	Quarterly	"awk 'BEGIN{RS=\"\\n//\\n\";FS=\"\\n\"}{ncit=\"\";for(i=1;i<=NF;i++){if($i~/^AC/){ac=substr($i,11)}if($i~/^DI   NCIt;/){split($i,b,\"; \");ncit=b[2]}}if(ncit)print ac \"\\t\" ncit}' $TOGOID_ROOT/input/cellosaurus/cellosaurus.txt"
 cellosaurus-orphanet_phenotype	nbdc02180	CellLine	Cellosaurus	http://www.cellosaurus.org/CVCL_	nbdc01422	Phenotype	Orphanet phenotype	http://identifiers.org/orphanet.ordo/Orphanet_	tio:TIO_000082	cell line is model of phenotype	is model of	tio:TIO_000083	phenotype is observed in cell line	is observed in	Quarterly	"awk 'BEGIN{RS=\"\\n//\\n\";FS=\"\\n\"}{ordo=\"\";for(i=1;i<=NF;i++){if($i~/^AC/){ac=substr($i,11)}if($i~/^DI   ORDO;/){split($i,b,\"; \");ordo=substr(b[2],10)}}if(ordo)print ac \"\\t\" ordo}' $TOGOID_ROOT/input/cellosaurus/cellosaurus.txt"