Skip to content

Commit

Permalink
Feature: Create triple store migration brenchmarks scripts (#30)
Browse files Browse the repository at this point in the history
* add a script to migrate 4s dump to graph nt files

* add a script to import any nt file into a graph

* add a script to combine the metadata graphs files generation and import

* add scripts that compares triples count in  graph files and in triple store

* update docker compose to use the default virtuoso image

* add option to run import metadata graphs using docker for testing

* add virtuoso custom scripts

* simplify compare count to not do the benchmarks

* add benchmarking tests

* update virtuoso docker image

* add benchmarks examples documentation
  • Loading branch information
syphax-bouazzouni authored Feb 1, 2025
1 parent dd73691 commit 677a6ca
Show file tree
Hide file tree
Showing 17 changed files with 477 additions and 139 deletions.
2 changes: 2 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -52,3 +52,5 @@ group :deployment do
end

gem "binding_of_caller", "~> 1.0"
gem 'net-smtp'
gem 'net-ftp'
32 changes: 28 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ GIT

GIT
remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git
revision: 6cb18910e322645e3cc3490951d10f19468da52f
revision: 194fcfb9a1c4660dabef738d16f32c210a23c343
branch: development
specs:
ontologies_linked_data (0.0.1)
Expand All @@ -49,7 +49,7 @@ GIT

GIT
remote: https://github.com/ontoportal-lirmm/sparql-client.git
revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467
revision: d4a226e75eb4aeaaf42720eac4f23f55380a0bd3
branch: development
specs:
sparql-client (3.2.2)
Expand Down Expand Up @@ -84,6 +84,7 @@ GEM
base64 (0.2.0)
bcrypt (3.1.20)
bcrypt_pbkdf (1.1.1)
bcrypt_pbkdf (1.1.1-arm64-darwin)
bigdecimal (3.1.9)
binding_of_caller (1.0.1)
debug_inspector (>= 1.2.0)
Expand All @@ -105,6 +106,7 @@ GEM
connection_pool (2.5.0)
cube-ruby (0.0.3)
dante (0.2.0)
date (3.4.1)
debug_inspector (1.2.0)
declarative (0.0.20)
docile (1.4.1)
Expand Down Expand Up @@ -152,6 +154,8 @@ GEM
google-cloud-env (2.1.1)
faraday (>= 1.0, < 3.a)
google-cloud-errors (1.4.0)
google-protobuf (3.25.3)
google-protobuf (3.25.3-arm64-darwin)
google-protobuf (3.25.3-x86_64-linux)
googleapis-common-protos (1.6.0)
google-protobuf (>= 3.18, < 5.a)
Expand All @@ -166,6 +170,12 @@ GEM
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (>= 0.16, < 2.a)
grpc (1.65.2)
google-protobuf (>= 3.25, < 5.0)
googleapis-common-protos-types (~> 1.0)
grpc (1.65.2-arm64-darwin)
google-protobuf (>= 3.25, < 5.0)
googleapis-common-protos-types (~> 1.0)
grpc (1.65.2-x86_64-linux)
google-protobuf (>= 3.25, < 5.0)
googleapis-common-protos-types (~> 1.0)
Expand Down Expand Up @@ -207,12 +217,19 @@ GEM
redis
multi_json (1.15.0)
mutex_m (0.3.0)
net-ftp (0.3.8)
net-protocol
time
net-http-persistent (4.0.5)
connection_pool (~> 2.2)
net-protocol (0.2.2)
timeout
net-scp (4.1.0)
net-ssh (>= 2.6.5, < 8.0.0)
net-sftp (4.0.0)
net-ssh (>= 5.0.0, < 8.0.0)
net-smtp (0.5.0)
net-protocol
net-ssh (7.3.0)
netrc (0.11.0)
oj (3.16.9)
Expand All @@ -230,7 +247,7 @@ GEM
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (5.1.1)
rack (3.1.8)
rack (3.1.9)
rack-test (2.2.0)
rack (>= 1.3)
rake (13.2.1)
Expand Down Expand Up @@ -299,6 +316,9 @@ GEM
systemu (2.6.5)
test-unit-minitest (0.9.1)
minitest (~> 4.7)
time (0.4.1)
date
timeout (0.4.3)
trailblazer-option (0.1.2)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
Expand All @@ -307,6 +327,8 @@ GEM
macaddr (~> 1.0)

PLATFORMS
arm64-darwin-24
ruby
x86_64-linux

DEPENDENCIES
Expand All @@ -331,6 +353,8 @@ DEPENDENCIES
multi_json
ncbo_annotator!
ncbo_cron!
net-ftp
net-smtp
oj
ontologies_linked_data!
parallel
Expand All @@ -348,4 +372,4 @@ DEPENDENCIES
test-unit-minitest

BUNDLED WITH
2.3.15
2.4.22
60 changes: 60 additions & 0 deletions bin/migrations/count_graph_triples.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# require 'bundler/setup'
require 'pry'
require 'benchmark'
require 'ncbo_annotator'
require 'ncbo_cron'
require 'ontologies_linked_data'

graph = ARGV[1]
profile = ARGV[2]

if graph.nil?
puts "Error: Missing arguments. Please provide the graph name."
exit(1)
end

case profile
when 'ag'
# AllegroGraph backend
ENV['GOO_BACKEND_NAME'] = 'allegrograph'
ENV['GOO_PORT'] = '10035'
ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test'
ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements'
ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements'
ENV['COMPOSE_PROFILES'] = 'ag'

when 'fs'
# 4store backend
ENV['GOO_PORT'] = '9000'
ENV['COMPOSE_PROFILES'] = 'fs'

when 'vo'
# Virtuoso backend
ENV['GOO_BACKEND_NAME'] = 'virtuoso'
ENV['GOO_PORT'] = '8890'
ENV['GOO_PATH_QUERY'] = '/sparql'
ENV['GOO_PATH_DATA'] = '/sparql'
ENV['GOO_PATH_UPDATE'] = '/sparql'
ENV['COMPOSE_PROFILES'] = 'vo'

when 'gb'
# Graphdb backend
ENV['GOO_BACKEND_NAME'] = 'graphdb'
ENV['GOO_PORT'] = '7200'
ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal'
ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements'
ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements'

else
puts "Will import to default config set in config/config.rb"
end

require_relative '../../config/config'
count = 0
time = Benchmark.realtime do
rs = Goo.sparql_query_client.query("SELECT (COUNT(?s) as ?count) FROM <#{graph_uri}> WHERE { ?s ?p ?o }")
rs = rs.solutions.first
count = rs[:count].to_i if rs
end

puts 'Imported triples in ' + format("%.4f", time) + 's with total count: ' + count.to_s
Empty file.
29 changes: 15 additions & 14 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,10 +50,10 @@ services:
ports:
- "9393:9393"

# mgrep-ut:
# image: ontoportal/mgrep-ncbo:0.1
# ports:
# - "55556:55555"
mgrep-ut:
image: ontoportal/mgrep-ncbo:0.1
ports:
- "55556:55555"

redis-ut:
image: redis
Expand Down Expand Up @@ -87,8 +87,7 @@ services:
# volumes:
#- solr_data:/var/solr/data
agraph-ut:
image: franzinc/agraph:v8.1.0
platform: linux/amd64
image: franzinc/agraph:v8.3.0
environment:
- AGRAPH_SUPER_USER=test
- AGRAPH_SUPER_PASSWORD=xyzzy
Expand Down Expand Up @@ -117,12 +116,12 @@ services:
- ag

virtuoso-ut:
image: tenforce/virtuoso:virtuoso7.2.5
platform: linux/amd64
image: openlink/virtuoso-opensource-7:latest
environment:
- SPARQL_UPDATE=true
- VIRT_Parameters_NumberOfBuffers=2450000
- VIRT_Parameters_MaxDirtyBuffers=1837500
- DBA_PASSWORD= dba
- DAV_PASSWORD= dba
- VIRT_Parameters_NumberOfBuffers=680000
- VIRT_Parameters_MaxDirtyBuffers=500000
- VIRT_Parameters_NumOfThreads=100
- VIRT_Parameters_MaxMem=20000000000
- VIRT_Parameters_LogEnable=2
Expand All @@ -132,10 +131,12 @@ services:
profiles:
- vo
ports:
- 1111:1111
- 8890:8890
- "1111:1111" # Standard Virtuoso port
- "8890:8890" # HTTP port for SPARQL endpoint
volumes:
- ./test/data/initdb.d:/opt/virtuoso-opensource/initdb.d
healthcheck:
test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ]
test: [ "CMD-SHELL", "curl -sf http://localhost:8890/sparql || exit 1" ]
start_period: 10s
interval: 60s
timeout: 5s
Expand Down
1 change: 1 addition & 0 deletions mise.toml
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
[tools]
java = "17"
ruby = "2.7.8"
85 changes: 85 additions & 0 deletions test/benchmarks/data_benchs.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
require 'ontologies_linked_data'
module Benchmarks

def self.do_all_benchmarks(sub)
Benchmarks.bench("fetch triples") do
Benchmarks.paginate_all_triples(sub)
end

Benchmarks.bench("get ontology Concept Roots") do
Benchmarks.ontology_roots(sub)
end

Benchmarks.bench("concept children") do
Benchmarks.concept_children("http://terminologies.gfbio.org/ITIS/Taxa_0", sub)
end

Benchmarks.bench("concept path to root") do
Benchmarks.concept_tree("http://terminologies.gfbio.org/ITIS/Taxa_6007", sub)
end
end

def self.bench(label, &block)
time = Benchmark.realtime do
block.call
end
puts "Time to #{label}: " + time.round(2).to_s
end

def self.import_nt_file(sub, file_path)
Goo.sparql_data_client.delete_graph(sub.id)
Goo.sparql_data_client.append_triples_no_bnodes(sub.id, file_path, nil)
end

def self.paginate_all_triples(sub)
page = 1
pagesize = 10000
count = 1
total_count = 0
while count > 0 && page < 100
puts "Starting query for page #{page}"
offset = " OFFSET #{(page - 1) * pagesize}"
rs = Goo.sparql_query_client.query("SELECT ?s ?p ?o FROM <#{sub.id}> WHERE { ?s ?p ?o } LIMIT #{pagesize} #{offset}")
count = rs.each_solution.size
total_count += count
page += 1
end
puts "Total triples: " + total_count.to_s
end

def self.ontology_roots(sub)
load_attrs = LinkedData::Models::Class.goo_attrs_to_load([:all])
roots = []
time = Benchmark.realtime do
roots = sub.roots(load_attrs)
end
puts "Time to find roots: " + time.round(2).to_s
Goo.log_debug_file('roots')
time = Benchmark.realtime do
LinkedData::Models::Class.in(sub).models(roots).include(:unmapped).all
end
puts "Time to load roots: " + time.round(2).to_s
Goo.log_debug_file('roots')
puts "Roots count: " + roots.length.to_s
puts "Roots total triples: " + roots.map { |r| r.properties.values.flatten.size}.sum.to_s
end

def self.concept_children(uri, sub)
page, size = [1, 100]
cls = LinkedData::Models::Class.find(RDF::URI.new("http://terminologies.gfbio.org/ITIS/Taxa_0")).in(sub).first
ld = LinkedData::Models::Class.goo_attrs_to_load([:all])
children = sub.children(cls, includes_param: ld, page: page, size: size)
puts "Children count: " + children.length.to_s
end

def self.concept_tree(uri, sub)
cls = LinkedData::Models::Class.find("http://terminologies.gfbio.org/ITIS/Taxa_6007").in(sub).first
display_attrs = [:prefLabel, :hasChildren, :children, :obsolete, :subClassOf]
extra_include = display_attrs + [:hasChildren, :isInActiveScheme, :isInActiveScheme]

roots = sub.roots(extra_include)
# path = cls.path_to_root(roots)
cls.tree(roots: roots)
end

end
10 changes: 10 additions & 0 deletions test/benchmarks/examples.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
# Benchmarks
## Import all AgroPortal metadata
test/benchmarks/import_all_metadata_file.sh ./processed_files gb
ruby test/benchmarks/run_metadata_benchs.rb gb

## Parse INRAETHES and do ontoportal operations
ruby test/benchmarks/parse_and_do_ontoportal_operations.rb INRAETHES fs

## Parse ITIS and do ontoportal operations
ruby test/benchmarks/parse_and_do_ontoportal_operations.rb ITIS fs api_key https://data.biodivportal.gfbio.dev
17 changes: 17 additions & 0 deletions test/benchmarks/import_all_metadata_file.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
#!/usr/bin/env bash
path_graphs_files=$1
profile=$2
set -e


if [ -z "$profile" ]; then
echo "Usage: $0 <path to path_graphs_files> <profile>"
exit 1
fi
echo "###########################################################################"
./test/benchmarks/start_ontoportal_services.sh "$profile"
./bin/migrations/import_metadata_graphs_to_store "$path_graphs_files" "$profile"
echo 'All metadata graphs imported successfully.'
echo "###########################################################################"

bundle exec ruby bin/migrations/compare_counts.rb "$path_graphs_files" "$profile"
Loading

0 comments on commit 677a6ca

Please sign in to comment.