ontoportal-lirmm · syphax-bouazzouni · Jan 23, 2025 · Jan 17, 2025 · Jan 17, 2025 · Jan 17, 2025
diff --git a/.gitignore b/.gitignore
@@ -31,3 +31,9 @@ logs/
 
 # Ignore jEnv files
 .java-version
+
+processed_files/
+
+queries.txt
+
+graph_comparison.csv
diff --git a/Gemfile.lock b/Gemfile.lock
@@ -1,6 +1,6 @@
 GIT
   remote: https://github.com/ontoportal-lirmm/goo.git
-  revision: f8ac7b00e8d8b46d1eea04de014175525c1cdd83
+  revision: 27300f28ca6c656c7e78af65013d88b792a6312f
   branch: development
   specs:
     goo (0.0.2)
@@ -29,7 +29,7 @@ GIT
 
 GIT
   remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git
-  revision: e65d887616aaf4ae6f099437223d86515ffdca79
+  revision: 6cb18910e322645e3cc3490951d10f19468da52f
   branch: development
   specs:
     ontologies_linked_data (0.0.1)
@@ -49,7 +49,7 @@ GIT
 
 GIT
   remote: https://github.com/ontoportal-lirmm/sparql-client.git
-  revision: 59251e59346c9a69a67c88552ba55a1244eec602
+  revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467
   branch: development
   specs:
     sparql-client (3.2.2)
@@ -101,7 +101,7 @@ GEM
       capistrano (~> 3.1)
       sshkit (~> 1.3)
     coderay (1.1.3)
-    concurrent-ruby (1.3.4)
+    concurrent-ruby (1.3.5)
     connection_pool (2.5.0)
     cube-ruby (0.0.3)
     dante (0.2.0)
@@ -138,7 +138,7 @@ GEM
       google-cloud-errors (~> 1.0)
     google-apis-analytics_v3 (0.16.0)
       google-apis-core (>= 0.15.0, < 2.a)
-    google-apis-core (0.15.1)
+    google-apis-core (0.16.0)
       addressable (~> 2.5, >= 2.5.1)
       googleauth (~> 1.9)
       httpclient (>= 2.8.3, < 3.a)
@@ -157,7 +157,7 @@ GEM
       google-protobuf (>= 3.18, < 5.a)
       googleapis-common-protos-types (~> 1.7)
       grpc (~> 1.41)
-    googleapis-common-protos-types (1.17.0)
+    googleapis-common-protos-types (1.18.0)
       google-protobuf (>= 3.18, < 5.a)
     googleauth (1.11.2)
       faraday (>= 1.0, < 3.a)
@@ -209,7 +209,7 @@ GEM
     mutex_m (0.3.0)
     net-http-persistent (4.0.5)
       connection_pool (~> 2.2)
-    net-scp (4.0.0)
+    net-scp (4.1.0)
       net-ssh (>= 2.6.5, < 8.0.0)
     net-sftp (4.0.0)
       net-ssh (>= 5.0.0, < 8.0.0)
@@ -251,7 +251,7 @@ GEM
       rexml (~> 3.2)
     redis (5.3.0)
       redis-client (>= 0.22.0)
-    redis-client (0.23.1)
+    redis-client (0.23.2)
       connection_pool
     representable (3.2.0)
       declarative (< 0.1.0)
@@ -348,4 +348,4 @@ DEPENDENCIES
   test-unit-minitest
 
 BUNDLED WITH
-   2.3.14
+   2.3.3
diff --git a/bin/migrations/4s-to-graph-files b/bin/migrations/4s-to-graph-files
@@ -0,0 +1,42 @@
+#!/usr/bin/env ruby
+
+require 'fileutils'
+
+# Usage: ruby migrate_and_extract.rb <source_folder> <target_folder>
+# Check if the correct number of arguments are provided
+if ARGV.size != 2
+  puts "Usage: #{$PROGRAM_NAME} <source_folder> <target_folder>"
+  exit 1
+end
+
+source_folder = ARGV[0]
+target_folder = ARGV[1]
+processed_dir = File.join(target_folder, 'processed_files')
+
+# Create the target directory if it doesn't exist
+FileUtils.mkdir_p(processed_dir)
+
+# Find all files in the source folder and process them
+Dir.glob(File.join(source_folder, '**', '*')).select { |file| File.file?(file) }.each do |file|
+  puts "Processing file: #{file}"
+
+  # Define the new filename with .n3 extension
+  filename = File.basename(file)
+  new_file = File.join(processed_dir, "#{filename}.n3")
+
+  # Copy the original file to the target folder with .n3 extension
+  FileUtils.cp(file, new_file)
+  puts "Copied to: #{new_file}"
+
+  # Extract the first line and remove the "## GRAPH " prefix, then save it to .graph file
+  graph_file = "#{new_file}.graph"
+  first_line = File.open(file, &:readline).sub(/^## GRAPH /, '').strip
+  File.write(graph_file, first_line)
+  puts "Extracted graph URI to: #{graph_file}"
+
+  # Remove the first line from the copied .n3 file
+  File.write(new_file, File.readlines(new_file).drop(1).join)
+  puts "Removed the first line from: #{new_file}"
+end
+
+puts "Migration and extraction complete."
diff --git a/bin/migrations/compare_counts.rb b/bin/migrations/compare_counts.rb
@@ -0,0 +1,135 @@
+require 'open3'
+require 'net/http'
+require 'json'
+require 'cgi'
+require 'csv'
+require 'pry'
+require 'bundler/setup'
+require 'benchmark'
+require 'ncbo_annotator'
+require 'ncbo_cron'
+require 'ontologies_linked_data'
+
+PROCESSED_DIR = ARGV[0] || './processed_files'
+profile = ARGV[1]
+
+case profile
+when 'ag'
+  # AllegroGraph backend
+  ENV['GOO_BACKEND_NAME'] = 'allegrograph'
+  ENV['GOO_PORT'] = '10035'
+  ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test'
+  ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements'
+  ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements'
+  ENV['COMPOSE_PROFILES'] = 'ag'
+when 'fs'
+  # 4store backend
+  ENV['GOO_PORT'] = '9000'
+  ENV['COMPOSE_PROFILES'] = 'fs'
+when 'vo'
+  # Virtuoso backend
+  ENV['GOO_BACKEND_NAME'] = 'virtuoso'
+  ENV['GOO_PORT'] = '8890'
+  ENV['GOO_PATH_QUERY'] = '/sparql'
+  ENV['GOO_PATH_DATA'] = '/sparql'
+  ENV['GOO_PATH_UPDATE'] = '/sparql'
+  ENV['COMPOSE_PROFILES'] = 'vo'
+when 'gb'
+  # Graphdb backend
+  ENV['GOO_BACKEND_NAME'] = 'graphdb'
+  ENV['GOO_PORT'] = '7200'
+  ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal'
+  ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements'
+  ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements'
+else
+  puts "Will import to default config set in config/config.rb"
+end
+
+require_relative '../../config/config'
+# Set your Virtuoso SPARQL endpoint, user credentials, and the directory where the .n3 files are located
+OUTPUT_CSV = './graph_comparison.csv'
+
+def get_all_graphs_counts
+  graphs = []
+  time = Benchmark.realtime do
+    rs = Goo.sparql_query_client.query("SELECT DISTINCT ?graph (COUNT(?s) as ?triplesCount) WHERE { GRAPH ?graph { ?s ?p ?o } } GROUP BY ?graph")
+    rs.each do |solution|
+      graphs << solution
+    end
+  end
+  puts 'Found ' + graphs.length.to_s + ' graphs in ' + format("%.4f", time) + 's'
+
+  counts = {}
+  graphs.each do |graph|
+    counts[graph['graph'].to_s] = graph['triplesCount'].to_i
+  end
+  counts
+end
+
+# Count the number of lines in a file (excluding the first metadata line)
+def count_file_lines(file_path)
+  File.read(file_path).each_line.count
+end
+
+def build_graphs_file_hash(folder_path = PROCESSED_DIR)
+  # Ensure the folder path exists
+  unless Dir.exist?(folder_path)
+    puts "Folder does not exist: #{folder_path}"
+    return
+  end
+
+  graphs = {}
+  # Loop through each file in the folder
+  Dir.foreach(folder_path) do |filename|
+    # Skip directories and only process files ending with .graph and starting with the specific string
+    if filename.end_with?('.graph')
+      file_path = File.join(folder_path, filename)
+      line = File.open(file_path, "r").readlines.first
+      graphs[line.strip] = filename.to_s.gsub('.graph', '')
+    end
+  end
+  graphs
+end
+
+# Compare graph counts with file lines and output to CSV
+def compare_graphs_with_files(graph_triples)
+  CSV.open(OUTPUT_CSV, 'w') do |csv|
+    # Write CSV headers
+    csv << ["Graph URI", "Triples in Graph", "Lines in File (excluding metadata)", "Match"]
+    graphs_files = build_graphs_file_hash
+    graph_triples.each do |graph, count|
+      graph_uri = graph
+      triples_count = count
+      graph_filename = graphs_files[graph_uri]
+
+      next csv << [graph_uri, triples_count, "Graph not found", "N/A"] unless graph_filename
+
+      # Construct the expected file name based on the graph URI
+      file_name = "#{PROCESSED_DIR}/#{graph_filename}"
+
+      # puts "count lines of the file #{file_name} for the graph #{graph_uri}"
+      if File.exist?(file_name)
+        file_lines_count = count_file_lines(file_name)
+
+        # Check if the counts match
+        match_status = triples_count == file_lines_count ? "Yes" : "No"
+
+        # Output the result to CSV
+        csv << [graph_uri, triples_count, file_lines_count, match_status]
+      else
+        # If the file doesn't exist, indicate it in the CSV
+        csv << [graph_uri, triples_count, "File not found", "N/A"]
+      end
+    end
+  end
+
+  puts "Comparison complete. Results saved to #{OUTPUT_CSV}"
+end
+
+# Main execution
+Goo.sparql_query_client.cache.redis_cache.flushdb
+puts "Redis cache flushed"
+
+puts "Comparing graph triple counts with file lines and exporting to CSV..."
+graph_triples = get_all_graphs_counts
+compare_graphs_with_files(graph_triples)
diff --git a/bin/migrations/import_metadata_graphs_to_store b/bin/migrations/import_metadata_graphs_to_store
@@ -0,0 +1,71 @@
+#!/usr/bin/env ruby
+
+require 'benchmark'
+# Stop the script at the first error
+begin
+  # Check if the correct number of arguments are provided
+  if ARGV.size < 1
+    puts "Usage: #{$PROGRAM_NAME} <processed_directory>"
+    exit 1
+  end
+
+  # Directory containing .n3 files and Virtuoso installation path
+  processed_dir = ARGV[0]
+  # Optional profile to use for the import (vo: virtruoso, fs: 4store, gb: GraphDB)
+  profile = ARGV[1]
+
+  docker = ARGV[2] == "docker"
+
+  if docker
+    result = system("./start_ontoportal_services.sh #{profile}")
+    unless result
+      puts "Error starting services"
+      exit 1
+    end
+  end
+  # Check if processed_files directory exists
+  unless Dir.exist?(processed_dir)
+    puts "Processed files directory #{processed_dir} does not exist!"
+    exit 1
+  end
+
+  total_time = 0
+  import_count = 0
+  file_count = 0
+  # Loop through all .n3 files in the processed_files directory
+  Dir.glob(File.join(processed_dir, '*.n3')).each do |file|
+    # Extract the associated .graph file (contains graph URI)
+    graph_file = "#{file}.graph"
+
+    # Check if graph file exists
+    unless File.exist?(graph_file)
+      puts "Graph file #{graph_file} not found. Skipping import of #{file}."
+      next
+    end
+
+    # Extract the graph URI from the graph file
+    graph_uri = File.read(graph_file).strip
+    line_count = `wc -l #{file}`.to_i
+    puts "Start importing #{file} into graph <#{graph_uri}> of line count #{line_count}"
+    result = false
+    time = Benchmark.realtime do
+      result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} >> ./process_log.log 2>&1")
+    end
+
+    file_count += 1
+    total_time += time
+
+    if !result
+      puts "Error importing #{file} into graph <#{graph_uri}>"
+      exit 1
+    else
+      import_count += 1
+      puts "Imported <#{graph_uri}> successfully in #{time.round(2)} seconds"
+    end
+    puts "#############################################################"
+  end
+  puts "#{import_count}/#{file_count} files imported in #{total_time.round(2)} seconds"
+rescue => e
+  puts "Error: #{e.message}"
+  exit 1
+end