Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature: create triples stores migration scripts #29

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,9 @@ logs/

# Ignore jEnv files
.java-version

processed_files/

queries.txt

graph_comparison.csv
18 changes: 9 additions & 9 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
GIT
remote: https://github.com/ontoportal-lirmm/goo.git
revision: f8ac7b00e8d8b46d1eea04de014175525c1cdd83
revision: 27300f28ca6c656c7e78af65013d88b792a6312f
branch: development
specs:
goo (0.0.2)
Expand Down Expand Up @@ -29,7 +29,7 @@ GIT

GIT
remote: https://github.com/ontoportal-lirmm/ontologies_linked_data.git
revision: e65d887616aaf4ae6f099437223d86515ffdca79
revision: 6cb18910e322645e3cc3490951d10f19468da52f
branch: development
specs:
ontologies_linked_data (0.0.1)
Expand All @@ -49,7 +49,7 @@ GIT

GIT
remote: https://github.com/ontoportal-lirmm/sparql-client.git
revision: 59251e59346c9a69a67c88552ba55a1244eec602
revision: 4364d34e9e4c411f1dd0ea706bf052465bf0b467
branch: development
specs:
sparql-client (3.2.2)
Expand Down Expand Up @@ -101,7 +101,7 @@ GEM
capistrano (~> 3.1)
sshkit (~> 1.3)
coderay (1.1.3)
concurrent-ruby (1.3.4)
concurrent-ruby (1.3.5)
connection_pool (2.5.0)
cube-ruby (0.0.3)
dante (0.2.0)
Expand Down Expand Up @@ -138,7 +138,7 @@ GEM
google-cloud-errors (~> 1.0)
google-apis-analytics_v3 (0.16.0)
google-apis-core (>= 0.15.0, < 2.a)
google-apis-core (0.15.1)
google-apis-core (0.16.0)
addressable (~> 2.5, >= 2.5.1)
googleauth (~> 1.9)
httpclient (>= 2.8.3, < 3.a)
Expand All @@ -157,7 +157,7 @@ GEM
google-protobuf (>= 3.18, < 5.a)
googleapis-common-protos-types (~> 1.7)
grpc (~> 1.41)
googleapis-common-protos-types (1.17.0)
googleapis-common-protos-types (1.18.0)
google-protobuf (>= 3.18, < 5.a)
googleauth (1.11.2)
faraday (>= 1.0, < 3.a)
Expand Down Expand Up @@ -209,7 +209,7 @@ GEM
mutex_m (0.3.0)
net-http-persistent (4.0.5)
connection_pool (~> 2.2)
net-scp (4.0.0)
net-scp (4.1.0)
net-ssh (>= 2.6.5, < 8.0.0)
net-sftp (4.0.0)
net-ssh (>= 5.0.0, < 8.0.0)
Expand Down Expand Up @@ -251,7 +251,7 @@ GEM
rexml (~> 3.2)
redis (5.3.0)
redis-client (>= 0.22.0)
redis-client (0.23.1)
redis-client (0.23.2)
connection_pool
representable (3.2.0)
declarative (< 0.1.0)
Expand Down Expand Up @@ -348,4 +348,4 @@ DEPENDENCIES
test-unit-minitest

BUNDLED WITH
2.3.14
2.3.3
42 changes: 42 additions & 0 deletions bin/migrations/4s-to-graph-files
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
#!/usr/bin/env ruby

require 'fileutils'

# Usage: ruby migrate_and_extract.rb <source_folder> <target_folder>
# Check if the correct number of arguments are provided
if ARGV.size != 2
puts "Usage: #{$PROGRAM_NAME} <source_folder> <target_folder>"
exit 1
end

source_folder = ARGV[0]
target_folder = ARGV[1]
processed_dir = File.join(target_folder, 'processed_files')

# Create the target directory if it doesn't exist
FileUtils.mkdir_p(processed_dir)

# Find all files in the source folder and process them
Dir.glob(File.join(source_folder, '**', '*')).select { |file| File.file?(file) }.each do |file|
puts "Processing file: #{file}"

# Define the new filename with .n3 extension
filename = File.basename(file)
new_file = File.join(processed_dir, "#{filename}.n3")

# Copy the original file to the target folder with .n3 extension
FileUtils.cp(file, new_file)
puts "Copied to: #{new_file}"

# Extract the first line and remove the "## GRAPH " prefix, then save it to .graph file
graph_file = "#{new_file}.graph"
first_line = File.open(file, &:readline).sub(/^## GRAPH /, '').strip
File.write(graph_file, first_line)
puts "Extracted graph URI to: #{graph_file}"

# Remove the first line from the copied .n3 file
File.write(new_file, File.readlines(new_file).drop(1).join)
puts "Removed the first line from: #{new_file}"
end

puts "Migration and extraction complete."
135 changes: 135 additions & 0 deletions bin/migrations/compare_counts.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
require 'open3'
require 'net/http'
require 'json'
require 'cgi'
require 'csv'
require 'pry'
require 'bundler/setup'
require 'benchmark'
require 'ncbo_annotator'
require 'ncbo_cron'
require 'ontologies_linked_data'

PROCESSED_DIR = ARGV[0] || './processed_files'
profile = ARGV[1]

case profile
when 'ag'
# AllegroGraph backend
ENV['GOO_BACKEND_NAME'] = 'allegrograph'
ENV['GOO_PORT'] = '10035'
ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal_test'
ENV['GOO_PATH_DATA'] = '/repositories/ontoportal_test/statements'
ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal_test/statements'
ENV['COMPOSE_PROFILES'] = 'ag'
when 'fs'
# 4store backend
ENV['GOO_PORT'] = '9000'
ENV['COMPOSE_PROFILES'] = 'fs'
when 'vo'
# Virtuoso backend
ENV['GOO_BACKEND_NAME'] = 'virtuoso'
ENV['GOO_PORT'] = '8890'
ENV['GOO_PATH_QUERY'] = '/sparql'
ENV['GOO_PATH_DATA'] = '/sparql'
ENV['GOO_PATH_UPDATE'] = '/sparql'
ENV['COMPOSE_PROFILES'] = 'vo'
when 'gb'
# Graphdb backend
ENV['GOO_BACKEND_NAME'] = 'graphdb'
ENV['GOO_PORT'] = '7200'
ENV['GOO_PATH_QUERY'] = '/repositories/ontoportal'
ENV['GOO_PATH_DATA'] = '/repositories/ontoportal/statements'
ENV['GOO_PATH_UPDATE'] = '/repositories/ontoportal/statements'
else
puts "Will import to default config set in config/config.rb"
end

require_relative '../../config/config'
# Set your Virtuoso SPARQL endpoint, user credentials, and the directory where the .n3 files are located
OUTPUT_CSV = './graph_comparison.csv'

def get_all_graphs_counts
graphs = []
time = Benchmark.realtime do
rs = Goo.sparql_query_client.query("SELECT DISTINCT ?graph (COUNT(?s) as ?triplesCount) WHERE { GRAPH ?graph { ?s ?p ?o } } GROUP BY ?graph")
rs.each do |solution|
graphs << solution
end
end
puts 'Found ' + graphs.length.to_s + ' graphs in ' + format("%.4f", time) + 's'

counts = {}
graphs.each do |graph|
counts[graph['graph'].to_s] = graph['triplesCount'].to_i
end
counts
end

# Count the number of lines in a file (excluding the first metadata line)
def count_file_lines(file_path)
File.read(file_path).each_line.count
end

def build_graphs_file_hash(folder_path = PROCESSED_DIR)
# Ensure the folder path exists
unless Dir.exist?(folder_path)
puts "Folder does not exist: #{folder_path}"
return
end

graphs = {}
# Loop through each file in the folder
Dir.foreach(folder_path) do |filename|
# Skip directories and only process files ending with .graph and starting with the specific string
if filename.end_with?('.graph')
file_path = File.join(folder_path, filename)
line = File.open(file_path, "r").readlines.first
graphs[line.strip] = filename.to_s.gsub('.graph', '')
end
end
graphs
end

# Compare graph counts with file lines and output to CSV
def compare_graphs_with_files(graph_triples)
CSV.open(OUTPUT_CSV, 'w') do |csv|
# Write CSV headers
csv << ["Graph URI", "Triples in Graph", "Lines in File (excluding metadata)", "Match"]
graphs_files = build_graphs_file_hash
graph_triples.each do |graph, count|
graph_uri = graph
triples_count = count
graph_filename = graphs_files[graph_uri]

next csv << [graph_uri, triples_count, "Graph not found", "N/A"] unless graph_filename

# Construct the expected file name based on the graph URI
file_name = "#{PROCESSED_DIR}/#{graph_filename}"

# puts "count lines of the file #{file_name} for the graph #{graph_uri}"
if File.exist?(file_name)
file_lines_count = count_file_lines(file_name)

# Check if the counts match
match_status = triples_count == file_lines_count ? "Yes" : "No"

# Output the result to CSV
csv << [graph_uri, triples_count, file_lines_count, match_status]
else
# If the file doesn't exist, indicate it in the CSV
csv << [graph_uri, triples_count, "File not found", "N/A"]
end
end
end

puts "Comparison complete. Results saved to #{OUTPUT_CSV}"
end

# Main execution
Goo.sparql_query_client.cache.redis_cache.flushdb
puts "Redis cache flushed"

puts "Comparing graph triple counts with file lines and exporting to CSV..."
graph_triples = get_all_graphs_counts
compare_graphs_with_files(graph_triples)
71 changes: 71 additions & 0 deletions bin/migrations/import_metadata_graphs_to_store
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
#!/usr/bin/env ruby

require 'benchmark'
# Stop the script at the first error
begin
# Check if the correct number of arguments are provided
if ARGV.size < 1
puts "Usage: #{$PROGRAM_NAME} <processed_directory>"
exit 1
end

# Directory containing .n3 files and Virtuoso installation path
processed_dir = ARGV[0]
# Optional profile to use for the import (vo: virtruoso, fs: 4store, gb: GraphDB)
profile = ARGV[1]

docker = ARGV[2] == "docker"

if docker
result = system("./start_ontoportal_services.sh #{profile}")
unless result
puts "Error starting services"
exit 1
end
end
# Check if processed_files directory exists
unless Dir.exist?(processed_dir)
puts "Processed files directory #{processed_dir} does not exist!"
exit 1
end

total_time = 0
import_count = 0
file_count = 0
# Loop through all .n3 files in the processed_files directory
Dir.glob(File.join(processed_dir, '*.n3')).each do |file|
# Extract the associated .graph file (contains graph URI)
graph_file = "#{file}.graph"

# Check if graph file exists
unless File.exist?(graph_file)
puts "Graph file #{graph_file} not found. Skipping import of #{file}."
next
end

# Extract the graph URI from the graph file
graph_uri = File.read(graph_file).strip
line_count = `wc -l #{file}`.to_i
puts "Start importing #{file} into graph <#{graph_uri}> of line count #{line_count}"
result = false
time = Benchmark.realtime do
result = system("ruby bin/migrations/import_nt_file.rb #{file} #{graph_uri} #{profile} >> ./process_log.log 2>&1")
end

file_count += 1
total_time += time

if !result
puts "Error importing #{file} into graph <#{graph_uri}>"
exit 1
else
import_count += 1
puts "Imported <#{graph_uri}> successfully in #{time.round(2)} seconds"
end
puts "#############################################################"
end
puts "#{import_count}/#{file_count} files imported in #{total_time.round(2)} seconds"
rescue => e
puts "Error: #{e.message}"
exit 1
end
Loading
Loading