Skip to content

Commit

Permalink
implemented the first pass at bmir-radx/radx-project#37
Browse files Browse the repository at this point in the history
  • Loading branch information
mdorf committed Dec 17, 2023
1 parent 6317dc4 commit a659415
Show file tree
Hide file tree
Showing 8 changed files with 277 additions and 209 deletions.
57 changes: 30 additions & 27 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ GIT

GIT
remote: https://github.com/ncbo/ncbo_annotator.git
revision: ebbb7a3c28ecde49c261290bec34ab082490a271
revision: d7ee80860a0eab9293af81083a0700d099c50263
branch: develop
specs:
ncbo_annotator (0.0.1)
Expand All @@ -26,7 +26,7 @@ GIT

GIT
remote: https://github.com/ncbo/ontologies_linked_data.git
revision: 5600020a8017cb4901e719f577032b0be6a14949
revision: 9487c7f73e68abab097af523d42c1d2e106e614b
branch: develop
specs:
ontologies_linked_data (0.0.1)
Expand Down Expand Up @@ -74,10 +74,11 @@ GEM
activesupport (3.2.22.5)
i18n (~> 0.6, >= 0.6.4)
multi_json (~> 1.0)
addressable (2.8.5)
addressable (2.8.6)
public_suffix (>= 2.0.2, < 6.0)
base64 (0.2.0)
bcrypt (3.1.19)
bcrypt (3.1.20)
bigdecimal (3.1.4)
builder (3.2.4)
coderay (1.1.3)
concurrent-ruby (1.2.2)
Expand All @@ -91,7 +92,7 @@ GEM
htmlentities (~> 4.3.3)
launchy (~> 2.1)
mail (~> 2.6)
faraday (2.7.11)
faraday (2.7.12)
base64
faraday-net_http (>= 2.0, < 3.1)
ruby2_keywords (>= 0.0.4)
Expand Down Expand Up @@ -124,35 +125,36 @@ GEM
retriable (>= 2.0, < 4.a)
rexml
webrick
google-cloud-core (1.6.0)
google-cloud-env (~> 1.0)
google-cloud-core (1.6.1)
google-cloud-env (>= 1.0, < 3.a)
google-cloud-errors (~> 1.0)
google-cloud-env (1.6.0)
faraday (>= 0.17.3, < 3.0)
google-cloud-env (2.1.0)
faraday (>= 1.0, < 3.a)
google-cloud-errors (1.3.1)
google-protobuf (3.25.0)
google-protobuf (3.25.0-x86_64-darwin)
google-protobuf (3.25.0-x86_64-linux)
google-protobuf (3.25.1)
google-protobuf (3.25.1-x86_64-darwin)
google-protobuf (3.25.1-x86_64-linux)
googleapis-common-protos (1.4.0)
google-protobuf (~> 3.14)
googleapis-common-protos-types (~> 1.2)
grpc (~> 1.27)
googleapis-common-protos-types (1.10.0)
googleapis-common-protos-types (1.11.0)
google-protobuf (~> 3.18)
googleauth (1.8.1)
faraday (>= 0.17.3, < 3.a)
googleauth (1.9.1)
faraday (>= 1.0, < 3.a)
google-cloud-env (~> 2.1)
jwt (>= 1.4, < 3.0)
multi_json (~> 1.11)
os (>= 0.9, < 2.0)
signet (>= 0.16, < 2.a)
grpc (1.59.2)
google-protobuf (~> 3.24)
grpc (1.60.0)
google-protobuf (~> 3.25)
googleapis-common-protos-types (~> 1.0)
grpc (1.59.2-x86_64-darwin)
google-protobuf (~> 3.24)
grpc (1.60.0-x86_64-darwin)
google-protobuf (~> 3.25)
googleapis-common-protos-types (~> 1.0)
grpc (1.59.2-x86_64-linux)
google-protobuf (~> 3.24)
grpc (1.60.0-x86_64-linux)
google-protobuf (~> 3.25)
googleapis-common-protos-types (~> 1.0)
htmlentities (4.3.4)
http-accept (1.7.0)
Expand All @@ -161,8 +163,8 @@ GEM
httpclient (2.8.3)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
json (2.6.3)
json_pure (2.6.3)
json (2.7.1)
json_pure (2.7.1)
jwt (2.7.1)
launchy (2.5.2)
addressable (~> 2.8)
Expand All @@ -175,15 +177,16 @@ GEM
method_source (1.0.0)
mime-types (3.5.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2023.1003)
mime-types-data (3.2023.1205)
mini_mime (1.1.5)
minitest (4.7.5)
mlanett-redis-lock (0.2.7)
redis
multi_json (1.15.0)
net-http-persistent (2.9.4)
netrc (0.11.0)
oj (3.16.1)
oj (3.16.3)
bigdecimal (>= 3.0)
omni_logger (0.1.4)
logger
os (1.1.4)
Expand All @@ -193,7 +196,7 @@ GEM
pry (0.14.2)
coderay (~> 1.1)
method_source (~> 1.0)
public_suffix (5.0.3)
public_suffix (5.0.4)
rack (3.0.8)
rack-test (2.1.0)
rack (>= 1.3)
Expand All @@ -202,7 +205,7 @@ GEM
addressable (>= 2.2)
redis (5.0.8)
redis-client (>= 0.17.0)
redis-client (0.18.0)
redis-client (0.19.0)
connection_pool
representable (3.2.0)
declarative (< 0.1.0)
Expand Down
2 changes: 1 addition & 1 deletion bin/ncbo_ontology_pull
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ logger = Logger.new($stdout)
logger.info "Starting ncbo pull"; logger.flush
puller = NcboCron::Models::OntologyPull.new
begin
puller.do_ontology_pull(ontology_acronym, logger: logger , enable_pull_umls:true )
puller.do_ontology_pull(ontology_acronym, logger: logger, enable_pull_umls: true)
rescue StandardError => e
logger.error e.message
logger.flush
Expand Down
185 changes: 185 additions & 0 deletions lib/ncbo_cron/ontology_helper.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
require 'logger'

module NcboCron
module Helpers
module OntologyHelper

REDIS_SUBMISSION_ID_PREFIX = "sub:"
PROCESS_QUEUE_HOLDER = "parseQueue"
PROCESS_ACTIONS = {
:process_rdf => true,
:generate_labels => true,
:index_search => true,
:index_properties => true,
:run_metrics => true,
:process_annotator => true,
:diff => true,
:remote_pull => false
}

class RemoteFileException < StandardError
attr_reader :submission

def initialize(submission)
super
@submission = submission
end
end

def self.do_ontology_pull(ontology_acronym, enable_pull_umls = false, umls_download_url = '', logger = nil,
add_to_queue = true)
logger ||= Logger.new($stdout)
ont = LinkedData::Models::Ontology.find(ontology_acronym).include(:acronym).first
new_submission = nil
raise StandardError, "Ontology #{ontology_acronym} not found" if ont.nil?

last = ont.latest_submission(status: :any)
raise StandardError, "No submission found for #{ontology_acronym}" if last.nil?

last.bring(:hasOntologyLanguage) if last.bring?(:hasOntologyLanguage)
if !enable_pull_umls && last.hasOntologyLanguage.umls?
raise StandardError, "Pull umls not enabled"
end

last.bring(:pullLocation) if last.bring?(:pullLocation)
raise StandardError, "#{ontology_acronym} has no pullLocation" if last.pullLocation.nil?

last.bring(:uploadFilePath) if last.bring?(:uploadFilePath)

if last.hasOntologyLanguage.umls? && umls_download_url && !umls_download_url.empty?
last.pullLocation = RDF::URI.new(umls_download_url + last.pullLocation.split("/")[-1])
logger.info("Using alternative download for umls #{last.pullLocation.to_s}")
logger.flush
end

if last.remote_file_exists?(last.pullLocation.to_s)
logger.info "Checking download for #{ont.acronym}"
logger.info "Location: #{last.pullLocation.to_s}"; logger.flush
file, filename = last.download_ontology_file
file, md5local, md5remote, new_file_exists = self.new_file_exists?(file, last)

if new_file_exists
logger.info "New file found for #{ont.acronym}\nold: #{md5local}\nnew: #{md5remote}"
logger.flush()
new_submission = self.create_submission(ont, last, file, filename, logger, add_to_queue)
else
logger.info "There is no new file found for #{ont.acronym}"
logger.flush()
end

file.close
new_submission
else
raise self::RemoteFileException.new(last)
end
end

def self.create_submission(ont, sub, file, filename, logger = nil, add_to_queue = true, new_version = nil,
new_released = nil)
logger ||= Kernel.const_defined?("LOGGER") ? Kernel.const_get("LOGGER") : Logger.new(STDOUT)
new_sub = LinkedData::Models::OntologySubmission.new

sub.bring_remaining
sub.loaded_attributes.each do |attr|
new_sub.send("#{attr}=", sub.send(attr))
end

submission_id = ont.next_submission_id()
new_sub.submissionId = submission_id
file_location = LinkedData::Models::OntologySubmission.copy_file_repository(ont.acronym, submission_id, file, filename)
new_sub.uploadFilePath = file_location

unless new_version.nil?
new_sub.version = new_version
end

if new_released.nil?
new_sub.released = DateTime.now
else
new_sub.released = DateTime.parse(new_released)
end
new_sub.submissionStatus = nil
new_sub.creationDate = nil
new_sub.missingImports = nil
new_sub.metrics = nil
full_file_path = File.expand_path(file_location)

# check if OWLAPI is able to parse the file before creating a new submission
owlapi = LinkedData::Parser::OWLAPICommand.new(
full_file_path,
File.expand_path(new_sub.data_folder.to_s),
logger: logger)
owlapi.disable_reasoner
parsable = true

begin
owlapi.parse
rescue Exception => e
logger.error("The new file for ontology #{ont.acronym}, submission id: #{submission_id} did not clear OWLAPI: #{e.class}: #{e.message}\n#{e.backtrace.join("\n\t")}")
logger.error("A new submission has NOT been created.")
logger.flush
parsable = false
end

if parsable
if new_sub.valid?
new_sub.save()

if add_to_queue
self.queue_submission(new_sub, { all: true })
logger.info("OntologyPull created a new submission (#{submission_id}) for ontology #{ont.acronym}")
end
else
logger.error("Unable to create a new submission for ontology #{ont.acronym} with id #{submission_id}: #{new_sub.errors}")
logger.flush
end
else
# delete the bad file
File.delete full_file_path if File.exist? full_file_path
end
new_sub
end

def self.queue_submission(submission, actions={:all => true})
redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port)

if actions[:all]
actions = PROCESS_ACTIONS.dup
else
actions.delete_if {|k, v| !PROCESS_ACTIONS.has_key?(k)}
end
actionStr = MultiJson.dump(actions)
redis.hset(PROCESS_QUEUE_HOLDER, get_prefixed_id(submission.id), actionStr) unless actions.empty?
end

def self.get_prefixed_id(id)
"#{REDIS_SUBMISSION_ID_PREFIX}#{id}"
end

def self.last_fragment_of_uri(uri)
uri.to_s.split("/")[-1]
end

def self.acronym_from_submission_id(submissionID)
submissionID.to_s.split("/")[-3]
end

def self.new_file_exists?(file, last)
file = File.open(file.path, "rb")
remote_contents = file.read
md5remote = Digest::MD5.hexdigest(remote_contents)

if last.uploadFilePath && File.exist?(last.uploadFilePath)
file_contents = open(last.uploadFilePath) { |f| f.read }
md5local = Digest::MD5.hexdigest(file_contents)
new_file_exists = (not md5remote.eql?(md5local))
else
# There is no existing file, so let's create a submission with the downloaded one
new_file_exists = true
end
return file, md5local, md5remote, new_file_exists
end

end
end
end
Loading

0 comments on commit a659415

Please sign in to comment.