From b88afb69ca172384b1d600b11991f0716c565dfd Mon Sep 17 00:00:00 2001 From: Jeremy Friesen Date: Wed, 24 May 2023 14:52:04 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=81=20Adding=20IiifPrint::DerivativeRo?= =?UTF-8?q?deoService?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As written, this is a draft commit. Its purpose is to share the implementation details and how we might incorporate the DerivativeRodeo into the derivative generation. This draft will break the build as it does not include adding the DerivativeRodeo as a dependency. An ongoing challenge is that Valkyrie and ActiveFedora's Faraday requirements conflict with the DerivativeRodeo. I put this code forward with lots of comments and the beginnings of tests to ensure that we're on the right path. Related to: - https://github.com/scientist-softserv/iiif_print/issues/219 - https://github.com/scientist-softserv/iiif_print/pull/243 --- .../iiif_print/derivative_rodeo_service.rb | 157 ++++++++++++++++++ lib/iiif_print/engine.rb | 1 + .../jobs/child_works_from_pdf_job.rb | 2 + .../derivative_rodeo_service_spec.rb | 85 ++++++++++ 4 files changed, 245 insertions(+) create mode 100644 app/services/iiif_print/derivative_rodeo_service.rb create mode 100644 spec/services/iiif_print/derivative_rodeo_service_spec.rb diff --git a/app/services/iiif_print/derivative_rodeo_service.rb b/app/services/iiif_print/derivative_rodeo_service.rb new file mode 100644 index 00000000..7afe3d85 --- /dev/null +++ b/app/services/iiif_print/derivative_rodeo_service.rb @@ -0,0 +1,157 @@ +module IiifPrint + ## + # This class implements the interface of a Hyrax::DerivativeService. + # + # That means three important methods are: + # + # - {#valid?} + # - {#create_derivatives} + # - {#cleanup_derivatives} + # + # And the object initializes with a FileSet. + # + # It is a companion to {IiifPrint::PluggableDerivativeService}. + # + # @see https://github.com/samvera/hyrax/blob/main/app/services/hyrax/derivative_service.rb Hyrax::DerivativesService + class DerivativeRodeoService + ## + # @!group Class Attributes + # + # @attr parent_work_identifier_property_name [String] the property we use to identify the unique + # identifier of the parent work as it went through the SpaceStone pre-process. + # + # TODO: The default of :aark_id is a quick hack for adventist. By exposing a configuration + # value, my hope is that this becomes easier to configure. + class_attribute :parent_work_identifier_property_name, default: 'aark_id' + + ## + # @attr input_location_adapter_name [String] The name of a derivative rodeo storage location; + # this will must be a registered with the DerivativeRodeo::StorageLocations::BaseLocation. + class_attribute :input_location_adapter_name, default: 's3' + + ## + # @attr named_derivatives_and_generators_by_type [Hash] the named derivative and it's + # associated generator. The "name" is important for Hyrax. The generator is one that + # exists in the DerivativeRodeo. + # + # TODO: Could be nice to have a registry for the DerivativeRodeo::Generators; but that's a + # tomorrow wish. + class_attribute(:named_derivatives_and_generators_by_type, default: { + pdf: { thumbnail: "DerivativeRodeo::Generators::ThumbnailGenerator" } + }) + # @!endgroup Class Attributes + ## + + ## + # This method "hard-codes" some existing assumptions about the input_uri based on + # implementations for Adventist. Those are reasonable assumptions but time will tell how + # reasonable. + # + # @param file_set [FileSet] + # @return [String] + def self.derivative_rodeo_input_uri(file_set:) + return @derivative_rodeo_input_uri if defined?(@derivative_rodeo_input_uri) + + # TODO: URGENT For a child work (e.g. an image split off of a PDF) we will know that the file_set's + # parent is a child, and the rules of the URI for those derivatives are different from the + # original ingested PDF or the original ingested Image. + + # TODO: This logic will work for an attached PDF; but not for each of the split pages of that + # PDF. How to do that? + + # TODO: This is duplicated logic for another service, consider extracting a helper module; + # better yet wouldn't it be nice if Hyrax did this right and proper. + parent = file_set.parent || file_set.member_of.find(&:work?) + raise IiifPrint::DataError, "Parent not found for #{file_set.class} ID=#{file_set.id}" unless parent + + dirname = parent.public_send(parent_work_identifier_property_name) + + # TODO: This is a hack that knows about the inner workings of Hydra::Works, but for + # expendiency, I'm using it. See + # https://github.com/samvera/hydra-works/blob/c9b9dd0cf11de671920ba0a7161db68ccf9b7f6d/lib/hydra/works/services/add_file_to_file_set.rb#L49-L53 + # TODO: Could we get away with filename that is passed in the create_derivatives process? + filename = Hydra::Works::DetermineOriginalName.call(file_set.original_file) + + # TODO: What kinds of exceptions might we raise if the location is not configured? Do we need + # to "validate" it in another step. + location = DerivativeRodeo::StorageLocations::BaseLocation.load_location(input_location_adapter_name) + + # TODO: This is based on the provided output template in + # https://github.com/scientist-softserv/space_stone-serverless/blob/0dbe2b6fa13b9f4bf8b9580ec14d0af5c98e2b00/awslambda/bin/sample_post.rb#L1 + # and is very much hard-coded. We likely want to "store" the template in a common place for + # the application. + # + # s3://s3-antics/:aark_id/:file_name_with_extension + # s3://s3-antics/12345/hello-world.pdf + @derivative_rodeo_input_uri = File.join(location.adapter_prefix, dirname, File.basename(filename)) + end + + def initialize(file_set) + @file_set = file_set + end + + attr_reader :file_set + delegate :uri, :mime_type, to: :file_set + + ## + # @return + # @see https://github.com/samvera/hyrax/blob/426575a9065a5dd3b30f458f5589a0a705ad7be2/app/services/hyrax/file_set_derivatives_service.rb#L18-L20 Hyrax::FileSetDerivativesService#valid? + def valid? + if in_the_rodeo? + Rails.logger.info("🤠🐮 Using the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}") + true + else + Rails.logger.info("Skipping the DerivativeRodeo for FileSet ID=#{file_set.id} with mime_type of #{mime_type}") + false + end + end + + ## + # @api public + # + # The file_set.class.*_mime_types are carried over from Hyrax. + def create_derivatives(filename) + # TODO: Do we need to handle "impending derivatives?" as per {IiifPrint::PluggableDerivativeService}? + case mime_type + when file_set.class.pdf_mime_types + lasso_up_some_derivatives(filename: filename, type: :pdf) + when file_set.class.image_mime_types + lasso_up_some_derivatives(filename: filename, type: :image) + else + # TODO: add more mime types but for now image and PDF are the two we accept. + raise "Unexpected mime_type #{mime_type} for filename #{filename}" + end + end + + private + + def lasso_up_some_derivatives(type:, **) + # TODO: Can we use the filename instead of the antics of the original_file on the file_set? + # We have the filename in create_derivatives. + named_derivatives_and_generators_by_type.fetch(type).flat_map do |named_derivative, generator_name| + # This is the location that Hyrax expects us to put files that will be added to Fedora. + output_location_template = "file://#{Hyrax::DerivativePath.derivative_path_for_reference(file_set, named_derivative)}" + generator = generator_name.constantize + generator.new(input_uris: [derivative_rodeo_input_uri], output_location_template: output_location_template).generate_uris + end + end + + def supported_mime_types + # If we've configured the rodeo + named_derivatives_and_generators_by_type.keys.flat_map { |type| file_set.class.public_send("#{type}_mime_types") } + end + + def derivative_rodeo_input_uri + @derivative_rodeo_input_uri ||= self.class.derivative_rodeo_input_uri(file_set: file_set) + end + + def in_the_rodeo? + # We can assume that we are not going to process a supported mime type; and there is a cost + # for looking in the rodeo. + return false unless supported_mime_types.include?(mime_type) + + location = DerivativeRodeo::StorageLocations::BaseLocation.from_uri(derivative_rodeo_input_uri) + location.exist? + end + end +end diff --git a/lib/iiif_print/engine.rb b/lib/iiif_print/engine.rb index b8273cf2..1330f20a 100644 --- a/lib/iiif_print/engine.rb +++ b/lib/iiif_print/engine.rb @@ -1,6 +1,7 @@ require 'active_fedora' require 'hyrax' require 'blacklight_iiif_search' +require 'derivative_rodeo' module IiifPrint # module constants: diff --git a/lib/iiif_print/jobs/child_works_from_pdf_job.rb b/lib/iiif_print/jobs/child_works_from_pdf_job.rb index d3e9825a..fef883ce 100644 --- a/lib/iiif_print/jobs/child_works_from_pdf_job.rb +++ b/lib/iiif_print/jobs/child_works_from_pdf_job.rb @@ -48,6 +48,8 @@ def perform(candidate_for_parency, pdf_paths, user, admin_set_id, *) # rubocop:disable Metrics/ParameterLists def split_pdf(original_pdf_path, user, child_model) + # TODO: This is the place to change out the existing service and instead use the derivative + # rodeo; we will likely need to look at method signatures to tighten this interface. image_files = @parent_work.iiif_print_config.pdf_splitter_service.call(original_pdf_path) return if image_files.blank? diff --git a/spec/services/iiif_print/derivative_rodeo_service_spec.rb b/spec/services/iiif_print/derivative_rodeo_service_spec.rb new file mode 100644 index 00000000..303127e3 --- /dev/null +++ b/spec/services/iiif_print/derivative_rodeo_service_spec.rb @@ -0,0 +1,85 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe IiifPrint::DerivativeRodeoService do + let(:work) { double({ described_class.parent_work_identifier_property_name => 'hello-1234-id' }) } + let(:file_set) { FileSet.new.tap { |fs| fs.save!(validate: false) } } + let(:generator) { DerivativeRodeo::Generators::CopyGenerator } + let(:output_extension) { "rb" } + + let(:instance) { described_class.new(file_set) } + + subject(:klass) { described_class } + + describe '.input_location_adapter_name' do + subject { described_class.input_location_adapter_name } + it { is_expected.to eq 's3' } + end + + describe '.parent_work_identifier_property_name' do + subject { described_class.parent_work_identifier_property_name } + it { is_expected.to be_a String } + end + + describe '.named_derivatives_and_generators_by_type' do + subject { described_class.named_derivatives_and_generators_by_type } + it { is_expected.to be_a Hash } + end + + describe '.derivative_rodeo_input_uri' do + subject { described_class.derivative_rodeo_input_uri(file_set: file_set) } + + context 'when the file_set does not have a parent' do + xit 'is expected to raise an error' do + expect { subject }.to raise_error(IiifPrint::DataError) + end + end + + context 'when the file_set has a parent' do + before do + allow(file_set).to receive(:parent).and_return(work) + # TODO: This is a hack that leverages the internals of Hydra::Works; not excited about it but + # this part is only one piece of the over all integration. + allow(file_set).to receive(:original_file).and_return(double(original_filename: __FILE__)) + end + + it { is_expected.to start_with("#{described_class.input_location_adapter_name}://") } + it { is_expected.to end_with(File.basename(__FILE__)) } + end + end + + # TODO: Need Faux Bucket for Derivative Rodeo + xdescribe '#valid?' do + subject { instance.valid? } + + before do + allow(file_set).to receive(:mime_type).and_return(mime_type) + allow(file_set).to receive(:parent).and_return(work) + end + + context 'when the mime_type of the file is not supported' do + let(:mime_type) { "text/plain" } + it { is_expected.to be_falsey } + end + + context 'when derivative rodeo has not pre-processed the file set' do + before { instance.input_location_adapter_name = "file" } + + let(:mime_type) { "application/pdf" } + it { is_expected.to be_falsey } + end + + context 'when the mime type is supported and the derivative rodeo has pre-processed the file set' do + before do + # TODO: write to the rodeo; consider using AWS's spec support; I want to be able to "fake" S3 + # with a "fake" bucket. + # + # Dependent on https://github.com/scientist-softserv/derivative_rodeo/pull/37 + end + + let(:mime_type) { "application/pdf" } + it { is_expected.to be_truthy } + end + end +end