From 07c79d5ca4cdef26f225c3882d5a0e68ee3b3de3 Mon Sep 17 00:00:00 2001 From: Johnathan Martin Date: Wed, 23 Oct 2024 21:50:24 -0700 Subject: [PATCH 1/2] FilesByMd5Service: in addition to confirming that a file exists before including it in the array #call returns, confirm that the file is of the expected size fixes https://github.com/sul-dlss/dor-services-app/issues/5191 --- app/services/files_by_md5_service.rb | 42 +++++++++++++++---- .../versioned_files_service/cocina.rb | 15 ++++--- .../versioned_files_service/object.rb | 8 ++-- spec/services/files_by_md5_service_spec.rb | 37 ++++++++++++---- .../versioned_files_service/object_spec.rb | 12 ++++-- 5 files changed, 86 insertions(+), 28 deletions(-) diff --git a/app/services/files_by_md5_service.rb b/app/services/files_by_md5_service.rb index 142d08d2..24b7c5d0 100644 --- a/app/services/files_by_md5_service.rb +++ b/app/services/files_by_md5_service.rb @@ -30,7 +30,12 @@ def call delegate :druid, to: :purl def versioned_files_by_md5 - versioned_files_object.files_by_md5.select { |md5_file| check_exists(versioned_files_object.content_path_for(md5: md5_file.keys.first)) } + correct_and_present_file_details = versioned_files_object.file_details_by_md5.select do |md5_file_details| + md5_filename = versioned_files_object.content_path_for(md5: md5_file_details.keys.first) + md5_filesize = md5_file_details.values.first['size'] + check_exists_and_complete(md5_filename, md5_filesize) + end + filenames_by_md5(correct_and_present_file_details) end def versioned_files_object @@ -43,15 +48,36 @@ def unversioned_files_by_md5 return [] unless purl.public_json cocina = VersionedFilesService::Cocina.new(hash: purl.public_json.cocina_hash) - cocina.files_by_md5.select { |md5_file| check_exists(versioned_files_object.stacks_object_path.join(md5_file.values.first)) } + correct_and_present_file_details = cocina.file_details_by_md5.select do |md5_file_details| + md5_filename = versioned_files_object.stacks_object_path.join(md5_file_details.values.first['filename']) + md5_filesize = md5_file_details.values.first['size'] + check_exists_and_complete(md5_filename, md5_filesize) + end + filenames_by_md5(correct_and_present_file_details) end - def check_exists(path) - if path.exist? - true - else - Honeybadger.notify("File missing from shelves", context: { path: path.to_s, druid: }) - false + def filenames_by_md5(files) + files.map do |md5_file_details| + md5 = md5_file_details.keys.first + { md5 => md5_file_details[md5]['filename'] } + end + end + + def check_exists_and_complete(path, expected_size) + context = { path: path.to_s, druid:, expected_size: } + + unless path.exist? + Honeybadger.notify("File missing from shelves", context:) + return false end + + actual_size = File.size(path) + if actual_size != expected_size + context[:actual_size] = actual_size + Honeybadger.notify("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + return false + end + + true end end diff --git a/app/services/versioned_files_service/cocina.rb b/app/services/versioned_files_service/cocina.rb index 39ae0012..940036ea 100644 --- a/app/services/versioned_files_service/cocina.rb +++ b/app/services/versioned_files_service/cocina.rb @@ -24,12 +24,17 @@ def shelve_file_map # @return [Array>] array of hashes with md5 as key and filename as value for shelved files # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => "2542A.tiff" }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" } + # { "5b79c8570b7ef582735f912aa24ce5f2" => { "filename" => "2542A.tiff", "size" => 456 } }, + # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => { "filename" => "2542A.jp2", "size" => 123 } } # ] - def files_by_md5 - @files_by_md5 ||= shelved_files.map do |file| - { md5_for(file) => file.fetch('filename') } + def file_details_by_md5 + @file_details_by_md5 ||= shelved_files.map do |file| + { + md5_for(file) => { + 'filename' => file.fetch('filename'), + 'size' => file.fetch('size') + } + } end end diff --git a/app/services/versioned_files_service/object.rb b/app/services/versioned_files_service/object.rb index 1b797c36..e3fdca51 100644 --- a/app/services/versioned_files_service/object.rb +++ b/app/services/versioned_files_service/object.rb @@ -32,11 +32,11 @@ def version_manifest # @return [Array>] array of hashes with md5 as key and filename as value for shelved files for all versions. # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => "2542A.tiff" }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" } + # { "5b79c8570b7ef582735f912aa24ce5f2" => { "filename" => "2542A.tiff", "size" => 456 } }, + # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => { "filename" => "2542A.jp2", "size" => 123 } } # ] - def files_by_md5 - versions.flat_map { |version| Cocina.for(druid:, version:).files_by_md5 }.uniq + def file_details_by_md5 + versions.flat_map { |version| Cocina.for(druid:, version:).file_details_by_md5 }.uniq end private diff --git a/spec/services/files_by_md5_service_spec.rb b/spec/services/files_by_md5_service_spec.rb index ebfb8caf..04f60222 100644 --- a/spec/services/files_by_md5_service_spec.rb +++ b/spec/services/files_by_md5_service_spec.rb @@ -17,15 +17,36 @@ end context 'when object is unversioned' do - before do - FileUtils.mkdir_p(stacks_object_path) - File.write("#{stacks_object_path}/2542A.jp2", '2542A.jp2') + context 'when all files are either missing or truncated' do + before do + FileUtils.mkdir_p(stacks_object_path) + File.write("#{stacks_object_path}/2542A.jp2", '2542A.jp2') # this file is smaller than the expected size + end + + it 'returns the files by md5' do + expect(files_by_md5).to eq([]) + + context = { path: "#{stacks_object_path}/2542A.tiff", druid:, expected_size: 3_182_927 } + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context:) + context = { path: "#{stacks_object_path}/2542A.jp2", druid:, expected_size: 11_043, actual_size: 9 } + expect(Honeybadger).to have_received(:notify).with("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + end end - it 'returns the files by md5' do - expect(files_by_md5).to eq([{ "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" }]) + context 'when one of the two files is present at the expected size' do + before do + FileUtils.mkdir_p(stacks_object_path) + File.write("#{stacks_object_path}/2542A.jp2", '1' * 11_043) + end + + it 'returns the files by md5' do + expect(files_by_md5).to eq([{ "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" }]) - expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{stacks_object_path}/2542A.tiff", druid: }) + context = { path: "#{stacks_object_path}/2542A.tiff", druid:, expected_size: 3_182_927 } + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context:) + context = an_instance_of(Hash) + expect(Honeybadger).not_to have_received(:notify).with("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + end end end @@ -49,6 +70,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -63,6 +85,7 @@ type: Cocina::Models::ObjectType.file, label: 'the hierarchical file', filename: 'files/file2.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '5997de4d5abb55f21f652aa61b8f3aaf' } @@ -92,7 +115,7 @@ expect(files_by_md5).to eq([ { "3e25960a79dbc69b674cd4ec67a72c62" => "file2.txt" } ]) - expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{content_path}/5997de4d5abb55f21f652aa61b8f3aaf", druid: }) + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{content_path}/5997de4d5abb55f21f652aa61b8f3aaf", druid:, expected_size: 15 }) end end end diff --git a/spec/services/versioned_files_service/object_spec.rb b/spec/services/versioned_files_service/object_spec.rb index ca2114fd..8a926ba5 100644 --- a/spec/services/versioned_files_service/object_spec.rb +++ b/spec/services/versioned_files_service/object_spec.rb @@ -193,6 +193,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -207,6 +208,7 @@ type: Cocina::Models::ObjectType.file, label: 'a file only in version 1', filename: 'files/file0.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3497de4d5abb55f21f652aa61b8f3abd' } @@ -239,6 +241,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -253,6 +256,7 @@ type: Cocina::Models::ObjectType.file, label: 'a file only in version 2', filename: 'files/file2.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '5997de4d5abb55f21f652aa61b8f3aaf' } @@ -282,10 +286,10 @@ end it 'returns array of files by md5' do - expect(service.files_by_md5).to eq [ - { "3e25960a79dbc69b674cd4ec67a72c62" => "file2.txt" }, - { "3497de4d5abb55f21f652aa61b8f3abd" => "files/file0.txt" }, - { "5997de4d5abb55f21f652aa61b8f3aaf" => "files/file2.txt" } + expect(service.file_details_by_md5).to eq [ + { "3e25960a79dbc69b674cd4ec67a72c62" => { "filename" => "file2.txt", "size" => 9 } }, + { "3497de4d5abb55f21f652aa61b8f3abd" => { "filename" => "files/file0.txt", "size" => 15 } }, + { "5997de4d5abb55f21f652aa61b8f3aaf" => { "filename" => "files/file2.txt", "size" => 15 } } ] end end From b63fd3fd094ee00dc173c3e91b1e68b9effcab6b Mon Sep 17 00:00:00 2001 From: Johnathan Martin Date: Fri, 25 Oct 2024 01:05:20 -0700 Subject: [PATCH 2/2] refactore VersionedFilesService::Cocina#file_details_by_md5 to return a FileDetails struct instead of a hash keyed on md5 (has was always one element and no lookup was ever done) --- app/services/files_by_md5_service.rb | 23 +++++++++---------- .../versioned_files_service/cocina.rb | 15 ++++-------- .../versioned_files_service/object.rb | 8 +++---- .../versioned_files_service/object_spec.rb | 6 ++--- 4 files changed, 22 insertions(+), 30 deletions(-) diff --git a/app/services/files_by_md5_service.rb b/app/services/files_by_md5_service.rb index 24b7c5d0..fc4e0042 100644 --- a/app/services/files_by_md5_service.rb +++ b/app/services/files_by_md5_service.rb @@ -30,12 +30,12 @@ def call delegate :druid, to: :purl def versioned_files_by_md5 - correct_and_present_file_details = versioned_files_object.file_details_by_md5.select do |md5_file_details| - md5_filename = versioned_files_object.content_path_for(md5: md5_file_details.keys.first) - md5_filesize = md5_file_details.values.first['size'] + correct_and_present_files = versioned_files_object.file_details_by_md5.select do |file_details| + md5_filename = versioned_files_object.content_path_for(md5: file_details.md5) + md5_filesize = file_details.filesize check_exists_and_complete(md5_filename, md5_filesize) end - filenames_by_md5(correct_and_present_file_details) + filenames_by_md5(correct_and_present_files) end def versioned_files_object @@ -48,18 +48,17 @@ def unversioned_files_by_md5 return [] unless purl.public_json cocina = VersionedFilesService::Cocina.new(hash: purl.public_json.cocina_hash) - correct_and_present_file_details = cocina.file_details_by_md5.select do |md5_file_details| - md5_filename = versioned_files_object.stacks_object_path.join(md5_file_details.values.first['filename']) - md5_filesize = md5_file_details.values.first['size'] + correct_and_present_files = cocina.file_details_by_md5.select do |file_details| + md5_filename = versioned_files_object.stacks_object_path.join(file_details.filename) + md5_filesize = file_details.filesize check_exists_and_complete(md5_filename, md5_filesize) end - filenames_by_md5(correct_and_present_file_details) + filenames_by_md5(correct_and_present_files) end - def filenames_by_md5(files) - files.map do |md5_file_details| - md5 = md5_file_details.keys.first - { md5 => md5_file_details[md5]['filename'] } + def filenames_by_md5(file_details_list) + file_details_list.map do |file_details| + { file_details.md5 => file_details.filename } end end diff --git a/app/services/versioned_files_service/cocina.rb b/app/services/versioned_files_service/cocina.rb index 940036ea..9ad2f9f9 100644 --- a/app/services/versioned_files_service/cocina.rb +++ b/app/services/versioned_files_service/cocina.rb @@ -1,6 +1,8 @@ class VersionedFilesService # Class for Cocina that doesn't rely on Dro::Models, since no guarantee that JSON will conform with latest. class Cocina + FileDetails = Struct.new('FileDetails', :md5, :filename, :filesize) + # @param [String] druid the druid # @param [String] version the version # @return [Cocina] the Cocina object @@ -23,18 +25,11 @@ def shelve_file_map end # @return [Array>] array of hashes with md5 as key and filename as value for shelved files - # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => { "filename" => "2542A.tiff", "size" => 456 } }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => { "filename" => "2542A.jp2", "size" => 123 } } - # ] + # For example: [#, + # #] def file_details_by_md5 @file_details_by_md5 ||= shelved_files.map do |file| - { - md5_for(file) => { - 'filename' => file.fetch('filename'), - 'size' => file.fetch('size') - } - } + FileDetails.new(md5: md5_for(file), filename: file.fetch('filename'), filesize: file.fetch('size')) end end diff --git a/app/services/versioned_files_service/object.rb b/app/services/versioned_files_service/object.rb index e3fdca51..4a4d8da3 100644 --- a/app/services/versioned_files_service/object.rb +++ b/app/services/versioned_files_service/object.rb @@ -30,11 +30,9 @@ def version_manifest @version_manifest ||= VersionsManifest.new(path: paths.versions_manifest_path) end - # @return [Array>] array of hashes with md5 as key and filename as value for shelved files for all versions. - # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => { "filename" => "2542A.tiff", "size" => 456 } }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => { "filename" => "2542A.jp2", "size" => 123 } } - # ] + # @return [Array] array of hashes with md5 as key and filename as value for shelved files for all versions. + # For example: [#, + # #] def file_details_by_md5 versions.flat_map { |version| Cocina.for(druid:, version:).file_details_by_md5 }.uniq end diff --git a/spec/services/versioned_files_service/object_spec.rb b/spec/services/versioned_files_service/object_spec.rb index 8a926ba5..5280f9b8 100644 --- a/spec/services/versioned_files_service/object_spec.rb +++ b/spec/services/versioned_files_service/object_spec.rb @@ -287,9 +287,9 @@ it 'returns array of files by md5' do expect(service.file_details_by_md5).to eq [ - { "3e25960a79dbc69b674cd4ec67a72c62" => { "filename" => "file2.txt", "size" => 9 } }, - { "3497de4d5abb55f21f652aa61b8f3abd" => { "filename" => "files/file0.txt", "size" => 15 } }, - { "5997de4d5abb55f21f652aa61b8f3aaf" => { "filename" => "files/file2.txt", "size" => 15 } } + VersionedFilesService::Cocina::FileDetails.new(md5: '3e25960a79dbc69b674cd4ec67a72c62', filename: 'file2.txt', filesize: 9), + VersionedFilesService::Cocina::FileDetails.new(md5: '3497de4d5abb55f21f652aa61b8f3abd', filename: 'files/file0.txt', filesize: 15), + VersionedFilesService::Cocina::FileDetails.new(md5: '5997de4d5abb55f21f652aa61b8f3aaf', filename: 'files/file2.txt', filesize: 15) ] end end