diff --git a/app/services/files_by_md5_service.rb b/app/services/files_by_md5_service.rb index 142d08d2..fc4e0042 100644 --- a/app/services/files_by_md5_service.rb +++ b/app/services/files_by_md5_service.rb @@ -30,7 +30,12 @@ def call delegate :druid, to: :purl def versioned_files_by_md5 - versioned_files_object.files_by_md5.select { |md5_file| check_exists(versioned_files_object.content_path_for(md5: md5_file.keys.first)) } + correct_and_present_files = versioned_files_object.file_details_by_md5.select do |file_details| + md5_filename = versioned_files_object.content_path_for(md5: file_details.md5) + md5_filesize = file_details.filesize + check_exists_and_complete(md5_filename, md5_filesize) + end + filenames_by_md5(correct_and_present_files) end def versioned_files_object @@ -43,15 +48,35 @@ def unversioned_files_by_md5 return [] unless purl.public_json cocina = VersionedFilesService::Cocina.new(hash: purl.public_json.cocina_hash) - cocina.files_by_md5.select { |md5_file| check_exists(versioned_files_object.stacks_object_path.join(md5_file.values.first)) } + correct_and_present_files = cocina.file_details_by_md5.select do |file_details| + md5_filename = versioned_files_object.stacks_object_path.join(file_details.filename) + md5_filesize = file_details.filesize + check_exists_and_complete(md5_filename, md5_filesize) + end + filenames_by_md5(correct_and_present_files) end - def check_exists(path) - if path.exist? - true - else - Honeybadger.notify("File missing from shelves", context: { path: path.to_s, druid: }) - false + def filenames_by_md5(file_details_list) + file_details_list.map do |file_details| + { file_details.md5 => file_details.filename } + end + end + + def check_exists_and_complete(path, expected_size) + context = { path: path.to_s, druid:, expected_size: } + + unless path.exist? + Honeybadger.notify("File missing from shelves", context:) + return false end + + actual_size = File.size(path) + if actual_size != expected_size + context[:actual_size] = actual_size + Honeybadger.notify("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + return false + end + + true end end diff --git a/app/services/versioned_files_service/cocina.rb b/app/services/versioned_files_service/cocina.rb index 39ae0012..9ad2f9f9 100644 --- a/app/services/versioned_files_service/cocina.rb +++ b/app/services/versioned_files_service/cocina.rb @@ -1,6 +1,8 @@ class VersionedFilesService # Class for Cocina that doesn't rely on Dro::Models, since no guarantee that JSON will conform with latest. class Cocina + FileDetails = Struct.new('FileDetails', :md5, :filename, :filesize) + # @param [String] druid the druid # @param [String] version the version # @return [Cocina] the Cocina object @@ -23,13 +25,11 @@ def shelve_file_map end # @return [Array>] array of hashes with md5 as key and filename as value for shelved files - # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => "2542A.tiff" }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" } - # ] - def files_by_md5 - @files_by_md5 ||= shelved_files.map do |file| - { md5_for(file) => file.fetch('filename') } + # For example: [#, + # #] + def file_details_by_md5 + @file_details_by_md5 ||= shelved_files.map do |file| + FileDetails.new(md5: md5_for(file), filename: file.fetch('filename'), filesize: file.fetch('size')) end end diff --git a/app/services/versioned_files_service/object.rb b/app/services/versioned_files_service/object.rb index 1b797c36..4a4d8da3 100644 --- a/app/services/versioned_files_service/object.rb +++ b/app/services/versioned_files_service/object.rb @@ -30,13 +30,11 @@ def version_manifest @version_manifest ||= VersionsManifest.new(path: paths.versions_manifest_path) end - # @return [Array>] array of hashes with md5 as key and filename as value for shelved files for all versions. - # For example: [ - # { "5b79c8570b7ef582735f912aa24ce5f2" => "2542A.tiff" }, - # { "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" } - # ] - def files_by_md5 - versions.flat_map { |version| Cocina.for(druid:, version:).files_by_md5 }.uniq + # @return [Array] array of hashes with md5 as key and filename as value for shelved files for all versions. + # For example: [#, + # #] + def file_details_by_md5 + versions.flat_map { |version| Cocina.for(druid:, version:).file_details_by_md5 }.uniq end private diff --git a/spec/services/files_by_md5_service_spec.rb b/spec/services/files_by_md5_service_spec.rb index ebfb8caf..04f60222 100644 --- a/spec/services/files_by_md5_service_spec.rb +++ b/spec/services/files_by_md5_service_spec.rb @@ -17,15 +17,36 @@ end context 'when object is unversioned' do - before do - FileUtils.mkdir_p(stacks_object_path) - File.write("#{stacks_object_path}/2542A.jp2", '2542A.jp2') + context 'when all files are either missing or truncated' do + before do + FileUtils.mkdir_p(stacks_object_path) + File.write("#{stacks_object_path}/2542A.jp2", '2542A.jp2') # this file is smaller than the expected size + end + + it 'returns the files by md5' do + expect(files_by_md5).to eq([]) + + context = { path: "#{stacks_object_path}/2542A.tiff", druid:, expected_size: 3_182_927 } + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context:) + context = { path: "#{stacks_object_path}/2542A.jp2", druid:, expected_size: 11_043, actual_size: 9 } + expect(Honeybadger).to have_received(:notify).with("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + end end - it 'returns the files by md5' do - expect(files_by_md5).to eq([{ "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" }]) + context 'when one of the two files is present at the expected size' do + before do + FileUtils.mkdir_p(stacks_object_path) + File.write("#{stacks_object_path}/2542A.jp2", '1' * 11_043) + end + + it 'returns the files by md5' do + expect(files_by_md5).to eq([{ "cd5ca5c4666cfd5ce0e9dc8c83461d7a" => "2542A.jp2" }]) - expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{stacks_object_path}/2542A.tiff", druid: }) + context = { path: "#{stacks_object_path}/2542A.tiff", druid:, expected_size: 3_182_927 } + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context:) + context = an_instance_of(Hash) + expect(Honeybadger).not_to have_received(:notify).with("File path present on shelves but file isn't the expected size. It's likely a bug shelved the wrong content.", context:) + end end end @@ -49,6 +70,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -63,6 +85,7 @@ type: Cocina::Models::ObjectType.file, label: 'the hierarchical file', filename: 'files/file2.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '5997de4d5abb55f21f652aa61b8f3aaf' } @@ -92,7 +115,7 @@ expect(files_by_md5).to eq([ { "3e25960a79dbc69b674cd4ec67a72c62" => "file2.txt" } ]) - expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{content_path}/5997de4d5abb55f21f652aa61b8f3aaf", druid: }) + expect(Honeybadger).to have_received(:notify).with("File missing from shelves", context: { path: "#{content_path}/5997de4d5abb55f21f652aa61b8f3aaf", druid:, expected_size: 15 }) end end end diff --git a/spec/services/versioned_files_service/object_spec.rb b/spec/services/versioned_files_service/object_spec.rb index ca2114fd..5280f9b8 100644 --- a/spec/services/versioned_files_service/object_spec.rb +++ b/spec/services/versioned_files_service/object_spec.rb @@ -193,6 +193,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -207,6 +208,7 @@ type: Cocina::Models::ObjectType.file, label: 'a file only in version 1', filename: 'files/file0.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3497de4d5abb55f21f652aa61b8f3abd' } @@ -239,6 +241,7 @@ type: Cocina::Models::ObjectType.file, label: 'the regular file', filename: 'file2.txt', + size: 9, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '3e25960a79dbc69b674cd4ec67a72c62' } @@ -253,6 +256,7 @@ type: Cocina::Models::ObjectType.file, label: 'a file only in version 2', filename: 'files/file2.txt', + size: 15, # write_version uses the file name for the file content version: 1, hasMessageDigests: [ { type: 'md5', digest: '5997de4d5abb55f21f652aa61b8f3aaf' } @@ -282,10 +286,10 @@ end it 'returns array of files by md5' do - expect(service.files_by_md5).to eq [ - { "3e25960a79dbc69b674cd4ec67a72c62" => "file2.txt" }, - { "3497de4d5abb55f21f652aa61b8f3abd" => "files/file0.txt" }, - { "5997de4d5abb55f21f652aa61b8f3aaf" => "files/file2.txt" } + expect(service.file_details_by_md5).to eq [ + VersionedFilesService::Cocina::FileDetails.new(md5: '3e25960a79dbc69b674cd4ec67a72c62', filename: 'file2.txt', filesize: 9), + VersionedFilesService::Cocina::FileDetails.new(md5: '3497de4d5abb55f21f652aa61b8f3abd', filename: 'files/file0.txt', filesize: 15), + VersionedFilesService::Cocina::FileDetails.new(md5: '5997de4d5abb55f21f652aa61b8f3aaf', filename: 'files/file2.txt', filesize: 15) ] end end