diff --git a/.gitignore b/.gitignore index 5e1422c..b36043b 100644 --- a/.gitignore +++ b/.gitignore @@ -9,6 +9,8 @@ /test/tmp/ /test/version_tmp/ /tmp/ +/log/ + # Used by dotenv library to load environment variables. # .env @@ -48,3 +50,7 @@ build-iPhoneSimulator/ # unless supporting rvm < 1.11.0 or doing something fancy, ignore this: .rvmrc + +# +*.pyc +.pytest_cache \ No newline at end of file diff --git a/.rubocop.yml b/.rubocop.yml index d85821c..786ec1b 100644 --- a/.rubocop.yml +++ b/.rubocop.yml @@ -11,7 +11,8 @@ AllCops: - 'tmp/**/*' - 'vendor/**/*' ExtraDetails: true - TargetRubyVersion: 2.3 + NewCops: enable + TargetRubyVersion: 2.5 # readability is Actually Good Layout/EmptyLinesAroundClassBody: @@ -21,6 +22,10 @@ Layout/IndentationConsistency: Enabled: true EnforcedStyle: normal +Layout/LineLength: + Enabled: true + Max: 120 # default is 80 + # A calculated magnitude based on number of assignments, # branches, and conditions. Metrics/AbcSize: @@ -34,10 +39,6 @@ Metrics/ClassLength: Metrics/CyclomaticComplexity: Enabled: false -Metrics/LineLength: - Enabled: true - Max: 120 # default is 80 - # Avoid methods longer than 10 lines of code. Metrics/MethodLength: Enabled: false @@ -52,6 +53,12 @@ Metrics/ModuleLength: Metrics/PerceivedComplexity: Enabled: false +Naming/FileName: + Exclude: + - Dangerfile + - Rakefile + - Gemfile + # indentation is an endangered resource Style/ClassAndModuleChildren: EnforcedStyle: compact @@ -59,13 +66,7 @@ Style/ClassAndModuleChildren: Style/Documentation: Enabled: false -Naming/FileName: - Exclude: - - Dangerfile - - Rakefile - - Gemfile - -# Checks if there is a magic comment to enforce string literals + # Checks if there is a magic comment to enforce string literals Style/FrozenStringLiteralComment: Enabled: false diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 632de4b..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: ruby -sudo: false -cache: bundler - -before_install: - - gem install bundler - - gem update bundler - -rvm: - - 2.4.5 - - 2.5.3 - - 2.6.1 diff --git a/Gemfile b/Gemfile index c171655..6510ccc 100644 --- a/Gemfile +++ b/Gemfile @@ -1,10 +1,10 @@ source 'https://rubygems.org' -gem 'bundler', '~> 1.17' +gem 'bundler', '~> 2.0' gem 'http-cookie' -gem 'json' +gem 'json', '>= 2.3.0' gem 'logger' -gem 'rake' +gem 'rake', '>= 12.3.3' gem 'swift_ingest', '~> 0.4.0' group :development, :test do diff --git a/Gemfile.lock b/Gemfile.lock index a1631db..84b5707 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,79 +1,81 @@ GEM remote: https://rubygems.org/ specs: - activesupport (5.2.3) + activesupport (5.2.4.5) concurrent-ruby (~> 1.0, >= 1.0.2) i18n (>= 0.7, < 2) minitest (~> 5.1) tzinfo (~> 1.1) - addressable (2.6.0) - public_suffix (>= 2.0.2, < 4.0) - ast (2.4.0) - concurrent-ruby (1.1.5) - crack (0.4.3) - safe_yaml (~> 1.0.0) - domain_name (0.5.20180417) + addressable (2.7.0) + public_suffix (>= 2.0.2, < 5.0) + ast (2.4.2) + concurrent-ruby (1.1.8) + crack (0.4.5) + rexml + domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) - hashdiff (0.3.8) + hashdiff (1.0.1) http-cookie (1.0.3) domain_name (~> 0.5) - i18n (1.6.0) + i18n (1.8.9) concurrent-ruby (~> 1.0) - jaro_winkler (1.5.2) - json (2.2.0) - logger (1.3.0) - minitest (5.11.3) + json (2.5.1) + logger (1.4.3) + minitest (5.14.3) mysql2 (0.4.10) - openstack (3.3.20) + openstack (3.3.21) json - parallel (1.17.0) - parser (2.6.2.1) - ast (~> 2.4.0) - power_assert (1.1.4) - psych (3.1.0) - public_suffix (3.0.3) + parallel (1.20.1) + parser (3.0.0.0) + ast (~> 2.4.1) + power_assert (2.0.0) + public_suffix (4.0.6) rainbow (3.0.0) - rake (12.3.2) - rubocop (0.67.2) - jaro_winkler (~> 1.5.1) + rake (13.0.3) + regexp_parser (2.0.3) + rexml (3.2.4) + rubocop (0.93.1) parallel (~> 1.10) - parser (>= 2.5, != 2.5.1.1) - psych (>= 3.1.0) + parser (>= 2.7.1.5) rainbow (>= 2.2.2, < 4.0) + regexp_parser (>= 1.8) + rexml + rubocop-ast (>= 0.6.0) ruby-progressbar (~> 1.7) - unicode-display_width (>= 1.4.0, < 1.6) + unicode-display_width (>= 1.4.0, < 2.0) + rubocop-ast (1.4.1) + parser (>= 2.7.1.5) rubocop-rspec (1.15.1) rubocop (>= 0.42.0) - ruby-progressbar (1.10.0) - safe_yaml (1.0.5) + ruby-progressbar (1.11.0) swift_ingest (0.4.1) activesupport (~> 5.0) mysql2 (~> 0.4.6) openstack (~> 3.3, >= 3.3.10) - test-unit (3.3.1) + test-unit (3.4.0) power_assert thread_safe (0.3.6) - tzinfo (1.2.5) + tzinfo (1.2.9) thread_safe (~> 0.1) unf (0.1.4) unf_ext - unf_ext (0.0.7.5) - unicode-display_width (1.5.0) + unf_ext (0.0.7.7) + unicode-display_width (1.7.0) vcr (3.0.3) - webmock (3.5.1) + webmock (3.11.2) addressable (>= 2.3.6) crack (>= 0.3.2) - hashdiff + hashdiff (>= 0.4.0, < 2.0.0) PLATFORMS ruby DEPENDENCIES - bundler (~> 1.17) + bundler (~> 2.0) http-cookie - json + json (>= 2.3.0) logger - rake + rake (>= 12.3.3) rubocop (~> 0.51) rubocop-rspec (~> 1.15.1) swift_ingest (~> 0.4.0) @@ -82,4 +84,4 @@ DEPENDENCIES webmock BUNDLED WITH - 1.17.3 + 2.2.11 diff --git a/README.md b/README.md index 04eed0b..2f94edb 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,8 @@ # CWRC Preservation -The CWRC Preservation toolkit contains Ruby applications for preserve content from the CWRC (cwrc.ca) repository. The primary objective is to manage the flow of content from the CWRC repository into an OpenStack Swift repository for preservation. Also, the repository provides an application to audit the contents of the source and preserved objects. The preservation tool is meant to run behind a firewall thus pulling content from CWRC. +> :warning: **These command-line scripts are only compatible with CWRC v1.0**. The next release of CWRC (Islandora v2.0 / Drupal 9+) renders these scripts obsolete therefore this repo is minimally supported and my plan is to not fix the URI obsolete warning (`RUBYOPT='-W0'` before the associated command will suppress the warning) plus limit code clean-up. + +The CWRC Preservation toolkit contains Ruby applications for preserving content from the CWRC (cwrc.ca) repository. The primary objective is to manage the flow of content from the CWRC repository into an OpenStack Swift repository for preservation. Also, the repository provides an application to audit the contents of the source and preserved objects. The preservation tool is meant to run behind a firewall thus pulling content from CWRC. The two main applications are: @@ -10,30 +12,30 @@ The two main applications are: ## Workflow - cwrc_preserver.rb executes at a regular interval - - sends request to the CWRC repository with authentication parameters that produces a manifest list of objects residing with the CWRC repository as a response + - sends a request to the CWRC repository with authentication parameters that produces a manifest list of objects residing with the CWRC repository as a response - for each CWRC repository object, inspect the preserved object - - if the preserved copy does not exist or is outdated (comparing CWRC manifest timestamp to the timestamp on the preserved copy), request a new AIP (Bag) from the CWRC repository and deposit within the preservation environment + - if the preserved copy does not exist or is outdated (comparing CWRC manifest timestamp to the timestamp on the preserved copy, the swift object custom metadata field 'last-mod-timestamp'), request a new AIP (Bag) from the CWRC repository and deposit within the preservation environment - generate an audit report via cwrc_audit_report.rb - - sends request to CWRC repository with authentication parameters to produce a manifest list of objects residing with the CWRC repository - - sends request to preservation environment with authentication parameters to produce a manifest list of objects residing with the preservation environment + - sends a request to CWRC repository with authentication parameters to produce a manifest list of objects residing with the CWRC repository + - sends a request to preservation environment with authentication parameters to produce a manifest list of objects residing with the preservation environment - merge lists and output as a CSV file for interpretation / review (e.g., within a spreadsheet tool) ## Requirements - Ruby 2.3+ - +- Associated Gems via `bundle install` - CWRC API endpoint: https://github.com/cwrc/islandora_bagit_extension - - Configuration file - use [secrets_example.yml](secrets_example.yml) as a starting point and the `-C --config PATH` to specify the config file to utilize. -``` +``` txt # Openstack Swift parameters -SWIFT_TENANT: +SWIFT_AUTH_URL: SWIFT_USERNAME: SWIFT_PASSWORD: -SWIFT_AUTH_URL: +SWIFT_USER_DOMAIN_NAME: SWIFT_PROJECT_DOMAIN_NAME: -SWIFT_PROJECT: +SWIFT_PROJECT_DOMAIN_ID: +SWIFT_PROJECT_NAME: CWRC_SWIFT_CONTAINER: CWRC_PROJECT_NAME: @@ -66,7 +68,7 @@ SWIFT_ARCHIVED_OK: ![Preservation System Diagram (PNG 50px/cm)](docs/images/cwrc_preservation.png) -This application connects to the CWRC repository and the preservation environment, determines which CWRC objects need preservation (e.g., missing from the preservation environment or the preservation environment contains a stale copy) and deposits a copy within the preservation environment. Optionally, the command-line allows defining a list of object ids to trigger a forced preservation event for each specified object. The application uses a config file specified on the command-line to contain properties (e.g. authentication). Two files are created: +This application connects to the CWRC repository and the preservation environment, determines which CWRC objects need preservation (e.g., missing from the preservation environment or the preservation environment contains a stale copy) and deposits a copy within the preservation environment (with CWRC modification time metadata in ['last-mod-timestamp']). Optionally, the command-line allows defining a list of object ids to trigger a forced preservation event for each specified object. The application uses a config file specified on the command-line to contain properties (e.g. authentication). Two files are created: - swift_archived_objs.txt: lists the IDs, size and archive rate of all CWRC successfully preserved objects, - swift_failed_objs.txt: lists all CWRC objects that failed preservation - this will need review and are candidates for reprocessing (hence -r parameter) @@ -77,7 +79,7 @@ Common usage: - query CWRC repository items modified since a given date/time and preserve if needed (example #2) - pass defined list of items and force preservation (example #3) -``` +``` txt Usage: cwrc_preserver [options] options: @@ -108,11 +110,36 @@ Example #3 - process objects via a list and with a forced update (i.e., deposit ./cwrc_preserver.rb -d --config="/opt/conf/cwrc_preserver_conf.yml" --reprocess=/tmp/cwrc_pid_list_one_per_line | tee /tmp/stdout_debug.txt ``` +#### Results in Swift + +Note the custom Swift object metadata item `Last-Mod-Timestamp`, this is the cwrc.ca Islandora7 last modification timestamp on the object (used by the audit to as on factor in determining whether or not the Swift instance needs to be updated). + +``` bash +$ swift stat cwrc-test islandora:root + Account: AUTH_0d17ddb0b6834fc5be902e1a2df6f17b + Container: cwrc-test + Object: islandora:root + Content Type: application/zip + Content Length: 10083 + Last Modified: Tue, 15 Aug 2023 15:08:50 GMT + ETag: 04009a3f93fd2c9b38706bddda4f86ea + Meta Project-Id: islandora:root + Meta Promise: bronze + Meta Aip-Version: 1.0 + Meta Last-Mod-Timestamp: 2018-05-16T14:36:26.691Z + X-Timestamp: 1692112129.62648 + Accept-Ranges: bytes + X-Trans-Id: txa72dc123c3784c959fa89-0064db97f9 + X-Openstack-Request-Id: txa72dc123c3784c959fa89-0064db97f9 +Strict-Transport-Security: max-age=15768000 +``` + + ### Reporting / Auditing: cwrc_audit_report.rb -# ![Audit System Diagram (PNG 50px/cm)](docs/images/cwrc_preservation_audit.png) +## ![Audit System Diagram (PNG 50px/cm)](docs/images/cwrc_preservation_audit.png) ```shell Usage: cwrc_audit_report [options] @@ -123,7 +150,7 @@ Usage: cwrc_audit_report [options] Builds a CSV formatted audit report comparing content within the CWRC repository relative to UAL's OpenStack Swift preserved content. -The report pulls input from two disparate sources: CWRC repository and UAL OpenStack Swift preservation service. The report links the content based on object id and outputs the linked information in csv rows that included the fields: the CWRC object PIDs and modification date/times, UAL Swift ID, modification time, and size along with a column indicating the preservation status (i.e., indicating if modification time comparison between Swift and CWRC indicates a need for preservation, or if the size of the Swift object is zero, etc). +The report pulls input from two disparate sources: CWRC repository and UAL OpenStack Swift preservation service. The report links the content based on object id and outputs the linked information in csv rows that included the fields: the CWRC object PIDs and modification date/times, UAL Swift ID, modification time (metadata['last-mod-timestamp']), and size along with a column indicating the preservation status (i.e., indicating if modification time comparison between Swift and CWRC indicates a need for preservation, or if the size of the Swift object is zero, etc). The output format is CSV with the following header columns: @@ -131,7 +158,7 @@ The output format is CSV with the following header columns: CWRC PID, CWRC modification, Swift ID, - Swift modification time, + Swift modification time (metadata['last-mod-timestamp']), Swift size, Status diff --git a/cwrc_audit_report.rb b/cwrc_audit_report.rb index 0b7ed50..c451d0e 100755 --- a/cwrc_audit_report.rb +++ b/cwrc_audit_report.rb @@ -10,7 +10,9 @@ # modification time, and size along with a column indicating the preservation # status (i.e., indicating if modification time comparison between Swift and # CWRC indicates a need for preservation, or if the size of the Swift object -# is zero, etc) +# is zero, etc). As content within CWRC is created, updated, or deleted +# Swift may contain items not in CWRC (i.e., deleted by CWRC), outdated +# (updated within CWRC) or missing (i.e., newly added to CWRC) # # The output format is CSV with the following header columns: @@ -35,14 +37,31 @@ # -s --summary summary output where status in not 'ok' # +# TODO: enhance performance while limiting memory usage - doesn't work as need the custom metadata 'last-mod-timestamp' +# 1. output swift_container.objects_detail (i.e., no mapping to CWRC object) +# 2. iterate through swift_container.objects_detail via markers (i.e., pages) +# - add CWRC object details +# - when finished Swift object list, add remaining CWRC objects not in Swift +# https://github.com/ruby-openstack/ruby-openstack/wiki/Object-Storage +# response: "cwrc_0c168793-b1ff-453f-a1f6-e1d75f7350be"=>{ +# :bytes=>"5939", +# :content_type=>"application/x-tar", +# :last_modified=>"2018-02-05T06:45:23.422720", +# :hash=>"dd2b11f239f7f25fb504519b612cf896" +# }, + require 'logger' require 'optparse' require 'time' require 'swift_ingest' +require 'csv' require_relative 'cwrc_common' module CWRCPreserver + # swift + SWIFT_LIMIT = 10_000 + # status IDs STATUS_OK = ''.freeze STATUS_E_SIZE = 's'.freeze # error: size zero or too small @@ -81,29 +100,31 @@ module CWRCPreserver raise CWRCArchivingError if swift_con.nil? # query Swift storage for a list of objects - # https://github.com/ruby-openstack/ruby-openstack/wiki/Object-Storage - # https://github.com/ruby-openstack/ruby-openstack/wiki/Object-Storage - # response: "cwrc_0c168793-b1ff-453f-a1f6-e1d75f7350be"=>{ - # :bytes=>"5939", - # :content_type=>"application/x-tar", - # :last_modified=>"2018-02-05T06:45:23.422720", - # :hash=>"dd2b11f239f7f25fb504519b612cf896" - # }, - swift_container = swift_con.swift_connection.container(swift_con.project) - # Iterate via markers # https://github.com/ruby-openstack/ruby-openstack/blob/d9c8aa19488062e483771a9168d24f2626fe688b/lib/openstack/swift/container.rb#L100 - swift_objs = swift_container.objects - while swift_objs.count < swift_container.container_metadata[:count].to_i - swift_objs = swift_objs.merge(swift_container.objects(marker: swift_objs.keys.last)) + # Gotcha: 2021-02-16 - iterating via markers while objects are also added to Swift + # may lead to a race condition as the container metadata count changes but the + # the items added while iterating by marker don't get returned by the marker iteration + # see previous commits for the problematic version + swift_container = swift_con.swift_connection.container(ENV['CWRC_SWIFT_CONTAINER']) + swift_count = swift_container.container_metadata[:count].to_i + swift_objs = swift_container.objects(limit: SWIFT_LIMIT) + while swift_objs.count < swift_count + swift_objs += swift_container.objects(limit: SWIFT_LIMIT, marker: swift_objs.last) end + # TODO: does hash use too much memory? + swift_id_hash = Hash[swift_objs.map.with_index.to_a] # TODO: use CSV gem # CSV header - puts "cwrc_pid (#{cwrc_objs.count}),"\ - "cwrc_mtime (#{Time.now.iso8601}),"\ - "swift_id (#{swift_container.container_metadata[:count]}),"\ - 'swift_timestamp,swift_bytes,status' + puts CSV.generate_line([ + "cwrc_pid (#{cwrc_objs.count})", + "cwrc_mtime (#{Time.now.iso8601})", + "swift_id (#{swift_container.container_metadata[:count]})", + 'swift_timestamp metadata[last-mod-timestamp]', + 'swift_bytes', + 'status (x=outdated/missing; d=missing in CWRC; s=size suspect)' + ]) # TODO: find a better way to merge CWRC and Swift hashes into an output format # for each cwrc object @@ -112,10 +133,12 @@ module CWRCPreserver cwrc_mtime = cwrc_obj['timestamp'] swift_id = cwrc_pid - if swift_objs.key?(swift_id) - swift_obj = swift_container.objects(swift_id) - swift_timestamp = swift_obj.metadata['last-mod-timestamp'] - swift_bytes = swift_obj.bytes + # TODO: .include? slow with 400K items; try bsearch and if not use hash + if swift_id_hash.key?(swift_id) + swift_obj = swift_container.object(swift_id) + swift_obj_metadata = swift_obj.object_metadata + swift_timestamp = swift_obj_metadata[:metadata]['last-mod-timestamp'] + swift_bytes = swift_obj_metadata[:bytes] # note: CWRC uses zulu while Swift is local timezone (assumption) # If timestamps don't match then report Swift object older than CWRC status = if Time.parse(cwrc_mtime) > Time.parse(swift_timestamp) @@ -136,13 +159,29 @@ module CWRCPreserver # CSV content if !opt_summary_output || (opt_summary_output && status != STATUS_OK) - puts "#{cwrc_pid},#{cwrc_mtime},#{swift_id},#{swift_timestamp},#{swift_bytes},#{status}" + puts CSV.generate_line([ + cwrc_pid, + cwrc_mtime, + swift_id, + swift_timestamp, + swift_bytes, + status + ]) end end # find the remaining Swift objects that don't have corresponding items in CWRC - swift_objs&.each do |key, swift_obj| + swift_objs&.each do |swift_id| + swift_obj = swift_container.object(swift_id) + swift_obj_metadata = swift_obj.object_metadata # CSV content - puts ",,#{key},#{swift_obj[:last_modified]},#{swift_obj[:bytes]},#{STATUS_I_DEL}" + puts CSV.generate_line([ + '', + '', + swift_obj.name, + swift_obj_metadata[:metadata]['last-mod-timestamp'], + swift_obj_metadata[:bytes], + STATUS_I_DEL + ]) end end diff --git a/cwrc_common.rb b/cwrc_common.rb index 8120e3d..07e1295 100755 --- a/cwrc_common.rb +++ b/cwrc_common.rb @@ -4,6 +4,8 @@ require 'json' require 'http-cookie' +require_relative 'ingestor' + module CWRCPreserver class CWRCArchivingError < StandardError; end @@ -53,7 +55,9 @@ def self.get_cwrc_objs(cookie, timestamp) raise CWRCArchivingError unless all_obj_response.is_a? Net::HTTPSuccess - all_obj_response.body.slice! timestamp + # 2023-08-16: I don't think this is needed - if input is '2023-07-15' then first item in the json body + # is changed from "2023-07-15T23:06:50.145Z" to "T23:06:50.145Z" + # all_obj_response.body.slice! timestamp JSON.parse(all_obj_response.body)['objects'] end @@ -61,6 +65,7 @@ def self.get_cwrc_objs(cookie, timestamp) # retry in event the server or network connection # only save a file if successful # http://ruby-doc.org/stdlib-2.5.1/libdoc/net/http/rdoc/Net/HTTP.html + # ToDo: refactor to improve readability def self.download_cwrc_obj(cookie, cwrc_obj, cwrc_file) # download object from cwrc obj_path = "https://#{ENV['CWRC_HOSTNAME']}/islandora/object/#{cwrc_obj['pid']}/manage/bagit_extension" @@ -71,23 +76,27 @@ def self.download_cwrc_obj(cookie, cwrc_obj, cwrc_file) begin Net::HTTP.start(ENV['CWRC_HOSTNAME'], ENV['CWRC_PORT'], use_ssl: true, read_timeout: http_read_timeout) do |http| - response = http.request(obj_req) - if response.is_a? Net::HTTPSuccess + http.request obj_req do |response| + unless response.is_a? Net::HTTPSuccess + raise Net::HTTPError.new("Failed request #{obj_path} with http status #{response.code}", response.code) + end + # CWRC response need to have the object's modified timestamp in the header raise CWRCArchivingError if response['CWRC-MODIFIED-DATE'].nil? cwrc_obj['timestamp'] = response['CWRC-MODIFIED-DATE'].tr('"', '') - # save HTTP response to working directory - File.open(cwrc_file, 'wb') do |file| - file.write(response.body) + cwrc_obj['content-type'] = response['Content-Type'].tr('"', '') + + File.open(cwrc_file, 'wb') do |io| + # save HTTP response to working directory: chunk large file + response.read_body do |chunk| + io.write chunk + end end + # compare md5sum of downloaded with with the HTTP header CWRC-CHECHSUM # to detect transport corruption raise CWRCArchivingError unless response['CWRC-CHECKSUM'].tr('"', '') == Digest::MD5.file(cwrc_file).to_s - elsif response.is_a? Net::HTTPServerError - raise Net::HTTPError.new("Failed request #{obj_path} with http status #{response.code}", response.code) - else - raise Net::HTTPError.new("Failed request #{obj_path} with http status #{response.code}", response.code) end end rescue CWRCArchivingError, @@ -111,12 +120,19 @@ def self.download_cwrc_obj(cookie, cwrc_obj, cwrc_file) end def self.connect_to_swift - SwiftIngest::Ingestor.new(username: ENV['SWIFT_USERNAME'], + # https://www.rubydoc.info/gems/openstack/3.3.21/OpenStack/Connection + # bundle exec ruby ./cwrc_preserver.rb -d --config ../secrets_olrc.yml --reprocess log/olrc_test_list + SwiftIngest::Ingestor.new(auth_url: ENV['SWIFT_AUTH_URL'], + username: ENV['SWIFT_USERNAME'], password: ENV['SWIFT_PASSWORD'], - tenant: ENV['SWIFT_TENANT'], - auth_url: ENV['SWIFT_AUTH_URL'], + user_domain: ENV['SWIFT_USER_DOMAIN_NAME'], project_name: ENV['SWIFT_PROJECT_NAME'], + project_domain_id: ENV['SWIFT_PROJECT_DOMAIN_ID'], + # For UAL Swift compatability (leave blank) project_domain_name: ENV['SWIFT_PROJECT_DOMAIN_NAME'], + # is_debug: TRUE, + region: ENV['SWIFT_REGION'], + identity_api_version: '3', project: ENV['CWRC_PROJECT_NAME']) end end diff --git a/cwrc_preserver.rb b/cwrc_preserver.rb index c52b69d..7b43776 100755 --- a/cwrc_preserver.rb +++ b/cwrc_preserver.rb @@ -10,10 +10,10 @@ # -s, --start=val process subset of material: objects modified after specified ISO-8601 YYY-MM-DD # -h, --help -require 'swift_ingest' require 'optparse' require 'logger' require 'time' +require_relative 'ingestor' require_relative 'cwrc_common' module CWRCPreserver @@ -59,8 +59,8 @@ module CWRCPreserver # load exception files log_dir = ENV['CWRC_PRESERVER_LOG_DIR'] time_str = Time.now.strftime('%Y-%m-%d_%H-%M-%S') - except_file = File.join(log_dir, time_str + '_' + ENV['SWIFT_ARCHIVE_FAILED']) - success_file = File.join(log_dir, time_str + '_' + ENV['SWIFT_ARCHIVED_OK']) + except_file = File.join(log_dir, "#{time_str}_#{ENV['SWIFT_ARCHIVE_FAILED']}") + success_file = File.join(log_dir, "#{time_str}_#{ENV['SWIFT_ARCHIVED_OK']}") Dir.mkdir(log_dir) unless File.exist?(log_dir) # working directory @@ -68,17 +68,17 @@ module CWRCPreserver Dir.mkdir(work_dir) unless File.exist?(work_dir) # setup logger and log level - log = Logger.new(STDOUT) + log = Logger.new($stdout) log.level = if debug_level || ENV['DEBUG'] == 'true' Logger::DEBUG else Logger::INFO end - log.debug("Retrieving all objects modified since: #{start_dt}") unless start_dt.nil? + log.debug("Retrieving all objects modified since: #{start_dt}") unless start_dt.nil? || start_dt.empty? # get connection cookie cookie = retrieve_cookie - log.debug("Using connecion cookie: #{cookie}") + log.debug("CWRC auth: using connection cookie: #{cookie}") # connect to swift storage swift_depositer = connect_to_swift @@ -106,9 +106,12 @@ module CWRCPreserver # check if file has been deposited, handle open stack bug causing exception in openstack/connection force_deposit = false || !reprocess.empty? begin - # TODO: switch to swift_depositer.lookup once Gem updated - swift_file = swift_depositer.get_file_from_swit(cwrc_obj['pid'], ENV['CWRC_SWIFT_CONTAINER']) unless force_deposit - log.debug("SWIFT LOOKUP: #{swift_file.nil? ? 'not found' : swift_file.metadata['last-mod-timestamp']}") + # if force_deposit then skip lookup of existing + unless force_deposit + # TODO: switch to swift_depositer.lookup once Gem updated + swift_file = swift_depositer.get_file_from_swit(cwrc_obj['pid'], ENV['CWRC_SWIFT_CONTAINER']) + log.debug("SWIFT LOOKUP: #{swift_file.nil? ? 'not found' : swift_file.metadata['last-mod-timestamp']}") + end rescue StandardError => e force_deposit = true log.debug("Force deposit in swift: #{cwrc_obj['pid']} #{e.message}") @@ -142,7 +145,7 @@ module CWRCPreserver end file_size = File.size(cwrc_file_tmp_path).to_f / 2**20 - fs_str = format('%.3f', file_size) + fs_str = format('%.3f', file_size: file_size) log.debug("SIZE: #{fs_str} MB") cwrc_time = Time.now @@ -150,6 +153,7 @@ module CWRCPreserver begin # TODO: switch to swift_depositer.deposit once Gem updated swift_depositer.deposit_file(cwrc_file_tmp_path, + cwrc_obj['content-type'], ENV['CWRC_SWIFT_CONTAINER'], last_mod_timestamp: cwrc_obj['timestamp']) rescue StandardError => e @@ -164,9 +168,9 @@ module CWRCPreserver FileUtils.rm_rf(cwrc_file_tmp_path) if File.exist?(cwrc_file_tmp_path) # print statistics - dp_rate = format('%.3f', (file_size / (swift_time - start_time))) - cwrc_rate = format('%.3f', (file_size / (cwrc_time - start_time))) - swift_rate = format('%.3f', (file_size / (swift_time - cwrc_time))) + dp_rate = format('%.3f', dp_rate: (file_size / (swift_time - start_time))) + cwrc_rate = format('%.3f', cwrc_rate: (file_size / (cwrc_time - start_time))) + swift_rate = format('%.3f', swift_rate: (file_size / (swift_time - cwrc_time))) log.debug("FILE DEPOSITED: #{cwrc_obj['pid']}, deposit rate #{dp_rate} (#{cwrc_rate} #{swift_rate}) MB/sec") File.open(success_file, 'a') do |ok_file| ok_file.write("#{cwrc_obj['pid']} #{fs_str} MB #{dp_rate} (#{cwrc_rate} #{swift_rate}) MB/sec\n") diff --git a/ingestor.rb b/ingestor.rb new file mode 100644 index 0000000..172a8eb --- /dev/null +++ b/ingestor.rb @@ -0,0 +1,93 @@ +require 'swift_ingest/version' +require 'openstack' +require 'mysql2' +require 'active_support' +require 'active_support/core_ext' + +# This is a copy of the Ruby Gem swift_ingest-0.4.1/lib/swift_ingest.rb +# Changed to +# * remove the hard coded content type +# * change the deposit_file method such that the basename doesn't remove '.*' +# when creating an Swift ID - this causes problems with CWRC + +class SwiftIngest::Ingestor + + attr_reader :swift_connection, :project + + def initialize(connection = {}) + extra_opt = { auth_method: 'password', + service_type: 'object-store' } + options = connection.merge(extra_opt) + options[:api_key] = options.delete :password + + @swift_connection = OpenStack::Connection.create(options) + @project = connection[:project] + + # connect to the database + @dbcon = if ENV['DB_HOST'] && ENV['DB_USER'] && ENV['DB_PASSWORD'] && ENV['DB_DATABASE'] + Mysql2::Client.new(host: ENV['DB_HOST'], + username: ENV['DB_USER'], + password: ENV['DB_PASSWORD'], + database: ENV['DB_DATABASE']) + end + end + + def get_file_from_swit(file_name, swift_container) + deposited_file = nil + file_base_name = File.basename(file_name) + container = swift_connection.container(swift_container) + deposited_file = container.object(file_base_name) if container.object_exists?(file_base_name) + deposited_file + end + + def deposit_file(file_name, content_type, swift_container, custom_metadata = {}) + file_base_name = File.basename(file_name) + checksum = Digest::MD5.file(file_name).hexdigest + container = swift_connection.container(swift_container) + + # Add swift metadata with in accordance to AIP spec: + # https://docs.google.com/document/d/154BqhDPAdGW-I9enrqLpBYbhkF9exX9lV3kMaijuwPg/edit# + metadata = { + project: @project, + project_id: file_base_name, + promise: 'bronze', + aip_version: '1.0' + }.merge(custom_metadata) + + # ruby-openstack wants all keys of the metadata to be named like + # "X-Object-Meta-{{Key}}" so update them + metadata.transform_keys! { |key| "X-Object-Meta-#{key}" } + + if container.object_exists?(file_base_name) + # temporary solution until fixed in upstream: + # for update: construct hash for key/value pairs as strings, + # and metadata as additional key/value string pairs in the hash + headers = { 'etag' => checksum, + 'content-type' => content_type }.merge(metadata) + deposited_file = container.object(file_base_name) + deposited_file.write(File.open(file_name), headers) + else + # for creating new: construct hash with symbols as keys, add metadata as a hash within the header hash + headers = { etag: checksum, + content_type: content_type, + metadata: metadata } + # base file name becomes the Swift identifier + deposited_file = container.create_object(file_base_name, headers, File.open(file_name)) + end + + return deposited_file unless @dbcon + + # update db with deposited file info + @dbcon.query("INSERT INTO archiveEvent(project, container, ingestTime, \ + objectIdentifier, objectChecksum, objectSize) \ + VALUES('#{@project}', '#{swift_container}', now(), '#{file_base_name}', '#{checksum}', \ + '#{File.size(file_name)}')") + custom_metadata.each do |key, value| + @dbcon.query("INSERT INTO customMetadata(eventId, propertyName, propertyValue) \ + VALUES(LAST_INSERT_ID(), '#{key}', '#{value}' )") + end + + deposited_file + end + +end diff --git a/migration/README.md b/migration/README.md new file mode 100644 index 0000000..427afc5 --- /dev/null +++ b/migration/README.md @@ -0,0 +1,55 @@ +# Migration from UAL Swift to OLRC Oct. 2023 + +## Process + +1. Use the audit report as a source of IDs in the UAL Swift + +``` bash +cut -d ',' -f 3 2023-09-15_cwrc_audit_report.cleaned.csv | sort > all_pids_2023-09-15.csv pids_2023-09-15_part_ +``` + +2023-10-13: the following 3 objects occur twice in the SPARQL query results building the list of PIDs (persistent IDs) within CWRC and their associated ‘last modified date’ – this is likely a broken delete operation on the triplestore that didn’t completely remove existing triples before adding a new triple. This doesn’t not impact date on the preserved item but does impact the audit report in that there are two rows form the same PID: + +* cwrc:1f59e5c9-63bc-44e2-8146-e612d9aa9a7a +* cwrc:765b9c98-50e1-41a1-83a8-c29e64ce1412 +* islandora:db03c412-9e0a-4b79-949b-9bd61bb75bfd + +1. Segment the list + +Sepment the list of ~410K items into 25k items to ease recovery if the migration process stops (e.g., server reboot, etc.) + +``` bash +split -l 25000 -d all_pids_2023-09-15.csv pids_2023-09-15_part_ +``` + +1. Migrate + +``` bash +source "RC_file_from_OLRC_Horizon_UI" +SEGMENT=11 +python3 migration/migrate.py --swift_src_config_path ../secret_ual.yml --id_list ../pid_lists/pids_2023-09-15_part_${SEGMENT} --tmp_dir ../tmp/ --container_src CWRC --container_dst cwrc --uploaded_by "Jeffery Antoniuk" --database_csv ../logs/pids_2023-09-15_part_${SEGMENT}.log +``` + +1. Audit + +* test counts +* run the `cwrc_audit_report.rb` + * audit migrated Swift content comparing one Swift instance to a second + * see the script for details +* run the 4-week preservation side-by-side old and new and compare the output + +## Tests + +### How to run tests + +Setup + +```bash +pip3 install pytest pytest-mock --user +``` + +Run + +```bash +pytest tests/migration_unit_test.py +``` \ No newline at end of file diff --git a/migration/migrate.py b/migration/migrate.py new file mode 100644 index 0000000..c001a3f --- /dev/null +++ b/migration/migrate.py @@ -0,0 +1,296 @@ +############################################################################################## +# desc: migrate swift content from one instance to a second +# usage: +# source openrc file from the Swift destination then +# python3 migrate.py \ +# --swift_src_config_path ${SRC_CONFIG_PATH} \ +# --swift_dest_container ${SWIFT_CONTAINER} \ +# --id_list ${id_LIST} \ +# --tmp_dir ${TMP_DIR} \ +# --container_src ${} \ +# --container_dst ${} \ +# --uploaded_by ${} \ +# --database_csv ${} +# https://docs.openstack.org/python-swiftclient/latest/service-api.html +# https://docs.openstack.org/swift/pike/overview_large_objects.html#additional-notes +# https://docs.openstack.org/python-swiftclient/latest/service-api.html +# https://github.com/openstack/python-swiftclient/blob/master/swiftclient/client.py#L516 +# +# date: Sept 12, 2023 +############################################################################################## + +import argparse +import csv +import hashlib +import logging +import os +import sys +import yaml +from swiftclient.service import ClientException, SwiftError, SwiftService, SwiftUploadObject + +# Swift: length of a segment; created when the file is too large +SWIFT_SEGMENT_LENGTH = 5261334937 + + +# +def parse_args(args): + parser = argparse.ArgumentParser() + parser.add_argument('--swift_src_config_path', required=True, help='Source preservation container name.') + parser.add_argument('--id_list', required=True, help='Migrate only the items in the file (IDs; one per line).') + parser.add_argument('--tmp_dir', required=True, help='Temporary directory (must exist; used for tar).') + parser.add_argument('--container_src', required=True, help='Source container name.') + parser.add_argument('--container_dst', required=True, help='Destination container name.') + parser.add_argument('--uploaded_by', required=True, help='Name of person running the script.') + parser.add_argument('--database_csv', required=True, help='Name of the log file to store the uploaded item information.') + return parser.parse_args(args) + + +# Source Swift +def swift_init_src(config_file, tmp_dir='/tmp'): + with open(config_file, "r") as stream: + try: + cfg = yaml.safe_load(stream) + _options = { + 'os_auth_url': cfg['OS_AUTH_URL'], + 'os_username': cfg['OS_USERNAME'], + 'os_password': cfg['OS_PASSWORD'], + 'os_user_domain_name': cfg['OS_USER_DOMAIN_NAME'], + 'os_project_domain_name': cfg['OS_PROJECT_DOMAIN_NAME'], + 'os_project_name': cfg['OS_PROJECT_NAME'], + 'os_project_id': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'os_project_domain_id': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'os_region_name': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'retries': 2, + 'out_directory': tmp_dir + } + conn = SwiftService(options=_options) + + except yaml.YAMLError as e: + print(e) + except ClientException as e: + print(e) + except Exception as e: + print(e) + return conn + + +def container_info(swift_conn_src, container_src, swift_conn_dst, container_dst): + tmp = swift_conn_src.stat(container_src) + print(f"{tmp}") + tmp = swift_conn_dst.stat(container_dst) + print(f"{tmp}") + + +# validate that the contents of the destination match the source; allow exceptions in the header +# for example: timestamp and source Swift (CWRC) contains the wrong mimetype (fixed during the upload) +def validate(swift_conn_src, container_src, swift_conn_dst, container_dst, id, exceptions=[]): + if type(id) is not list: + id = [id] + + # header properties that are expected to be different, e.g., request related ids or Swift managed timestamps + exceptions = [ + *exceptions, + 'last-modified', + 'x-timestamp', + 'x-trans-id', + 'x-openstack-request-id', + 'date' + ] + # stat returns an iterator: https://docs.openstack.org/python-swiftclient/latest/service-api.html#stat + for src in swift_conn_src.stat(container_src, id): + logging.info(f"{src}") + for dst in swift_conn_dst.stat(container_dst, id): + logging.info(f"{dst}") + for key in src['headers']: + if key not in exceptions: + if key not in dst['headers']: + logging.error(f"{key} not present in destination: {src['headers'][key]}") + raise SwiftError(f"{key} not present in destination: {src['headers'][key]}", container_dst, id) + elif container_src == 'CWRC' and key == 'content-type' and dst['headers'][key] == 'application/zip': + logging.info(f"{key} differs; this is expected in CWRC due to bulk change - destination {dst['headers'][key]}") + elif src['headers'][key] != dst['headers'][key]: + logging.error(f"{key} differs {src['headers'][key]} <> {dst['headers'][key]}") + raise SwiftError(f"{key} differs: {src['headers'][key]} <> {dst['headers'][key]}", container_dst, id) + else: + logging.info(f"{key} matches {src['headers'][key]} == {dst['headers'][key]}") + + +# +def file_checksum(path): + hash_md5 = hashlib.md5() + hash_sha256 = hashlib.sha256() + with open(path, 'rb') as f: + # read and buffer to prevent high memory usage + for chunk in iter(lambda: f.read(4096), b""): + hash_md5.update(chunk) + hash_sha256.update(chunk) + return { + 'md5sum': hash_md5.hexdigest(), + 'sha256sum': hash_sha256.hexdigest() + } + + +# +def validate_checksum(path, etag, id): + checksums = file_checksum(path) + if checksums['md5sum'] != etag: + raise ClientException(f"ERROR: id:[{id}] error: checksum failure [{path}] - {checksums['md5sum']} <> {etag}") + return checksums + + +# +def build_swift_upload_object(src_item, container_src): + # Custom headers: https://github.com/ualbertalib/swift_ingest/blob/master/lib/swift_ingest/ingestor.rb#L18 + src_headers = src_item['response_dict']['headers'] + options = { + 'header': { + 'x-object-meta-project-id': src_headers['x-object-meta-project-id'] if 'x-object-meta-project-id' in src_headers else '', + 'x-object-meta-aip-version': src_headers['x-object-meta-aip-version'] if 'x-object-meta-aip-version' in src_headers else '', + 'x-object-meta-project': src_headers['x-object-meta-project'] if 'x-object-meta-project' in src_headers else '', + 'x-object-meta-promise': src_headers['x-object-meta-promise'] if 'x-object-meta-promise' in src_headers else '', + 'content-type': src_headers['content-type'] if 'content-type' in src_headers else '' + } + } + # Custom CWRC metadata used by auditing processes; CWRC platform object last update timestamp + # CWRC content-type fix (source Swift used x-tar when content is zip) + if container_src == 'CWRC': + # could use |= to combine dict structures but aiming for Python 3.5 + options['header']['x-object-meta-last-mod-timestamp'] = src_headers['x-object-meta-last-mod-timestamp'] + options['header']['content-type'] = 'application/zip' + # cleanup June 2023 missing meta-project - always CWRC + options['header']['x-object-meta-project'] = container_src + + upload_obj = SwiftUploadObject( + src_item['path'], + object_name=src_item['object'], + options=options + ) + + return upload_obj + + +# +def download_from_source(swift_conn_src, container_src, id): + if type(id) is not list: + id = [id] + + # download the Swift object from the source Swift instance + src_objs = swift_conn_src.download(container_src, id) + + # build SwiftUploadObject from download response + dst_objs = [] + for src_item in src_objs: + + logging.info(f"{src_item}") + if not src_item['success']: + raise ClientException(f"ERROR: id:[{id}] error: {src_item['error']}") + + dst_objs.append(build_swift_upload_object(src_item, container_src)) + + # test download file against Swift header etag to verify + validate_checksum(src_item['path'], src_item['response_dict']['headers']['etag'], id) + + return dst_objs + + +# +def log_upload(db_writer, dst_item, container_dst, checksums, uploaded_by): + db_dict = { + 'id': dst_item['object'], + 'md5sum': checksums['md5sum'], + 'sha256sum': checksums['sha256sum'], + 'uploaded_by': uploaded_by, + 'last_updated_at': dst_item['response_dict']['headers']['last-modified'], + 'container_name': container_dst, + 'notes': "" + } + db_writer.writerow(db_dict) + + +# upload to Swift and remove temporary file +def upload_to_destination(swift_conn_dst, container_dst, dst_objs, db_writer, uploaded_by): + for dst_item in swift_conn_dst.upload(container_dst, dst_objs): + if dst_item['action'] == 'upload_object': + logging.info(f"{dst_item}") + if not dst_item['success']: + if 'object' in dst_item: + logging.error(f"{dst_item}") + raise SwiftError(dst_item['error'], container_dst, dst_item['object']) + # Swift segmented object + elif 'for_object' in dst_item: + logging.error(f"{dst_item}") + raise SwiftError(dst_item['error'], container_dst, dst_item['object'], dst_item['segment_index']) + + if dst_item['action'] == 'upload_object' and os.path.isfile(dst_item['path']): + # test upload file against Swift header etag to verify + checksums = validate_checksum(dst_item['path'], dst_item['response_dict']['headers']['etag'], dst_item['object']) + # log upload + log_upload(db_writer, dst_item, container_dst, checksums, uploaded_by) + # remove temporary file + os.remove(dst_item['path']) + + +# +def process(args, swift_conn_src, swift_conn_dst, db_writer): + + # get list of items + try: + with open(args.id_list) as f: + for line in f: + id = line.strip() + print(id) + dst_objs = download_from_source(swift_conn_src, args.container_src, id) + upload_to_destination(swift_conn_dst, args.container_dst, dst_objs, db_writer, args.uploaded_by) + validate(swift_conn_src, args.container_src, swift_conn_dst, args.container_dst, id) + + except ClientException as e: + logging.error(e) + except SwiftError as e: + logging.error(e) + except Exception as e: + logging.error(e) + + +# +def csv_init(fd): + db_writer = csv.DictWriter(fd, fieldnames=[ + 'id', # (avalon noid) + 'md5sum', + 'sha256sum', + 'uploaded_by', + 'last_updated_at', + 'container_name', + 'notes' + ]) + db_writer.writeheader() + return db_writer + + +# +def main(): + options = {} + logging.basicConfig(level=logging.ERROR) + + if (os.environ.get('OS_AUTH_URL') is None): + print("ERROR: missing Swift auth; source the Swift env file for the container before running this script") + exit() + + args = parse_args(sys.argv[1:]) + + options['segment_size'] = SWIFT_SEGMENT_LENGTH + options['use_slo'] = True + options['object_uu_threads'] = 1 + options['retries'] = 2 + options['out_directory'] = args.tmp_dir + + with SwiftService(options=options) as swift_conn_dest: + swift_conn_src = swift_init_src(args.swift_src_config_path, args.tmp_dir) + with open(args.database_csv, 'w', newline='') as db_file: + db_writer = csv_init(db_file) + + process(args, swift_conn_src, swift_conn_dest, db_writer) + os.fsync(db_file) + + +if __name__ == "__main__": + main() diff --git a/migration/migrate_audit.py b/migration/migrate_audit.py new file mode 100644 index 0000000..496d22a --- /dev/null +++ b/migration/migrate_audit.py @@ -0,0 +1,154 @@ +############################################################################################## +# desc: audit migrated swift content comparing one instance to a second +# usage: +# source openrc file from the Swift destination then +# python3 migrate.py \ +# --swift_src_config_path ${SRC_CONFIG_PATH} \ +# --id_list ${id_LIST} \ +# --container_src ${} \ +# --container_dst ${} +# https://docs.openstack.org/python-swiftclient/latest/service-api.html +# https://docs.openstack.org/swift/pike/overview_large_objects.html#additional-notes +# https://docs.openstack.org/python-swiftclient/latest/service-api.html +# https://github.com/openstack/python-swiftclient/blob/master/swiftclient/client.py#L516 +# +# date: Sept 12, 2023 +############################################################################################## + +import argparse +import logging +import os +import sys +import time +import yaml +from swiftclient.service import ClientException, SwiftError, SwiftService, SwiftUploadObject + +# Swift: length of a segment; created when the file is too large +SWIFT_SEGMENT_LENGTH = 5261334937 + + +# +def parse_args(args): + parser = argparse.ArgumentParser() + parser.add_argument('--swift_src_config_path', required=True, help='Source preservation container name.') + parser.add_argument('--id_list', required=True, help='Migrate only the items in the file (IDs; one per line).') + parser.add_argument('--container_src', required=True, help='Source container name.') + parser.add_argument('--container_dst', required=True, help='Destination container name.') + parser.add_argument('--tmp_dir', required=False, default='/tmp', help='Temporary directory (must exist; used for tar).') + + return parser.parse_args(args) + + +# Source Swift +def swift_init_src(config_file, tmp_dir='/tmp'): + with open(config_file, "r") as stream: + try: + cfg = yaml.safe_load(stream) + _options = { + 'os_auth_url': cfg['OS_AUTH_URL'], + 'os_username': cfg['OS_USERNAME'], + 'os_password': cfg['OS_PASSWORD'], + 'os_user_domain_name': cfg['OS_USER_DOMAIN_NAME'], + 'os_project_domain_name': cfg['OS_PROJECT_DOMAIN_NAME'], + 'os_project_name': cfg['OS_PROJECT_NAME'], + 'os_project_id': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'os_project_domain_id': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'os_region_name': '', # set blank when using UAL swift plus OLRC source env variables otherwise auth fails + 'retries': 2 + } + conn = SwiftService(options=_options) + + except yaml.YAMLError as e: + print(e) + except ClientException as e: + print(e) + except Exception as e: + print(e) + return conn + + +def container_info(swift_conn_src, container_src, swift_conn_dst, container_dst): + tmp = swift_conn_src.stat(container_src) + print(f"{tmp}") + tmp = swift_conn_dst.stat(container_dst) + print(f"{tmp}") + + +# validate that the contents of the destination match the source; allow exceptions in the header +# for example: timestamp and source Swift (CWRC) contains the wrong mimetype (fixed during the upload) +def validate(swift_conn_src, container_src, swift_conn_dst, container_dst, id, exceptions=[]): + if type(id) is not list: + id = [id] + + # header properties that are expected to be different, e.g., request related ids or Swift managed timestamps + exceptions = [ + *exceptions, + 'last-modified', + 'x-timestamp', + 'x-trans-id', + 'x-openstack-request-id', + 'date' + ] + # stat returns an iterator: https://docs.openstack.org/python-swiftclient/latest/service-api.html#stat + for src in swift_conn_src.stat(container_src, id): + logging.debug(f"{src}") + for dst in swift_conn_dst.stat(container_dst, id): + logging.debug(f"{dst}") + for key in src['headers']: + if key not in exceptions: + if key not in dst['headers']: + logging.error(f"id:[{id}] key:[{key}] not present in destination: {src['headers'][key]}") + # raise SwiftError(f"{key} not present in destination: {src['headers'][key]}", container_dst, id) + elif container_src == 'CWRC' and key == 'content-type' and dst['headers'][key] == 'application/zip': + logging.info(f"id:[{id}] key:[{key}] differs; this is expected in CWRC due to bulk change - destination {dst['headers'][key]}") + elif src['headers'][key] != dst['headers'][key]: + logging.error(f"id:[{id}] key:[{key}] differs - {src['headers'][key]} <> {dst['headers'][key]}") + # raise SwiftError(f"{key} differs: {src['headers'][key]} <> {dst['headers'][key]}", container_dst, id) + else: + logging.info(f"id:[{id}] key:[{key}] matches - {src['headers'][key]} == {dst['headers'][key]}") + + +# +def process(args, swift_conn_src, swift_conn_dst): + + # get list of items + try: + with open(args.id_list) as f: + for line in f: + id = line.strip() + print(id) + validate(swift_conn_src, args.container_src, swift_conn_dst, args.container_dst, id) + time.sleep(1) + + except ClientException as e: + logging.error(e) + except SwiftError as e: + logging.error(e) + except Exception as e: + logging.error(e) + + +# +def main(): + options = {} + logging.basicConfig(level=logging.INFO) + + if (os.environ.get('OS_AUTH_URL') is None): + print("ERROR: missing Swift auth; source the Swift env file for the container before running this script") + exit() + + args = parse_args(sys.argv[1:]) + + options['segment_size'] = SWIFT_SEGMENT_LENGTH + options['use_slo'] = True + options['object_uu_threads'] = 1 + options['retries'] = 2 + options['out_directory'] = args.tmp_dir + + with SwiftService(options=options) as swift_conn_dest: + swift_conn_src = swift_init_src(args.swift_src_config_path, args.tmp_dir) + process(args, swift_conn_src, swift_conn_dest) + + +if __name__ == "__main__": + main() diff --git a/migration/tests/fixtures/assets/a:1 b/migration/tests/fixtures/assets/a:1 new file mode 100644 index 0000000..3351791 Binary files /dev/null and b/migration/tests/fixtures/assets/a:1 differ diff --git a/migration/tests/migrate_unit_tests.py b/migration/tests/migrate_unit_tests.py new file mode 100644 index 0000000..9533484 --- /dev/null +++ b/migration/tests/migrate_unit_tests.py @@ -0,0 +1,142 @@ +""" Very quickly write unit tests for a one-time script +""" + + +import csv +import os +import pytest +import pytest_mock +import shutil +import sys + +from swiftclient.service import ClientException, SwiftError, SwiftService, SwiftUploadObject + +sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) +import migrate as migrate + +_object_id = 'a:1' +_swift_download_response = { + 'success': True, + 'path': 'tests/fixtures/assets/a:1', + 'object': _object_id, + 'response_dict': { + 'headers': { + 'etag': '94813657ffbc76defd96ac21ff4061ca', + 'x-object-meta-project-id': 'a', + 'x-object-meta-aip-version': 'b', + 'x-object-meta-project': 'c', + 'x-object-meta-promise': 'd', + 'content-type': 'e', + 'x-object-meta-last-mod-timestamp': 'f' + } + } +} + + +# CWRC object - success on mismatched content-type +def test_validate(mocker): + mocker.patch('migrate.SwiftService.stat', side_effect=[ + [{'headers': {'a': 'b', 'date': '2000', 'content-type': 'application/x-tar'}}], + [{'headers': {'a': 'b', 'date': '2001', 'content-type': 'application/zip'}}] + ]) + try: + migrate.validate(SwiftService, 'CWRC', SwiftService, 'cwrc', 'a:1') + assert True + except Exception: + assert False + + +def test_validate_missing_header(mocker): + mocker.patch('migrate.SwiftService.stat', side_effect=[ + [{'headers': {'a': 'b', 'date': '2000', 'content-type': 'application/x-tar'}}], + [{'headers': {'date': '2001', 'content-type': 'application/zip'}}] + ]) + with pytest.raises(SwiftError) as excinfo: + migrate.validate(SwiftService, 'CWRC', SwiftService, 'cwrc', 'a:1') + assert str(excinfo.value) == "'a not present in destination: b' container:cwrc object:['a:1']" + + +def test_validate_header_value_difference(mocker): + mocker.patch('migrate.SwiftService.stat', side_effect=[ + [{'headers': {'a': 'b', 'date': '2000', 'content-type': 'application/x-tar'}}], + [{'headers': {'a': 'invalid', 'date': '2001', 'content-type': 'application/zip'}}] + ]) + with pytest.raises(SwiftError) as excinfo: + migrate.validate(SwiftService, 'CWRC', SwiftService, 'cwrc', 'a:1') + assert str(excinfo.value) == "'a differs: b <> invalid' container:cwrc object:['a:1']" + + +# non CWRC object - error on mismatched content-type +def test_validate_header_content_type(mocker): + mocker.patch('migrate.SwiftService.stat', side_effect=[ + [{'headers': {'date': '2000', 'content-type': 'application/x-tar'}}], + [{'headers': {'date': '2001', 'content-type': 'application/zip'}}] + ]) + with pytest.raises(SwiftError) as excinfo: + migrate.validate(SwiftService, 'x', SwiftService, 'x', 'a:1') + assert str(excinfo.value) == "'content-type differs: application/x-tar <> application/zip' container:x object:['a:1']" + + +def test_download_from_source_cwrc(mocker): + mocker.patch('migrate.SwiftService.download', return_value=[ + { + 'success': True, + 'path': 'tests/fixtures/assets/a:1', + 'object': _object_id, + 'response_dict': { + 'headers': { + 'etag': '94813657ffbc76defd96ac21ff4061ca', + 'x-object-meta-project-id': 'a', + 'x-object-meta-aip-version': 'b', + 'x-object-meta-project': 'c', + 'x-object-meta-promise': 'd', + 'content-type': 'e', + 'x-object-meta-last-mod-timestamp': 'f' + } + } + } + ]) + upload_obj = migrate.download_from_source(SwiftService, 'CWRC', object) + for item in upload_obj: + assert item.object_name == _object_id + assert item.options['header']['x-object-meta-project-id'] == 'a' + assert item.options['header']['x-object-meta-aip-version'] == 'b' + assert item.options['header']['x-object-meta-project'] == 'c' + assert item.options['header']['x-object-meta-promise'] == 'd' + assert item.options['header']['content-type'] == 'application/zip' + assert item.options['header']['x-object-meta-last-mod-timestamp'] == 'f' + # non-cwrc test + upload_obj = migrate.download_from_source(SwiftService, '', object) + for item in upload_obj: + assert item.options['header']['content-type'] == 'e' + assert 'x-object-meta-last-mod-timestamp' not in item.options['header'] + + +def test_upload_to_destination(tmpdir, mocker): + t = _swift_download_response + t['path'] = tmpdir / _object_id + upload_obj = [migrate.build_swift_upload_object(t, 'cwrc')] + shutil.copy('tests/fixtures/assets/a:1', t['path']) + csv_path = tmpdir / "csv" + with open(csv_path, 'w', newline='') as csv_fd: + csv_dict = migrate.csv_init(csv_fd) + mocker.patch('migrate.SwiftService.upload', return_value=[ + { + 'action': 'upload_object', + 'success': True, + 'path': t['path'], + 'object': _object_id, + 'response_dict': { + 'headers': { + 'etag': '94813657ffbc76defd96ac21ff4061ca', + 'last-modified': 'a' + } + } + } + ]) + migrate.upload_to_destination(SwiftService, 'CWRC', upload_obj, csv_dict, "J") + assert not os.path.exists(t['path']) + with open(csv_path, 'r', newline='') as tmp_fd: + dr = csv.DictReader(tmp_fd) + for row in dr: + assert row['id'] == _object_id