Skip to content

Commit

Permalink
LoadRaster and LoadVector: refactor to remove Dir.chdir calls, and to…
Browse files Browse the repository at this point in the history
… improve maintainability (#851)

* VectorNormalizer and LoadVector: better variable and parameter names

* load_vector_spec.rb: more precise expectation for normalizer.run_shp2pgsql call

* Robots::DorRepo::GisDelivery::LoadVector: log an error on (presumed) UTF-8 decoding error fallthrough, add test case for fallthrough logic

the Dir.chdir hack in the tests should go away in the commit that gets rid of Dir.chdir

* add test for successful codepath of VectorNormalizer#run_shp2pgsql

also, remove unnecessary check of return value from system(cmd, exception: true) call, since per docs, it will raise on failed execution or non-zero exit code (instead of returning nil or false, respectively).

* LoadVector#perform_work: refactor to avoid Dir.chdir

adjust tests:
* no longer need to undo effect of Dir.chdir(tmpdir)
* file paths for sql generation and loading now include tmpdir

* LoadRaster#perform_work: var name improvements, comment fix

* LoadRaster: refactor to get rid of Dir.chdir call

* move #run_shp2pgsql from VectorNormalizer to LoadVector, make it private

LoadVector is the only consumer of the method, and all the info the method relies on is passed to it via params.

* comment cleanup

* LoadVector#perform_work: turn a TODO/question comment into an issue
* remove obsoleted sidekiq config warning
  • Loading branch information
jmartin-sul authored Mar 6, 2024
1 parent a726ce2 commit d458c16
Show file tree
Hide file tree
Showing 6 changed files with 87 additions and 66 deletions.
2 changes: 0 additions & 2 deletions config/sidekiq.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
---
# The concurrency is set to 1 since Dir.chdir is used, which is not thread safe.
# See https://bugs.ruby-lang.org/issues/15661
:concurrency: 1
:queues:
- gisAssemblyWF_default
Expand Down
12 changes: 0 additions & 12 deletions lib/gis_robot_suite/vector_normalizer.rb
Original file line number Diff line number Diff line change
Expand Up @@ -29,18 +29,6 @@ def cleanup
FileUtils.rm_rf tmpdir
end

def run_shp2pgsql(projection, encoding, shpfn, schema, sqlfn, errfn)
# XXX: Perhaps put the .sql data into the content directory as .zip for derivative
# XXX: -G for the geography column causes some issues with GeoServer
cmd = "shp2pgsql -s #{projection} -d -D -I -W #{encoding} " \
"'#{shpfn}' #{schema}.#{bare_druid} " \
"> '#{sqlfn}' 2> '#{errfn}'"
logger.debug "Running: #{cmd}"
success = system(cmd, exception: true)
raise "normalize-vector: #{bare_druid} cannot convert Shapefile to PostGIS: #{File.open(errfn).readlines}" unless success
raise "normalize-vector: #{bare_druid} shp2pgsql generated no SQL?" unless File.size?(sqlfn)
end

private

attr_reader :logger, :bare_druid, :rootdir
Expand Down
36 changes: 17 additions & 19 deletions lib/robots/dor_repo/gis_delivery/load_raster.rb
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,23 @@ def perform_work
end

normalizer.with_normalized do |tmpdir|
Dir.chdir(tmpdir) do
tiffn = Dir.glob('*.tif').first
raise "load-raster: #{bare_druid} cannot locate GeoTIFF: #{tmpdir}" if tiffn.nil?

# copy to geoserver storage
path = if Settings.geohydra.geotiff.host == 'localhost'
Settings.geohydra.geotiff.dir
else
[Settings.geohydra.geotiff.host, Settings.geohydra.geotiff.dir].join(':')
end
cmd = "rsync -v '#{tiffn}' #{path}/#{bare_druid}.tif"
logger.debug "Running: #{cmd}"
system(cmd, exception: true)

# copy statistics files (produced by CopyData#compute_statistics, as of Feb 2024)
cmd = "rsync -v '#{tiffn}'.aux.xml #{path}/#{bare_druid}.tif.aux.xml"
logger.debug "Running: #{cmd}"
system(cmd, exception: true)
end
tif_filename = Dir.glob("#{tmpdir}/*.tif").first
raise "load-raster: #{bare_druid} cannot locate GeoTIFF: #{tmpdir}" if tif_filename.nil?

# copy to geoserver storage
destination_path = if Settings.geohydra.geotiff.host == 'localhost'
Settings.geohydra.geotiff.dir
else
[Settings.geohydra.geotiff.host, Settings.geohydra.geotiff.dir].join(':')
end
cmd = "rsync -v '#{tif_filename}' #{destination_path}/#{bare_druid}.tif"
logger.debug "Running: #{cmd}"
system(cmd, exception: true)

# copy statistics files (produced by RasterNormalizer#compute_statistics, as of March 2024)
cmd = "rsync -v '#{tif_filename}'.aux.xml #{destination_path}/#{bare_druid}.tif.aux.xml"
logger.debug "Running: #{cmd}"
system(cmd, exception: true)
end
end

Expand Down
39 changes: 23 additions & 16 deletions lib/robots/dor_repo/gis_delivery/load_vector.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,31 +21,27 @@ def perform_work

normalizer.with_normalized do |tmpdir|
schema = Settings.geohydra.postgis.schema || 'druid'
# encoding = # XXX: these are hardcoded encodings for certain druids -- these should be read from the metadata somewhere
# case druid
# when 'bt348dh6363', 'cc936tf6277'
# 'LATIN1'
# else
# 'UTF-8'
# end

# sniff out shapefile from extraction
Dir.chdir(tmpdir)
shpfn = Dir.glob('*.shp').first
sqlfn = shpfn.gsub(/\.shp$/, '.sql')
errfn = 'shp2pgsql.err'
logger.debug "load-vector: #{bare_druid} is working on Shapefile: #{shpfn}"
shp_filename = Dir.glob("#{tmpdir}/*.shp").first
sql_filename = shp_filename.gsub(/\.shp$/, '.sql')
stderr_filename = "#{tmpdir}/shp2pgsql.err"

logger.debug "load-vector: #{bare_druid} is working on Shapefile: #{shp_filename}"

# first try decoding with UTF-8 and if that fails use LATIN1
# see also https://github.com/sul-dlss/gis-robot-suite/issues/850
begin
normalizer.run_shp2pgsql('4326', 'UTF-8', shpfn, schema, sqlfn, errfn)
rescue RuntimeError
normalizer.run_shp2pgsql('4326', 'LATIN1', shpfn, schema, sqlfn, errfn)
run_shp2pgsql('4326', 'UTF-8', shp_filename, schema, sql_filename, stderr_filename)
rescue RuntimeError => e
logger.warn("#{druid} -- fell through to LATIN1 encoding after calling run_shp2pgsql with " \
"UTF-8 encoding and encountering error: #{e.message}; #{e.backtrace.join('; ')}")
run_shp2pgsql('4326', 'LATIN1', shp_filename, schema, sql_filename, stderr_filename)
end

# Load the data into PostGIS
cmd = 'psql --no-psqlrc --no-password --quiet ' \
"--file='#{sqlfn}' "
"--file='#{sql_filename}' "
logger.debug "Running: #{cmd}"
system(cmd, exception: true)
end
Expand All @@ -60,6 +56,17 @@ def normalizer
def rootdir
@rootdir ||= GisRobotSuite.locate_druid_path bare_druid, type: :stage
end

def run_shp2pgsql(projection, encoding, shp_filename, schema, sql_filename, stderr_filename)
# TODO: Perhaps put the .sql data into the content directory as .zip for derivative
# NOTE: -G for the geography column causes some issues with GeoServer
cmd = "shp2pgsql -s #{projection} -d -D -I -W #{encoding} " \
"'#{shp_filename}' #{schema}.#{bare_druid} " \
"> '#{sql_filename}' 2> '#{stderr_filename}'"
logger.debug "Running: #{cmd}"
system(cmd, exception: true)
raise "normalize-vector: #{bare_druid} shp2pgsql generated no SQL?" unless File.size?(sql_filename)
end
end
end
end
Expand Down
20 changes: 10 additions & 10 deletions spec/robots/dor_repo/gis_delivery/load_raster_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,10 @@
RSpec.describe Robots::DorRepo::GisDelivery::LoadRaster do
let(:druid) { "druid:#{bare_druid}" }
let(:bare_druid) { 'bb021mm7809' }
let(:tiffn) { 'MCE_FI2G_2014.tif' }
let(:path) { '/geotiff' }
let(:cmd) { "rsync -v '#{tiffn}' #{path}/#{bare_druid}.tif" }
let(:cmd_aux) { "rsync -v '#{tiffn}'.aux.xml #{path}/#{bare_druid}.tif.aux.xml" }
let(:tif_filename) { "/tmp/normalizeraster_#{bare_druid}/MCE_FI2G_2014.tif" }
let(:destination_path) { '/geotiff' }
let(:cmd_tif_sync) { "rsync -v '#{tif_filename}' #{destination_path}/#{bare_druid}.tif" }
let(:cmd_aux_sync) { "rsync -v '#{tif_filename}'.aux.xml #{destination_path}/#{bare_druid}.tif.aux.xml" }
let(:robot) { described_class.new }
let(:workflow_client) { instance_double(Dor::Workflow::Client) }
let(:object_client) { instance_double(Dor::Services::Client::Object, find: cocina_object) }
Expand Down Expand Up @@ -82,8 +82,8 @@
end

before do
allow(robot).to receive(:system).with(cmd, exception: true)
allow(robot).to receive(:system).with(cmd_aux, exception: true)
allow(robot).to receive(:system).with(cmd_tif_sync, exception: true)
allow(robot).to receive(:system).with(cmd_aux_sync, exception: true)
allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
allow(Dor::Services::Client).to receive(:object).and_return(object_client)
allow(Kernel).to receive(:system).and_call_original
Expand All @@ -98,8 +98,8 @@
it 'does nothing' do
test_perform(robot, druid)
expect(normalizer).not_to have_received(:with_normalized)
expect(robot).not_to have_received(:system).with(cmd, exception: true)
expect(robot).not_to have_received(:system).with(cmd_aux, exception: true)
expect(robot).not_to have_received(:system).with(cmd_tif_sync, exception: true)
expect(robot).not_to have_received(:system).with(cmd_aux_sync, exception: true)
end
end

Expand All @@ -108,8 +108,8 @@

it 'executes system commands to load raster' do
test_perform(robot, druid)
expect(robot).to have_received(:system).with(cmd, exception: true)
expect(robot).to have_received(:system).with(cmd_aux, exception: true)
expect(robot).to have_received(:system).with(cmd_tif_sync, exception: true)
expect(robot).to have_received(:system).with(cmd_aux_sync, exception: true)
end
end
end
44 changes: 37 additions & 7 deletions spec/robots/dor_repo/gis_delivery/load_vector_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,10 @@
RSpec.describe Robots::DorRepo::GisDelivery::LoadVector do
let(:druid) { "druid:#{bare_druid}" }
let(:bare_druid) { 'cc044gt0726' }
let(:cmd) { "psql --no-psqlrc --no-password --quiet --file='sanluisobispo1996.sql' " }
let(:normalizer_tmpdir) { "/tmp/normalizevector_#{bare_druid}" }
let(:shp_filename) { "#{normalizer_tmpdir}/sanluisobispo1996.shp" }
let(:sql_filename) { "#{normalizer_tmpdir}/sanluisobispo1996.sql" }
let(:stderr_filename) { "#{normalizer_tmpdir}/shp2pgsql.err" }
let(:robot) { described_class.new }
let(:workflow_client) { instance_double(Dor::Workflow::Client) }
let(:object_client) { instance_double(Dor::Services::Client::Object, find: cocina_object) }
Expand Down Expand Up @@ -80,7 +83,7 @@
end

before do
allow(robot).to receive(:system).with(cmd, exception: true)
allow(robot).to receive(:system)
allow(LyberCore::WorkflowClientFactory).to receive(:build).and_return(workflow_client)
allow(Dor::Services::Client).to receive(:object).and_return(object_client)
allow(GisRobotSuite::VectorNormalizer).to receive(:new).and_return(normalizer)
Expand All @@ -93,15 +96,17 @@
it 'does nothing' do
test_perform(robot, druid)
expect(normalizer).not_to have_received(:with_normalized)
expect(robot).not_to have_received(:system).with(cmd, exception: true)
expect(robot).not_to have_received(:system)
end
end

context 'when the object is a vector' do
let(:media_type) { 'application/x-esri-shapefile' }
let(:logger) { instance_double(Logger, debug: nil, info: nil) }
let(:logger) { instance_double(Logger, debug: nil, info: nil, warn: nil) }
let(:rootdir) { GisRobotSuite.locate_druid_path bare_druid, type: :workspace }
let(:normalizer) { GisRobotSuite::VectorNormalizer.new(logger:, bare_druid:, rootdir:) }
let(:cmd_psql) { "psql --no-psqlrc --no-password --quiet --file='#{sql_filename}' " }
let(:cmd_shp2pgsql) { "shp2pgsql -s 4326 -d -D -I -W UTF-8 '#{shp_filename}' druid.#{bare_druid} > '#{sql_filename}' 2> '#{stderr_filename}'" }
let(:wkt) do
'GEOGCS["WGS 84", DATUM["WGS_1984", SPHEROID["WGS 84",6378137,298.257223563, AUTHORITY["EPSG","7030"]], AUTHORITY["EPSG","6326"]], PRIMEM["Greenwich",0, AUTHORITY["EPSG","8901"]], UNIT["degree",0.0174532925199433, AUTHORITY["EPSG","9122"]], AUTHORITY["EPSG","4326"]]' # rubocop:disable Layout/LineLength
end
Expand All @@ -110,14 +115,39 @@
stub_request(:get, 'https://spatialreference.org/ref/epsg/4326/prettywkt/')
.to_return(status: 200, body: wkt, headers: {})
allow(robot).to receive(:normalizer).and_return(normalizer)
allow(normalizer).to receive_messages(cleanup: true, run_shp2pgsql: true)
allow(normalizer).to receive_messages(cleanup: true)
allow(File).to receive(:size?).and_call_original
allow(File).to receive(:size?).with(sql_filename).and_return('123')
end

it 'executes system commands to load vector' do
test_perform(robot, druid)
expect(normalizer).to have_received(:run_shp2pgsql)
expect(normalizer).to have_received(:cleanup)
expect(robot).to have_received(:system).with(cmd, exception: true)
expect(robot).to have_received(:system).with(cmd_shp2pgsql, exception: true)
expect(robot).to have_received(:system).with(cmd_psql, exception: true)
end

context 'when decoding UTF-8 fails' do
let(:decoding_err_msg) { 'Could not decode UTF-8 for some reason 🤷' }
let(:cmd_shp2pgsql_retry) { "shp2pgsql -s 4326 -d -D -I -W LATIN1 '#{shp_filename}' druid.#{bare_druid} > '#{sql_filename}' 2> '#{stderr_filename}'" }

before do
allow(robot.logger).to receive(:warn)
allow(robot).to receive(:system) do |cmd|
raise decoding_err_msg if cmd.include?('-W UTF-8')
end
end

it 'logs a warning and re-tries shp2pgsql with LATIN1 encoding' do
expect { test_perform(robot, druid) }.not_to raise_error

expect(robot).to have_received(:system).with(cmd_shp2pgsql, exception: true)

base_err_msg = 'fell through to LATIN1 encoding after calling run_shp2pgsql with UTF-8 encoding and encountering error'
backtrace_regex = "#{__FILE__}:\\d+.*in .block.*; "
expect(robot.logger).to have_received(:warn).with(/#{druid} -- #{base_err_msg}: #{decoding_err_msg}; .*#{backtrace_regex}/)
expect(robot).to have_received(:system).with(cmd_shp2pgsql_retry, exception: true)
end
end
end
end

0 comments on commit d458c16

Please sign in to comment.