diff --git a/Gemfile b/Gemfile index 566cd0f..7948702 100644 --- a/Gemfile +++ b/Gemfile @@ -7,7 +7,7 @@ gem 'faraday-net_http_persistent', '~> 2.0' gem 'geo_combine' gem 'geoserver-publish', '~> 0.7.0' gem 'rsolr' -gem 'rubyzip' +gem 'rubyzip', '3.0.0.alpha' gem "berkeley_library-docker", "~> 0.2.0" gem "listen", "~> 3.8" gem 'uri' diff --git a/Gemfile.lock b/Gemfile.lock index 70b2b2f..41755a5 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -162,7 +162,7 @@ GEM rspec-support (~> 3.12.0) rspec-support (3.12.1) ruby2_keywords (0.0.5) - rubyzip (2.3.2) + rubyzip (3.0.0.alpha) sanitize (6.1.0) crass (~> 1.0.2) nokogiri (>= 1.12.0) @@ -189,7 +189,7 @@ DEPENDENCIES pry (~> 0.14.2) rsolr rspec (~> 3.12) - rubyzip + rubyzip (= 3.0.0.alpha) uri RUBY VERSION diff --git a/docker-compose.yml b/docker-compose.yml index 1ec9ab3..33cf4bd 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -39,14 +39,14 @@ services: - ./data/spatial:/usr/local/apache2/htdocs/:ro geoserver: - image: containers.lib.berkeley.edu/gis/geoserver/v2.23.2 + image: containers.lib.berkeley.edu/gis/geoserver:latest ports: - 8080:8080 volumes: - ./data/geoserver/public:/srv/geofiles:delegated geoserver-secure: - image: containers.lib.berkeley.edu/gis/geoserver/v2.23.2 + image: containers.lib.berkeley.edu/gis/geoserver:latest ports: - 8081:8080 volumes: diff --git a/lib/gingr/cli.rb b/lib/gingr/cli.rb index 27cb250..608b47d 100644 --- a/lib/gingr/cli.rb +++ b/lib/gingr/cli.rb @@ -80,13 +80,7 @@ def geoserver(filename) option :geoserver_root def unpack(zipfile) zipfile_path = zipfile == File.basename(zipfile) ? File.join(ImportUtil.root_path, 'import', zipfile) : zipfile - DataHandler.spatial_root = options[:spatial_root] || ENV.fetch('SPATIAL_ROOT', - Config.default_options[:spatial_root]) - DataHandler.geoserver_root = options[:geoserver_root] || ENV.fetch('GEOSERVER_ROOT', - Config.default_options[:geoserver_root]) - - gingr_watch_root_dir ||= ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr' - DataHandler.processing_root = File.join(gingr_watch_root_dir, 'processing') + set_data_handler(options[:spatial_root], options[:geoserver_root]) DataHandler.extract_and_move(zipfile_path) end @@ -107,12 +101,13 @@ def unpack(zipfile) option :geoserver_secure_url def all(zipfile) unpacked = unpack(zipfile) - solr(unpacked[:extract_to_path]) + total_indexed = solr(unpacked[:extract_to_path]) geofile_names = unpacked[:geofile_name_hash] geoserver_urls = options.slice(:geoserver_url, :geoserver_secure_url).transform_keys(&:to_sym) - Gingr::GeoserverPublisher.publish_inventory(geofile_names, **geoserver_urls) - logger.info("#{zipfile} - all imported") + failed_files = Gingr::GeoserverPublisher.publish_inventory(geofile_names, **geoserver_urls) + + report(total_indexed, failed_files, zipfile) end desc 'geoserver_workspace', 'create a workspace in a geoserver' @@ -126,5 +121,29 @@ def geoserver_workspace(workspace_name = nil) publisher = GeoserverPublisher.new(options[:geoserver_url], default:, workspace_name:) publisher.create_workspace end + + private + + def set_data_handler(spatial_root, goserver_root) + DataHandler.spatial_root = spatial_root || ENV.fetch('SPATIAL_ROOT', + Config.default_options[:spatial_root]) + DataHandler.geoserver_root = goserver_root || ENV.fetch('GEOSERVER_ROOT', + Config.default_options[:geoserver_root]) + gingr_watch_root_dir ||= ENV['GINGR_WATCH_DIRECTORY'] || '/opt/app/data/gingr' + DataHandler.processing_root = File.join(gingr_watch_root_dir, 'processing') + end + + def report(total_indexed, failed_files, zipfile) + if total_indexed.nil? + logger.error('Solr indexing failed') + logger.info("#{zipfile} - not imported") + return + end + logger.info("#{zipfile} - all imported, total records: #{total_indexed}") + return if failed_files.empty? + + logger.warn("#{zipfile} - some shapefile or GeoTIFF files not published to Geoservers") + logger.error("Failed to published geo files: #{failed_files.join('; ')}") + end end end diff --git a/lib/gingr/data_handler.rb b/lib/gingr/data_handler.rb index 222e624..bb83df0 100644 --- a/lib/gingr/data_handler.rb +++ b/lib/gingr/data_handler.rb @@ -20,51 +20,97 @@ class << self attr_accessor :spatial_root, :geoserver_root, :processing_root def extract_and_move(zip_file) - extract_to_path = extract_zipfile(zip_file) + extract_to_path = perform_extraction(zip_file) + summary = prepare_publishing_files(extract_to_path) + geofile_name_hash = analyze_summary(summary) + { extract_to_path:, geofile_name_hash: } + end + + private + + def analyze_summary(summary) + public_map_files = [] + ucb_map_files = [] + summary.each do |summ| + filename = summ[:map_filename] + summ[:public_access] ? public_map_files << filename : ucb_map_files << filename + end + { public_files: public_map_files.compact.reject(&:empty?), ucb_files: ucb_map_files.compact.reject(&:empty?) } + end + + # Extacting ingestion zip file to processing directory + def perform_extraction(zip_file) + extract_to_path = prepare_extract_to_path(zip_file) + extract_zipfile(zip_file) + extract_to_path + end - geofile_ingestion_dir_path = move_files(extract_to_path) - { extract_to_path:, geofile_name_hash: get_geofile_name_hash(geofile_ingestion_dir_path) } + def prepare_extract_to_path(zip_file) + dir_name = File.basename(zip_file, '.*') + extract_to_path = File.join(@processing_root, dir_name) + clr_directory(extract_to_path) + extract_to_path + end + + # Moving files to Geoserver and spatial server + def prepare_publishing_files(extract_to_path) + from_geofile_ingestion_path = File.join(extract_to_path, Config.geofile_ingestion_dirname) + subdirectory_list(from_geofile_ingestion_path).map { |dir| move_a_record(dir) } + rescue StandardError => e + logger.error "An error occurred while extracting and moving files from #{from_geofile_ingestion_path}: #{e.message}" end def extract_zipfile(zip_file, to_dir = @processing_root) - extracted_to_path = clr_subdirectory(zip_file) Zip::File.open(zip_file) do |zip| zip.each do |entry| - entry_path = File.join(to_dir, entry.name) - entry.extract(entry_path) { true } + entry.extract(destination_directory: to_dir) { true } end end - extracted_to_path + rescue StandardError => e + logger.error "An unexpected error occurred during unzip #{zip_file}: #{e.message}" + raise end + + # some records may no have a map.zip files + def move_a_record(dir_path) + attributes = record_attributes(dir_path) + arkid = File.basename(dir_path).strip + map_filename = nil + souredata_moved = false - def move_files(from_dir_path) - geofile_ingestion_dir_path = File.join(from_dir_path, Config.geofile_ingestion_dirname) - subdirectory_list(geofile_ingestion_dir_path).each do |subdirectory_path| - move_a_record(subdirectory_path) + subfile_list(dir_path).each do |file| + filename = File.basename(file) + map_filename = move_map_file(file, arkid, attributes) if filename == 'map.zip' + souredata_moved = move_source_file(file, arkid, attributes[:public_access]) if filename == 'data.zip' end - geofile_ingestion_dir_path + logger.warning " '#{arkid} has no map.zip file, please check" if map_filename.nil? + logger.warning " '#{arkid} has no data.zip file, please check" unless souredata_moved + { public_access: attributes[:public_access], map_filename: } end - def move_a_record(dir_path) - subfile_list(dir_path).each do |file| - if File.basename(file) == 'map.zip' - dest_dir_path = file_path(dir_path, @geoserver_root) - unzip_map_files(dest_dir_path, file) - else - dest_dir_path = file_path(dir_path, @spatial_root) - mv_spatial_file(dest_dir_path, file) - end - end + def move_map_file(file, arkid, attributes) + dest_dir_path = file_path(@geoserver_root, arkid, attributes[:public_access]) + unzip_map_files(dest_dir_path, file) + format = attributes[:format].downcase + ext = format == 'shapefile' ? '.shp' : '.tif' + "#{arkid}#{ext}" + rescue StandardError => e + logger.error "Failed to move map file '#{file}' for arkid '#{arkid}': #{e.message}" + '' end - # remove the subdirectory if it exists - def clr_subdirectory(zip_file) - subdir_name = File.basename(zip_file, '.*') - subdir_path = File.join(@processing_root, subdir_name) - FileUtils.rm_r(subdir_path) if File.directory? subdir_path - subdir_path + def move_source_file(file, arkid, public_access) + dest_dir_path = file_path(@spatial_root, arkid, public_access) + mv_spatial_file(dest_dir_path, file) + true + rescue StandardError => e + logger.error "Failed to move soucedata '#{file}' for '#{arkid}': #{e.message}" + end + + def clr_directory(directory_name) + FileUtils.rm_r(directory_name) if File.directory? directory_name rescue Errno::EACCES - logger.error("Permission denied: #{subdir_path}") + logger.error("Permission denied to clear #{directory_name}") raise end @@ -76,37 +122,13 @@ def subfile_list(directory_path) Pathname(directory_path).children.select(&:file?) end - def get_geofile_name_hash(directory_path) - public_names = [] - ucb_names = [] - subdirectory_list(directory_path).each do |sub_dir| - hash = name_access_hash(sub_dir) - hash[:public_access] ? public_names << hash[:name] : ucb_names << hash[:name] - end - { public: public_names, ucb: ucb_names } - end - - def access_type(dir) - json_hash = geoblacklight_hash(dir) - value = json_hash['dct_accessRights_s'].downcase - value == 'public' ? 'public' : 'UCB' - end - - private - - def geoblacklight_hash(dir) + def record_attributes(dir) json_filepath = File.join(dir, 'geoblacklight.json') json_data = File.read(json_filepath) - JSON.parse(json_data) - end - - def name_access_hash(dir) - basename = File.basename(dir).split('_').last - json_hash = geoblacklight_hash(dir) + json_hash = JSON.parse(json_data) + public_access = json_hash['dct_accessRights_s'].downcase == 'public' format = json_hash['dct_format_s'].downcase - ext = format == 'shapefile' ? '.shp' : '.tif' - access_right = json_hash['dct_accessRights_s'].downcase - { name: "#{basename}#{ext}", public_access: access_right == 'public' } + { public_access:, format: } end def unzip_map_files(dest_dir, map_zipfile) @@ -120,12 +142,12 @@ def mv_spatial_file(dest_dir, file) FileUtils.cp(file, to_file) end - def file_path(dir_path, root) + def file_path(root, arkid, public_access ) # geofiles/spatial/{UCB,public}/berkeley-{arkID} - arkid = File.basename(dir_path).strip - type = access_type(dir_path) + type = public_access ? 'public' : 'UCB' File.join(root, type, "berkeley-#{arkid}") end + end end end diff --git a/lib/gingr/geoserver_publisher.rb b/lib/gingr/geoserver_publisher.rb index e314dbb..290e5f7 100644 --- a/lib/gingr/geoserver_publisher.rb +++ b/lib/gingr/geoserver_publisher.rb @@ -11,21 +11,24 @@ class GeoserverPublisher DEFAULT_REMOTE_ROOT = '/srv/geofiles' DEFAULT_WORKSPACE = 'UCB' - attr_reader :connection - attr_reader :remote_root - attr_reader :workspace_name + attr_reader :connection, :remote_root, :workspace_name class << self def publish_inventory(inventory, geoserver_url: nil, geoserver_secure_url: nil) - if !inventory[:public].empty? + public_files = inventory[:public_files] + ucb_files = inventory[:ucb_files] + un_published_shapefiles = [] + un_published_geotiffs = [] + unless public_files.empty? public_publisher = new(geoserver_url) - public_publisher.batch_publish(inventory[:public]) + un_published_shapefiles = public_publisher.batch_publish(public_files) end - if !inventory[:ucb].empty? + unless ucb_files.empty? secure_publisher = new(geoserver_secure_url, default: :geoserver_secure_url) - secure_publisher.batch_publish(inventory[:ucb]) + un_published_geotiffs = secure_publisher.batch_publish(ucb_files) end + (un_published_shapefiles + un_published_geotiffs).compact end def parse_connection_string(geoserver_baseurl) @@ -39,7 +42,7 @@ def parse_connection_string(geoserver_baseurl) port: uri.port == uri.default_port ? nil : uri.port, path: uri.path, fragment: uri.fragment, - query: uri.query, + query: uri.query ).to_s, uri.user, uri.password end end @@ -63,17 +66,15 @@ def initialize(conn = nil, default: nil, remote_root: nil, workspace_name: nil) end def batch_publish(filenames) - filenames.each(&method(:publish)) + filenames.map(&method(:publish)) end - + def publish(filename) id = File.basename(filename, '.*') file_path = remote_filepath(id, filename) - if File.extname(filename).casecmp?('.shp') - publish_shapefile(file_path, id) - else - publish_geotiff(file_path, id) - end + return publish_shapefile(file_path, id) if File.extname(filename).casecmp?('.shp') + + publish_geotiff(file_path, id) end def create_workspace @@ -92,11 +93,19 @@ def create_workspace def publish_shapefile(file_path, id) logger.debug("Publishing shapefile #{id} to #{geoserver_url}") Geoserver::Publish.shapefile(connection:, workspace_name:, file_path:, id:, title: id) + nil + rescue StandardError => e + logger.error("Error publishing shapefile #{file_path} to #{geoserver_url}: #{e.message}") + file_path end def publish_geotiff(file_path, id) logger.debug("Publishing geotiff #{id} to #{geoserver_url}") Geoserver::Publish.geotiff(connection:, workspace_name:, file_path:, id:, title: id) + nil + rescue StandardError => e + logger.error("Error publishing GeoTIFF #{file_path} to #{geoserver_url}: #{e.message}") + file_path end def remote_filepath(id, filename) diff --git a/lib/gingr/solr_indexer.rb b/lib/gingr/solr_indexer.rb index 9c2f6d2..02deede 100644 --- a/lib/gingr/solr_indexer.rb +++ b/lib/gingr/solr_indexer.rb @@ -8,8 +8,7 @@ module Gingr class SolrIndexer include Logging - attr_accessor :reference_urls - attr_accessor :solr + attr_accessor :reference_urls, :solr def initialize(connection = nil, refurls = nil) connection ||= Gingr::Config.getopt(:solr_url) @@ -28,13 +27,21 @@ def add(doc) logger.debug("Indexing document: #{doc['id']}") update_reference_urls!(doc) @solr.add doc + rescue StandardError => e + logger.error "Indexing document '#{doc['id']}' failed: #{e.message}" + raise end def index_directory(directory) - Find.find(directory) - .select(&method(:json_file?)) - .each(&method(:add)) + total_indexed = Find.find(directory) + .select(&method(:json_file?)) + .each(&method(:add)) + .size @solr.commit + total_indexed + rescue StandardError => e + logger.error "Indexing directory '#{directory}' failed: #{e.message}" + nil end def update_reference_urls!(doc) diff --git a/spec/fixture/zipfile/raster_public.zip b/spec/fixture/zipfile/raster_public.zip new file mode 100644 index 0000000..95a1ebc Binary files /dev/null and b/spec/fixture/zipfile/raster_public.zip differ diff --git a/spec/geoserver_publisher_spec.rb b/spec/geoserver_publisher_spec.rb index 518feec..41418b6 100644 --- a/spec/geoserver_publisher_spec.rb +++ b/spec/geoserver_publisher_spec.rb @@ -106,8 +106,12 @@ around do |test| workspace_client.create(workspace_name:) + Gingr::DataHandler.processing_root = '/opt/app/tmp' + Gingr::DataHandler.spatial_root = '/opt/app/data/spatial' + Gingr::DataHandler.geoserver_root = '/opt/app/data/geoserver' Gingr::DataHandler.extract_and_move('spec/fixture/zipfile/vector_restricted_with_attachment.zip') Gingr::DataHandler.extract_and_move('spec/fixture/zipfile/vector.zip') + Gingr::DataHandler.extract_and_move('spec/fixture/zipfile/raster_public.zip') test.run ensure workspace_client.delete(workspace_name:) @@ -115,16 +119,15 @@ context 'with a public geoserver' do it 'publishes a shapefile' do - subject.publish 'fk4hm6vj5q.shp' + expect(subject.publish('fk4hm6vj5q.shp')).to be_nil end it 'publishes a batch of shapefiles' do - subject.batch_publish %w(fk4hm6vj5q.shp fk4cv64r2x.shp) + expect(subject.batch_publish(%w[fk4hm6vj5q.shp fk4cv64r2x.shp])).to all(be_nil) end it 'publishes a raster file' do - pending 'Missing datafile' - subject.publish '{TO BE CREATED}.rst' + expect(subject.publish('fk4mk7zb4q.tif')).to be_nil end end @@ -132,7 +135,7 @@ let(:default) { :geoserver_secure_url } it 'publishes a shapefile' do - subject.publish 's76412.shp' + expect(subject.publish('s76412.shp')).to be_nil end end end