diff --git a/app/models/gazetteer/geonames/name.rb b/app/models/gazetteer/geonames/name.rb new file mode 100644 index 000000000..c29a63f59 --- /dev/null +++ b/app/models/gazetteer/geonames/name.rb @@ -0,0 +1,3 @@ +class Gazetteer::Geonames::Name < ApplicationRecord + self.table_name = 'gazetteer_geonames_names' +end diff --git a/app/models/gazetteer/wof/ancestor.rb b/app/models/gazetteer/wof/ancestor.rb new file mode 100644 index 000000000..7198b7cc3 --- /dev/null +++ b/app/models/gazetteer/wof/ancestor.rb @@ -0,0 +1,3 @@ +class Gazetteer::Wof::Ancestor < ApplicationRecord + self.table_name = 'gazetteer_wof_ancestors' +end diff --git a/app/models/gazetteer/wof/concordance.rb b/app/models/gazetteer/wof/concordance.rb new file mode 100644 index 000000000..0200d2f66 --- /dev/null +++ b/app/models/gazetteer/wof/concordance.rb @@ -0,0 +1,3 @@ +class Gazetteer::Wof::Concordance < ApplicationRecord + self.table_name = 'gazetteer_wof_concordances' +end diff --git a/app/models/gazetteer/wof/geojson.rb b/app/models/gazetteer/wof/geojson.rb new file mode 100644 index 000000000..7b068c6fe --- /dev/null +++ b/app/models/gazetteer/wof/geojson.rb @@ -0,0 +1,3 @@ +class Gazetteer::Wof::Geojson < ApplicationRecord + self.table_name = 'gazetteer_wof_geojson' +end diff --git a/app/models/gazetteer/wof/name.rb b/app/models/gazetteer/wof/name.rb new file mode 100644 index 000000000..938bc3391 --- /dev/null +++ b/app/models/gazetteer/wof/name.rb @@ -0,0 +1,3 @@ +class Gazetteer::Wof::Name < ApplicationRecord + self.table_name = 'gazetteer_wof_names' +end diff --git a/app/models/gazetteer/wof/spr.rb b/app/models/gazetteer/wof/spr.rb new file mode 100644 index 000000000..1a3364858 --- /dev/null +++ b/app/models/gazetteer/wof/spr.rb @@ -0,0 +1,3 @@ +class Gazetteer::Wof::Spr < ApplicationRecord + self.table_name = 'gazetteer_wof_spr' +end diff --git a/app/models/geoname.rb b/app/models/geoname.rb deleted file mode 100644 index e1e5a4f1e..000000000 --- a/app/models/geoname.rb +++ /dev/null @@ -1,13 +0,0 @@ -class Geoname < ApplicationRecord - # include Kithe::Indexable - - # Indexer - # def self.kithe_indexable_mapper - # GeonameIndexer.new - # end - - # Required by Kithe::Indexable - # def friendlier_id - # id - # end -end diff --git a/db/migrate/20241110202927_create_geonames.rb b/db/migrate/20241110202927_create_geonames.rb index 2cac0d406..ed878bac4 100644 --- a/db/migrate/20241110202927_create_geonames.rb +++ b/db/migrate/20241110202927_create_geonames.rb @@ -24,6 +24,7 @@ def change t.timestamps end + # @TODO: Add indexes after importing the data. # Indexes # add_index :geonames, :geonameid, unique: true # add_index :geonames, :name diff --git a/db/migrate/20241124223351_create_gazetteer_wok_tables.rb b/db/migrate/20241124223351_create_gazetteer_wok_tables.rb new file mode 100644 index 000000000..1c9d6042a --- /dev/null +++ b/db/migrate/20241124223351_create_gazetteer_wok_tables.rb @@ -0,0 +1,72 @@ +class CreateGazetteerWokTables < ActiveRecord::Migration[7.0] + def change + rename_table :geonames, :gazetteer_geonames_names + rename_column :gazetteer_geonames_names, :geonameid, :geoname_id + + create_table :gazetteer_wof_ancestors do |t| + t.bigint :wok_id + t.integer :ancestor_id + t.string :ancestor_placetype + t.integer :lastmodified + t.timestamps + end + + create_table :gazetteer_wof_concordances do |t| + t.bigint :wok_id + t.string :other_id + t.string :other_source + t.integer :lastmodified + t.timestamps + end + + create_table :gazetteer_wof_geojson do |t| + t.bigint :wok_id + t.text :body + t.string :source + t.string :alt_label + t.boolean :is_alt + t.integer :lastmodified + t.timestamps + end + + create_table :gazetteer_wof_names do |t| + t.bigint :wok_id + t.string :placetype + t.string :country + t.string :language + t.string :extlang + t.string :script + t.string :region + t.string :variant + t.string :extension + t.string :privateuse + t.string :name + t.integer :lastmodified + t.timestamps + end + + create_table :gazetteer_wof_spr do |t| + t.bigint :wok_id + t.integer :parent_id + t.string :name + t.string :placetype + t.string :country + t.string :repo + t.decimal :latitude + t.decimal :longitude + t.decimal :min_latitude + t.decimal :min_longitude + t.decimal :max_latitude + t.decimal :max_longitude + t.integer :is_current + t.integer :is_deprecated + t.integer :is_ceased + t.integer :is_superseded + t.integer :is_superseding + t.integer :superseded_by + t.integer :supersedes + t.integer :lastmodified + t.timestamps + end + end +end diff --git a/db/schema.rb b/db/schema.rb index 74e70eb85..5c2e07e28 100644 --- a/db/schema.rb +++ b/db/schema.rb @@ -10,7 +10,7 @@ # # It's strongly recommended that you check this file into your version control system. -ActiveRecord::Schema[7.0].define(version: 2024_11_10_202927) do +ActiveRecord::Schema[7.0].define(version: 2024_11_24_223351) do # These are extensions that must be enabled in order to support this database enable_extension "pgcrypto" enable_extension "plpgsql" @@ -84,6 +84,16 @@ t.index ["blob_id", "variation_digest"], name: "index_active_storage_variant_records_uniqueness", unique: true end + create_table "ancestors", id: false, force: :cascade do |t| + t.bigint "id" + t.bigint "ancestor_id" + t.text "ancestor_placetype" + t.bigint "lastmodified" + t.index ["ancestor_id", "ancestor_placetype", "lastmodified"], name: "idx_17618719_ancestors_by_ancestor" + t.index ["id", "ancestor_placetype", "lastmodified"], name: "idx_17618719_ancestors_by_id" + t.index ["lastmodified"], name: "idx_17618719_ancestors_by_lastmod" + end + create_table "blacklight_allmaps_sidecars", force: :cascade do |t| t.string "solr_document_id" t.string "document_type", default: "SolrDocument" @@ -221,6 +231,17 @@ t.datetime "updated_at", null: false end + create_table "concordances", id: false, force: :cascade do |t| + t.bigint "id" + t.text "other_id" + t.text "other_source" + t.bigint "lastmodified" + t.index ["id", "lastmodified"], name: "idx_17618724_concordances_by_id" + t.index ["lastmodified"], name: "idx_17618724_concordances_by_lastmod" + t.index ["other_source", "other_id", "lastmodified"], name: "idx_17618724_concordances_by_other_lastmod" + t.index ["other_source", "other_id"], name: "idx_17618724_concordances_by_other_id" + end + create_table "document_accesses", force: :cascade do |t| t.string "friendlier_id", null: false t.string "institution_code", null: false @@ -311,8 +332,8 @@ t.datetime "updated_at", null: false end - create_table "geonames", force: :cascade do |t| - t.bigint "geonameid" + create_table "gazetteer_geonames_names", force: :cascade do |t| + t.bigint "geoname_id" t.string "name" t.string "asciiname" t.text "alternatenames" @@ -335,6 +356,86 @@ t.datetime "updated_at", null: false end + create_table "gazetteer_wof_ancestors", force: :cascade do |t| + t.bigint "wok_id" + t.integer "ancestor_id" + t.string "ancestor_placetype" + t.integer "lastmodified" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + end + + create_table "gazetteer_wof_concordances", force: :cascade do |t| + t.bigint "wok_id" + t.string "other_id" + t.string "other_source" + t.integer "lastmodified" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + end + + create_table "gazetteer_wof_geojson", force: :cascade do |t| + t.bigint "wok_id" + t.text "body" + t.string "source" + t.string "alt_label" + t.boolean "is_alt" + t.integer "lastmodified" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + end + + create_table "gazetteer_wof_names", force: :cascade do |t| + t.bigint "wok_id" + t.string "placetype" + t.string "country" + t.string "language" + t.string "extlang" + t.string "script" + t.string "region" + t.string "variant" + t.string "extension" + t.string "privateuse" + t.string "name" + t.integer "lastmodified" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + end + + create_table "gazetteer_wof_spr", force: :cascade do |t| + t.bigint "wok_id" + t.integer "parent_id" + t.string "name" + t.string "placetype" + t.string "country" + t.string "repo" + t.decimal "latitude" + t.decimal "longitude" + t.decimal "min_latitude" + t.decimal "min_longitude" + t.decimal "max_latitude" + t.decimal "max_longitude" + t.integer "is_current" + t.integer "is_deprecated" + t.integer "is_ceased" + t.integer "is_superseded" + t.integer "is_superseding" + t.integer "superseded_by" + t.integer "supersedes" + t.integer "lastmodified" + t.datetime "created_at", null: false + t.datetime "updated_at", null: false + end + + create_table "geojson", id: false, force: :cascade do |t| + t.bigint "id" + t.text "body" + t.text "source" + t.text "alt_label" + t.boolean "is_alt" + t.bigint "lastmodified" + end + create_table "image_upload_transitions", force: :cascade do |t| t.string "to_state", null: false t.text "metadata" @@ -448,6 +549,21 @@ t.index ["import_id"], name: "index_mappings_on_import_id" end + create_table "names", id: false, force: :cascade do |t| + t.bigint "id" + t.text "placetype" + t.text "country" + t.text "language" + t.text "extlang" + t.text "script" + t.text "region" + t.text "variant" + t.text "extension" + t.text "privateuse" + t.text "name" + t.bigint "lastmodified" + end + create_table "notifications", force: :cascade do |t| t.string "recipient_type", null: false t.bigint "recipient_id", null: false @@ -522,6 +638,29 @@ t.index ["document_type", "document_id"], name: "solr_document_uris_solr_document" end + create_table "spr", id: false, force: :cascade do |t| + t.bigint "id" + t.bigint "parent_id" + t.text "name" + t.text "placetype" + t.text "country" + t.text "repo" + t.float "latitude" + t.float "longitude" + t.float "min_latitude" + t.float "min_longitude" + t.float "max_latitude" + t.float "max_longitude" + t.bigint "is_current" + t.bigint "is_deprecated" + t.bigint "is_ceased" + t.bigint "is_superseded" + t.bigint "is_superseding" + t.text "superseded_by" + t.text "supersedes" + t.bigint "lastmodified" + end + create_table "uri_transitions", force: :cascade do |t| t.string "to_state", null: false t.text "metadata" diff --git a/lib/tasks/geoportal/gazetteer/geonames.rake b/lib/tasks/geoportal/gazetteer/geonames.rake index 278e748ca..5f36d7ab0 100644 --- a/lib/tasks/geoportal/gazetteer/geonames.rake +++ b/lib/tasks/geoportal/gazetteer/geonames.rake @@ -74,7 +74,7 @@ namespace :geoportal do text = f.readline row = CSV.parse_line(text, col_sep: "\t", headers: false) geonames << { - geonameid: row[0], + geoname_id: row[0], name: row[1], asciiname: row[2], alternatenames: row[3], @@ -97,7 +97,7 @@ namespace :geoportal do # Import every 100000 records if geonames.size >= 100000 - Geoname.import(geonames, validate: false) + Gazetteer::Geonames::Name.import(geonames, validate: false) geonames.clear end @@ -110,7 +110,7 @@ namespace :geoportal do end # Import any remaining records - Geoname.import(geonames, validate: false) unless geonames.empty? + Gazetteer::Geonames::Name.import(geonames, validate: false) unless geonames.empty? puts "Geonames import completed successfully." end @@ -125,7 +125,7 @@ namespace :geoportal do connection.execute <<-SQL COPY ( SELECT - geonameid AS geonameid_i, + geoname_id AS geonameid_i, name, asciiname AS asciiname_s, alternatenames AS alternatenames_s, diff --git a/lib/tasks/geoportal/gazetteer/wof.rake b/lib/tasks/geoportal/gazetteer/wof.rake index 6fb56f322..975e7aac3 100644 --- a/lib/tasks/geoportal/gazetteer/wof.rake +++ b/lib/tasks/geoportal/gazetteer/wof.rake @@ -3,6 +3,7 @@ require 'csv' require 'fileutils' require 'open-uri' require 'rsolr' +require 'sqlite3' # Geoportal Gazetteer Who's on First Tasks # Order of execution: @@ -46,9 +47,384 @@ namespace :geoportal do puts "Download and extraction completed successfully." end - desc "Import Who's on First data into Rails" + # Imports the data, but some is duplicated from the import above. + desc "Export SQLite tables to CSV and import into PostgreSQL" task import: :environment do - # todo + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'whosonfirst-data-admin-us-latest.db') + csv_dir = Rails.root.join('db', 'gazetteer', 'wof', 'csv') + FileUtils.mkdir_p(csv_dir) + + puts "Exporting SQLite tables to CSV from #{file_path}..." + + # Open the SQLite database + db = SQLite3::Database.new(file_path.to_s) + + # Get the list of tables + tables = db.execute("SELECT name FROM sqlite_master WHERE type='table';").flatten + + # Export each table to a CSV file + tables.each do |table| + csv_file = File.join(csv_dir, "#{table}.csv") + CSV.open(csv_file, 'wb') do |csv| + db.execute2("SELECT * FROM #{table};") do |row| + csv << row + end + end + puts "Exported #{table} to #{csv_file}" + end + + # Call the individual import tasks + Rake::Task['geoportal:gazetteer:wof:import_ancestors'].invoke + Rake::Task['geoportal:gazetteer:wof:import_concordances'].invoke + Rake::Task['geoportal:gazetteer:wof:import_geojson'].invoke + Rake::Task['geoportal:gazetteer:wof:import_names'].invoke + Rake::Task['geoportal:gazetteer:wof:import_spr'].invoke + puts "CSV export and import completed successfully." + end + + desc "Import Ancestors CSV file into PostgreSQL" + task import_ancestors: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'csv', 'gazetteer_wof_ancestors.csv') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:wof:download'].invoke + end + + # Geonames Array + ancestors = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + # Open the file + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, headers: false) + + # Ancestor Table Schema + # t.bigint :wok_id + # t.integer :ancestor_id + # t.string :ancestor_placetype + # t.integer :lastmodified + ancestors << { + wok_id: row[0].to_i, + ancestor_id: row[1].to_i, + ancestor_placetype: row[2].to_s, + lastmodified: row[3].to_i, + } + + # Import every 100000 records + if ancestors.size >= 100000 + Gazetteer::Wof::Ancestor.import(ancestors, validate: false) + ancestors.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Gazetteer::Wof::Ancestor.import(ancestors, validate: false) unless ancestors.empty? + + puts "Imported ancestors from #{file_path} into PostgreSQL" + end + + desc "Import Concordances CSV file into PostgreSQL" + task import_concordances: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'csv', 'gazetteer_wof_concordances.csv') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:wof:download'].invoke + end + + # Concordances Array + concordances = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + # Open the file + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, headers: false) + + # Concordance Table Schema + # t.bigint :wok_id + # t.string :other_id + # t.string :other_source + # t.integer :lastmodified + concordances << { + wok_id: row[0].to_i, + other_id: begin + id = row[1].to_s + id.end_with?('.0') ? id.chomp('.0') : id + end, + other_source: row[2].to_s, + lastmodified: row[3].to_i, + } + + # Import every 100000 records + if concordances.size >= 100000 + Gazetteer::Wof::Concordance.import(concordances, validate: false) + concordances.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Gazetteer::Wof::Concordance.import(concordances, validate: false) unless concordances.empty? + + puts "Imported concordances from #{file_path} into PostgreSQL" + end + + desc "Import GeoJSON CSV file into PostgreSQL" + task import_geojson: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'csv', 'gazetteer_wof_geojson.csv') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:wof:download'].invoke + end + + # GeoJSON Array + geojson_records = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + # Open the file + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, headers: false) + + # GeoJSON Table Schema + # t.bigint :wok_id + # t.text :body + # t.string :source + # t.string :alt_label + # t.boolean :is_alt + # t.integer :lastmodified + geojson_records << { + wok_id: row[0].to_i, + body: row[1].to_s, + source: row[2].to_s, + alt_label: row[3].to_s, + is_alt: row[4] == 'true', + lastmodified: row[5].to_i, + } + + # Import every 100000 records + if geojson_records.size >= 100000 + Gazetteer::Wof::Geojson.import(geojson_records, validate: false) + geojson_records.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Gazetteer::Wof::Geojson.import(geojson_records, validate: false) unless geojson_records.empty? + + puts "Imported geojson records from #{file_path} into PostgreSQL" + end + + desc "Import Names CSV file into PostgreSQL" + task import_names: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'csv', 'gazetteer_wof_names.csv') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:wof:download'].invoke + end + + # Names Array + names_records = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + # Open the file + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, headers: false) + + # Names Table Schema + # t.bigint :wok_id + # t.string :placetype + # t.string :country + # t.string :language + # t.string :extlang + # t.string :script + # t.string :region + # t.string :variant + # t.string :extension + # t.string :privateuse + # t.string :name + # t.integer :lastmodified + names_records << { + wok_id: row[0].to_i, + placetype: row[1].to_s, + country: row[2].to_s, + language: row[3].to_s, + extlang: row[4].to_s, + script: row[5].to_s, + region: row[6].to_s, + variant: row[7].to_s, + extension: row[8].to_s, + privateuse: row[9].to_s, + name: row[10].to_s, + lastmodified: row[11].to_i, + } + + # Import every 100000 records + if names_records.size >= 100000 + Gazetteer::Wof::Name.import(names_records, validate: false) + names_records.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Gazetteer::Wof::Name.import(names_records, validate: false) unless names_records.empty? + + puts "Imported names records from #{file_path} into PostgreSQL" + end + + desc "Import SPR CSV file into PostgreSQL" + task import_spr: :environment do + file_path = Rails.root.join('db', 'gazetteer', 'wof', 'csv', 'gazetteer_wof_spr.csv') + + # Check if the file exists + unless File.exist?(file_path) + puts "File not found. Downloading..." + Rake::Task['geoportal:gazetteer:wof:download'].invoke + end + + # SPR Records Array + spr_records = [] + + # Count the total number of lines in the file + total_lines = `wc -l "#{file_path}"`.strip.split(' ')[0].to_i + + # Initialize the progress bar + progress_bar = ProgressBar.create(total: total_lines, format: "%a %b\u{15E7}%i %p%% %t") + + # Open the file + File.open(file_path, 'r') do |f| + until f.eof? + begin + text = f.readline + row = CSV.parse_line(text, headers: false) + + # SPR Table Schema + # t.bigint :wok_id + # t.integer :parent_id + # t.string :name + # t.string :placetype + # t.string :country + # t.string :repo + # t.decimal :latitude + # t.decimal :longitude + # t.decimal :min_latitude + # t.decimal :min_longitude + # t.decimal :max_latitude + # t.decimal :max_longitude + # t.integer :is_current + # t.integer :is_deprecated + # t.integer :is_ceased + # t.integer :is_superseded + # t.integer :is_superseding + # t.integer :superseded_by + # t.integer :supersedes + # t.integer :lastmodified + spr_records << { + wok_id: row[0].to_i, + parent_id: row[1].to_i, + name: row[2].to_s, + placetype: row[3].to_s, + country: row[4].to_s, + repo: row[5].to_s, + latitude: row[6].to_d, + longitude: row[7].to_d, + min_latitude: row[8].to_d, + min_longitude: row[9].to_d, + max_latitude: row[10].to_d, + max_longitude: row[11].to_d, + is_current: row[12].to_i, + is_deprecated: row[13].to_i, + is_ceased: row[14].to_i, + is_superseded: row[15].to_i, + is_superseding: row[16].to_i, + superseded_by: row[17].to_i, + supersedes: row[18].to_i, + lastmodified: row[19].to_i, + } + + # Import every 100000 records + if spr_records.size >= 100000 + Gazetteer::Wof::Spr.import(spr_records, validate: false) + spr_records.clear + end + + # Increment the progress bar + progress_bar.increment + rescue StandardError => e + puts "Error processing line: #{e.message}" + end + end + end + + # Import any remaining records + Gazetteer::Wof::Spr.import(spr_records, validate: false) unless spr_records.empty? + + puts "Imported SPR records from #{file_path} into PostgreSQL" end end end