Skip to content

Commit

Permalink
Only run DR5HN import when needed
Browse files Browse the repository at this point in the history
  • Loading branch information
nigelhorne committed Feb 29, 2024
1 parent 64ee24c commit 54a68bf
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 59 deletions.
1 change: 1 addition & 0 deletions MANIFEST
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ t/cities.t
t/comment-spelling.t
t/coverage.t
t/critic.t
t/dr5hn.t
t/eof.t
t/eol.t
t/fixme.t
Expand Down
126 changes: 68 additions & 58 deletions createdatabase.PL
Original file line number Diff line number Diff line change
Expand Up @@ -1988,13 +1988,13 @@ if(my $whosonfirst = $ENV{'WHOSONFIRST_HOME'}) {
$l2_cache->clear();
undef %l1_cache;
undef $l2_cache;
}

if(BUILDMODE eq 'ONE') {
undef $ENV{'DR5HN_HOME'};
undef $ENV{'OSM_HOME'};
undef $ENV{'WHOSONFIRST_HOME'};
undef $oa;
if(BUILDMODE eq 'ONE') {
undef $ENV{'DR5HN_HOME'};
undef $ENV{'OSM_HOME'};
undef $ENV{'WHOSONFIRST_HOME'};
undef $oa;
}
}

if($dbh) {
Expand Down Expand Up @@ -2025,58 +2025,6 @@ foreach my $country(@whosonfirst_only_countries) {
flush_queue($dbh, $redis, $mongodb, $berkeley_db);
# %whosonfirst = ();

if(my $dr5hn = $ENV{'DR5HN_HOME'}) {
my $filename = File::Spec->catfile($dr5hn, 'countries+states+cities.json');

$| = 1;
printf "%-70s\r", $filename;
print "\n" if(DEBUG);
$| = 0;

my $j = JSON::MaybeXS->new()->utf8();

my $data = File::Slurp::read_file($filename);
$data = $j->decode($data);

foreach my $country(@{$data}) {
# undef %digests_added;
if(($country->{'iso3'} eq 'USA') || ($country->{'iso3'} eq 'CAN') || ($country->{'iso3'} eq 'AUS')) {
if($country->{'name'} eq 'United States') {
$country->{'name'} = 'US';
}
foreach my $state(@{$country->{'states'}}) {
foreach my $city(@{$state->{'cities'}}) {
my $row = {
'COUNTRY' => $country->{'name'},
'STATE' => $state->{'state_code'},
'CITY' => $city->{'name'},
'LAT' => $city->{'latitude'},
'LON' => $city->{'longitude'},
};
$inserts += import(row => $row, file => $filename, ua => $ua, dbh => $dbh, redis => $redis, mongodb => $mongodb, berkeley_db => $berkeley_db);
# print Data::Dumper->new([$row])->Dump();
if($inserts >= MAX_INSERT_COUNT) {
flush_queue($dbh, $redis, $mongodb, $berkeley_db);
$inserts = 0;
}
}
}
# } elsif($country->{'iso3'} eq 'GBR') {
# TODO
};
}

flush_queue($dbh, $redis, $mongodb, $berkeley_db);
$inserts = 0;

if(BUILDMODE eq 'ONE') {
undef $ENV{'DR5HN_HOME'};
undef $ENV{'OSM_HOME'};
undef $ENV{'WHOSONFIRST_HOME'};
undef $oa;
}
}

if(my $osm = $ENV{'OSM_HOME'}) {
# Openstreetmap
# There are a range of differing formats -
Expand Down Expand Up @@ -2611,8 +2559,70 @@ print __LINE__, ": add_record = $add_record (city/state/country = $city/$state/$

flush_queue($dbh, $redis, $mongodb, $berkeley_db); # Check for hanging dups in current state
$inserts = 0;
} elsif(my $dr5hn = $ENV{'DR5HN_HOME'}) {
# my @files = (
# 'cities',
# 'countries+states+cities',
# 'states+cities',
# 'countries+cities',
# 'countries+states',
# 'states',
# 'countries',
# 'regions',
# 'subregions'

my $filename = File::Spec->catfile($dr5hn, 'countries+states+cities.json');

$| = 1;
printf "%-70s\r", $filename;
print "\n" if(DEBUG);
$| = 0;

my $j = JSON::MaybeXS->new()->utf8();

my $data = File::Slurp::read_file($filename);
$data = $j->decode($data);

foreach my $country(@{$data}) {
# undef %digests_added;
if(($country->{'iso3'} eq 'USA') || ($country->{'iso3'} eq 'CAN') || ($country->{'iso3'} eq 'AUS')) {
if($country->{'name'} eq 'United States') {
$country->{'name'} = 'US';
}
foreach my $state(@{$country->{'states'}}) {
foreach my $city(@{$state->{'cities'}}) {
my $row = {
'COUNTRY' => $country->{'name'},
'STATE' => $state->{'state_code'},
'CITY' => $city->{'name'},
'LAT' => $city->{'latitude'},
'LON' => $city->{'longitude'},
};
$inserts += import(row => $row, file => $filename, ua => $ua, dbh => $dbh, redis => $redis, mongodb => $mongodb, berkeley_db => $berkeley_db);
print Data::Dumper->new([$row])->Dump();
if($inserts >= MAX_INSERT_COUNT) {
flush_queue($dbh, $redis, $mongodb, $berkeley_db);
$inserts = 0;
}
}
}
# } elsif($country->{'iso3'} eq 'GBR') {
# TODO
};
}

flush_queue($dbh, $redis, $mongodb, $berkeley_db);
$inserts = 0;

if(BUILDMODE eq 'ONE') {
undef $ENV{'DR5HN_HOME'};
undef $ENV{'OSM_HOME'};
undef $ENV{'WHOSONFIRST_HOME'};
undef $oa;
}
}


# undef %digests_added;
# $| = 1;
# printf "%-70s\r", 'creating cities';
Expand Down
4 changes: 4 additions & 0 deletions lib/Geo/Coder/Free.pm
Original file line number Diff line number Diff line change
Expand Up @@ -423,6 +423,10 @@ scripts are trivial.
This can take a long time because it contains lots of directories which filesystem drivers
seem to take a long time to navigate (at least my EXT4 and ZFS systems do).
2. Install L<https://github.com/dr5hn/countries-states-cities-database.git> into $DR5HN_HOME.
This data contains cities only,
so it's not used if OSM_HOME is set,
since the latter is much more comprehensive.
Also, only Australia, Canada and the US is imported, as the UK data is difficult to parse.
3. Run bin/download_databases - this will download the WhosOnFirst, Openaddr,
Open Street Map and dr5hn databases.
Check the values of OSM_HOME, OPENADDR_HOME,
Expand Down
7 changes: 6 additions & 1 deletion lib/Geo/Coder/Free/OpenAddresses.pm
Original file line number Diff line number Diff line change
Expand Up @@ -883,7 +883,12 @@ sub _get {
$location =~ s/,\s*//g;

# ::diag(__PACKAGE__, ': ', __LINE__, ": _get: $location");
my $digest = substr Digest::MD5::md5_base64(uc($location)), 0, 16;
my $digest;
if(length($location) <= 16) {
$digest = uc($location);
} else {
$digest = substr Digest::MD5::md5_base64(uc($location)), 0, 16;
}
# print __PACKAGE__, ': ', __LINE__, ': ', uc($location), " = $digest\n";

if(defined($unknown_locations{$digest})) {
Expand Down
58 changes: 58 additions & 0 deletions t/dr5hn.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!perl -w

# TODO: Try using Test::Without::Module to try without Geo::libpostal is that
# is installed

use warnings;
use strict;
use Data::Dumper;
use Test::Most tests => 5;
use Test::Number::Delta;
use Test::Carp;
use Test::Deep;
use lib 't/lib';
use MyLogger;

BEGIN {
use_ok('Geo::Coder::Free');
}

DR5HN: {
SKIP: {
if($ENV{'DR5HN_HOME'} && $ENV{'OPENADDR_HOME'}) {
if($ENV{AUTHOR_TESTING}) {
diag('This will take some time and memory');

my $libpostal_is_installed = 0;
if(eval { require Geo::libpostal; }) {
$libpostal_is_installed = 1;
}

if($ENV{'TEST_VERBOSE'}) {
Database::Abstraction::init(logger => MyLogger->new());
}

my $geo_coder = new_ok('Geo::Coder::Free');
my $location = $geo_coder->geocode(location => 'Silver Spring, MD, USA');
ok(defined($location));
cmp_deeply($location,
methods('lat' => num(38.99, 1e-2), 'long' => num(-77.02, 1e-1)));

diag(Data::Dumper->new([$location])->Dump()) if($ENV{'TEST_VERBOSE'});

eval 'use Test::Memory::Cycle';
if($@) {
skip('Test::Memory::Cycle required to check for cicular memory references', 1);
} else {
memory_cycle_ok($geo_coder);
}
} else {
diag('Author tests not required for installation');
skip('Author tests not required for installation', 4);
}
} else {
diag('Set DR5HN_HOME and OPENADDR_HOME to enable dr5hn testing');
skip('DR5HN_HOME and/or OPENADDR_HOME not defined', 4);
}
}
}

0 comments on commit 54a68bf

Please sign in to comment.