diff --git a/Cargo.lock b/Cargo.lock index a4342af14..80509109f 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1948,8 +1948,7 @@ dependencies = [ [[package]] name = "geosuggest-core" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d339ac9809b2cdb8abdb87df2971877c4eb48deaec9d1ee9e30d2721d72ce43" +source = "git+https://github.com/estin/geosuggest?rev=5c6b08b#5c6b08bbc9211972b489d5cfa13ce13cde42cb43" dependencies = [ "bincode", "csv", @@ -1959,22 +1958,18 @@ dependencies = [ "serde", "serde_json", "strsim", - "tracing", ] [[package]] name = "geosuggest-utils" version = "0.3.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a3609965d29046d62fc2ed4589df7b5f3da1142073eb49d14cdd518572674637" +source = "git+https://github.com/estin/geosuggest?rev=5c6b08b#5c6b08bbc9211972b489d5cfa13ce13cde42cb43" dependencies = [ "anyhow", "futures", "geosuggest-core", "reqwest", "tokio", - "tracing", - "tracing-subscriber", "zip", ] @@ -2813,15 +2808,6 @@ version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3e2e65a1a2e43cfcb47a895c4c8b10d1f4a61097f9f254f183aee60cad9c651d" -[[package]] -name = "matchers" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8263075bb86c5a1b1427b5ae862e8889656f126e9f77c484496e8b47cf5c5558" -dependencies = [ - "regex-automata 0.1.10", -] - [[package]] name = "matches" version = "0.1.10" @@ -3073,16 +3059,6 @@ dependencies = [ "winapi", ] -[[package]] -name = "nu-ansi-term" -version = "0.46.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" -dependencies = [ - "overload", - "winapi", -] - [[package]] name = "num" version = "0.4.1" @@ -3265,12 +3241,6 @@ dependencies = [ "num-traits", ] -[[package]] -name = "overload" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" - [[package]] name = "parking_lot" version = "0.12.1" @@ -4285,19 +4255,10 @@ checksum = "12de2eff854e5fa4b1295edd650e227e9d8fb0c9e90b12e7f36d6a6811791a29" dependencies = [ "aho-corasick", "memchr", - "regex-automata 0.3.7", + "regex-automata", "regex-syntax 0.7.5", ] -[[package]] -name = "regex-automata" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" -dependencies = [ - "regex-syntax 0.6.29", -] - [[package]] name = "regex-automata" version = "0.3.7" @@ -4495,9 +4456,9 @@ dependencies = [ [[package]] name = "rustix" -version = "0.38.10" +version = "0.38.11" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ed6248e1caa625eb708e266e06159f135e8c26f2bb7ceb72dc4b2766d0340964" +checksum = "c0c3dde1fc030af041adc40e79c0e7fbcf431dd24870053d187d7c66e4b87453" dependencies = [ "bitflags 2.4.0", "errno", @@ -4788,15 +4749,6 @@ dependencies = [ "digest", ] -[[package]] -name = "sharded-slab" -version = "0.1.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "900fba806f70c630b0a382d0d825e17a0f19fcd059a2ade1ff237bcddf446b31" -dependencies = [ - "lazy_static", -] - [[package]] name = "shlex" version = "1.1.0" @@ -5210,16 +5162,6 @@ version = "0.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "3bf63baf9f5039dadc247375c29eb13706706cfde997d0330d05aa63a77d8820" -[[package]] -name = "thread_local" -version = "1.1.7" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdd6f064ccff2d6567adcb3873ca630700f00b5ad3f060c25b5dcfd9a4ce152" -dependencies = [ - "cfg-if", - "once_cell", -] - [[package]] name = "threadpool" version = "1.8.1" @@ -5416,21 +5358,9 @@ dependencies = [ "cfg-if", "log", "pin-project-lite", - "tracing-attributes", "tracing-core", ] -[[package]] -name = "tracing-attributes" -version = "0.1.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f4f31f56159e98206da9efd823404b79b6ef3143b4a7ab76e67b1751b25a4ab" -dependencies = [ - "proc-macro2", - "quote", - "syn 2.0.29", -] - [[package]] name = "tracing-core" version = "0.1.31" @@ -5438,36 +5368,6 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0955b8137a1df6f1a2e9a37d8a6656291ff0297c1a97c24e0d8425fe2312f79a" dependencies = [ "once_cell", - "valuable", -] - -[[package]] -name = "tracing-log" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "78ddad33d2d10b1ed7eb9d1f518a5674713876e97e5bb9b7345a7984fbb4f922" -dependencies = [ - "lazy_static", - "log", - "tracing-core", -] - -[[package]] -name = "tracing-subscriber" -version = "0.3.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "30a651bc37f915e81f087d86e62a18eec5f79550c7faff886f7090b4ea757c77" -dependencies = [ - "matchers", - "nu-ansi-term", - "once_cell", - "regex", - "sharded-slab", - "smallvec", - "thread_local", - "tracing", - "tracing-core", - "tracing-log", ] [[package]] @@ -5627,12 +5527,6 @@ dependencies = [ "unicase", ] -[[package]] -name = "valuable" -version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" - [[package]] name = "value-trait" version = "0.6.1" diff --git a/Cargo.toml b/Cargo.toml index 39657d318..bdddced48 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -103,6 +103,8 @@ futures = "0.3" futures-util = "0.3" geosuggest-core = { version = "0.3", optional = true } geosuggest-utils = { version = "0.3", optional = true } +# geosuggest-core = { path = "../geosuggest/geosuggest-core", optional = true} +# geosuggest-utils = { path = "../geosuggest/geosuggest-utils", optional = true} governor = { version = "0.6", optional = true } grex = { version = "1.4", default-features = false } gzp = { version = "0.11", default-features = false, features = [ @@ -221,6 +223,10 @@ quickcheck = { version = "1", default-features = false } rusqlite = { version = "0.29", features = ["bundled"] } serial_test = { version = "2.0", features = ["file_locks"] } +[patch.crates-io] +geosuggest-core = { git = "https://github.com/estin/geosuggest", rev = "5c6b08b" } +geosuggest-utils = { git = "https://github.com/estin/geosuggest", rev = "5c6b08b" } + [features] default = ["mimalloc"] all_features = [ diff --git a/src/cmd/geocode.rs b/src/cmd/geocode.rs index 64f2c9410..0f4fc3707 100644 --- a/src/cmd/geocode.rs +++ b/src/cmd/geocode.rs @@ -88,7 +88,7 @@ qsv geocode suggest [--formatstr=] [options] [] qsv geocode reverse [--formatstr=] [options] [] qsv geocode index-load qsv geocode index-check -qsv geocode index-update +qsv geocode index-update [--languages=] [--force] qsv geocode index-reset qsv geocode --help @@ -97,7 +97,7 @@ geocode arguments: The input file to read from. If not specified, reads from stdin. The column to geocode. The alternate geonames index file to use. It must be a .bincode file. - Only used by the 'load' operations. + Only used by the index-load subcommand. geocode options: -c, --new-column Put the transformed values in a new column instead. @@ -115,6 +115,7 @@ geocode options: - '%city-state-country' | '%city-admin1-country' - Brooklyn, New York US - '%city' - Brooklyn - '%state' | '%admin1' - New York + - "%county' | '%admin2' - Kings County - '%country' - US - '%cityrecord' - returns the full city record as a string - '%lat-long' - , @@ -138,7 +139,7 @@ geocode options: If an invalid dynfmt template is specified, it will return "Invalid dynfmt template." [default: %+] - --invalid-result The string to use when the geocode result is empty/invalid. + --invalid-result The string to return when the geocode result is empty/invalid. If not set, the original value is used. -j, --jobs The number of jobs to run in parallel. When not set, the number of jobs is set to the number of CPUs detected. @@ -146,14 +147,17 @@ geocode options: [default: 50000] --timeout Timeout for downloading Geonames cities index. [default: 60] - --languages The languages to use when building the Geonames cities index. - Only used by the 'index-update' subcommand. - The languages are specified as a comma-separated list of ISO 639-1 codes. - [default: en] --cache-dir The directory to use for caching the Geonames cities index. If the directory does not exist, qsv will attempt to create it. If the QSV_CACHE_DIR envvar is set, it will be used instead. - [default: ~/.qsv-cache] + [default: ~/.qsv-cache] + + INDEX-UPDATE only options: + --languages The languages to use when building the Geonames cities index. + The languages are specified as a comma-separated list of ISO 639-1 codes. + [default: en] + --force Force update the Geonames cities index. If not set, qsv will check if there + are updates available at Geonames.org before updating the index. Common options: -h, --help Display this message @@ -169,7 +173,7 @@ use std::{ path::{Path, PathBuf}, }; -use cached::proc_macro::cached; +use cached::{proc_macro::cached, SizedCache}; use dynfmt::Format; use geosuggest_core::{CitiesRecord, Engine, EngineDumpFormat}; use geosuggest_utils::{IndexUpdater, IndexUpdaterSettings, SourceItem}; @@ -209,8 +213,9 @@ struct Args { flag_invalid_result: Option, flag_batch: u32, flag_timeout: u16, - flag_languages: String, flag_cache_dir: String, + flag_languages: String, + flag_force: bool, flag_jobs: Option, flag_new_column: Option, flag_output: Option, @@ -230,6 +235,12 @@ static DEFAULT_CITIES_NAMES_FILENAME: &str = "alternateNamesV2.txt"; static DEFAULT_COUNTRY_INFO_URL: &str = "https://download.geonames.org/export/dump/countryInfo.txt"; static DEFAULT_ADMIN1_CODES_URL: &str = "https://download.geonames.org/export/dump/admin1CodesASCII.txt"; +static DEFAULT_ADMIN2_CODES_URL: &str = "https://download.geonames.org/export/dump/admin2Codes.txt"; + +// max number of entries in LRU cache +static CACHE_SIZE: usize = 2_000_000; +// max number of entries in fallback LRU cache if we can't allocate CACHE_SIZE +static FALLBACK_CACHE_SIZE: usize = CACHE_SIZE / 4; static EMPTY_STRING: String = String::new(); static INVALID_DYNFMT: &str = "Invalid dynfmt template."; @@ -354,6 +365,7 @@ async fn geocode_main(args: Args) -> CliResult<()> { }), countries_url: Some(DEFAULT_COUNTRY_INFO_URL), admin1_codes_url: Some(DEFAULT_ADMIN1_CODES_URL), + admin2_codes_url: Some(DEFAULT_ADMIN2_CODES_URL), filter_languages: languages_vec.clone(), })?; @@ -394,17 +406,27 @@ async fn geocode_main(args: Args) -> CliResult<()> { // update/rebuild Geonames index from Geonames website // will only update if there are changes check_index_file(&geocode_index_file)?; - let engine = load_engine(geocode_index_file.clone().into(), &progress).await?; - if updater.has_updates(&engine).await? { - winfo!( - "Updating/Rebuilding Geonames index. This will take a while as we need to \ - download ~200mb of data from Geonames and rebuild the index..." - ); + if args.flag_force { + winfo!("Forcing fresh build of Geonames index: {geocode_index_file}..."); let engine = updater.build().await?; engine.dump_to(geocode_index_file.clone(), EngineDumpFormat::Bincode)?; - winfo!("Updates applied: {geocode_index_file}"); + winfo!("Geonames index built: {geocode_index_file}"); } else { - winfo!("Skipping update. Geonames index is up-to-date."); + winfo!("Checking main Geonames website for updates..."); + + let engine = load_engine(geocode_index_file.clone().into(), &progress).await?; + if updater.has_updates(&engine).await? { + winfo!( + "Updating/Rebuilding Geonames index. This will take a while as we \ + need to download ~200mb of data from Geonames and rebuild the \ + index..." + ); + let engine = updater.build().await?; + engine.dump_to(geocode_index_file.clone(), EngineDumpFormat::Bincode)?; + winfo!("Updates applied: {geocode_index_file}"); + } else { + winfo!("Skipping update. Geonames index is up-to-date."); + } } }, GeocodeSubCmd::IndexLoad => { @@ -419,7 +441,7 @@ async fn geocode_main(args: Args) -> CliResult<()> { engine.dump_to(geocode_index_file.clone(), EngineDumpFormat::Bincode)?; winfo!( "Valid Geonames index file {index_file} copied to {geocode_index_file}. \ - It will be used from now on or until you reset it.", + It will be used from now on or until you reset/rebuild it.", ); } else { return fail_incorrectusage_clierror!( @@ -622,6 +644,9 @@ async fn load_engine(geocode_index_file: PathBuf, progressbar: &ProgressBar) -> } #[cached( + type = "SizedCache", + create = "{ SizedCache::try_with_size(CACHE_SIZE).unwrap_or_else(|_| \ + SizedCache::with_size(FALLBACK_CACHE_SIZE)) }", key = "String", convert = r#"{ format!("{cell}") }"#, option = true, @@ -641,13 +666,6 @@ fn search_cached( return None; }; - let Some((_admin1_key, admin1_name)) = (match &cityrecord.admin1_names { - Some(admin1) => admin1.iter().next().map(|s| s.to_owned()), - None => Some((&EMPTY_STRING, &EMPTY_STRING)), - }) else { - return None; - }; - if formatstr == "%+" { // default for suggest is location - e.g. "(lat, long)" return Some(format!( @@ -657,7 +675,7 @@ fn search_cached( )); } - return Some(format_result(cityrecord, formatstr, true, admin1_name)); + return Some(format_result(cityrecord, formatstr, true)); } else if mode == GeocodeSubCmd::Reverse { // regex for Location field. Accepts (lat, long) & lat, long let locregex: &'static Regex = regex_oncelock!( @@ -677,15 +695,16 @@ fn search_cached( return None; }; - let Some((_admin1_key, admin1_name)) = (match &cityrecord.admin1_names { - Some(admin1) => admin1.iter().next().map(|s| s.to_owned()), - None => Some((&EMPTY_STRING, &EMPTY_STRING)), - }) else { - return None; - }; - if formatstr == "%+" { // default for reverse is city-admin1 - e.g. "Brooklyn, New York" + let (_admin1_key, admin1_name) = match &cityrecord.admin1_names { + Some(admin1) => admin1 + .iter() + .next() + .unwrap_or((&EMPTY_STRING, &EMPTY_STRING)), + None => (&EMPTY_STRING, &EMPTY_STRING), + }; + return Some(format!( "{city}, {admin1}", city = cityrecord.name.clone(), @@ -693,7 +712,7 @@ fn search_cached( )); } - return Some(format_result(cityrecord, formatstr, false, admin1_name)); + return Some(format_result(cityrecord, formatstr, false)); } } else { // not a valid lat, long @@ -706,12 +725,23 @@ fn search_cached( /// format the geocoded result based on formatstr if its not %+ #[inline] -fn format_result( - cityrecord: &CitiesRecord, - formatstr: &str, - suggest_mode: bool, - admin1_name: &str, -) -> String { +fn format_result(cityrecord: &CitiesRecord, formatstr: &str, suggest_mode: bool) -> String { + let (_admin1_key, admin1_name) = match &cityrecord.admin1_names { + Some(admin1) => admin1 + .iter() + .next() + .unwrap_or((&EMPTY_STRING, &EMPTY_STRING)), + None => (&EMPTY_STRING, &EMPTY_STRING), + }; + + let (_admin2_key, admin2_name) = match &cityrecord.admin2_names { + Some(admin2) => admin2 + .iter() + .next() + .unwrap_or((&EMPTY_STRING, &EMPTY_STRING)), + None => (&EMPTY_STRING, &EMPTY_STRING), + }; + if formatstr.starts_with('%') { // if formatstr starts with %, then we're using a predefined format match formatstr { @@ -731,6 +761,7 @@ fn format_result( cityrecord.country.clone().unwrap().name ), "%state" | "%admin1" => admin1_name.to_owned(), + "%county" | "%admin2" => admin2_name.to_owned(), "%country" => cityrecord.country.clone().unwrap().name, "%id" => format!("{}", cityrecord.id), "%population" => format!("{}", cityrecord.population), @@ -760,13 +791,14 @@ fn format_result( // i.e. eight predefined fields below in curly braces are replaced with values // e.g. "City: {name}, State: {admin1}, Country: {country} - {timezone}" - let mut cityrecord_map: HashMap<&str, String> = HashMap::with_capacity(8); + let mut cityrecord_map: HashMap<&str, String> = HashMap::with_capacity(9); cityrecord_map.insert("id", cityrecord.id.to_string()); cityrecord_map.insert("name", cityrecord.name.clone()); cityrecord_map.insert("latitude", cityrecord.latitude.to_string()); cityrecord_map.insert("longitude", cityrecord.longitude.to_string()); cityrecord_map.insert("country", cityrecord.country.clone().unwrap().name); cityrecord_map.insert("admin1", admin1_name.to_owned()); + cityrecord_map.insert("admin2", admin2_name.to_owned()); cityrecord_map.insert("timezone", cityrecord.timezone.clone()); cityrecord_map.insert("population", cityrecord.population.to_string()); diff --git a/tests/test_geocode.rs b/tests/test_geocode.rs index d376a1a8c..511c4d2e1 100644 --- a/tests/test_geocode.rs +++ b/tests/test_geocode.rs @@ -214,32 +214,38 @@ fn geocode_suggest_fmt_cityrecord() { "CitiesRecord { id: 5116495, name: \"Elmhurst\", latitude: 40.73649, longitude: \ -73.87791, country: Some(Country { id: 6252001, code: \"US\", name: \"United \ States\" }), admin_division: Some(AdminDivision { id: 5128638, code: \"US.NY\", \ - name: \"New York\" }), timezone: \"America/New_York\", names: Some({\"en\": \ - \"Elmhurst\"}), country_names: Some({\"en\": \"United States\"}), admin1_names: \ - Some({\"en\": \"New York\"}), population: 113364 }" + name: \"New York\" }), admin2_division: Some(AdminDivision { id: 5133268, code: \ + \"US.NY.081\", name: \"Queens County\" }), timezone: \"America/New_York\", names: \ + Some({\"en\": \"Elmhurst\"}), country_names: Some({\"en\": \"United States\"}), \ + admin1_names: Some({\"en\": \"New York\"}), admin2_names: Some({\"en\": \"Queens \ + County\"}), population: 113364 }" ], svec![ "CitiesRecord { id: 5115843, name: \"East Flatbush\", latitude: 40.65371, longitude: \ -73.93042, country: Some(Country { id: 6252001, code: \"US\", name: \"United \ States\" }), admin_division: Some(AdminDivision { id: 5128638, code: \"US.NY\", \ - name: \"New York\" }), timezone: \"America/New_York\", names: Some({\"en\": \"East \ - Flatbush\"}), country_names: Some({\"en\": \"United States\"}), admin1_names: \ - Some({\"en\": \"New York\"}), population: 178464 }" + name: \"New York\" }), admin2_division: Some(AdminDivision { id: 6941775, code: \ + \"US.NY.047\", name: \"Kings County\" }), timezone: \"America/New_York\", names: \ + Some({\"en\": \"East Flatbush\"}), country_names: Some({\"en\": \"United States\"}), \ + admin1_names: Some({\"en\": \"New York\"}), admin2_names: Some({\"en\": \"Kings\"}), \ + population: 178464 }" ], svec![ "CitiesRecord { id: 5128581, name: \"New York City\", latitude: 40.71427, longitude: \ -74.00597, country: Some(Country { id: 6252001, code: \"US\", name: \"United \ States\" }), admin_division: Some(AdminDivision { id: 5128638, code: \"US.NY\", \ - name: \"New York\" }), timezone: \"America/New_York\", names: Some({\"en\": \"New \ - York\"}), country_names: Some({\"en\": \"United States\"}), admin1_names: \ - Some({\"en\": \"New York\"}), population: 8804190 }" + name: \"New York\" }), admin2_division: None, timezone: \"America/New_York\", names: \ + Some({\"en\": \"New York\"}), country_names: Some({\"en\": \"United States\"}), \ + admin1_names: Some({\"en\": \"New York\"}), admin2_names: None, population: 8804190 }" ], svec![ "CitiesRecord { id: 6332428, name: \"East Harlem\", latitude: 40.79472, longitude: \ -73.9425, country: Some(Country { id: 6252001, code: \"US\", name: \"United States\" \ }), admin_division: Some(AdminDivision { id: 5128638, code: \"US.NY\", name: \"New \ - York\" }), timezone: \"America/New_York\", names: None, country_names: Some({\"en\": \ - \"United States\"}), admin1_names: Some({\"en\": \"New York\"}), population: 115921 }" + York\" }), admin2_division: Some(AdminDivision { id: 5128594, code: \"US.NY.061\", \ + name: \"New York County\" }), timezone: \"America/New_York\", names: None, \ + country_names: Some({\"en\": \"United States\"}), admin1_names: Some({\"en\": \"New \ + York\"}), admin2_names: Some({\"en\": \"New York County\"}), population: 115921 }" ], svec!["This is not a Location and it will not be geocoded"], svec!["40.71427, -74.00597"], @@ -247,9 +253,11 @@ fn geocode_suggest_fmt_cityrecord() { "CitiesRecord { id: 1703417, name: \"Makati City\", latitude: 14.55027, longitude: \ 121.03269, country: Some(Country { id: 1694008, code: \"PH\", name: \"Philippines\" \ }), admin_division: Some(AdminDivision { id: 7521311, code: \"PH.NCR\", name: \ - \"Metro Manila\" }), timezone: \"Asia/Manila\", names: Some({\"en\": \"Makati \ - City\"}), country_names: Some({\"en\": \"Philippines\"}), admin1_names: \ - Some({\"en\": \"National Capital Region\"}), population: 510383 }" + \"Metro Manila\" }), admin2_division: Some(AdminDivision { id: 11395838, code: \ + \"PH.NCR.137600000\", name: \"Southern Manila District\" }), timezone: \ + \"Asia/Manila\", names: Some({\"en\": \"Makati City\"}), country_names: \ + Some({\"en\": \"Philippines\"}), admin1_names: Some({\"en\": \"National Capital \ + Region\"}), admin2_names: None, population: 510383 }" ], ]; assert_eq!(got, expected);