From 31c40db011b1ed4807009ca31cf98b5892b4b814 Mon Sep 17 00:00:00 2001 From: Hugh Saalmans Date: Mon, 21 Nov 2022 14:57:57 +1100 Subject: [PATCH 1/3] updates for 202211 release --- README.md | 26 +++++++-------- create_concordance_file.py | 4 +-- data/boundary_concordance_score.csv | 32 +++++++++---------- .../00_import_concordance_file.sql | 16 +++++----- .../02_create_residential_address_table.sql | 6 ++-- .../example-usage/02_join_pc_and_lga_data.sql | 10 +++--- postgres-scripts/xx_test_results.sql | 8 ++--- postgres-scripts/xx_testing.sql | 20 ++++++------ testing/01_create_concordance_table.sql | 6 ++-- ..._create_postcode_lga_concordance_table.sql | 18 +++++------ testing/01a_abs_bdy_concordances.sql | 2 +- testing/xx_geoscape_testing.sql | 2 +- testing/xx_lga_testing.sql | 6 ++-- testing/xx_testing.sql | 18 +++++------ xx_create_concordance_file_and_copy_to_s3.sh | 10 +++--- 15 files changed, 92 insertions(+), 92 deletions(-) diff --git a/README.md b/README.md index fc74dde..355486b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Concord -A [CSV file](https://minus34.com/opendata/geoscape-202208/boundary_concordance.csv) and supporting scripts for converting data between Australian boundaries. +A [CSV file](https://minus34.com/opendata/geoscape-202211/boundary_concordance.csv) and supporting scripts for converting data between Australian boundaries. It solves the problem of trying to merge 2 or more datasets based on different census or administrative boundaries such as statistical areas or postcodes. @@ -8,7 +8,7 @@ It does this by providing a list of **_concordances_** between pairs of boundari In this context, **_concordance_** describes what % of residential addresses in a "from" boundary fit within a "to" boundary. -Download & import the ~50Mb [concordance file](https://minus34.com/opendata/geoscape-202208/boundary_concordance.csv) into your database or reporting tool to [get started](#get-started). A [script](/postgres-scripts/00_import_concordance_file.sql) for importing into Postgres is also provided. +Download & import the ~50Mb [concordance file](https://minus34.com/opendata/geoscape-202211/boundary_concordance.csv) into your database or reporting tool to [get started](#get-started). A [script](/postgres-scripts/00_import_concordance_file.sql) for importing into Postgres is also provided. ### Example Use Cases @@ -42,7 +42,7 @@ In the [score](/data/boundary_concordance_score.csv) file, the **_error_** measu The concordance file is generated by the following process: -1. Tag all GNAF addresses with 2016 & 2021 ABS Census boundaries and geoscape 202208 Administrative boundaries +1. Tag all GNAF addresses with 2016 & 2021 ABS Census boundaries and geoscape 202211 Administrative boundaries 2. Remove all addresses in non-residential ABS Census 2021 meshblocks 3. Aggregate all residential addresses by a set of _**from**_ boundary and _**to**_ boundary pairs (e.g. postcode to LGA) 4. Determine the % overlap of residential addresses between both boundary types for all boundary pairs @@ -63,7 +63,7 @@ There are 2 options to get the data: #### 1. Download and Import -1. Download the [concordance file](https://minus34.com/opendata/geoscape-202208/boundary_concordance.csv) +1. Download the [concordance file](https://minus34.com/opendata/geoscape-202211/boundary_concordance.csv) 2. Import it into your database/reporting tool of choice. If using Postgres: 1. Edit the file path, schema name & table owner in `00_import_concordance_file.sql` in the [postgres-scripts](/postgres-scripts) folder 2. Run the SQL script to import the file @@ -75,9 +75,9 @@ This requires a knowledge of Python, Postgres & pg_restore. BTW - if the boundary combination you want isn't in the default concordance file - you need to edit the `settings.py` file before running `create_concordance_file.py`. If this is too hard - raise an [issue](https://github.com/iag-geo/concord/issues) and we may be able to generate it for you; noting you shouldn't convert data to a smaller boundary due to the increase in data errors. **Running the script only needs to be done for 3 reasons:** -1. The boundary from/to combination you need isn't in the standard [concordances file](https://minus34.com/opendata/geoscape-202208/boundary_concordance.csv) +1. The boundary from/to combination you need isn't in the standard [concordances file](https://minus34.com/opendata/geoscape-202211/boundary_concordance.csv) 2. It's now the future and we've been too lazy to update the concordances file with the latest boundary data from the ABS and/or Geoscape -3. You have a license of [Geoscape Buildings](https://geoscape.com.auhttps://minus34.com/opendata/geoscape-202208/boundary_concordance.csv/buildings/) or [Geoscape Land Parcels](https://geoscape.com.auhttps://minus34.com/opendata/geoscape-202208/boundary_concordance.csv/land-parcels/) and want to use the _planning zone_ data in those products to: +3. You have a license of [Geoscape Buildings](https://geoscape.com.auhttps://minus34.com/opendata/geoscape-202211/boundary_concordance.csv/buildings/) or [Geoscape Land Parcels](https://geoscape.com.auhttps://minus34.com/opendata/geoscape-202211/boundary_concordance.csv/land-parcels/) and want to use the _planning zone_ data in those products to: 1. Use a more accurate list of residential addresses to determine the data apportionment percentages (see **note** below); or 2. Use a different set of addresses to apportion your data; e.g. industrial or commercial addresses @@ -88,8 +88,8 @@ BTW - if the boundary combination you want isn't in the default concordance file Running the script requires the following open data, available as Postgres dump files, as well as the optional licensed Geoscape data mentioned above: 1. ABS Census 2016 boundaries ([download](https://minus34.com/opendata/census-2016/census_2016_bdys.dmp)) 2. ABS Census 2021 boundaries ([download](https://minus34.com/opendata/census-2021/census_2021_bdys_gda94.dmp)) -3. GNAF from gnaf-loader ([download](https://minus34.com/opendata/geoscape-202208/gnaf-202208.dmp)) -4. Geoscape Administrative Boundaries from gnaf-loader ([download](https://minus34.com/opendata/geoscape-202208/admin-bdys-202208.dmp)) +3. GNAF from gnaf-loader ([download](https://minus34.com/opendata/geoscape-202211/gnaf-202211.dmp)) +4. Geoscape Administrative Boundaries from gnaf-loader ([download](https://minus34.com/opendata/geoscape-202211/admin-bdys-202211.dmp)) 5. ABS Census 2016 data - used to generate error rates only ([download](https://minus34.com/opendata/census-2016/census_2016_data.dmp)) #### Process @@ -117,7 +117,7 @@ The behaviour of the Python script can be controlled by specifying various comma * `--pgpassword` password for accessing the Postgres server. This defaults to the `PGPASSWORD` environment variable if set, otherwise `password`. ##### Optional Arguments -* `--geoscape-version` Geoscape version number in YYYYMM format. Defaults to current year and last release month. e.g. `202208`. +* `--geoscape-version` Geoscape version number in YYYYMM format. Defaults to current year and last release month. e.g. `202211`. * `--gnaf-schema` input schema name to store final GNAF tables in. Also the **output schema** for the concordance table. Defaults to `gnaf_`. * `--admin-schema` input schema name to store final admin boundary tables in. Defaults to `admin_bdys_`. * `--output-table` name of both output concordance table and file. Defaults to `boundary_concordance`. @@ -147,8 +147,8 @@ WITH pc_data AS ( con.to_name AS lga_name, sum(pc.cases::float * con.address_percent / 100.0)::integer AS cases FROM testing.nsw_covid_cases_20220503_postcode AS pc - INNER JOIN gnaf_202208.boundary_concordance AS con ON pc.postcode = con.from_id - WHERE con.from_source = 'geoscape 202208' + INNER JOIN gnaf_202211.boundary_concordance AS con ON pc.postcode = con.from_id + WHERE con.from_source = 'geoscape 202211' AND con.from_bdy = 'postcode' AND con.to_source = 'abs 2016' AND con.to_bdy = 'lga' @@ -167,9 +167,9 @@ FROM testing.nsw_covid_tests_20220503_lga AS lga ## Data Licenses -Incorporates or developed using G-NAF © [Geoscape Australia](https://geoscape.com.au/legalhttps://minus34.com/opendata/geoscape-202208/boundary_concordance.csv-copyright-and-disclaimer/) licensed by the Commonwealth of Australia under the [Open Geo-coded National Address File (G-NAF) End User Licence Agreement](https:/https://minus34.com/opendata/geoscape-202208/boundary_concordance.csv.gov.auhttps://minus34.com/opendata/geoscape-202208/boundary_concordance.csvset/ds-dga-19432f89-dc3a-4ef3-b943-5326ef1dbecc/distribution/dist-dga-09f74802-08b1-4214-a6ea-3591b2753d30/details?q=). +Incorporates or developed using G-NAF © [Geoscape Australia](https://geoscape.com.au/legalhttps://minus34.com/opendata/geoscape-202211/boundary_concordance.csv-copyright-and-disclaimer/) licensed by the Commonwealth of Australia under the [Open Geo-coded National Address File (G-NAF) End User Licence Agreement](https:/https://minus34.com/opendata/geoscape-202211/boundary_concordance.csv.gov.auhttps://minus34.com/opendata/geoscape-202211/boundary_concordance.csvset/ds-dga-19432f89-dc3a-4ef3-b943-5326ef1dbecc/distribution/dist-dga-09f74802-08b1-4214-a6ea-3591b2753d30/details?q=). -Incorporates or developed using Administrative Boundaries © [Geoscape Australia](https://geoscape.com.au/legalhttps://minus34.com/opendata/geoscape-202208/boundary_concordance.csv-copyright-and-disclaimer/) licensed by the Commonwealth of Australia under [Creative Commons Attribution 4.0 International licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/). +Incorporates or developed using Administrative Boundaries © [Geoscape Australia](https://geoscape.com.au/legalhttps://minus34.com/opendata/geoscape-202211/boundary_concordance.csv-copyright-and-disclaimer/) licensed by the Commonwealth of Australia under [Creative Commons Attribution 4.0 International licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/). Based on [Australian Bureau of Statistics](https://www.abs.gov.au/websitedbs/d3310114.nsf/Home/Attributing+ABS+Material) data, licensed by the Commonwealth of Australia under [Creative Commons Attribution 4.0 International licence (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/). diff --git a/create_concordance_file.py b/create_concordance_file.py index 9cb06e4..8d6028e 100644 --- a/create_concordance_file.py +++ b/create_concordance_file.py @@ -192,7 +192,7 @@ def add_asgs_concordances(pg_cur): count(*) as address_count, 100.0 as address_percent from census_2016_bdys.mb_2016_aust as mb - inner join gnaf_202208.address_principals as gnaf on gnaf.mb_2016_code::text = mb.mb_code16 + inner join gnaf_202211.address_principals as gnaf on gnaf.mb_2016_code::text = mb.mb_code16 group by from_id, from_name, to_id, @@ -240,7 +240,7 @@ def add_asgs_concordances(pg_cur): count(*) as address_count, 100.0 as address_percent from census_2021_bdys_gda94.mb_2021_aust_gda94 as mb - inner join gnaf_202208.address_principals as gnaf + inner join gnaf_202211.address_principals as gnaf on gnaf.mb_2021_code::text = mb.mb_code_2021 group by from_id, from_name, diff --git a/data/boundary_concordance_score.csv b/data/boundary_concordance_score.csv index 0fe3047..5ddecbb 100644 --- a/data/boundary_concordance_score.csv +++ b/data/boundary_concordance_score.csv @@ -15,7 +15,7 @@ abs 2016,sa2,abs 2016,poa,79,4.3 abs 2016,sa2,abs 2016,sa3,100,0.0 abs 2016,sa2,abs 2016,sa4,100,0.0 abs 2016,sa2,abs 2021,sa2,92, -abs 2016,sa2,geoscape 202208,postcode,79, +abs 2016,sa2,geoscape 202211,postcode,79, abs 2016,sa3,abs 2016,gcc,100,0.0 abs 2016,sa3,abs 2016,lga,83,3.0 abs 2016,sa3,abs 2016,sa4,100,0.0 @@ -35,22 +35,22 @@ abs 2021,sa2,abs 2021,lga,98, abs 2021,sa2,abs 2021,poa,84, abs 2021,sa2,abs 2021,sa3,100, abs 2021,sa2,abs 2021,sa4,100, -abs 2021,sa2,geoscape 202208,postcode,84, +abs 2021,sa2,geoscape 202211,postcode,84, abs 2021,sa3,abs 2021,gccsa,100, abs 2021,sa3,abs 2021,lga,85, abs 2021,sa3,abs 2021,sa4,100, abs 2021,sa4,abs 2021,gccsa,100, -geoscape 202208,lga,abs 2016,lga,100, -geoscape 202208,lga,abs 2021,lga,100, -geoscape 202208,locality,abs 2016,lga,98, -geoscape 202208,locality,abs 2016,sa2,94, -geoscape 202208,locality,abs 2016,sa3,99, -geoscape 202208,locality,abs 2021,lga,98, -geoscape 202208,locality,abs 2021,sa2,90, -geoscape 202208,locality,abs 2021,sa3,99, -geoscape 202208,locality,geoscape 202208,lga,98, -geoscape 202208,postcode,abs 2016,lga,93, -geoscape 202208,postcode,abs 2016,sa3,92, -geoscape 202208,postcode,abs 2021,lga,94, -geoscape 202208,postcode,abs 2021,sa3,93, -geoscape 202208,postcode,geoscape 202208,lga,93, +geoscape 202211,lga,abs 2016,lga,100, +geoscape 202211,lga,abs 2021,lga,100, +geoscape 202211,locality,abs 2016,lga,98, +geoscape 202211,locality,abs 2016,sa2,94, +geoscape 202211,locality,abs 2016,sa3,99, +geoscape 202211,locality,abs 2021,lga,98, +geoscape 202211,locality,abs 2021,sa2,90, +geoscape 202211,locality,abs 2021,sa3,99, +geoscape 202211,locality,geoscape 202211,lga,98, +geoscape 202211,postcode,abs 2016,lga,93, +geoscape 202211,postcode,abs 2016,sa3,92, +geoscape 202211,postcode,abs 2021,lga,94, +geoscape 202211,postcode,abs 2021,sa3,93, +geoscape 202211,postcode,geoscape 202211,lga,93, diff --git a/postgres-scripts/00_import_concordance_file.sql b/postgres-scripts/00_import_concordance_file.sql index 450376b..998874e 100644 --- a/postgres-scripts/00_import_concordance_file.sql +++ b/postgres-scripts/00_import_concordance_file.sql @@ -1,7 +1,7 @@ -- create table -drop table if exists gnaf_202208.boundary_concordance; -create table gnaf_202208.boundary_concordance +drop table if exists gnaf_202211.boundary_concordance; +create table gnaf_202211.boundary_concordance ( from_source text not null, from_bdy text not null, @@ -14,21 +14,21 @@ create table gnaf_202208.boundary_concordance address_count integer, address_percent numeric(4, 1) ); -alter table gnaf_202208.boundary_concordance owner to postgres; +alter table gnaf_202211.boundary_concordance owner to postgres; -- import CSV file -- 586,977 rows affected in 1 s 365 ms -COPY gnaf_202208.boundary_concordance +COPY gnaf_202211.boundary_concordance FROM '/Users/minus34/Downloads/boundary_concordance.csv' WITH (HEADER, DELIMITER ',', FORMAT CSV); -analyse gnaf_202208.boundary_concordance; +analyse gnaf_202211.boundary_concordance; -- add primary key (faster if done after import) -- completed in 8 s 496 ms -alter table gnaf_202208.boundary_concordance add constraint boundary_concordance_pkey +alter table gnaf_202211.boundary_concordance add constraint boundary_concordance_pkey primary key (from_source, from_bdy, from_id, to_source, to_bdy, to_id); -- add index on required fields for converting data -create index boundary_concordance_combo_idx on gnaf_202208.boundary_concordance +create index boundary_concordance_combo_idx on gnaf_202211.boundary_concordance using btree (from_source, from_bdy, to_source, to_bdy); -alter table gnaf_202208.boundary_concordance cluster on boundary_concordance_combo_idx; \ No newline at end of file +alter table gnaf_202211.boundary_concordance cluster on boundary_concordance_combo_idx; \ No newline at end of file diff --git a/postgres-scripts/data-prep/02_create_residential_address_table.sql b/postgres-scripts/data-prep/02_create_residential_address_table.sql index d096ac3..0c1c357 100644 --- a/postgres-scripts/data-prep/02_create_residential_address_table.sql +++ b/postgres-scripts/data-prep/02_create_residential_address_table.sql @@ -27,9 +27,9 @@ select gnaf.gnaf_pid, gnaf.mb_2021_code, lower(mb21.mb_cat) as mb_category_2021, gnaf.geom -from gnaf_202208.address_principals as gnaf - inner join admin_bdys_202208.abs_2016_mb as mb16 on mb16.mb_16code = gnaf.mb_2016_code - inner join admin_bdys_202208.abs_2021_mb as mb21 on mb21.mb21_code = gnaf.mb_2021_code +from gnaf_202211.address_principals as gnaf + inner join admin_bdys_202211.abs_2016_mb as mb16 on mb16.mb_16code = gnaf.mb_2016_code + inner join admin_bdys_202211.abs_2021_mb as mb21 on mb21.mb21_code = gnaf.mb_2021_code left outer join blg on blg.gnaf_pid = gnaf.gnaf_pid ; analyse geoscape_202203.address_principals_buildings; diff --git a/postgres-scripts/example-usage/02_join_pc_and_lga_data.sql b/postgres-scripts/example-usage/02_join_pc_and_lga_data.sql index cb3a9bc..d0c4f83 100644 --- a/postgres-scripts/example-usage/02_join_pc_and_lga_data.sql +++ b/postgres-scripts/example-usage/02_join_pc_and_lga_data.sql @@ -19,7 +19,7 @@ -- select * from testing.nsw_covid_tests_20220503_lga; -- -- -- concordance table --- select * from gnaf_202208.boundary_concordance; +-- select * from gnaf_202211.boundary_concordance; WITH pc_data AS ( SELECT con.to_id AS lga_id, @@ -28,8 +28,8 @@ WITH pc_data AS ( sum(pc.cases) as qa_count, count(*) as postcode_count FROM testing.nsw_covid_cases_20220503_postcode AS pc - INNER JOIN gnaf_202208.boundary_concordance AS con ON pc.postcode = con.from_id - WHERE con.from_source = 'geoscape 202208' + INNER JOIN gnaf_202211.boundary_concordance AS con ON pc.postcode = con.from_id + WHERE con.from_source = 'geoscape 202211' AND con.from_bdy = 'postcode' AND con.to_source = 'abs 2016' AND con.to_bdy = 'lga' @@ -49,8 +49,8 @@ INNER JOIN pc_data on pc_data.lga_id = lga.lga_code19; -- example of poor concordance due to number of partial postcodes that intersect with one LGA select * -from gnaf_202208.boundary_concordance -WHERE from_source = 'geoscape 202208' +from gnaf_202211.boundary_concordance +WHERE from_source = 'geoscape 202211' AND from_bdy = 'postcode' AND to_source = 'abs 2016' AND to_bdy = 'lga' diff --git a/postgres-scripts/xx_test_results.sql b/postgres-scripts/xx_test_results.sql index 06a533d..470b373 100644 --- a/postgres-scripts/xx_test_results.sql +++ b/postgres-scripts/xx_test_results.sql @@ -2,7 +2,7 @@ -- 592552 rows select count(*) -from gnaf_202208.boundary_concordance; +from gnaf_202211.boundary_concordance; @@ -14,7 +14,7 @@ select count(*) as bdy_pair_count, max(cor.ratio_from_to * 100.0 - bdy.address_percent)::smallint as max_delta, (sum(abs(cor.ratio_from_to * 100.0 - bdy.address_percent) * address_count) / 100.0)::integer as address_count from census_2021_bdys_gda94.correspondences_sa2 as cor - inner join gnaf_202208.boundary_concordance as bdy on bdy.from_id = cor.sa2_maincode_2016 + inner join gnaf_202211.boundary_concordance as bdy on bdy.from_id = cor.sa2_maincode_2016 and bdy.to_id = cor.sa2_code_2021 where abs(cor.ratio_from_to * 100.0 - bdy.address_percent) > 5.0 ; @@ -99,8 +99,8 @@ with agg as ( sa2_code_2021::text as to_id, sa2_name_2021 as to_name, count(*) as address_count - from gnaf_202208.address_principal_census_2016_boundaries as f - inner join gnaf_202208.address_principal_census_2021_boundaries as t on t.gnaf_pid = f.gnaf_pid + from gnaf_202211.address_principal_census_2016_boundaries as f + inner join gnaf_202211.address_principal_census_2021_boundaries as t on t.gnaf_pid = f.gnaf_pid where sa2_16main = '101021011' and mb_category = 'RESIDENTIAL' and mb_category_2021 = 'Residential' diff --git a/postgres-scripts/xx_testing.sql b/postgres-scripts/xx_testing.sql index d4f6412..cd59524 100644 --- a/postgres-scripts/xx_testing.sql +++ b/postgres-scripts/xx_testing.sql @@ -1,20 +1,20 @@ select count(*) -from admin_bdys_202208.abs_2021_sa2; +from admin_bdys_202211.abs_2021_sa2; select * -from gnaf_202208.boundary_concordance +from gnaf_202211.boundary_concordance where from_bdy = 'postcode' and to_bdy = 'poa'; select * -from admin_bdys_202208.abs_2016_sa1; +from admin_bdys_202211.abs_2016_sa1; select * -from gnaf_202208.address_principal_census_2016_boundaries; +from gnaf_202211.address_principal_census_2016_boundaries; select * @@ -30,11 +30,11 @@ from census_2016_bdys.sa2_2016_aust as from_bdy select * -from gnaf_202208; +from gnaf_202211; select * -from admin_bdys_202208.abs_2016_mb as adm +from admin_bdys_202211.abs_2016_mb as adm full outer join census_2016_bdys.mb_2016_aust as abs on adm.mb_16code::text = abs.mb_code16 where not st_equals(adm.geom, abs.geom) ; @@ -51,7 +51,7 @@ select from_id as postcode, to_name as lga_name, address_count, address_percent -from gnaf_202208.boundary_concordance +from gnaf_202211.boundary_concordance where from_bdy = 'postcode' and to_source = 'abs 2016' and to_bdy = 'lga' @@ -63,8 +63,8 @@ where from_bdy = 'postcode' drop table if exists testing.temp_mb; create table testing.temp_mb as select mb.* -from admin_bdys_202208.abs_2021_mb as mb -inner join admin_bdys_202208.postcode_bdys as pc on st_intersects(mb.geom, pc.geom) --- inner join admin_bdys_202208.postcode_bdys as pc on st_intersects(st_centroid(mb.geom), pc.geom) +from admin_bdys_202211.abs_2021_mb as mb +inner join admin_bdys_202211.postcode_bdys as pc on st_intersects(mb.geom, pc.geom) +-- inner join admin_bdys_202211.postcode_bdys as pc on st_intersects(st_centroid(mb.geom), pc.geom) and postcode in ('2050', '2042'); diff --git a/testing/01_create_concordance_table.sql b/testing/01_create_concordance_table.sql index 1a39878..dd9e34e 100644 --- a/testing/01_create_concordance_table.sql +++ b/testing/01_create_concordance_table.sql @@ -16,7 +16,7 @@ with mb as ( state as source_state from mb inner join census_2016_bdys.lga_2016_aust as abs_lga on st_intersects(mb.geom, abs_lga.geom) - inner join gnaf_202208.address_principals as gnaf on gnaf.mb_2016_code = mb.mb_2016_code + inner join gnaf_202211.address_principals as gnaf on gnaf.mb_2016_code = mb.mb_2016_code ) select source.gnaf_pid, source_id, @@ -25,7 +25,7 @@ select source.gnaf_pid, lga_pid as target_id, lga_name as target_name, state as target_state -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma inner join source on source.gnaf_pid = psma.gnaf_pid ; analyse temp_bdy_concordance; @@ -81,7 +81,7 @@ with source_counts as ( ), target as ( select lga_pid, st_collect(geom) as geom - from admin_bdys_202208.local_government_areas + from admin_bdys_202211.local_government_areas group by lga_pid ) select final.*, diff --git a/testing/01_create_postcode_lga_concordance_table.sql b/testing/01_create_postcode_lga_concordance_table.sql index 79574f8..b0a401e 100644 --- a/testing/01_create_postcode_lga_concordance_table.sql +++ b/testing/01_create_postcode_lga_concordance_table.sql @@ -13,8 +13,8 @@ select source.gnaf_pid, target.lga_pid as target_id, target.lga_name as target_name, target.state as target_state -from gnaf_202208.address_principal_admin_boundaries as target -inner join gnaf_202208.address_principals as source on source.gnaf_pid = target.gnaf_pid +from gnaf_202211.address_principal_admin_boundaries as target +inner join gnaf_202211.address_principals as source on source.gnaf_pid = target.gnaf_pid ; analyse temp_bdy_concordance; @@ -31,7 +31,7 @@ where target_state = 'ACT' update temp_bdy_concordance as tmp set target_id = 'lgaot9999991', target_name = 'Unincorporated - Norfolk Island' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = 'locc15e0d2d6f2a' and target_id is null; @@ -39,7 +39,7 @@ where psma.gnaf_pid = tmp.gnaf_pid update temp_bdy_concordance as tmp set target_id = 'lgaot9999992', target_name = 'Unincorporated - Jervis Bay' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = 'loced195c315de9' and target_id is null; @@ -47,7 +47,7 @@ where psma.gnaf_pid = tmp.gnaf_pid update temp_bdy_concordance as tmp set target_id = 'lgasa9999991', target_name = 'Unincorporated - Thistle Island' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = '250190776' and target_id is null; @@ -56,7 +56,7 @@ where psma.gnaf_pid = tmp.gnaf_pid update temp_bdy_concordance as tmp set target_id = 'lgacbffb11990f2', target_name = 'Hobart City' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = 'loc0f7a581b85b7' and target_id is null; @@ -65,7 +65,7 @@ where psma.gnaf_pid = tmp.gnaf_pid update temp_bdy_concordance as tmp set target_id = 'lgaa8d127fa14e7', target_name = 'Ceduna' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = 'loccf8be9dcdacd' and target_id is null; @@ -74,7 +74,7 @@ where psma.gnaf_pid = tmp.gnaf_pid update temp_bdy_concordance as tmp set target_id = 'lga7872e04f6637', target_name = 'Tenterfield' -from gnaf_202208.address_principal_admin_boundaries as psma +from gnaf_202211.address_principal_admin_boundaries as psma where psma.gnaf_pid = tmp.gnaf_pid and locality_pid = 'loc552bd3aef1b8' and target_id is null; @@ -90,7 +90,7 @@ analyse temp_bdy_concordance; -- locality_name, -- postcode, -- state --- from temp_bdy_concordance as tmp, gnaf_202208.address_principal_admin_boundaries as psma +-- from temp_bdy_concordance as tmp, gnaf_202211.address_principal_admin_boundaries as psma -- where psma.gnaf_pid = tmp.gnaf_pid -- and target_id is null -- group by locality_name, diff --git a/testing/01a_abs_bdy_concordances.sql b/testing/01a_abs_bdy_concordances.sql index 96d415b..439f651 100644 --- a/testing/01a_abs_bdy_concordances.sql +++ b/testing/01a_abs_bdy_concordances.sql @@ -28,7 +28,7 @@ with agg as ( lga_16name as to_name, -- state as to_state, count(*) as address_count - from gnaf_202208.address_principal_census_2016_boundaries + from gnaf_202211.address_principal_census_2016_boundaries group by from_id, from_name, -- from_state, diff --git a/testing/xx_geoscape_testing.sql b/testing/xx_geoscape_testing.sql index da0e1fa..8ca080a 100644 --- a/testing/xx_geoscape_testing.sql +++ b/testing/xx_geoscape_testing.sql @@ -12,7 +12,7 @@ with blg as ( blg.gnaf_pid as blg_gnaf_pid, is_residential, building_count - from gnaf_202208.address_principals as gnaf + from gnaf_202211.address_principals as gnaf full outer join blg on blg.gnaf_pid = gnaf.gnaf_pid ) select is_residential, diff --git a/testing/xx_lga_testing.sql b/testing/xx_lga_testing.sql index 88a0088..41b8d88 100644 --- a/testing/xx_lga_testing.sql +++ b/testing/xx_lga_testing.sql @@ -29,7 +29,7 @@ -- ), gnaf_2022 as ( -- select mb_2016_code, -- count(*) as address_count --- from gnaf_202208.address_principals +-- from gnaf_202211.address_principals -- group by mb_2016_code -- ) -- select mb.mb_2016_code, @@ -103,7 +103,7 @@ with psma_lga as ( select lga_pid, name as psma_name, st_collect(geom) as geom - from admin_bdys_202208.local_government_areas + from admin_bdys_202211.local_government_areas group by lga_pid, name ), psma_lga_pnt as ( @@ -181,7 +181,7 @@ with psma_lga as ( select lga_pid, name as psma_name, sum(st_area(st_transform(geom, 3577))) as area_m2 - from admin_bdys_202208.local_government_areas + from admin_bdys_202211.local_government_areas group by lga_pid, name ), psma_lga_pnt as ( diff --git a/testing/xx_testing.sql b/testing/xx_testing.sql index f93ff82..a297e07 100644 --- a/testing/xx_testing.sql +++ b/testing/xx_testing.sql @@ -55,8 +55,8 @@ select bdy.state, bdy.postcode, bdy.locality_name, count(bdy.gnaf_pid) as addr_count -from gnaf_202208.address_principal_admin_boundaries as bdy - inner join gnaf_202208.address_principals as gnaf on gnaf.gnaf_pid = bdy.gnaf_pid +from gnaf_202211.address_principal_admin_boundaries as bdy + inner join gnaf_202211.address_principals as gnaf on gnaf.gnaf_pid = bdy.gnaf_pid group by bdy.state, bdy.lga_name, bdy.postcode, @@ -82,9 +82,9 @@ select bdy.state, bdy.postcode, bdy.locality_name, count(bdy.gnaf_pid) as addr_count -from gnaf_202208.address_principal_admin_boundaries as bdy -inner join gnaf_202208.address_principals as gnaf on gnaf.gnaf_pid = bdy.gnaf_pid -inner join admin_bdys_202208.abs_2016_mb as mb on mb.mb_16code = gnaf.mb_2016_code +from gnaf_202211.address_principal_admin_boundaries as bdy +inner join gnaf_202211.address_principals as gnaf on gnaf.gnaf_pid = bdy.gnaf_pid +inner join admin_bdys_202211.abs_2016_mb as mb on mb.mb_16code = gnaf.mb_2016_code where mb.mb_category = 'RESIDENTIAL' group by bdy.state, bdy.lga_name, @@ -101,18 +101,18 @@ LIMIT 50 select * -from gnaf_202208.address_principals +from gnaf_202211.address_principals ; select * -from gnaf_202208.address_alias_admin_boundaries +from gnaf_202211.address_alias_admin_boundaries ; select * -from admin_bdys_202208.abs_2016_mb +from admin_bdys_202211.abs_2016_mb ; -select * from admin_bdys_202208.local_government_areas +select * from admin_bdys_202211.local_government_areas order by name, state; diff --git a/xx_create_concordance_file_and_copy_to_s3.sh b/xx_create_concordance_file_and_copy_to_s3.sh index 7987390..2d964e5 100644 --- a/xx_create_concordance_file_and_copy_to_s3.sh +++ b/xx_create_concordance_file_and_copy_to_s3.sh @@ -3,8 +3,8 @@ conda activate geo AWS_PROFILE="minus34" -OUTPUT_FOLDER="/Users/$(whoami)/tmp/geoscape_202208" -OUTPUT_FOLDER_2020="/Users/$(whoami)/tmp/geoscape_202208_gda2020" +OUTPUT_FOLDER="/Users/$(whoami)/tmp/geoscape_202211" +OUTPUT_FOLDER_2020="/Users/$(whoami)/tmp/geoscape_202211_gda2020" mkdir -p "${OUTPUT_FOLDER}" mkdir -p "${OUTPUT_FOLDER_2020}" @@ -12,11 +12,11 @@ mkdir -p "${OUTPUT_FOLDER_2020}" # Process using GDA94 boundaries python3 /Users/$(whoami)/git/iag_geo/concord/create_concordance_file.py --pgdb=geo --output-path=${OUTPUT_FOLDER} # Process using GDA2020 boundaries -python3 /Users/$(whoami)/git/iag_geo/concord/create_concordance_file.py --pgdb=geo --admin-schema="admin_bdys_202208_gda2020" --gnaf-schema="gnaf_202208_gda2020" --output-path=${OUTPUT_FOLDER_2020} +python3 /Users/$(whoami)/git/iag_geo/concord/create_concordance_file.py --pgdb=geo --admin-schema="admin_bdys_202211_gda2020" --gnaf-schema="gnaf_202211_gda2020" --output-path=${OUTPUT_FOLDER_2020} # copy concordance files to GDA94 & GDA2020 folders as GDA2020 would be the same as the GDA94 files -aws --profile=${AWS_PROFILE} s3 sync ${OUTPUT_FOLDER} s3://minus34.com/opendata/geoscape-202208 --exclude "*" --include "*.csv" --acl public-read -aws --profile=${AWS_PROFILE} s3 sync ${OUTPUT_FOLDER_2020} s3://minus34.com/opendata/geoscape-202208-gda2020 --exclude "*" --include "*.csv" --acl public-read +aws --profile=${AWS_PROFILE} s3 sync ${OUTPUT_FOLDER} s3://minus34.com/opendata/geoscape-202211 --exclude "*" --include "*.csv" --acl public-read +aws --profile=${AWS_PROFILE} s3 sync ${OUTPUT_FOLDER_2020} s3://minus34.com/opendata/geoscape-202211-gda2020 --exclude "*" --include "*.csv" --acl public-read # copy GDA94 score file to GitHub repo local files cp ${OUTPUT_FOLDER}/boundary_concordance_score.csv /Users/$(whoami)/git/iag_geo/concord/data/ From 44d914d5053e4ec8d8f5d19cd73e555c33ccbcc8 Mon Sep 17 00:00:00 2001 From: Hugh Saalmans Date: Mon, 21 Nov 2022 15:16:50 +1100 Subject: [PATCH 2/3] updated to psycopg 3 --- README.md | 2 +- create_concordance_file.py | 8 +- geoscape.py | 247 ------------------------------------- settings.py | 6 +- 4 files changed, 9 insertions(+), 254 deletions(-) diff --git a/README.md b/README.md index 355486b..27c5939 100644 --- a/README.md +++ b/README.md @@ -100,7 +100,7 @@ Running the script requires the following open data, available as Postgres dump 2. Edit the `02_create_residential_address_table.sql` in the [postgres-scripts](/postgres-scripts/data-prep) folder to suit your dataset and schema name 3. Run the above SQL script 3. Review & edit the boundaries to process in `settings.py` as required - make any required changes in the sections near the bottom marked for editing. If optionally using Geoscape Buildings data for residential address - change the `residential_address_source` to use it. -4. Add `psycopg2` to your Python 3.x environment +4. Add `psycopg` to your Python 3.x environment 5. Run the script (takes ~30-45 mins on a 2017 MacBook Pro with 16Gb of RAM and 4 cores) #### Command Line Options diff --git a/create_concordance_file.py b/create_concordance_file.py index 8d6028e..647458b 100644 --- a/create_concordance_file.py +++ b/create_concordance_file.py @@ -2,7 +2,7 @@ import geoscape import logging import os -import psycopg2 # need to install package +import psycopg # need to install package import settings # gets global vars and runtime arguments from datetime import datetime @@ -10,7 +10,7 @@ def main(): # connect to Postgres database - pg_conn = psycopg2.connect(settings.pg_connect_string) + pg_conn = psycopg.connect(settings.pg_connect_string) pg_conn.autocommit = True pg_cur = pg_conn.cursor() @@ -450,7 +450,9 @@ def export_to_csv(pg_cur, table, file_name): to_bdy ) TO STDOUT WITH CSV HEADER""" with open(os.path.join(settings.output_path, file_name), "w") as f: - pg_cur.copy_expert(query, f) + with pg_cur.copy(query) as copy: + while data := f.read(): + copy.write(data) if __name__ == "__main__": diff --git a/geoscape.py b/geoscape.py index dbc1874..0863d3a 100644 --- a/geoscape.py +++ b/geoscape.py @@ -1,71 +1,6 @@ -# import io -import multiprocessing -import math import os -# import platform -import psycopg2 import settings -import subprocess -# import sys - - -# takes a list of sql queries or command lines and runs them using multiprocessing -def multiprocess_list(mp_type, work_list, logger): - pool = multiprocessing.Pool(processes=settings.max_processes) - - num_jobs = len(work_list) - - if mp_type == "sql": - results = pool.imap_unordered(run_sql_multiprocessing, work_list) - else: - results = pool.imap_unordered(run_command_line, work_list) - - pool.close() - pool.join() - - result_list = list(results) - num_results = len(result_list) - - if num_jobs > num_results: - logger.warning("\t- A MULTIPROCESSING PROCESS FAILED WITHOUT AN ERROR\nACTION: Check the record counts") - - for result in result_list: - if result != "SUCCESS": - logger.info(result) - - -def run_sql_multiprocessing(the_sql): - pg_conn = psycopg2.connect(settings.pg_connect_string) - pg_conn.autocommit = True - pg_cur = pg_conn.cursor() - - # set raw gnaf database schema (it's needed for the primary and foreign key creation) - if settings.raw_gnaf_schema != "public": - pg_cur.execute(f"SET search_path = {settings.raw_gnaf_schema}, public, pg_catalog") - - try: - pg_cur.execute(the_sql) - result = "SUCCESS" - except Exception as ex: - result = f"SQL FAILED! : {the_sql} : {ex}" - - pg_cur.close() - pg_conn.close() - - return result - - -def run_command_line(cmd): - # run the command line without any output (it'll still tell you if it fails miserably) - try: - f_null = open(os.devnull, "w") - subprocess.call(cmd, shell=True, stdout=f_null, stderr=subprocess.STDOUT) - result = "SUCCESS" - except Exception as ex: - result = f"COMMAND FAILED! : {cmd} : {ex}" - - return result def open_sql_file(file_name): @@ -73,20 +8,7 @@ def open_sql_file(file_name): return prep_sql(sql) -# change schema names in an array of SQL script if schemas not the default -def prep_sql_list(sql_list): - output_list = [] - for sql in sql_list: - output_list.append(prep_sql(sql)) - return output_list - - -# set schema names in the SQL script def prep_sql(sql): - # if settings.raw_gnaf_schema is not None: - # sql = sql.replace(" raw_gnaf.", f" {settings.raw_gnaf_schema}.") - # if settings.raw_admin_bdys_schema is not None: - # sql = sql.replace(" raw_admin_bdys.", f" {settings.raw_admin_bdys_schema}.") if settings.gnaf_schema is not None: sql = sql.replace(" gnaf.", f" {settings.gnaf_schema}.") if settings.admin_bdys_schema is not None: @@ -97,172 +19,3 @@ def prep_sql(sql): sql = sql.replace(" postgres;", f" {settings.pg_user};") return sql - - -def split_sql_into_list(pg_cur, the_sql, table_schema, table_name, table_alias, table_gid, logger): - # get min max gid values from the table to split - min_max_sql = f"SELECT MIN({table_gid}) AS min, MAX({table_gid}) AS max FROM {table_schema}.{table_name}" - - pg_cur.execute(min_max_sql) - - try: - result = pg_cur.fetchone() - - min_pkey = int(result[0]) - max_pkey = int(result[1]) - diff = max_pkey - min_pkey - - # Number of records in each query - rows_per_request = int(math.floor(float(diff) / float(settings.max_processes))) + 1 - - # If less records than processes or rows per request, - # reduce both to allow for a minimum of 15 records each process - if float(diff) / float(settings.max_processes) < 10.0: - rows_per_request = 10 - processes = int(math.floor(float(diff) / 10.0)) + 1 - logger.info(f"\t\t- running {processes} processes (adjusted due to low row count in table to split)") - else: - processes = settings.max_processes - - # create list of sql statements to run with multiprocessing - sql_list = [] - start_pkey = min_pkey - 1 - - for i in range(0, processes): - end_pkey = start_pkey + rows_per_request - - where_clause = \ - f" WHERE {table_alias}.{table_gid} > {start_pkey} AND {table_alias}.{table_gid} <= {end_pkey}" - - if "WHERE " in the_sql: - mp_sql = the_sql.replace(" WHERE ", where_clause + " AND ") - elif "GROUP BY " in the_sql: - mp_sql = the_sql.replace("GROUP BY ", where_clause + " GROUP BY ") - elif "ORDER BY " in the_sql: - mp_sql = the_sql.replace("ORDER BY ", where_clause + " ORDER BY ") - else: - if ";" in the_sql: - mp_sql = the_sql.replace(";", where_clause + ";") - else: - mp_sql = the_sql + where_clause - logger.warning("\t\t- NOTICE: no ; found at the end of the SQL statement") - - sql_list.append(mp_sql) - start_pkey = end_pkey - - # logger.info("\n".join(sql_list)) - - return sql_list - except Exception as ex: - logger.fatal(f"Looks like the table in this query is empty: {min_max_sql}\n{ex}") - return None - - -def multiprocess_shapefile_load(work_list, logger): - pool = multiprocessing.Pool(processes=settings.max_processes) - - num_jobs = len(work_list) - - results = pool.imap_unordered(intermediate_shapefile_load_step, work_list) - - pool.close() - pool.join() - - result_list = list(results) - num_results = len(result_list) - - if num_jobs > num_results: - logger.warning("\t- A MULTIPROCESSING PROCESS FAILED WITHOUT AN ERROR\nACTION: Check the record counts") - - for result in result_list: - if result != "SUCCESS": - logger.info(result) - - -def intermediate_shapefile_load_step(work_dict): - file_path = work_dict["file_path"] - pg_table = work_dict["pg_table"] - pg_schema = work_dict["pg_schema"] - delete_table = work_dict["delete_table"] - spatial = work_dict["spatial"] - - result = import_shapefile_to_postgres(file_path, pg_table, pg_schema, delete_table, spatial) - - return result - - -# imports a Shapefile into Postgres in 2 steps: SHP > SQL; SQL > Postgres -# overcomes issues trying to use psql with PGPASSWORD set at runtime -def import_shapefile_to_postgres(file_path, pg_table, pg_schema, delete_table, spatial): - # delete target table or append to it? - if delete_table: - # add delete and spatial index flag - delete_append_flag = "-d -I" - else: - delete_append_flag = "-a" - - # assign coordinate system if spatial, otherwise flag as non-spatial - if spatial: - spatial_or_dbf_flags = f"-s {settings.srid}" - else: - spatial_or_dbf_flags = "-G -n" - - # build shp2pgsql command line - shp2pgsql_cmd = f"shp2pgsql {delete_append_flag} {spatial_or_dbf_flags} -i \"{file_path}\" {pg_schema}.{pg_table}" - # print(shp2pgsql_cmd) - - # convert the Shapefile to SQL statements - try: - process = subprocess.Popen(shp2pgsql_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True) - sqlobj, err = process.communicate() - except Exception as ex: - return f"Importing {file_path} - Couldn't convert Shapefile to SQL : {ex}" - - # prep Shapefile SQL - sql = sqlobj.decode("utf-8") # this is required for Python 3 - sql = sql.replace("Shapefile type: ", "-- Shapefile type: ") - sql = sql.replace("Postgis type: ", "-- Postgis type: ") - sql = sql.replace("SELECT DropGeometryColumn", "-- SELECT DropGeometryColumn") - - # # bug in shp2pgsql? - an append command will still create a spatial index if requested - disable it - # if not delete_table or not spatial: - # sql = sql.replace("CREATE INDEX ", "-- CREATE INDEX ") - - # this is required due to differing approaches by different versions of PostGIS - sql = sql.replace("DROP TABLE ", "DROP TABLE IF EXISTS ") - sql = sql.replace("DROP TABLE IF EXISTS IF EXISTS ", "DROP TABLE IF EXISTS ") - - # import data to Postgres - pg_conn = psycopg2.connect(settings.pg_connect_string) - pg_conn.autocommit = True - pg_cur = pg_conn.cursor() - - try: - pg_cur.execute(sql) - except Exception as ex: - # if import fails for some reason - output sql to file for debugging - file_name = os.path.basename(file_path) - - target = open(os.path.join(os.path.dirname(os.path.realpath(__file__)), f"error_debug_{file_name}.sql"), "w") - target.write(sql) - - pg_cur.close() - pg_conn.close() - - return f"\tImporting {file_name} - Couldn't run Shapefile SQL\nshp2pgsql result was: {ex} " - - # Cluster table on spatial index for performance - if delete_table and spatial: - sql = f"ALTER TABLE {pg_schema}.{pg_table} CLUSTER ON {pg_table}_geom_idx" - - try: - pg_cur.execute(sql) - except Exception as ex: - pg_cur.close() - pg_conn.close() - return f"\tImporting {pg_table} - Couldn't cluster on spatial index : {ex}" - - pg_cur.close() - pg_conn.close() - - return "SUCCESS" diff --git a/settings.py b/settings.py index 47c3afa..d34c910 100644 --- a/settings.py +++ b/settings.py @@ -3,7 +3,7 @@ import os import argparse import platform -import psycopg2 +import psycopg import sys from datetime import datetime @@ -33,9 +33,9 @@ def get_geoscape_version(date): return gs_version, previous_gs_version -# get python, psycopg2 and OS versions +# get python, psycopg and OS versions python_version = sys.version.split("(")[0].strip() -psycopg2_version = psycopg2.__version__.split("(")[0].strip() +psycopg_version = psycopg.__version__.split("(")[0].strip() os_version = platform.system() + " " + platform.version().strip() # get the command line arguments for the script From 2616daefed754e11e26266e571354784df0f4ad8 Mon Sep 17 00:00:00 2001 From: Hugh Saalmans Date: Wed, 23 Nov 2022 08:53:08 +1100 Subject: [PATCH 3/3] fixed export to CSV --- create_concordance_file.py | 7 ++++--- data/boundary_concordance_score.csv | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/create_concordance_file.py b/create_concordance_file.py index 647458b..722fb43 100644 --- a/create_concordance_file.py +++ b/create_concordance_file.py @@ -449,10 +449,11 @@ def export_to_csv(pg_cur, table, file_name): to_source, to_bdy ) TO STDOUT WITH CSV HEADER""" - with open(os.path.join(settings.output_path, file_name), "w") as f: + + with open(os.path.join(settings.output_path, file_name), "wb") as f: with pg_cur.copy(query) as copy: - while data := f.read(): - copy.write(data) + while data := copy.read(): + f.write(data) if __name__ == "__main__": diff --git a/data/boundary_concordance_score.csv b/data/boundary_concordance_score.csv index 5ddecbb..2716825 100644 --- a/data/boundary_concordance_score.csv +++ b/data/boundary_concordance_score.csv @@ -1,9 +1,9 @@ from_source,from_bdy,to_source,to_bdy,concordance_percent,error_percent abs 2016,gcc,abs 2021,gccsa,100, -abs 2016,lga,abs 2016,sa3,74,4.7 +abs 2016,lga,abs 2016,sa3,73,4.7 abs 2016,lga,abs 2016,ste,100,0.2 abs 2016,poa,abs 2016,lga,93,1.4 -abs 2016,poa,abs 2016,sa2,62,8.6 +abs 2016,poa,abs 2016,sa2,62,8.7 abs 2016,sa1,abs 2016,gcc,100,0.0 abs 2016,sa1,abs 2016,sa2,100,0.0 abs 2016,sa1,abs 2016,sa3,100,0.0