worldbank · zacdezgeo · Nov 21, 2024 · Nov 5, 2024 · Nov 5, 2024 · Nov 5, 2024
diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -2,9 +2,6 @@ version: '3'
 
 services:
   database:
-    # at time of writing this, ARM64 is not supported so we make sure to use
-    # a supported platform: https://github.com/postgis/docker-postgis/issues/216
-    # Could possibly switch to https://github.com/vincentsarago/containers
     platform: linux/amd64
     image: postgis/postgis:15-3.4
     environment:
@@ -13,6 +10,23 @@ services:
       - POSTGRES_DB=postgis
     ports:
       - 5439:5432
-    command: postgres -N 500
+    command: >
+      postgres -N 500
+      -c checkpoint_timeout=30min
+      -c synchronous_commit=off
+      -c max_wal_senders=0
+      -c max_connections=8
+      -c shared_buffers=2GB
+      -c effective_cache_size=6GB
+      -c maintenance_work_mem=512MB
+      -c checkpoint_completion_target=0.9
+      -c wal_buffers=16MB
+      -c default_statistics_target=100
+      -c random_page_cost=1.1
+      -c effective_io_concurrency=200
+      -c work_mem=256MB
+      -c huge_pages=off
+      -c min_wal_size=1GB
+      -c max_wal_size=4GB
     volumes:
-      - ./.pgdata:/var/lib/postgresql/data
+      - ./.pgdata:/var/lib/postgresql/data
diff --git a/docs/acceptance/db.md b/docs/acceptance/db.md
@@ -54,32 +54,15 @@ You can use the CLI tool for data ingestion. First, ensure you have the required
 poetry install
 ```
 
-To download the Parquet file from S3 and load it into the database, run the following command:
+To load a Parquet file it into the database, run the following command:
 
 ```bash
-poetry run space2stats-ingest download-and-load \
-    "s3://<bucket>/space2stats.parquet" \
+poetry run space2stats-ingest load \
     "postgresql://username:password@localhost:5439/postgres" \
-    "<path>/space2stats.json" \
-    --parquet-file "local.parquet"
+    "<item_path>" \
+    "local.parquet"
 ```
 
-Alternatively, you can run the `download` and `load` commands separately:
-
-1. **Download the Parquet file**:
-   ```bash
-   poetry run space2stats-ingest download "s3://<bucket>/space2stats.parquet" --local-path "local.parquet"
-   ```
-
-2. **Load the Parquet file into the database**:
-   ```bash
-   poetry run space2stats-ingest download-and-load \
-    "s3://<bucket>/space2stats.parquet" \
-    "postgresql://username:password@localhost:5439/postgres" \
-    "<path>/space2stats.json" \
-    --parquet-file "local.parquet"  
-   ```
-
 ### Database Configuration
 
 Once connected to the database via `psql` or a PostgreSQL client (e.g., `pgAdmin`), execute the following SQL command to create an index on the `space2stats` table:
@@ -110,3 +93,28 @@ SELECT sum_pop_2020 FROM space2stats WHERE hex_id IN ('86beabd8fffffff', '86beab
 ### Conclusion
 
 Ensure all steps are followed to verify the ETL process, database setup, and data ingestion pipeline. Reach out to the development team for any further assistance or troubleshooting.
+
+
+#### Updating test
+
+- Spin up database with docker:
+```
+docker-compose up
+```
+- Download initial dataset:
+```
+aws s3 cp s3://wbg-geography01/Space2Stats/parquet/GLOBAL/space2stats.parquet .
+download: s3://wbg-geography01/Space2Stats/parquet/GLOBAL/space2stats.parquet to ./space2stats.parquet
+```
+- Upload initial dataset:
+```
+space2stats-ingest <connection_string> ./space2stats_ingest/METADATA/stac/space2stats/space2stats_population_2020/space2stats_population_2020.json space2stats.parquet
+```
+- Generate second dataset:
+```
+python space2stats_ingest/METADATA/generate_test_data.py 
+```
+- Upload second dataset:
+```
+space2stats-ingest <connection_string> ./space2stats_ingest/METADATA/stac/space2stats/space2stats_population_2020/space2stats_reupload_test.json space2stats_test.parquet 
+```
diff --git a/space2stats_api/src/poetry.lock b/space2stats_api/src/poetry.lock
diff --git a/space2stats_api/src/pyproject.toml b/space2stats_api/src/pyproject.toml
@@ -34,6 +34,8 @@ boto3 = "^1.35.25"
 pyarrow = "^17.0.0"
 adbc-driver-postgresql = "^1.2.0"
 tqdm = "^4.66.5"
+pystac = "^1.11.0"
+jsonschema = "^4.23.0"
 
 [tool.poetry.group.notebook.dependencies]
 pandas = "*"

diff --git a/space2stats_api/src/space2stats_ingest/METADATA/generate_test_data.py b/space2stats_api/src/space2stats_ingest/METADATA/generate_test_data.py
@@ -0,0 +1,23 @@
+import numpy as np
+import pyarrow as pa
+import pyarrow.parquet as pq
+
+# Load the original Parquet file
+input_file = "space2stats.parquet"
+table = pq.read_table(input_file)
+
+# Select only the 'hex_id' column
+table = table.select(["hex_id"])
+
+# Create the new 'test_column' with random values
+num_rows = table.num_rows
+test_column = pa.array(np.random.random(size=num_rows), type=pa.float64())
+
+# Add 'test_column' to the table
+table = table.append_column("test_column", test_column)
+
+# Save the modified table to a new Parquet file
+output_file = "space2stats_test.parquet"
+pq.write_table(table, output_file)
+
+print(f"Modified Parquet file saved as {output_file}")