From 64e3bb870ab534b128883dff8fd24e3f544e22f4 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Thu, 10 Aug 2023 15:24:10 +0100 Subject: [PATCH 01/16] Start commit --- 04_area_interpolate_dask.ipynb | 427 ++++++++++++++++++ tobler/area_weighted/__init__.py | 3 +- tobler/area_weighted/area_interpolate_dask.py | 152 +++++++ 3 files changed, 581 insertions(+), 1 deletion(-) create mode 100644 04_area_interpolate_dask.ipynb create mode 100755 tobler/area_weighted/area_interpolate_dask.py diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb new file mode 100644 index 0000000..c4ae195 --- /dev/null +++ b/04_area_interpolate_dask.ipynb @@ -0,0 +1,427 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "e3f2586a-5b6a-4d46-b6e8-1991ae3bec6f", + "metadata": {}, + "source": [ + "# (Distributed) areal interpolation" + ] + }, + { + "cell_type": "markdown", + "id": "00f875bd-2714-4551-b10c-1ef3f514478d", + "metadata": {}, + "source": [ + "In this notebook, we compare the single-core version in `tobler.area_weighted.area_interpolate` with the distributed version in `tobler.area_weighted.area_interpolate_dask`. " + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "b70ac531-082c-4e77-9194-c5d4096b72ae", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ['USE_PYGEOS'] = '1'\n", + "\n", + "import geopandas\n", + "import dask_geopandas\n", + "import tobler\n", + "\n", + "from dask.distributed import Client, LocalCluster" + ] + }, + { + "cell_type": "markdown", + "id": "d16a2e15-866b-407d-b65d-54a675aefbd7", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "markdown", + "id": "e8858b38-0a72-4f72-98be-c490f8201d86", + "metadata": {}, + "source": [ + "We use the San Diego H3 dataset from the [GDS Book](https://geographicdata.science/book/data/h3_grid/build_sd_h3_grid.html):" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "dc60a91c-0d58-4b19-b180-69692286c9a0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "h3 = geopandas.read_file((\n", + " 'https://geographicdata.science/book/'\n", + " '_downloads/d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg'\n", + ")).to_crs(epsg=3310)" + ] + }, + { + "cell_type": "markdown", + "id": "41b31033-9711-4102-98af-66b3b6945bcb", + "metadata": {}, + "source": [ + "And the Census tracts dataset, also from the same [source](https://geographicdata.science/book/data/sandiego/sandiego_tracts_cleaning.html):" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "271b9208-1ae7-41f0-8234-da1d1ed2030a", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "tracts = (\n", + " geopandas.read_file((\n", + " 'https://geographicdata.science/book/'\n", + " '_downloads/f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg'\n", + " ))\n", + " .to_crs(epsg=3310)\n", + " .clip(h3)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2ebddef3-ed10-4164-99bb-3ca07f3aa1de", + "metadata": {}, + "source": [ + "Note in both cases we require a projected CRS and thus use the [NAD83/California Albers](https://epsg.io/3310)." + ] + }, + { + "cell_type": "markdown", + "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae", + "metadata": {}, + "source": [ + "We will set up a local Dask cluster:" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-10 15:19:49,969 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-taj3n78d', purging\n", + "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-283edkzp', purging\n", + "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-5sq_o8d_', purging\n", + "2023-08-10 15:19:49,971 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8pnf3b0w', purging\n", + "2023-08-10 15:19:49,972 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-0kn_mkzc', purging\n", + "2023-08-10 15:19:49,973 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-kkw_crni', purging\n", + "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8ycp859d', purging\n", + "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-fb2nvcnm', purging\n", + "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-mpy0p1sd', purging\n", + "2023-08-10 15:19:49,975 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-y5wx5vyl', purging\n" + ] + } + ], + "source": [ + "client = Client(LocalCluster(n_workers=10))" + ] + }, + { + "cell_type": "markdown", + "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880", + "metadata": {}, + "source": [ + "Finally, for Dask, we need to provide `dask_geopandas.GeoDataFrame` objects with spatial partitions and categorical variables properly set up:" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "7fef3124-a5d9-4712-bf9e-53fdf344c37f", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "tracts['sub_30'] = tracts['sub_30'].astype('category')\n", + "tracts['tract'] = tracts['tract'].astype('category')\n", + "\n", + "dtracts = (\n", + " dask_geopandas.from_geopandas(tracts[\n", + " ['geometry', 'sub_30', 'tract', 'total_pop', 'total_pop_white']\n", + " ], npartitions=10)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", + ")\n", + "\n", + "dh3 = (\n", + " dask_geopandas.from_geopandas(h3, npartitions=10)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "54f986ec-ea46-479e-aed8-5edeeaf16fda", + "metadata": {}, + "source": [ + "---\n", + "\n", + "**IMPORTANT** - At this point, only *extensive* and *categorical* variables are implemented, so those are what we will test.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6", + "metadata": {}, + "source": [ + "## Correctness" + ] + }, + { + "cell_type": "markdown", + "id": "958a9509-2666-4cf5-88f0-3e22ab8d8eac", + "metadata": {}, + "source": [ + "### Extensive\n", + "\n", + "Here we transfer the total population from `tracts` to `h3`." + ] + }, + { + "cell_type": "markdown", + "id": "c0e1d903-1e6f-446b-9d46-93cbdce3bc76", + "metadata": {}, + "source": [ + "First, we transfer with the single-core approach:" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "2d5dd9e8-4c55-43d0-9730-4b1f0826305f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n", + " return lib.intersects(a, b, **kwargs)\n", + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + } + ], + "source": [ + "ext_sc = tobler.area_weighted.area_interpolate(\n", + " tracts, h3, extensive_variables=['total_pop', 'total_pop_white']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b7604a13-770d-45d3-bd36-23464ff39138", + "metadata": {}, + "source": [ + "Then we perform the same operation using Dask:" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "0a84dc41-f128-4e4f-98da-44c7729b73a9", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + }, + { + "ename": "TypeError", + "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop_white\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m 70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m 71\u001b[0m name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 79\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 80\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 81\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m 88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m 89\u001b[0m graph, \n\u001b[1;32m 90\u001b[0m name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 93\u001b[0m new_spatial_partitions\n\u001b[1;32m 94\u001b[0m )\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m 119\u001b[0m source_df,\n\u001b[1;32m 120\u001b[0m target_df,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 129\u001b[0m category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 136\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'" + ] + } + ], + "source": [ + "ext_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts, dh3, 'hex_id', extensive_variables=['total_pop', 'total_pop_white']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "db296055-3865-43f8-bfd0-0ea40f246ba7", + "metadata": {}, + "source": [ + "### Categorical" + ] + }, + { + "cell_type": "markdown", + "id": "8cf00b26-765c-40ce-a78a-dcfce4838c88", + "metadata": {}, + "source": [ + "Single-core:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "cbb3dbbd-70e4-4c3d-935e-6a6b60341f7c", + "metadata": { + "collapsed": true, + "jupyter": { + "outputs_hidden": true + }, + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n", + " return lib.intersects(a, b, **kwargs)\n", + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + }, + { + "ename": "TypeError", + "evalue": "Object with dtype category cannot perform the numpy op isnan", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cat_sc \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate.py:267\u001b[0m, in \u001b[0;36m_area_interpolate_binning\u001b[0;34m(source_df, target_df, extensive_variables, intensive_variables, table, allocate_total)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extensive_variables:\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m variable \u001b[38;5;129;01min\u001b[39;00m extensive_variables:\n\u001b[0;32m--> 267\u001b[0m vals \u001b[38;5;241m=\u001b[39m \u001b[43m_nan_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m vals \u001b[38;5;241m=\u001b[39m _inf_check(source_df, variable)\n\u001b[1;32m 269\u001b[0m estimates \u001b[38;5;241m=\u001b[39m diags([vals], [\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mdot(weights)\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/util/util.py:25\u001b[0m, in \u001b[0;36m_nan_check\u001b[0;34m(df, column)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Check if variable has nan values.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03mWarn and replace nan with 0.0.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 24\u001b[0m values \u001b[38;5;241m=\u001b[39m df[column]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misnan\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;129;01mor\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(np\u001b[38;5;241m.\u001b[39misinf(values)):\n\u001b[1;32m 26\u001b[0m wherenan \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39misnan(values)\n\u001b[1;32m 27\u001b[0m values[wherenan] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m\n", + "File \u001b[0;32m~/mambaforge/envs/gds/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:1639\u001b[0m, in \u001b[0;36mCategorical.__array_ufunc__\u001b[0;34m(self, ufunc, method, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[1;32m 1637\u001b[0m \u001b[38;5;66;03m# for all other cases, raise for now (similarly as what happens in\u001b[39;00m\n\u001b[1;32m 1638\u001b[0m \u001b[38;5;66;03m# Series.__array_prepare__)\u001b[39;00m\n\u001b[0;32m-> 1639\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject with dtype \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cannot perform \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe numpy op \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mufunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1642\u001b[0m )\n", + "\u001b[0;31mTypeError\u001b[0m: Object with dtype category cannot perform the numpy op isnan" + ] + } + ], + "source": [ + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts, h3, extensive_variables=['sub_30', 'tract']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "76507c2b-769a-4b80-8b62-40206e2cab42", + "metadata": {}, + "source": [ + "And through Dask:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "7107527b-88cc-4e9c-97d2-72a1d153c657", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", + " return lib.intersection(a, b, **kwargs)\n" + ] + }, + { + "ename": "TypeError", + "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m 70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m 71\u001b[0m name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 79\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 80\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 81\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m 88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m 89\u001b[0m graph, \n\u001b[1;32m 90\u001b[0m name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 93\u001b[0m new_spatial_partitions\n\u001b[1;32m 94\u001b[0m )\n", + "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m 119\u001b[0m source_df,\n\u001b[1;32m 120\u001b[0m target_df,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 129\u001b[0m category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 136\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", + "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'" + ] + } + ], + "source": [ + "ext_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts, dh3, 'hex_id', extensive_variables=['sub_30', 'tract']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "571e7878-25ad-49bc-a7a5-a632988f6a4b", + "metadata": {}, + "source": [ + "## Performance" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tobler/area_weighted/__init__.py b/tobler/area_weighted/__init__.py index dea94ee..05d056d 100644 --- a/tobler/area_weighted/__init__.py +++ b/tobler/area_weighted/__init__.py @@ -1,4 +1,5 @@ from .area_interpolate import _area_interpolate_binning as area_interpolate from .area_interpolate import _area_interpolate as _slow_area_interpolate from .area_interpolate import _area_tables, _area_tables_binning, _area_tables_raster -from .area_interpolate import _check_presence_of_crs \ No newline at end of file +from .area_interpolate import _check_presence_of_crs +from .area_interpolate_dask import area_interpolate_dask \ No newline at end of file diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py new file mode 100755 index 0000000..c0ae853 --- /dev/null +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -0,0 +1,152 @@ +import pandas +import geopandas +import dask_geopandas +import warnings +import numpy as np +from dask.base import tokenize +from dask.highlevelgraph import HighLevelGraph +from tobler.area_weighted import area_interpolate + +def area_interpolate_dask( + left_dgdf, + right_dgdf, + id_col, + extensive_variables=None, + intensive_variables=None, + categorical_variables=None, +): + if intensive_variables is not None: + raise NotImplementedError(( + "Dask-based interpolation of intensive variables is " + "not implemented yet. Please remove intensive variables to " + "be able to run the rest." + )) + # Categoricals must be Dask's known categorical + if categorical_variables is not None: + category_vars = [] + for cat_var in categorical_variables: + var_names = [f'{cat_var}_{c}' for c in left_dgdf[cat_var].cat.categories] + category_vars.extend(var_names) + else: + category_vars = None + # Build tasks by joining pairs of chunks from left/right + dsk = {} + new_spatial_partitions = [] + parts = geopandas.sjoin( + left_dgdf.spatial_partitions.to_frame('geometry'), + right_dgdf.spatial_partitions.to_frame('geometry'), + how='inner', + predicate='intersects' + ) + parts_left = np.asarray(parts.index) + parts_right = np.asarray(parts['index_right'].values) + name = 'area_interpolate-' + tokenize( + right_dgdf, left_dgdf + ) + for i, (l, r) in enumerate(zip(parts_left, parts_right)): + dsk[(name, i)] = ( + id_area_interpolate, + (left_dgdf._name, l), + (right_dgdf._name, r), + id_col, + extensive_variables, + intensive_variables, + None, + True, + 'auto', + 1, + categorical_variables, + category_vars + ) + lr = left_dgdf.spatial_partitions.iloc[l] + rr = right_dgdf.spatial_partitions.iloc[r] + extent = lr.intersection(rr) + new_spatial_partitions.append(extent) + # Create geometries for new spatial partitions + new_spatial_partitions = geopandas.GeoSeries( + data=new_spatial_partitions, crs=left_dgdf.crs + ) + # Build Dask graph + graph = HighLevelGraph.from_collections( + name, dsk, dependencies=[left_dgdf, right_dgdf] + ) + # Get metadata for the outcome table + meta = id_area_interpolate( + left_dgdf._meta, + right_dgdf._meta, + id_col, + extensive_variables=extensive_variables, + intensive_variables=intensive_variables, + table=None, + allocate_total=True, + spatial_index='auto', + n_jobs=1, + categorical_variables=categorical_variables, + category_vars=category_vars + ) + # Build output table + transferred = dask_geopandas.GeoDataFrame( + graph, + name, + meta, + [None] * (len(dsk) + 1), + new_spatial_partitions + ) + # Merge chunks + out = right_dgdf[[id_col, 'geometry']] + ## Extensive --> Add up estimates by ID + if extensive_variables is not None: + out_extensive = ( + transferred + .groupby(id_col) + [extensive_variables] + .agg({v: 'sum' for v in extensive_variables}) + ) + out = out.join(out_extensive, on=id_col) + ## Intensive --> Weight by area of the chunk (Not implemented) + ## Categorical --> Add up proportions + if categorical_variables is not None: + out_categorical = ( + transferred + [category_vars + [id_col]] + .groupby(id_col) + .agg({v: 'sum' for v in category_vars}) + ) + out = out.join(out_categorical, on=id_col) + return out + +def id_area_interpolate( + source_df, + target_df, + id_col, + extensive_variables=None, + intensive_variables=None, + table=None, + allocate_total=True, + spatial_index='auto', + n_jobs=1, + categorical_variables=None, + category_vars=None +): + estimates = area_interpolate( + source_df, + target_df, + extensive_variables=extensive_variables, + intensive_variables=intensive_variables, + table=table, + allocate_total=allocate_total, + spatial_index=spatial_index, + n_jobs=n_jobs, + categorical_variables=categorical_variables, + ) + estimates[id_col] = target_df[id_col].values + + if categorical_variables is not None: + category_vars_to_add = [] + for category_var in category_vars: + if category_var not in estimates.columns: + category_vars_to_add.append(category_var) + estimates = estimates.join( + pandas.DataFrame(index=estimates.index, columns=category_vars_to_add) + ) + return estimates From 8b4c28f5b50f249548dfa5a84d802fe3d5cd1322 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Thu, 10 Aug 2023 19:46:01 +0000 Subject: [PATCH 02/16] Merge init in AI --- tobler/area_weighted/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/tobler/area_weighted/__init__.py b/tobler/area_weighted/__init__.py index 7b8ce0a..5c9e7e4 100644 --- a/tobler/area_weighted/__init__.py +++ b/tobler/area_weighted/__init__.py @@ -1,5 +1,4 @@ from .area_interpolate import _area_interpolate_binning as area_interpolate -from .area_interpolate import _area_interpolate as _slow_area_interpolate -from .area_interpolate import _area_tables, _area_tables_binning, _area_tables_raster -from .area_interpolate import _check_presence_of_crs +from .area_interpolate import _area_tables_binning +from .area_join import area_join from .area_interpolate_dask import area_interpolate_dask From cc0dbf7dd865d998f7b4b6ab74ad97170098b742 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 11 Aug 2023 10:39:24 +0000 Subject: [PATCH 03/16] Add demo notebook and docs, remove extensive var support. Ready for review --- 04_area_interpolate_dask.ipynb | 640 +++++++++++++----- environment.yml | 3 + tobler/area_weighted/area_interpolate_dask.py | 131 +++- 3 files changed, 593 insertions(+), 181 deletions(-) diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb index c4ae195..9e88c68 100644 --- a/04_area_interpolate_dask.ipynb +++ b/04_area_interpolate_dask.ipynb @@ -19,7 +19,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "b70ac531-082c-4e77-9194-c5d4096b72ae", + "id": "4084f715-3989-4424-943a-2a4066a8bcf2", "metadata": { "tags": [] }, @@ -28,9 +28,12 @@ "import os\n", "os.environ['USE_PYGEOS'] = '1'\n", "\n", + "import pandas\n", "import geopandas\n", "import dask_geopandas\n", "import tobler\n", + "from libpysal.examples import load_example\n", + "import numpy as np\n", "\n", "from dask.distributed import Client, LocalCluster" ] @@ -45,106 +48,82 @@ }, { "cell_type": "markdown", - "id": "e8858b38-0a72-4f72-98be-c490f8201d86", + "id": "080369e7-f3d4-41c6-a629-12ed458eb743", "metadata": {}, "source": [ - "We use the San Diego H3 dataset from the [GDS Book](https://geographicdata.science/book/data/h3_grid/build_sd_h3_grid.html):" + "Load example data from `pysal`:" ] }, { "cell_type": "code", "execution_count": 2, - "id": "dc60a91c-0d58-4b19-b180-69692286c9a0", - "metadata": { - "tags": [] - }, + "id": "cb395dc5-67f2-462e-a1cf-919c8e6d0ae8", + "metadata": {}, "outputs": [], "source": [ - "h3 = geopandas.read_file((\n", - " 'https://geographicdata.science/book/'\n", - " '_downloads/d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg'\n", - ")).to_crs(epsg=3310)" + "c1 = load_example('Charleston1')\n", + "c2 = load_example('Charleston2')\n", + "\n", + "crs = 6569 # https://epsg.io/6569\n", + "\n", + "tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n", + "zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)" ] }, { "cell_type": "markdown", - "id": "41b31033-9711-4102-98af-66b3b6945bcb", + "id": "1d11c1d7-6435-40cb-a4d4-851f63eccf01", "metadata": {}, "source": [ - "And the Census tracts dataset, also from the same [source](https://geographicdata.science/book/data/sandiego/sandiego_tracts_cleaning.html):" + "We make up a categorical variable with four classes distributed randomly across the dataset:" ] }, { "cell_type": "code", "execution_count": 3, - "id": "271b9208-1ae7-41f0-8234-da1d1ed2030a", + "id": "3543702f-5e8a-4336-a14d-19a4eeb77b1b", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection\n", - " return lib.intersection(a, b, **kwargs)\n" - ] - } - ], + "outputs": [], "source": [ - "tracts = (\n", - " geopandas.read_file((\n", - " 'https://geographicdata.science/book/'\n", - " '_downloads/f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg'\n", - " ))\n", - " .to_crs(epsg=3310)\n", - " .clip(h3)\n", + "rng = np.random.default_rng(seed=42)\n", + "\n", + "tracts['rando'] = pandas.Series(\n", + " rng.integers(0, 4, len(tracts)), dtype='category'\n", ")" ] }, { "cell_type": "markdown", - "id": "2ebddef3-ed10-4164-99bb-3ca07f3aa1de", + "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae", "metadata": {}, "source": [ - "Note in both cases we require a projected CRS and thus use the [NAD83/California Albers](https://epsg.io/3310)." + "We will set up a local Dask cluster so you can follow the computations on the dashboard (`http://localhost:8787` by default):" ] }, { - "cell_type": "markdown", - "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae", - "metadata": {}, + "cell_type": "code", + "execution_count": 13, + "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", + "metadata": { + "tags": [] + }, + "outputs": [], "source": [ - "We will set up a local Dask cluster:" + "client = Client(LocalCluster(n_workers=8))" ] }, { "cell_type": "code", - "execution_count": 4, - "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", + "execution_count": 12, + "id": "69f06d42-f47f-4120-811b-275431b1cf3a", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-10 15:19:49,969 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-taj3n78d', purging\n", - "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-283edkzp', purging\n", - "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-5sq_o8d_', purging\n", - "2023-08-10 15:19:49,971 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8pnf3b0w', purging\n", - "2023-08-10 15:19:49,972 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-0kn_mkzc', purging\n", - "2023-08-10 15:19:49,973 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-kkw_crni', purging\n", - "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8ycp859d', purging\n", - "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-fb2nvcnm', purging\n", - "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-mpy0p1sd', purging\n", - "2023-08-10 15:19:49,975 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-y5wx5vyl', purging\n" - ] - } - ], + "outputs": [], "source": [ - "client = Client(LocalCluster(n_workers=10))" + "client.shutdown()" ] }, { @@ -158,24 +137,17 @@ { "cell_type": "code", "execution_count": 5, - "id": "7fef3124-a5d9-4712-bf9e-53fdf344c37f", - "metadata": { - "tags": [] - }, + "id": "a31a1a91-4071-40e2-a21f-7e035d734976", + "metadata": {}, "outputs": [], "source": [ - "tracts['sub_30'] = tracts['sub_30'].astype('category')\n", - "tracts['tract'] = tracts['tract'].astype('category')\n", - "\n", "dtracts = (\n", - " dask_geopandas.from_geopandas(tracts[\n", - " ['geometry', 'sub_30', 'tract', 'total_pop', 'total_pop_white']\n", - " ], npartitions=10)\n", + " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=4)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")\n", "\n", - "dh3 = (\n", - " dask_geopandas.from_geopandas(h3, npartitions=10)\n", + "dzips = (\n", + " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=4)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")" ] @@ -187,7 +159,7 @@ "source": [ "---\n", "\n", - "**IMPORTANT** - At this point, only *extensive* and *categorical* variables are implemented, so those are what we will test.\n", + "**IMPORTANT** - At this point, only *categorical* variables are implemented, so those are what we will test.\n", "\n", "---" ] @@ -202,212 +174,554 @@ }, { "cell_type": "markdown", - "id": "958a9509-2666-4cf5-88f0-3e22ab8d8eac", + "id": "92dafb11-ec94-43c2-baec-2a5e2a0b380d", + "metadata": {}, + "source": [ + "- Single core" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "4d4cde6d-73c1-4197-86ed-131724e21296", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts, zip_codes, categorical_variables=['rando']\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "2982d8dc-c1e9-4927-8643-9900b1b09890", + "metadata": {}, + "source": [ + "- Dask" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", + ").compute()" + ] + }, + { + "cell_type": "markdown", + "id": "5e19b8dd-505f-4dc1-ba85-9fd825e59b43", "metadata": {}, "source": [ - "### Extensive\n", + "And we can compare both results are the same:" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "8bc830b2-99a7-4c11-a8d9-0fad3aefcf06", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "rando_0 4.188295e-08\n", + "rando_1 5.328575e-08\n", + "rando_2 5.396667e-08\n", + "rando_3 2.935173e-08\n", + "dtype: float64" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "a = (\n", + " cat_dk\n", + " .set_index('ZIP')\n", + " .reindex(zip_codes['ZIP'].values)\n", + " .drop(columns='geometry')\n", + ")\n", "\n", - "Here we transfer the total population from `tracts` to `h3`." + "b = (\n", + " cat_sc\n", + " .drop(columns='geometry')\n", + " [['rando_0', 'rando_1', 'rando_2', 'rando_3']]\n", + ")\n", + "b.index = a.index\n", + "\n", + "(a - b).max()" ] }, { "cell_type": "markdown", - "id": "c0e1d903-1e6f-446b-9d46-93cbdce3bc76", + "id": "e2e04df1-3331-449c-b74c-e910239c3067", "metadata": {}, "source": [ - "First, we transfer with the single-core approach:" + "The differences in the estimates for the proportions of each area start at the 8th decimal, and thus likely rounding errors derived from the different approaches used to compute the interpolation (the single core does it in one-shot, while Dask computes parts and brings them together later with a sum)." + ] + }, + { + "cell_type": "markdown", + "id": "1debbdf4-892f-4fda-834a-0403595794ef", + "metadata": {}, + "source": [ + "## Performance\n", + "\n", + "---\n", + "\n", + "**NOTE** - Timings below do _not_ include computation time required for spatial shuffling and partitioning (which can be substantial with large datasets), or converting from `geopandas`. These are \"sunk costs\" that'll only make this approach preferable with large datasets, although they can be computed once and the result stored in disk efficiently (e.g., as Parquet files). Having said that, when \"larger\" is large enough is not very large in modern terms: from a handful of thousand observations the gains will be substantial if several cores/workers are available.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "id": "e5242c13-c4cd-46e2-9131-ec1734bcc142", + "metadata": {}, + "source": [ + "We can now time the example above:\n" ] }, { "cell_type": "code", - "execution_count": 6, - "id": "2d5dd9e8-4c55-43d0-9730-4b1f0826305f", + "execution_count": 9, + "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8", "metadata": { "tags": [] }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n", - " return lib.intersects(a, b, **kwargs)\n", - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", - " return lib.intersection(a, b, **kwargs)\n" + "85.5 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ - "ext_sc = tobler.area_weighted.area_interpolate(\n", - " tracts, h3, extensive_variables=['total_pop', 'total_pop_white']\n", + "%%timeit\n", + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts, zip_codes, categorical_variables=['rando']\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 10, + "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "444 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + ] + } + ], + "source": [ + "%%timeit\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", + ").compute()" + ] + }, { "cell_type": "markdown", - "id": "b7604a13-770d-45d3-bd36-23464ff39138", + "id": "a124ee86-c527-4386-be8d-2dc833270fd9", "metadata": {}, "source": [ - "Then we perform the same operation using Dask:" + "This is notably slower (about 5x!). For such a small dataset, the overhead in distributing computations and collecting them overcomes any gains in parallelism.\n", + "\n", + "Now we can artificially increase the size of the datasets by concatenating them several times and re-computing (this time we only time one execution):" ] }, { "cell_type": "code", - "execution_count": 7, - "id": "0a84dc41-f128-4e4f-98da-44c7729b73a9", + "execution_count": 24, + "id": "5f56d579-0022-45c2-845c-f351bf96ed01", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", - " return lib.intersection(a, b, **kwargs)\n" + "40x increase | N. tracts: 4680 | N. ZIPs: 1680\n" ] }, { - "ename": "TypeError", - "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop_white\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m 70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m 71\u001b[0m name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 79\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 80\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 81\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m 88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m 89\u001b[0m graph, \n\u001b[1;32m 90\u001b[0m name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 93\u001b[0m new_spatial_partitions\n\u001b[1;32m 94\u001b[0m )\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m 119\u001b[0m source_df,\n\u001b[1;32m 120\u001b[0m target_df,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 129\u001b[0m category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 136\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'" + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n", + "This may cause some slowdown.\n", + "Consider scattering data ahead of time and using futures.\n", + " warnings.warn(\n", + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.17 MiB.\n", + "This may cause some slowdown.\n", + "Consider scattering data ahead of time and using futures.\n", + " warnings.warn(\n" ] } ], "source": [ - "ext_dk = tobler.area_weighted.area_interpolate_dask(\n", - " dtracts, dh3, 'hex_id', extensive_variables=['total_pop', 'total_pop_white']\n", + "sizeup = 40\n", + "tracts_lrg = pandas.concat([tracts] * sizeup)\n", + "zips_lrg = pandas.concat([zip_codes] * sizeup)\n", + "print(\n", + " f'{sizeup}x increase | N. tracts: {len(tracts_lrg)} | N. ZIPs: {len(zips_lrg)}'\n", + ")\n", + "\n", + "dtracts_lrg = (\n", + " dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=800)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", + ")\n", + "\n", + "dzips_lrg = (\n", + " dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=800)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")" ] }, { "cell_type": "markdown", - "id": "db296055-3865-43f8-bfd0-0ea40f246ba7", + "id": "e5187109-ba95-4b5f-b373-2ec4745d0289", "metadata": {}, "source": [ - "### Categorical" + "And re-compute the timings:" ] }, { "cell_type": "markdown", - "id": "8cf00b26-765c-40ce-a78a-dcfce4838c88", - "metadata": {}, + "id": "c0da372a-f791-47fb-ade0-317a1cf6ff9c", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "Single-core:" + "---\n", + "\n", + "### 10x" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "cbb3dbbd-70e4-4c3d-935e-6a6b60341f7c", + "execution_count": 14, + "id": "620cf9ab-7b9e-4458-809c-c7a73d13f26c", "metadata": { - "collapsed": true, - "jupyter": { - "outputs_hidden": true - }, "tags": [] }, "outputs": [ { - "name": "stderr", + "name": "stdout", "output_type": "stream", "text": [ - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n", - " return lib.intersects(a, b, **kwargs)\n", - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", - " return lib.intersection(a, b, **kwargs)\n" + "Computing for a sizeup of 10x\n", + "CPU times: user 7.21 s, sys: 11.3 ms, total: 7.23 s\n", + "Wall time: 6.95 s\n" ] - }, + } + ], + "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "c615b27a-e004-429b-a0c5-e4b237516f9f", + "metadata": { + "tags": [] + }, + "outputs": [ { - "ename": "TypeError", - "evalue": "Object with dtype category cannot perform the numpy op isnan", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cat_sc \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate.py:267\u001b[0m, in \u001b[0;36m_area_interpolate_binning\u001b[0;34m(source_df, target_df, extensive_variables, intensive_variables, table, allocate_total)\u001b[0m\n\u001b[1;32m 265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extensive_variables:\n\u001b[1;32m 266\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m variable \u001b[38;5;129;01min\u001b[39;00m extensive_variables:\n\u001b[0;32m--> 267\u001b[0m vals \u001b[38;5;241m=\u001b[39m \u001b[43m_nan_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 268\u001b[0m vals \u001b[38;5;241m=\u001b[39m _inf_check(source_df, variable)\n\u001b[1;32m 269\u001b[0m estimates \u001b[38;5;241m=\u001b[39m diags([vals], [\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mdot(weights)\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/util/util.py:25\u001b[0m, in \u001b[0;36m_nan_check\u001b[0;34m(df, column)\u001b[0m\n\u001b[1;32m 20\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Check if variable has nan values.\u001b[39;00m\n\u001b[1;32m 21\u001b[0m \n\u001b[1;32m 22\u001b[0m \u001b[38;5;124;03mWarn and replace nan with 0.0.\u001b[39;00m\n\u001b[1;32m 23\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 24\u001b[0m values \u001b[38;5;241m=\u001b[39m df[column]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misnan\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;129;01mor\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(np\u001b[38;5;241m.\u001b[39misinf(values)):\n\u001b[1;32m 26\u001b[0m wherenan \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39misnan(values)\n\u001b[1;32m 27\u001b[0m values[wherenan] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m\n", - "File \u001b[0;32m~/mambaforge/envs/gds/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:1639\u001b[0m, in \u001b[0;36mCategorical.__array_ufunc__\u001b[0;34m(self, ufunc, method, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 1635\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[1;32m 1637\u001b[0m \u001b[38;5;66;03m# for all other cases, raise for now (similarly as what happens in\u001b[39;00m\n\u001b[1;32m 1638\u001b[0m \u001b[38;5;66;03m# Series.__array_prepare__)\u001b[39;00m\n\u001b[0;32m-> 1639\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m 1640\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject with dtype \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cannot perform \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1641\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe numpy op \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mufunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 1642\u001b[0m )\n", - "\u001b[0;31mTypeError\u001b[0m: Object with dtype category cannot perform the numpy op isnan" + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 10x\n", + "CPU times: user 548 ms, sys: 18 ms, total: 566 ms\n", + "Wall time: 3.56 s\n" ] } ], "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", + ").compute()" + ] + }, + { + "cell_type": "markdown", + "id": "cc13af25-e97e-4b34-bb1f-bb946c15748e", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "---\n", + "\n", + "### 20x" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "8dbb40d4-4b3b-446d-9d1b-99462a122d6e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 20x\n", + "CPU times: user 28.6 s, sys: 26.1 ms, total: 28.7 s\n", + "Wall time: 27.6 s\n" + ] + } + ], + "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", "cat_sc = tobler.area_weighted.area_interpolate(\n", - " tracts, h3, extensive_variables=['sub_30', 'tract']\n", + " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", ")" ] }, + { + "cell_type": "code", + "execution_count": 24, + "id": "f2ca1394-5f8d-428f-a61c-87beb8778322", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 20x\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 16.77 MiB.\n", + "This may cause some slowdown.\n", + "Consider scattering data ahead of time and using futures.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.32 s, sys: 65.3 ms, total: 1.38 s\n", + "Wall time: 9.86 s\n" + ] + } + ], + "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", + ").compute()" + ] + }, { "cell_type": "markdown", - "id": "76507c2b-769a-4b80-8b62-40206e2cab42", - "metadata": {}, + "id": "335b34b4-9fea-48a6-b38b-8b1a5d755ca1", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ - "And through Dask:" + "---\n", + "\n", + "### 30x" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "7107527b-88cc-4e9c-97d2-72a1d153c657", + "execution_count": 26, + "id": "1598ce3f-d21e-4a60-9619-ee5b1eb4932f", "metadata": { "tags": [] }, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 30x\n", + "CPU times: user 1min 4s, sys: 176 ms, total: 1min 4s\n", + "Wall time: 1min 1s\n" + ] + } + ], + "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "224ffbca-7690-4b20-bad2-efbf042623a9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 30x\n" + ] + }, { "name": "stderr", "output_type": "stream", "text": [ - "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n", - " return lib.intersection(a, b, **kwargs)\n" + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 25.14 MiB.\n", + "This may cause some slowdown.\n", + "Consider scattering data ahead of time and using futures.\n", + " warnings.warn(\n" ] }, { - "ename": "TypeError", - "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 2\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m 3\u001b[0m \u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m 70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m 71\u001b[0m name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m 72\u001b[0m )\n\u001b[1;32m 73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 75\u001b[0m \u001b[43m \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 76\u001b[0m \u001b[43m \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 77\u001b[0m \u001b[43m \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 78\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 79\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 80\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 81\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m 88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m 89\u001b[0m graph, \n\u001b[1;32m 90\u001b[0m name,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 93\u001b[0m new_spatial_partitions\n\u001b[1;32m 94\u001b[0m )\n", - "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m 118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m 119\u001b[0m source_df,\n\u001b[1;32m 120\u001b[0m target_df,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 129\u001b[0m category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 132\u001b[0m \u001b[43m \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 133\u001b[0m \u001b[43m \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 134\u001b[0m \u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 135\u001b[0m \u001b[43m \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 136\u001b[0m \u001b[43m \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 137\u001b[0m \u001b[43m \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 138\u001b[0m \u001b[43m \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 139\u001b[0m \u001b[43m \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 140\u001b[0m \u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 141\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 142\u001b[0m estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m 144\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n", - "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'" + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 1.91 s, sys: 58.8 ms, total: 1.97 s\n", + "Wall time: 14.6 s\n" ] } ], "source": [ - "ext_dk = tobler.area_weighted.area_interpolate_dask(\n", - " dtracts, dh3, 'hex_id', extensive_variables=['sub_30', 'tract']\n", - ")" + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", + ").compute()" ] }, { "cell_type": "markdown", - "id": "571e7878-25ad-49bc-a7a5-a632988f6a4b", - "metadata": {}, + "id": "b004834f-c5ce-4f92-be9a-364a07c7996b", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "---\n", + "\n", + "### 40x" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "b6b9d06a-9034-4c39-b3a9-92fc6408d5c6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 40x\n", + "CPU times: user 2min 2s, sys: 1.71 s, total: 2min 3s\n", + "Wall time: 1min 53s\n" + ] + } + ], + "source": [ + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_sc = tobler.area_weighted.area_interpolate(\n", + " tracts_lrg, zips_lrg, categorical_variables=['rando']\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing for a sizeup of 40x\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 33.52 MiB.\n", + "This may cause some slowdown.\n", + "Consider scattering data ahead of time and using futures.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CPU times: user 5.58 s, sys: 417 ms, total: 5.99 s\n", + "Wall time: 29.3 s\n" + ] + } + ], "source": [ - "## Performance" + "%%time\n", + "print(f'Computing for a sizeup of {sizeup}x')\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", + ").compute()" ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3 (ipykernel)", + "display_name": "tobler", "language": "python", - "name": "python3" + "name": "tobler" }, "language_info": { "codemirror_mode": { @@ -419,7 +733,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.10" + "version": "3.11.4" } }, "nbformat": 4, diff --git a/environment.yml b/environment.yml index 3037858..7d4c6e2 100644 --- a/environment.yml +++ b/environment.yml @@ -2,6 +2,9 @@ name: tobler channels: - conda-forge dependencies: + - dask-geopandas + - dask + - distributed - jupyterlab - numpy - geopandas >=0.13 diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index c0ae853..24b204d 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -1,31 +1,75 @@ +''' +Area Weighted Interpolation, out-of-core and parallel through Dask +''' + import pandas import geopandas import dask_geopandas -import warnings import numpy as np from dask.base import tokenize from dask.highlevelgraph import HighLevelGraph -from tobler.area_weighted import area_interpolate +from .area_interpolate import _area_interpolate_binning as area_interpolate def area_interpolate_dask( - left_dgdf, - right_dgdf, + source_dgdf, + target_dgdf, id_col, extensive_variables=None, intensive_variables=None, categorical_variables=None, ): + ''' + Out-of-core and parallel area interpolation for categorical variables. + + Parameters + ---------- + source_dgdf : dask_geopandas.GeoDataFrame + Dask-geopandas GeoDataFrame + IMPORTANT: the table needs to be spatially shuffled and with spatial partitions. + This is required so only overlapping partitions are checked for interpolation. See + more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html + target_dgdf : dask_geopandas.GeoDataFrame + Dask-geopandas GeoDataFrame + IMPORTANT: the table needs to be spatially shuffled and with spatial partitions. + This is required so only overlapping partitions are checked for interpolation. See + more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html + id_col : str + Name of the column in `target_dgdf` with unique IDs to be used in output table + extensive_variables : list + [Optional. Default=None] Columns in `source_dgdf` for extensive variables. + IMPORTANT: currently NOT implemented. + intensive_variables : list + [Optional. Default=None] Columns in `source_dgdf` for intensive variables + IMPORTANT: currently NOT implemented. + categorical_variables : list + [Optional. Default=None] Columns in `source_dgdf` for categorical variables + IMPORTANT: categorical variables must be of type `'category[known]'`. This is so + all categories are known ahead of time and Dask can run lazily. + + Returns + ------- + estimates : dask_geopandas.GeoDataFrame + new dask-geopandas geodaraframe with interpolated variables and `id_col` as + columns and target_df geometry as output geometry + + ''' if intensive_variables is not None: raise NotImplementedError(( "Dask-based interpolation of intensive variables is " "not implemented yet. Please remove intensive variables to " "be able to run the rest." )) + if extensive_variables is not None: + raise NotImplementedError(( + "Dask-based interpolation of extensive variables is " + "not implemented yet. Please remove intensive variables to " + "be able to run the rest." + )) # Categoricals must be Dask's known categorical if categorical_variables is not None: category_vars = [] for cat_var in categorical_variables: - var_names = [f'{cat_var}_{c}' for c in left_dgdf[cat_var].cat.categories] + var_names = [f'{cat_var}_{c}' for c in source_dgdf[cat_var].cat.categories] category_vars.extend(var_names) else: category_vars = None @@ -33,21 +77,21 @@ def area_interpolate_dask( dsk = {} new_spatial_partitions = [] parts = geopandas.sjoin( - left_dgdf.spatial_partitions.to_frame('geometry'), - right_dgdf.spatial_partitions.to_frame('geometry'), + source_dgdf.spatial_partitions.to_frame('geometry'), + target_dgdf.spatial_partitions.to_frame('geometry'), how='inner', predicate='intersects' ) parts_left = np.asarray(parts.index) parts_right = np.asarray(parts['index_right'].values) name = 'area_interpolate-' + tokenize( - right_dgdf, left_dgdf + target_dgdf, source_dgdf ) for i, (l, r) in enumerate(zip(parts_left, parts_right)): dsk[(name, i)] = ( id_area_interpolate, - (left_dgdf._name, l), - (right_dgdf._name, r), + (source_dgdf._name, l), + (target_dgdf._name, r), id_col, extensive_variables, intensive_variables, @@ -58,22 +102,22 @@ def area_interpolate_dask( categorical_variables, category_vars ) - lr = left_dgdf.spatial_partitions.iloc[l] - rr = right_dgdf.spatial_partitions.iloc[r] + lr = source_dgdf.spatial_partitions.iloc[l] + rr = target_dgdf.spatial_partitions.iloc[r] extent = lr.intersection(rr) new_spatial_partitions.append(extent) # Create geometries for new spatial partitions new_spatial_partitions = geopandas.GeoSeries( - data=new_spatial_partitions, crs=left_dgdf.crs + data=new_spatial_partitions, crs=source_dgdf.crs ) # Build Dask graph graph = HighLevelGraph.from_collections( - name, dsk, dependencies=[left_dgdf, right_dgdf] + name, dsk, dependencies=[source_dgdf, target_dgdf] ) # Get metadata for the outcome table meta = id_area_interpolate( - left_dgdf._meta, - right_dgdf._meta, + source_dgdf._meta, + target_dgdf._meta, id_col, extensive_variables=extensive_variables, intensive_variables=intensive_variables, @@ -93,8 +137,9 @@ def area_interpolate_dask( new_spatial_partitions ) # Merge chunks - out = right_dgdf[[id_col, 'geometry']] - ## Extensive --> Add up estimates by ID + out = target_dgdf[[id_col, 'geometry']] + ## Extensive --> Not implemented (DAB: the below does not match single-core) + ''' if extensive_variables is not None: out_extensive = ( transferred @@ -103,6 +148,7 @@ def area_interpolate_dask( .agg({v: 'sum' for v in extensive_variables}) ) out = out.join(out_extensive, on=id_col) + ''' ## Intensive --> Weight by area of the chunk (Not implemented) ## Categorical --> Add up proportions if categorical_variables is not None: @@ -128,6 +174,55 @@ def id_area_interpolate( categorical_variables=None, category_vars=None ): + ''' + Light wrapper around single-core area interpolation to be run on distributed workers + + Parameters + ---------- + source_df : geopandas.GeoDataFrame + target_df : geopandas.GeoDataFrame + id_col : str + Name of the column in `target_dgdf` with unique IDs to be used in output table + extensive_variables : list + [Optional. Default=None] Columns in dataframes for extensive variables + intensive_variables : list + [Optional. Default=None] Columns in dataframes for intensive variables + table : scipy.sparse.csr_matrix + [Optional. Default=None] Area allocation source-target correspondence + table. If not provided, it will be built from `source_df` and + `target_df` using `tobler.area_interpolate._area_tables_binning` + allocate_total : boolean + [Optional. Default=True] True if total value of source area should be + allocated. False if denominator is area of i. Note that the two cases + would be identical when the area of the source polygon is exhausted by + intersections. See Notes for more details. + spatial_index : str + [Optional. Default="auto"] Spatial index to use to build the + allocation of area from source to target tables. It currently support + the following values: + - "source": build the spatial index on `source_df` + - "target": build the spatial index on `target_df` + - "auto": attempts to guess the most efficient alternative. + Currently, this option uses the largest table to build the + index, and performs a `bulk_query` on the shorter table. + This argument is ignored if n_jobs>1 (or n_jobs=-1). + n_jobs : int + [Optional. Default=1] Number of processes to run in parallel to + generate the area allocation. If -1, this is set to the number of CPUs + available. If `table` is passed, this is ignored. + categorical_variables : list + [Optional. Default=None] Columns in dataframes for categorical variables + categories : list + [Optional. Default=None] Full list of category names in the format + `f'{var_name}_{cat_name}'` + + Returns + ------- + estimates : geopandas.GeoDataFrame + new geodaraframe with interpolated variables as columns and target_df geometry + as output geometry + + ''' estimates = area_interpolate( source_df, target_df, From 3e78f4e435205ea997f6155699bf1886daa40cf6 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 11 Aug 2023 11:21:22 +0000 Subject: [PATCH 04/16] Add example to notebook reproducing known bug --- 04_area_interpolate_dask.ipynb | 194 ++++++++++++++++++++++++++++++--- 1 file changed, 179 insertions(+), 15 deletions(-) diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb index 9e88c68..7a43f7b 100644 --- a/04_area_interpolate_dask.ipynb +++ b/04_area_interpolate_dask.ipynb @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", "metadata": { "tags": [] @@ -114,18 +114,6 @@ "client = Client(LocalCluster(n_workers=8))" ] }, - { - "cell_type": "code", - "execution_count": 12, - "id": "69f06d42-f47f-4120-811b-275431b1cf3a", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "client.shutdown()" - ] - }, { "cell_type": "markdown", "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880", @@ -167,7 +155,10 @@ { "cell_type": "markdown", "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ "## Correctness" ] @@ -276,7 +267,10 @@ { "cell_type": "markdown", "id": "1debbdf4-892f-4fda-834a-0403595794ef", - "metadata": {}, + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, "source": [ "## Performance\n", "\n", @@ -715,6 +709,176 @@ " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", ").compute()" ] + }, + { + "cell_type": "markdown", + "id": "93576191-ddb0-4316-af7e-d12393e520b6", + "metadata": {}, + "source": [ + "## Bug" + ] + }, + { + "cell_type": "markdown", + "id": "079ff509-cd42-4982-8144-f4915e996f83", + "metadata": {}, + "source": [ + "There is a recurrent bug that appears in some cases that errors the computation and should be fixed ideally before merging. The code below reproduces it:" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d2470144-7c4d-4638-90a8-6bc4254128ef", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2023-08-11 11:19:01,747 - distributed.worker - WARNING - Compute Failed\n", + "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 2, 1)\n", + "Function: pipe\n", + "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29437 0 ... 0.000000\n", + "29472 0 ... 0.000000\n", + "29483 0 ... 0.008659\n", + "\n", + "[3 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29437 0.0 ... 0.0\n", + "29472 0.0 ... 0.0\n", + "29483 0.0 \n", + "kwargs: {}\n", + "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", + "\n", + "2023-08-11 11:19:01,750 - distributed.worker - WARNING - Compute Failed\n", + "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 0, 1)\n", + "Function: pipe\n", + "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29438 0.905017 ... 0\n", + "\n", + "[1 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29449 0.001648 ... 0\n", + "29426 0.000000 ... 0\n", + "29414 0.000000 ... 0\n", + "\n", + "[3 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP \n", + "kwargs: {}\n", + "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", + "\n", + "2023-08-11 11:19:01,761 - distributed.worker - ERROR - Exception during execution of task ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 4, 1).\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 184, in __getitem__\n", + " return self.fast[key]\n", + " ~~~~~~~~~^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/lru.py\", line 117, in __getitem__\n", + " result = self.d[key]\n", + " ~~~~~~^^^^^\n", + "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2412, in _prepare_args_for_execution\n", + " data[k] = self.data[k]\n", + " ~~~~~~~~~^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/spill.py\", line 216, in __getitem__\n", + " return super().__getitem__(key)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 186, in __getitem__\n", + " return self.slow_to_fast(key)\n", + " ^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 153, in slow_to_fast\n", + " value = self.slow[key]\n", + " ~~~~~~~~~^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", + " return func(*args, **kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/cache.py\", line 67, in __getitem__\n", + " gen = self._last_updated[key]\n", + " ~~~~~~~~~~~~~~~~~~^^^^^\n", + "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", + "\n", + "During handling of the above exception, another exception occurred:\n", + "\n", + "Traceback (most recent call last):\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2264, in execute\n", + " args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs)\n", + " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", + " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2416, in _prepare_args_for_execution\n", + " data[k] = Actor(type(self.state.actors[k]), self.address, k, self)\n", + " ~~~~~~~~~~~~~~~~~^^^\n", + "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", + "2023-08-11 11:19:01,791 - distributed.worker - WARNING - Compute Failed\n", + "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 3, 1)\n", + "Function: pipe\n", + "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29487 0.002037 ... 0\n", + "29455 0.275390 ... 0\n", + "\n", + "[2 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP ... \n", + "29487 0.000000 ... 0.000000\n", + "29455 0.002053 ... 0.018981\n", + "\n", + "[2 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", + "ZIP \n", + "kwargs: {}\n", + "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", + "\n" + ] + }, + { + "ename": "TypeError", + "evalue": "can only concatenate str (not \"float\") to str", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 13\u001b[0m\n\u001b[1;32m 1\u001b[0m dtracts \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(tracts[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrando\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m dzips \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 7\u001b[0m dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(zip_codes[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mZIP\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m cat_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdzips\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mZIP\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrando\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m---> 13\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/toolz/functoolz.py:628\u001b[0m, in \u001b[0;36mpipe\u001b[0;34m()\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\" Pipe a value through a sequence of functions\u001b[39;00m\n\u001b[1;32m 609\u001b[0m \n\u001b[1;32m 610\u001b[0m \u001b[38;5;124;03mI.e. ``pipe(data, f, g, h)`` is equivalent to ``h(g(f(data)))``\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;124;03m thread_last\u001b[39;00m\n\u001b[1;32m 626\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m func(data)\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n", + "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1209\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1207\u001b[0m result \u001b[38;5;241m=\u001b[39m collections\u001b[38;5;241m.\u001b[39mOrderedDict()\n\u001b[1;32m 1208\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1209\u001b[0m r \u001b[38;5;241m=\u001b[39m func(grouped, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfunc_kwargs)\n\u001b[1;32m 1211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 1212\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(r):\n", + "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1255\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like[column])\n", + "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/pandas/_libs/groupby.pyx:717\u001b[0m, in \u001b[0;36mpandas._libs.groupby.group_sum\u001b[0;34m()\u001b[0m\n\u001b[1;32m 715\u001b[0m t = val\n\u001b[1;32m 716\u001b[0m else:\n\u001b[0;32m--> 717\u001b[0m t = sumx[lab, j] + val\n\u001b[1;32m 718\u001b[0m sumx[lab, j] = t\n\u001b[1;32m 719\u001b[0m \n", + "\u001b[0;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str" + ] + } + ], + "source": [ + "dtracts = (\n", + " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", + ")\n", + "\n", + "dzips = (\n", + " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n", + " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", + ")\n", + "\n", + "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", + " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", + ").compute()" + ] + }, + { + "cell_type": "markdown", + "id": "08596315-236b-45df-955e-44a98b0a2eba", + "metadata": {}, + "source": [ + "[DAB]: my hunch is that the error, though cryptic and hard to debug, comes from a worker returning an empty result (perhaps `None`?) which, when it's passed through the aggregation post collection from the workers, raises the error. Further investigation is warranted." + ] } ], "metadata": { From 9ef8dc04f710309bc7fa6fc18cb07a6bb8c6df3e Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Mon, 14 Aug 2023 10:00:09 +0000 Subject: [PATCH 05/16] Fix bug that appears sometimes w/ many small partitions --- 04_area_interpolate_dask.ipynb | 207 ++---------------- tobler/area_weighted/area_interpolate_dask.py | 9 +- 2 files changed, 24 insertions(+), 192 deletions(-) diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb index 7a43f7b..d368967 100644 --- a/04_area_interpolate_dask.ipynb +++ b/04_area_interpolate_dask.ipynb @@ -124,18 +124,18 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, "id": "a31a1a91-4071-40e2-a21f-7e035d734976", "metadata": {}, "outputs": [], "source": [ "dtracts = (\n", - " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=4)\n", + " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")\n", "\n", "dzips = (\n", - " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=4)\n", + " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")" ] @@ -156,7 +156,6 @@ "cell_type": "markdown", "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -173,7 +172,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 8, "id": "4d4cde6d-73c1-4197-86ed-131724e21296", "metadata": { "tags": [] @@ -195,7 +194,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5", "metadata": { "tags": [] @@ -218,7 +217,7 @@ { "cell_type": "code", "execution_count": 8, - "id": "8bc830b2-99a7-4c11-a8d9-0fad3aefcf06", + "id": "81de5e35-f3b6-4567-86b1-36d98583dca0", "metadata": { "tags": [] }, @@ -268,7 +267,6 @@ "cell_type": "markdown", "id": "1debbdf4-892f-4fda-834a-0403595794ef", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -291,7 +289,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 12, "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8", "metadata": { "tags": [] @@ -301,7 +299,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "85.5 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" + "85 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], @@ -314,7 +312,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 13, "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e", "metadata": { "tags": [] @@ -324,7 +322,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "444 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" + "1.41 s ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], @@ -347,7 +345,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 17, "id": "5f56d579-0022-45c2-845c-f351bf96ed01", "metadata": { "tags": [] @@ -368,7 +366,7 @@ "This may cause some slowdown.\n", "Consider scattering data ahead of time and using futures.\n", " warnings.warn(\n", - "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.17 MiB.\n", + "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n", "This may cause some slowdown.\n", "Consider scattering data ahead of time and using futures.\n", " warnings.warn(\n" @@ -384,12 +382,12 @@ ")\n", "\n", "dtracts_lrg = (\n", - " dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=800)\n", + " dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=500)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")\n", "\n", "dzips_lrg = (\n", - " dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=800)\n", + " dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=500)\n", " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", ")" ] @@ -633,7 +631,6 @@ "cell_type": "markdown", "id": "b004834f-c5ce-4f92-be9a-364a07c7996b", "metadata": { - "jp-MarkdownHeadingCollapsed": true, "tags": [] }, "source": [ @@ -670,7 +667,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 18, "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28", "metadata": { "tags": [] @@ -697,8 +694,8 @@ "name": "stdout", "output_type": "stream", "text": [ - "CPU times: user 5.58 s, sys: 417 ms, total: 5.99 s\n", - "Wall time: 29.3 s\n" + "CPU times: user 6.99 s, sys: 512 ms, total: 7.5 s\n", + "Wall time: 30.5 s\n" ] } ], @@ -709,176 +706,6 @@ " dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n", ").compute()" ] - }, - { - "cell_type": "markdown", - "id": "93576191-ddb0-4316-af7e-d12393e520b6", - "metadata": {}, - "source": [ - "## Bug" - ] - }, - { - "cell_type": "markdown", - "id": "079ff509-cd42-4982-8144-f4915e996f83", - "metadata": {}, - "source": [ - "There is a recurrent bug that appears in some cases that errors the computation and should be fixed ideally before merging. The code below reproduces it:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "d2470144-7c4d-4638-90a8-6bc4254128ef", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2023-08-11 11:19:01,747 - distributed.worker - WARNING - Compute Failed\n", - "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 2, 1)\n", - "Function: pipe\n", - "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29437 0 ... 0.000000\n", - "29472 0 ... 0.000000\n", - "29483 0 ... 0.008659\n", - "\n", - "[3 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29437 0.0 ... 0.0\n", - "29472 0.0 ... 0.0\n", - "29483 0.0 \n", - "kwargs: {}\n", - "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", - "\n", - "2023-08-11 11:19:01,750 - distributed.worker - WARNING - Compute Failed\n", - "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 0, 1)\n", - "Function: pipe\n", - "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29438 0.905017 ... 0\n", - "\n", - "[1 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29449 0.001648 ... 0\n", - "29426 0.000000 ... 0\n", - "29414 0.000000 ... 0\n", - "\n", - "[3 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP \n", - "kwargs: {}\n", - "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", - "\n", - "2023-08-11 11:19:01,761 - distributed.worker - ERROR - Exception during execution of task ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 4, 1).\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 184, in __getitem__\n", - " return self.fast[key]\n", - " ~~~~~~~~~^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/lru.py\", line 117, in __getitem__\n", - " result = self.d[key]\n", - " ~~~~~~^^^^^\n", - "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2412, in _prepare_args_for_execution\n", - " data[k] = self.data[k]\n", - " ~~~~~~~~~^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/spill.py\", line 216, in __getitem__\n", - " return super().__getitem__(key)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 186, in __getitem__\n", - " return self.slow_to_fast(key)\n", - " ^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 153, in slow_to_fast\n", - " value = self.slow[key]\n", - " ~~~~~~~~~^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n", - " return func(*args, **kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/cache.py\", line 67, in __getitem__\n", - " gen = self._last_updated[key]\n", - " ~~~~~~~~~~~~~~~~~~^^^^^\n", - "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", - "\n", - "During handling of the above exception, another exception occurred:\n", - "\n", - "Traceback (most recent call last):\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2264, in execute\n", - " args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs)\n", - " ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n", - " File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2416, in _prepare_args_for_execution\n", - " data[k] = Actor(type(self.state.actors[k]), self.address, k, self)\n", - " ~~~~~~~~~~~~~~~~~^^^\n", - "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n", - "2023-08-11 11:19:01,791 - distributed.worker - WARNING - Compute Failed\n", - "Key: ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 3, 1)\n", - "Function: pipe\n", - "args: ([ sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29487 0.002037 ... 0\n", - "29455 0.275390 ... 0\n", - "\n", - "[2 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP ... \n", - "29487 0.000000 ... 0.000000\n", - "29455 0.002053 ... 0.018981\n", - "\n", - "[2 rows x 4 columns], sum-rando_0-17eccfbe7bc44d26fa589319100d6357 ... sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n", - "ZIP \n", - "kwargs: {}\n", - "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n", - "\n" - ] - }, - { - "ename": "TypeError", - "evalue": "can only concatenate str (not \"float\") to str", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mTypeError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[9], line 13\u001b[0m\n\u001b[1;32m 1\u001b[0m dtracts \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 2\u001b[0m dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(tracts[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrando\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 3\u001b[0m \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 4\u001b[0m )\n\u001b[1;32m 6\u001b[0m dzips \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m 7\u001b[0m dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(zip_codes[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mZIP\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m 8\u001b[0m \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m cat_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 12\u001b[0m \u001b[43m \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdzips\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mZIP\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrando\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m---> 13\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/toolz/functoolz.py:628\u001b[0m, in \u001b[0;36mpipe\u001b[0;34m()\u001b[0m\n\u001b[1;32m 608\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\" Pipe a value through a sequence of functions\u001b[39;00m\n\u001b[1;32m 609\u001b[0m \n\u001b[1;32m 610\u001b[0m \u001b[38;5;124;03mI.e. ``pipe(data, f, g, h)`` is equivalent to ``h(g(f(data)))``\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 625\u001b[0m \u001b[38;5;124;03m thread_last\u001b[39;00m\n\u001b[1;32m 626\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 627\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m--> 628\u001b[0m data \u001b[38;5;241m=\u001b[39m func(data)\n\u001b[1;32m 629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n", - "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1209\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1207\u001b[0m result \u001b[38;5;241m=\u001b[39m collections\u001b[38;5;241m.\u001b[39mOrderedDict()\n\u001b[1;32m 1208\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1209\u001b[0m r \u001b[38;5;241m=\u001b[39m func(grouped, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfunc_kwargs)\n\u001b[1;32m 1211\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m 1212\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m idx, s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(r):\n", - "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1255\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1253\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like[column])\n", - "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/pandas/_libs/groupby.pyx:717\u001b[0m, in \u001b[0;36mpandas._libs.groupby.group_sum\u001b[0;34m()\u001b[0m\n\u001b[1;32m 715\u001b[0m t = val\n\u001b[1;32m 716\u001b[0m else:\n\u001b[0;32m--> 717\u001b[0m t = sumx[lab, j] + val\n\u001b[1;32m 718\u001b[0m sumx[lab, j] = t\n\u001b[1;32m 719\u001b[0m \n", - "\u001b[0;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str" - ] - } - ], - "source": [ - "dtracts = (\n", - " dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n", - " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", - ")\n", - "\n", - "dzips = (\n", - " dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n", - " .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n", - ")\n", - "\n", - "cat_dk = tobler.area_weighted.area_interpolate_dask(\n", - " dtracts, dzips, 'ZIP', categorical_variables=['rando']\n", - ").compute()" - ] - }, - { - "cell_type": "markdown", - "id": "08596315-236b-45df-955e-44a98b0a2eba", - "metadata": {}, - "source": [ - "[DAB]: my hunch is that the error, though cryptic and hard to debug, comes from a worker returning an empty result (perhaps `None`?) which, when it's passed through the aggregation post collection from the workers, raises the error. Further investigation is warranted." - ] } ], "metadata": { diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index 24b204d..2228172 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -10,6 +10,8 @@ from dask.highlevelgraph import HighLevelGraph from .area_interpolate import _area_interpolate_binning as area_interpolate +from dask.distributed import print as dprint + def area_interpolate_dask( source_dgdf, target_dgdf, @@ -154,11 +156,13 @@ def area_interpolate_dask( if categorical_variables is not None: out_categorical = ( transferred - [category_vars + [id_col]] - .groupby(id_col) + [category_vars] + .astype(float) + .groupby(transferred[id_col]) .agg({v: 'sum' for v in category_vars}) ) out = out.join(out_categorical, on=id_col) + #return transferred return out def id_area_interpolate( @@ -244,4 +248,5 @@ def id_area_interpolate( estimates = estimates.join( pandas.DataFrame(index=estimates.index, columns=category_vars_to_add) ) + #dprint(f"######################\n{estimates}\n######################") return estimates From 4cf10635c8ec408e2f83ab5317484f6006f4cda0 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Mon, 14 Aug 2023 11:41:40 +0100 Subject: [PATCH 06/16] Remove debugging code --- tobler/area_weighted/area_interpolate_dask.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index 2228172..82021ff 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -10,8 +10,6 @@ from dask.highlevelgraph import HighLevelGraph from .area_interpolate import _area_interpolate_binning as area_interpolate -from dask.distributed import print as dprint - def area_interpolate_dask( source_dgdf, target_dgdf, @@ -162,7 +160,6 @@ def area_interpolate_dask( .agg({v: 'sum' for v in category_vars}) ) out = out.join(out_categorical, on=id_col) - #return transferred return out def id_area_interpolate( @@ -248,5 +245,4 @@ def id_area_interpolate( estimates = estimates.join( pandas.DataFrame(index=estimates.index, columns=category_vars_to_add) ) - #dprint(f"######################\n{estimates}\n######################") return estimates From f71d984a6b88d484adde4ad4970804691f0e846f Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Mon, 14 Aug 2023 20:22:18 +0000 Subject: [PATCH 07/16] Add categorical_frequency --- 04_area_interpolate_dask.ipynb | 2 +- tobler/area_weighted/area_interpolate.py | 9 ++++++++- tobler/area_weighted/area_interpolate_dask.py | 5 ++++- 3 files changed, 13 insertions(+), 3 deletions(-) diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb index d368967..128b57b 100644 --- a/04_area_interpolate_dask.ipynb +++ b/04_area_interpolate_dask.ipynb @@ -104,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 2, "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749", "metadata": { "tags": [] diff --git a/tobler/area_weighted/area_interpolate.py b/tobler/area_weighted/area_interpolate.py index 40bafe9..155d533 100644 --- a/tobler/area_weighted/area_interpolate.py +++ b/tobler/area_weighted/area_interpolate.py @@ -212,6 +212,7 @@ def _area_interpolate_binning( spatial_index="auto", n_jobs=1, categorical_variables=None, + categorical_frequency=True ): """ Area interpolation for extensive, intensive and categorical variables. @@ -249,6 +250,11 @@ def _area_interpolate_binning( available. If `table` is passed, this is ignored. categorical_variables : list [Optional. Default=None] Columns in dataframes for categorical variables + categorical_frequency : Boolean + [Optional. Default=True] If True, `estimates` returns the frequency of each + value in a categorical variable in every polygon of `target_df` (proportion of + area). If False, `estimates` contains the area in every polygon of `target_df` + that is occupied by each value of the categorical Returns ------- @@ -357,7 +363,8 @@ def _area_interpolate_binning( )[0] categorical = pd.DataFrame(categorical) - categorical = categorical.div(target_df.area.values, axis="rows") + if categorical_frequency is True: + categorical = categorical.div(target_df.area.values, axis="rows") if extensive_variables: dfs.append(extensive) diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index 2228172..6303cc2 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -102,7 +102,8 @@ def area_interpolate_dask( 'auto', 1, categorical_variables, - category_vars + category_vars, + False ) lr = source_dgdf.spatial_partitions.iloc[l] rr = target_dgdf.spatial_partitions.iloc[r] @@ -129,6 +130,7 @@ def area_interpolate_dask( n_jobs=1, categorical_variables=categorical_variables, category_vars=category_vars + categorical_frequency=False ) # Build output table transferred = dask_geopandas.GeoDataFrame( @@ -237,6 +239,7 @@ def id_area_interpolate( spatial_index=spatial_index, n_jobs=n_jobs, categorical_variables=categorical_variables, + categorical_frequency=False ) estimates[id_col] = target_df[id_col].values From bf157ef42767c2742ad223d237faf98000e1f2b6 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Mon, 14 Aug 2023 20:51:44 +0000 Subject: [PATCH 08/16] Integrate categorical_frequency into Dask --- tobler/area_weighted/area_interpolate_dask.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index ed90071..8a795ac 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -17,6 +17,7 @@ def area_interpolate_dask( extensive_variables=None, intensive_variables=None, categorical_variables=None, + categorical_frequency=True ): ''' Out-of-core and parallel area interpolation for categorical variables. @@ -45,6 +46,12 @@ def area_interpolate_dask( [Optional. Default=None] Columns in `source_dgdf` for categorical variables IMPORTANT: categorical variables must be of type `'category[known]'`. This is so all categories are known ahead of time and Dask can run lazily. + categorical_frequency : Boolean + [Optional. Default=True] If True, `estimates` returns the frequency of each + value in a categorical variable in every polygon of `target_df` (proportion of + area). If False, `estimates` contains the area in every polygon of `target_df` + that is occupied by each value of the categorical + Returns ------- @@ -101,7 +108,6 @@ def area_interpolate_dask( 1, categorical_variables, category_vars, - False ) lr = source_dgdf.spatial_partitions.iloc[l] rr = target_dgdf.spatial_partitions.iloc[r] @@ -127,8 +133,7 @@ def area_interpolate_dask( spatial_index='auto', n_jobs=1, categorical_variables=categorical_variables, - category_vars=category_vars - categorical_frequency=False + category_vars=category_vars, ) # Build output table transferred = dask_geopandas.GeoDataFrame( @@ -162,6 +167,11 @@ def area_interpolate_dask( .agg({v: 'sum' for v in category_vars}) ) out = out.join(out_categorical, on=id_col) + if categorical_frequency is True: + cols = out_categorical.columns.tolist() + out[cols] = out[cols].div( + out.area, axis='index' + ) return out def id_area_interpolate( From c2537165f46a3a1d6b01101564168252c42bbfce Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Wed, 16 Aug 2023 14:40:08 +0000 Subject: [PATCH 09/16] Move notebook to notebooks folder --- .ci/310.yml | 3 +++ .ci/311.yml | 3 +++ .ci/39.yml | 5 ++++- .../04_area_interpolate_dask.ipynb | 0 4 files changed, 10 insertions(+), 1 deletion(-) rename 04_area_interpolate_dask.ipynb => notebooks/04_area_interpolate_dask.ipynb (100%) diff --git a/.ci/310.yml b/.ci/310.yml index f894ed7..e666b7a 100644 --- a/.ci/310.yml +++ b/.ci/310.yml @@ -3,6 +3,9 @@ channels: - conda-forge dependencies: - python=3.10 + - dask + - dask-geopandas + - distributed - jupyterlab - numpy - geopandas diff --git a/.ci/311.yml b/.ci/311.yml index a09fcd2..a533e59 100644 --- a/.ci/311.yml +++ b/.ci/311.yml @@ -4,6 +4,9 @@ channels: dependencies: - python=3.11 - jupyterlab + - dask + - dask-geopandas + - distributed - numpy - geopandas - pandas diff --git a/.ci/39.yml b/.ci/39.yml index 3eceed8..d029839 100644 --- a/.ci/39.yml +++ b/.ci/39.yml @@ -3,6 +3,9 @@ channels: - conda-forge dependencies: - python=3.9 + - dask + - dask-geopandas + - distributed - numpy - geopandas - pandas @@ -29,4 +32,4 @@ dependencies: - numpydoc - nbsphinx - joblib - - astropy \ No newline at end of file + - astropy diff --git a/04_area_interpolate_dask.ipynb b/notebooks/04_area_interpolate_dask.ipynb similarity index 100% rename from 04_area_interpolate_dask.ipynb rename to notebooks/04_area_interpolate_dask.ipynb From 07469c6ec2b1d6f85ef42ab6baec0e0140d33f24 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Wed, 16 Aug 2023 14:58:04 +0000 Subject: [PATCH 10/16] Add test for dask_ai --- tobler/tests/test_area_interpolators.py | 30 ++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index 27cd829..db49f56 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -1,9 +1,11 @@ """test interpolation functions.""" import geopandas +import dask_geopandas from libpysal.examples import load_example from numpy.testing import assert_almost_equal from tobler.area_weighted import area_interpolate +from tobler.area_weighted import area_interpolate_dask from tobler.area_weighted.area_interpolate import _area_tables_binning from geopandas.testing import assert_geodataframe_equal import pytest @@ -79,6 +81,32 @@ def test_area_interpolate_categorical(): assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) +def test_area_interpolate_categorical_dask(): + sac1, sac2 = datasets() + sac1['animal'] = sac1['animal'].astype('category') + dsac1 = ( + dask_geopandas.from_geopandas(sac1, npartitions=2) + .spatial_shuffle(by='hilbert', shuffle='tasks') + ) + dsac2 = ( + dask_geopandas.from_geopandas(sac2, npartitions=2) + .spatial_shuffle(by='hilbert', shuffle='tasks') + ) + area = area_interpolate_dask.area_interpolate_dask( + source_df=sac1, + target_df=sac2, + extensive_variables=["TOT_POP"], + intensive_variables=["pct_poverty"], + categorical_variables=["animal"], + n_jobs=1, + ) + assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) + assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) + assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0) + assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0) + assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0) + + def test_area_interpolate_custom_index(): sac1, sac2 = datasets() sac1.index = sac1.index * 2 @@ -193,4 +221,4 @@ def test_passed_table(): table=dok, ) assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0) - assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) \ No newline at end of file + assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0) From 96d5831e4b3754f950e1324f27303072c17c4a05 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 13:53:00 +0100 Subject: [PATCH 11/16] Change test as suggested by @knaaptime --- tobler/tests/test_area_interpolators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index db49f56..d5adf61 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -92,7 +92,7 @@ def test_area_interpolate_categorical_dask(): dask_geopandas.from_geopandas(sac2, npartitions=2) .spatial_shuffle(by='hilbert', shuffle='tasks') ) - area = area_interpolate_dask.area_interpolate_dask( + area = area_interpolate_dask( source_df=sac1, target_df=sac2, extensive_variables=["TOT_POP"], From 9f20b02a556d4fe0442d677a7d7a8c5657778d02 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 14:01:37 +0100 Subject: [PATCH 12/16] Add optional imports for dask(-geopandas) --- tobler/area_weighted/area_interpolate_dask.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py index 8a795ac..7f3b26e 100755 --- a/tobler/area_weighted/area_interpolate_dask.py +++ b/tobler/area_weighted/area_interpolate_dask.py @@ -4,11 +4,18 @@ import pandas import geopandas -import dask_geopandas import numpy as np -from dask.base import tokenize -from dask.highlevelgraph import HighLevelGraph from .area_interpolate import _area_interpolate_binning as area_interpolate +try: + import dask_geopandas + from dask.base import tokenize + from dask.highlevelgraph import HighLevelGraph +except ImportError: + raise ImportError( + "Area interpolation with Dask requires `dask` and " + "`dask_geopandas` installed to run. Please install them " + "before importing this functionality." + ) def area_interpolate_dask( source_dgdf, From 6a9cc3f30d1d12f6cf43c56f6d60f285c4158e9a Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 14:04:46 +0100 Subject: [PATCH 13/16] Minor fix for dask test --- tobler/tests/test_area_interpolators.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index d5adf61..9070aa9 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -93,12 +93,9 @@ def test_area_interpolate_categorical_dask(): .spatial_shuffle(by='hilbert', shuffle='tasks') ) area = area_interpolate_dask( - source_df=sac1, - target_df=sac2, - extensive_variables=["TOT_POP"], - intensive_variables=["pct_poverty"], + source_dgdf=sac1, + target_dgdf=sac2, categorical_variables=["animal"], - n_jobs=1, ) assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) From d6ed8f602f37aa9b66206822b7a3b962adb7c02c Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 14:21:55 +0100 Subject: [PATCH 14/16] more typo fixing of dask tests --- tobler/tests/test_area_interpolators.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index 9070aa9..94a397a 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -95,6 +95,7 @@ def test_area_interpolate_categorical_dask(): area = area_interpolate_dask( source_dgdf=sac1, target_dgdf=sac2, + id_col='ZIP', categorical_variables=["animal"], ) assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) From d316125a139d571742d17a497fde7f09bdb980b4 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 14:35:26 +0100 Subject: [PATCH 15/16] More typos... --- tobler/tests/test_area_interpolators.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index 94a397a..4531ea3 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -93,8 +93,8 @@ def test_area_interpolate_categorical_dask(): .spatial_shuffle(by='hilbert', shuffle='tasks') ) area = area_interpolate_dask( - source_dgdf=sac1, - target_dgdf=sac2, + source_dgdf=dsac1, + target_dgdf=dsac2, id_col='ZIP', categorical_variables=["animal"], ) From 17841d328ff99ae7a49b8228c354262584b98c78 Mon Sep 17 00:00:00 2001 From: Dani Arribas-Bel Date: Fri, 18 Aug 2023 14:45:01 +0100 Subject: [PATCH 16/16] loading dask gdf to memory for tests --- tobler/tests/test_area_interpolators.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py index 4531ea3..bc8791b 100644 --- a/tobler/tests/test_area_interpolators.py +++ b/tobler/tests/test_area_interpolators.py @@ -97,7 +97,7 @@ def test_area_interpolate_categorical_dask(): target_dgdf=dsac2, id_col='ZIP', categorical_variables=["animal"], - ) + ).compute() assert_almost_equal(area.animal_cat.sum(), 32, decimal=0) assert_almost_equal(area.animal_dog.sum(), 19, decimal=0) assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)