From 64e3bb870ab534b128883dff8fd24e3f544e22f4 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Thu, 10 Aug 2023 15:24:10 +0100
Subject: [PATCH 01/16] Start commit

---
 04_area_interpolate_dask.ipynb                | 427 ++++++++++++++++++
 tobler/area_weighted/__init__.py              |   3 +-
 tobler/area_weighted/area_interpolate_dask.py | 152 +++++++
 3 files changed, 581 insertions(+), 1 deletion(-)
 create mode 100644 04_area_interpolate_dask.ipynb
 create mode 100755 tobler/area_weighted/area_interpolate_dask.py

diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb
new file mode 100644
index 0000000..c4ae195
--- /dev/null
+++ b/04_area_interpolate_dask.ipynb
@@ -0,0 +1,427 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "e3f2586a-5b6a-4d46-b6e8-1991ae3bec6f",
+   "metadata": {},
+   "source": [
+    "# (Distributed) areal interpolation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "00f875bd-2714-4551-b10c-1ef3f514478d",
+   "metadata": {},
+   "source": [
+    "In this notebook, we compare the single-core version in `tobler.area_weighted.area_interpolate` with the distributed version in `tobler.area_weighted.area_interpolate_dask`. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "b70ac531-082c-4e77-9194-c5d4096b72ae",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.environ['USE_PYGEOS'] = '1'\n",
+    "\n",
+    "import geopandas\n",
+    "import dask_geopandas\n",
+    "import tobler\n",
+    "\n",
+    "from dask.distributed import Client, LocalCluster"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d16a2e15-866b-407d-b65d-54a675aefbd7",
+   "metadata": {},
+   "source": [
+    "## Setup"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e8858b38-0a72-4f72-98be-c490f8201d86",
+   "metadata": {},
+   "source": [
+    "We use the San Diego H3 dataset from the [GDS Book](https://geographicdata.science/book/data/h3_grid/build_sd_h3_grid.html):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "dc60a91c-0d58-4b19-b180-69692286c9a0",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "h3 = geopandas.read_file((\n",
+    "    'https://geographicdata.science/book/'\n",
+    "    '_downloads/d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg'\n",
+    ")).to_crs(epsg=3310)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "41b31033-9711-4102-98af-66b3b6945bcb",
+   "metadata": {},
+   "source": [
+    "And the Census tracts dataset, also from the same [source](https://geographicdata.science/book/data/sandiego/sandiego_tracts_cleaning.html):"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "271b9208-1ae7-41f0-8234-da1d1ed2030a",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection\n",
+      "  return lib.intersection(a, b, **kwargs)\n"
+     ]
+    }
+   ],
+   "source": [
+    "tracts = (\n",
+    "    geopandas.read_file((\n",
+    "        'https://geographicdata.science/book/'\n",
+    "        '_downloads/f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg'\n",
+    "    ))\n",
+    "    .to_crs(epsg=3310)\n",
+    "    .clip(h3)\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ebddef3-ed10-4164-99bb-3ca07f3aa1de",
+   "metadata": {},
+   "source": [
+    "Note in both cases we require a projected CRS and thus use the [NAD83/California Albers](https://epsg.io/3310)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae",
+   "metadata": {},
+   "source": [
+    "We will set up a local Dask cluster:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-10 15:19:49,969 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-taj3n78d', purging\n",
+      "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-283edkzp', purging\n",
+      "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-5sq_o8d_', purging\n",
+      "2023-08-10 15:19:49,971 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8pnf3b0w', purging\n",
+      "2023-08-10 15:19:49,972 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-0kn_mkzc', purging\n",
+      "2023-08-10 15:19:49,973 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-kkw_crni', purging\n",
+      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8ycp859d', purging\n",
+      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-fb2nvcnm', purging\n",
+      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-mpy0p1sd', purging\n",
+      "2023-08-10 15:19:49,975 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-y5wx5vyl', purging\n"
+     ]
+    }
+   ],
+   "source": [
+    "client = Client(LocalCluster(n_workers=10))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880",
+   "metadata": {},
+   "source": [
+    "Finally, for Dask, we need to provide `dask_geopandas.GeoDataFrame` objects with spatial partitions and categorical variables properly set up:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7fef3124-a5d9-4712-bf9e-53fdf344c37f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "tracts['sub_30'] = tracts['sub_30'].astype('category')\n",
+    "tracts['tract'] = tracts['tract'].astype('category')\n",
+    "\n",
+    "dtracts = (\n",
+    "    dask_geopandas.from_geopandas(tracts[\n",
+    "        ['geometry', 'sub_30', 'tract', 'total_pop', 'total_pop_white']\n",
+    "    ], npartitions=10)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
+    ")\n",
+    "\n",
+    "dh3 = (\n",
+    "    dask_geopandas.from_geopandas(h3, npartitions=10)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "54f986ec-ea46-479e-aed8-5edeeaf16fda",
+   "metadata": {},
+   "source": [
+    "---\n",
+    "\n",
+    "**IMPORTANT** - At this point, only *extensive* and *categorical* variables are implemented, so those are what we will test.\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6",
+   "metadata": {},
+   "source": [
+    "## Correctness"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "958a9509-2666-4cf5-88f0-3e22ab8d8eac",
+   "metadata": {},
+   "source": [
+    "### Extensive\n",
+    "\n",
+    "Here we transfer the total population from `tracts` to `h3`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c0e1d903-1e6f-446b-9d46-93cbdce3bc76",
+   "metadata": {},
+   "source": [
+    "First, we transfer with the single-core approach:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "2d5dd9e8-4c55-43d0-9730-4b1f0826305f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n",
+      "  return lib.intersects(a, b, **kwargs)\n",
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
+      "  return lib.intersection(a, b, **kwargs)\n"
+     ]
+    }
+   ],
+   "source": [
+    "ext_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts, h3, extensive_variables=['total_pop', 'total_pop_white']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b7604a13-770d-45d3-bd36-23464ff39138",
+   "metadata": {},
+   "source": [
+    "Then we perform the same operation using Dask:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0a84dc41-f128-4e4f-98da-44c7729b73a9",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
+      "  return lib.intersection(a, b, **kwargs)\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop_white\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m     70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m     71\u001b[0m     name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m     72\u001b[0m )\n\u001b[1;32m     73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     75\u001b[0m \u001b[43m    \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     76\u001b[0m \u001b[43m    \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[43m    \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m    \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     79\u001b[0m \u001b[43m    \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     80\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     81\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m    \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m     88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m     89\u001b[0m     graph, \n\u001b[1;32m     90\u001b[0m     name,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     93\u001b[0m     new_spatial_partitions\n\u001b[1;32m     94\u001b[0m )\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m    119\u001b[0m     source_df,\n\u001b[1;32m    120\u001b[0m     target_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    129\u001b[0m     category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m     estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[43m        \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m        \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m        \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    137\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m        \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    140\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    141\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    142\u001b[0m     estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'"
+     ]
+    }
+   ],
+   "source": [
+    "ext_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts, dh3, 'hex_id', extensive_variables=['total_pop', 'total_pop_white']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "db296055-3865-43f8-bfd0-0ea40f246ba7",
+   "metadata": {},
+   "source": [
+    "### Categorical"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8cf00b26-765c-40ce-a78a-dcfce4838c88",
+   "metadata": {},
+   "source": [
+    "Single-core:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "cbb3dbbd-70e4-4c3d-935e-6a6b60341f7c",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    },
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n",
+      "  return lib.intersects(a, b, **kwargs)\n",
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
+      "  return lib.intersection(a, b, **kwargs)\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "Object with dtype category cannot perform the numpy op isnan",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cat_sc \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate.py:267\u001b[0m, in \u001b[0;36m_area_interpolate_binning\u001b[0;34m(source_df, target_df, extensive_variables, intensive_variables, table, allocate_total)\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extensive_variables:\n\u001b[1;32m    266\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m variable \u001b[38;5;129;01min\u001b[39;00m extensive_variables:\n\u001b[0;32m--> 267\u001b[0m         vals \u001b[38;5;241m=\u001b[39m \u001b[43m_nan_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    268\u001b[0m         vals \u001b[38;5;241m=\u001b[39m _inf_check(source_df, variable)\n\u001b[1;32m    269\u001b[0m         estimates \u001b[38;5;241m=\u001b[39m diags([vals], [\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mdot(weights)\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/util/util.py:25\u001b[0m, in \u001b[0;36m_nan_check\u001b[0;34m(df, column)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Check if variable has nan values.\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03mWarn and replace nan with 0.0.\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     24\u001b[0m values \u001b[38;5;241m=\u001b[39m df[column]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misnan\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;129;01mor\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(np\u001b[38;5;241m.\u001b[39misinf(values)):\n\u001b[1;32m     26\u001b[0m     wherenan \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39misnan(values)\n\u001b[1;32m     27\u001b[0m     values[wherenan] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m\n",
+      "File \u001b[0;32m~/mambaforge/envs/gds/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:1639\u001b[0m, in \u001b[0;36mCategorical.__array_ufunc__\u001b[0;34m(self, ufunc, method, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m   1635\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[1;32m   1637\u001b[0m \u001b[38;5;66;03m# for all other cases, raise for now (similarly as what happens in\u001b[39;00m\n\u001b[1;32m   1638\u001b[0m \u001b[38;5;66;03m# Series.__array_prepare__)\u001b[39;00m\n\u001b[0;32m-> 1639\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m   1640\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject with dtype \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cannot perform \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1641\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe numpy op \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mufunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1642\u001b[0m )\n",
+      "\u001b[0;31mTypeError\u001b[0m: Object with dtype category cannot perform the numpy op isnan"
+     ]
+    }
+   ],
+   "source": [
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts, h3, extensive_variables=['sub_30', 'tract']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "76507c2b-769a-4b80-8b62-40206e2cab42",
+   "metadata": {},
+   "source": [
+    "And through Dask:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "7107527b-88cc-4e9c-97d2-72a1d153c657",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
+      "  return lib.intersection(a, b, **kwargs)\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m     70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m     71\u001b[0m     name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m     72\u001b[0m )\n\u001b[1;32m     73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     75\u001b[0m \u001b[43m    \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     76\u001b[0m \u001b[43m    \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[43m    \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m    \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     79\u001b[0m \u001b[43m    \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     80\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     81\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m    \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m     88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m     89\u001b[0m     graph, \n\u001b[1;32m     90\u001b[0m     name,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     93\u001b[0m     new_spatial_partitions\n\u001b[1;32m     94\u001b[0m )\n",
+      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m    119\u001b[0m     source_df,\n\u001b[1;32m    120\u001b[0m     target_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    129\u001b[0m     category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m     estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[43m        \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m        \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m        \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    137\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m        \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    140\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    141\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    142\u001b[0m     estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
+      "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'"
+     ]
+    }
+   ],
+   "source": [
+    "ext_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts, dh3, 'hex_id', extensive_variables=['sub_30', 'tract']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "571e7878-25ad-49bc-a7a5-a632988f6a4b",
+   "metadata": {},
+   "source": [
+    "## Performance"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.10"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tobler/area_weighted/__init__.py b/tobler/area_weighted/__init__.py
index dea94ee..05d056d 100644
--- a/tobler/area_weighted/__init__.py
+++ b/tobler/area_weighted/__init__.py
@@ -1,4 +1,5 @@
 from .area_interpolate import _area_interpolate_binning as area_interpolate
 from .area_interpolate import _area_interpolate as _slow_area_interpolate
 from .area_interpolate import _area_tables, _area_tables_binning, _area_tables_raster
-from .area_interpolate import _check_presence_of_crs
\ No newline at end of file
+from .area_interpolate import _check_presence_of_crs
+from .area_interpolate_dask import area_interpolate_dask
\ No newline at end of file
diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
new file mode 100755
index 0000000..c0ae853
--- /dev/null
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -0,0 +1,152 @@
+import pandas
+import geopandas
+import dask_geopandas
+import warnings
+import numpy as np
+from dask.base import tokenize
+from dask.highlevelgraph import HighLevelGraph
+from tobler.area_weighted import area_interpolate
+
+def area_interpolate_dask(
+    left_dgdf,
+    right_dgdf,
+    id_col,
+    extensive_variables=None,
+    intensive_variables=None,
+    categorical_variables=None,
+):
+    if intensive_variables is not None:
+        raise NotImplementedError((
+            "Dask-based interpolation of intensive variables is "
+            "not implemented yet. Please remove intensive variables to "
+            "be able to run the rest."
+        ))
+    # Categoricals must be Dask's known categorical
+    if categorical_variables is not None:
+        category_vars = []
+        for cat_var in categorical_variables:
+            var_names = [f'{cat_var}_{c}' for c in left_dgdf[cat_var].cat.categories]
+            category_vars.extend(var_names)
+    else:
+        category_vars = None
+    # Build tasks by joining pairs of chunks from left/right
+    dsk = {}
+    new_spatial_partitions = []
+    parts = geopandas.sjoin(
+        left_dgdf.spatial_partitions.to_frame('geometry'),
+        right_dgdf.spatial_partitions.to_frame('geometry'),
+        how='inner',
+        predicate='intersects'
+    )
+    parts_left = np.asarray(parts.index)
+    parts_right = np.asarray(parts['index_right'].values)
+    name = 'area_interpolate-' + tokenize(
+        right_dgdf, left_dgdf
+    )   
+    for i, (l, r) in enumerate(zip(parts_left, parts_right)):
+        dsk[(name, i)] = (
+            id_area_interpolate,
+            (left_dgdf._name, l),
+            (right_dgdf._name, r),
+            id_col,
+            extensive_variables,
+            intensive_variables,
+            None,
+            True,
+            'auto',
+            1,
+            categorical_variables,
+            category_vars
+        )
+        lr = left_dgdf.spatial_partitions.iloc[l]
+        rr = right_dgdf.spatial_partitions.iloc[r]
+        extent = lr.intersection(rr)
+        new_spatial_partitions.append(extent)
+    # Create geometries for new spatial partitions
+    new_spatial_partitions = geopandas.GeoSeries(
+        data=new_spatial_partitions, crs=left_dgdf.crs
+    )
+    # Build Dask graph
+    graph = HighLevelGraph.from_collections(
+        name, dsk, dependencies=[left_dgdf, right_dgdf]
+    )
+    # Get metadata for the outcome table
+    meta = id_area_interpolate(
+        left_dgdf._meta,
+        right_dgdf._meta,
+        id_col,
+        extensive_variables=extensive_variables,
+        intensive_variables=intensive_variables,
+        table=None,
+        allocate_total=True,
+        spatial_index='auto',
+        n_jobs=1,
+        categorical_variables=categorical_variables,
+        category_vars=category_vars
+    )
+    # Build output table
+    transferred = dask_geopandas.GeoDataFrame(
+        graph, 
+        name,
+        meta,
+        [None] * (len(dsk) + 1),
+        new_spatial_partitions
+    )
+    # Merge chunks
+    out = right_dgdf[[id_col, 'geometry']]
+    ## Extensive --> Add up estimates by ID
+    if extensive_variables is not None:
+        out_extensive = (
+            transferred
+            .groupby(id_col)
+            [extensive_variables]
+            .agg({v: 'sum' for v in extensive_variables})
+        )
+        out = out.join(out_extensive, on=id_col)
+    ## Intensive --> Weight by area of the chunk (Not implemented)
+    ## Categorical --> Add up proportions
+    if categorical_variables is not None:
+        out_categorical = (
+            transferred
+            [category_vars + [id_col]]
+            .groupby(id_col)
+            .agg({v: 'sum' for v in category_vars})
+        )    
+        out = out.join(out_categorical, on=id_col)
+    return out
+
+def id_area_interpolate(
+    source_df,
+    target_df,
+    id_col,
+    extensive_variables=None,
+    intensive_variables=None,
+    table=None,
+    allocate_total=True,
+    spatial_index='auto',
+    n_jobs=1,
+    categorical_variables=None,
+    category_vars=None
+):
+    estimates = area_interpolate(
+        source_df,
+        target_df,
+        extensive_variables=extensive_variables,
+        intensive_variables=intensive_variables,
+        table=table,
+        allocate_total=allocate_total,
+        spatial_index=spatial_index,
+        n_jobs=n_jobs,
+        categorical_variables=categorical_variables,
+    )
+    estimates[id_col] = target_df[id_col].values
+    
+    if categorical_variables is not None:
+        category_vars_to_add = []
+        for category_var in category_vars:
+            if category_var not in estimates.columns:
+                category_vars_to_add.append(category_var)
+        estimates = estimates.join(
+            pandas.DataFrame(index=estimates.index, columns=category_vars_to_add)
+        )    
+    return estimates

From 8b4c28f5b50f249548dfa5a84d802fe3d5cd1322 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Thu, 10 Aug 2023 19:46:01 +0000
Subject: [PATCH 02/16] Merge init in AI

---
 tobler/area_weighted/__init__.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tobler/area_weighted/__init__.py b/tobler/area_weighted/__init__.py
index 7b8ce0a..5c9e7e4 100644
--- a/tobler/area_weighted/__init__.py
+++ b/tobler/area_weighted/__init__.py
@@ -1,5 +1,4 @@
 from .area_interpolate import _area_interpolate_binning as area_interpolate
-from .area_interpolate import _area_interpolate as _slow_area_interpolate
-from .area_interpolate import _area_tables, _area_tables_binning, _area_tables_raster
-from .area_interpolate import _check_presence_of_crs
+from .area_interpolate import _area_tables_binning
+from .area_join import area_join
 from .area_interpolate_dask import area_interpolate_dask

From cc0dbf7dd865d998f7b4b6ab74ad97170098b742 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 11 Aug 2023 10:39:24 +0000
Subject: [PATCH 03/16] Add demo notebook and docs, remove extensive var
 support. Ready for review

---
 04_area_interpolate_dask.ipynb                | 640 +++++++++++++-----
 environment.yml                               |   3 +
 tobler/area_weighted/area_interpolate_dask.py | 131 +++-
 3 files changed, 593 insertions(+), 181 deletions(-)

diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb
index c4ae195..9e88c68 100644
--- a/04_area_interpolate_dask.ipynb
+++ b/04_area_interpolate_dask.ipynb
@@ -19,7 +19,7 @@
   {
    "cell_type": "code",
    "execution_count": 1,
-   "id": "b70ac531-082c-4e77-9194-c5d4096b72ae",
+   "id": "4084f715-3989-4424-943a-2a4066a8bcf2",
    "metadata": {
     "tags": []
    },
@@ -28,9 +28,12 @@
     "import os\n",
     "os.environ['USE_PYGEOS'] = '1'\n",
     "\n",
+    "import pandas\n",
     "import geopandas\n",
     "import dask_geopandas\n",
     "import tobler\n",
+    "from libpysal.examples import load_example\n",
+    "import numpy as np\n",
     "\n",
     "from dask.distributed import Client, LocalCluster"
    ]
@@ -45,106 +48,82 @@
   },
   {
    "cell_type": "markdown",
-   "id": "e8858b38-0a72-4f72-98be-c490f8201d86",
+   "id": "080369e7-f3d4-41c6-a629-12ed458eb743",
    "metadata": {},
    "source": [
-    "We use the San Diego H3 dataset from the [GDS Book](https://geographicdata.science/book/data/h3_grid/build_sd_h3_grid.html):"
+    "Load example data from `pysal`:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 2,
-   "id": "dc60a91c-0d58-4b19-b180-69692286c9a0",
-   "metadata": {
-    "tags": []
-   },
+   "id": "cb395dc5-67f2-462e-a1cf-919c8e6d0ae8",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "h3 = geopandas.read_file((\n",
-    "    'https://geographicdata.science/book/'\n",
-    "    '_downloads/d740a1069144baa1302b9561c3d31afe/sd_h3_grid.gpkg'\n",
-    ")).to_crs(epsg=3310)"
+    "c1 = load_example('Charleston1')\n",
+    "c2 = load_example('Charleston2')\n",
+    "\n",
+    "crs = 6569  # https://epsg.io/6569\n",
+    "\n",
+    "tracts = geopandas.read_file(c1.get_path('sc_final_census2.shp')).to_crs(crs)\n",
+    "zip_codes = geopandas.read_file(c2.get_path('CharlestonMSA2.shp')).to_crs(crs)"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "41b31033-9711-4102-98af-66b3b6945bcb",
+   "id": "1d11c1d7-6435-40cb-a4d4-851f63eccf01",
    "metadata": {},
    "source": [
-    "And the Census tracts dataset, also from the same [source](https://geographicdata.science/book/data/sandiego/sandiego_tracts_cleaning.html):"
+    "We make up a categorical variable with four classes distributed randomly across the dataset:"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": 3,
-   "id": "271b9208-1ae7-41f0-8234-da1d1ed2030a",
+   "id": "3543702f-5e8a-4336-a14d-19a4eeb77b1b",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/pygeos/set_operations.py:129: RuntimeWarning: invalid value encountered in intersection\n",
-      "  return lib.intersection(a, b, **kwargs)\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "tracts = (\n",
-    "    geopandas.read_file((\n",
-    "        'https://geographicdata.science/book/'\n",
-    "        '_downloads/f2341ee89163afe06b42fc5d5ed38060/sandiego_tracts.gpkg'\n",
-    "    ))\n",
-    "    .to_crs(epsg=3310)\n",
-    "    .clip(h3)\n",
+    "rng = np.random.default_rng(seed=42)\n",
+    "\n",
+    "tracts['rando'] = pandas.Series(\n",
+    "    rng.integers(0, 4, len(tracts)), dtype='category'\n",
     ")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "2ebddef3-ed10-4164-99bb-3ca07f3aa1de",
+   "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae",
    "metadata": {},
    "source": [
-    "Note in both cases we require a projected CRS and thus use the [NAD83/California Albers](https://epsg.io/3310)."
+    "We will set up a local Dask cluster so you can follow the computations on the dashboard (`http://localhost:8787` by default):"
    ]
   },
   {
-   "cell_type": "markdown",
-   "id": "d2546bb7-abcb-4cad-8db8-c569ea9289ae",
-   "metadata": {},
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
    "source": [
-    "We will set up a local Dask cluster:"
+    "client = Client(LocalCluster(n_workers=8))"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
-   "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
+   "execution_count": 12,
+   "id": "69f06d42-f47f-4120-811b-275431b1cf3a",
    "metadata": {
     "tags": []
    },
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-08-10 15:19:49,969 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-taj3n78d', purging\n",
-      "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-283edkzp', purging\n",
-      "2023-08-10 15:19:49,970 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-5sq_o8d_', purging\n",
-      "2023-08-10 15:19:49,971 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8pnf3b0w', purging\n",
-      "2023-08-10 15:19:49,972 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-0kn_mkzc', purging\n",
-      "2023-08-10 15:19:49,973 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-kkw_crni', purging\n",
-      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-8ycp859d', purging\n",
-      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-fb2nvcnm', purging\n",
-      "2023-08-10 15:19:49,974 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-mpy0p1sd', purging\n",
-      "2023-08-10 15:19:49,975 - distributed.diskutils - INFO - Found stale lock file and directory '/var/folders/dz/5zvlmz1s0739pm0wx2ryxjf00000gn/T/dask-worker-space/worker-y5wx5vyl', purging\n"
-     ]
-    }
-   ],
+   "outputs": [],
    "source": [
-    "client = Client(LocalCluster(n_workers=10))"
+    "client.shutdown()"
    ]
   },
   {
@@ -158,24 +137,17 @@
   {
    "cell_type": "code",
    "execution_count": 5,
-   "id": "7fef3124-a5d9-4712-bf9e-53fdf344c37f",
-   "metadata": {
-    "tags": []
-   },
+   "id": "a31a1a91-4071-40e2-a21f-7e035d734976",
+   "metadata": {},
    "outputs": [],
    "source": [
-    "tracts['sub_30'] = tracts['sub_30'].astype('category')\n",
-    "tracts['tract'] = tracts['tract'].astype('category')\n",
-    "\n",
     "dtracts = (\n",
-    "    dask_geopandas.from_geopandas(tracts[\n",
-    "        ['geometry', 'sub_30', 'tract', 'total_pop', 'total_pop_white']\n",
-    "    ], npartitions=10)\n",
+    "    dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=4)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")\n",
     "\n",
-    "dh3 = (\n",
-    "    dask_geopandas.from_geopandas(h3, npartitions=10)\n",
+    "dzips = (\n",
+    "    dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=4)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")"
    ]
@@ -187,7 +159,7 @@
    "source": [
     "---\n",
     "\n",
-    "**IMPORTANT** - At this point, only *extensive* and *categorical* variables are implemented, so those are what we will test.\n",
+    "**IMPORTANT** - At this point, only *categorical* variables are implemented, so those are what we will test.\n",
     "\n",
     "---"
    ]
@@ -202,212 +174,554 @@
   },
   {
    "cell_type": "markdown",
-   "id": "958a9509-2666-4cf5-88f0-3e22ab8d8eac",
+   "id": "92dafb11-ec94-43c2-baec-2a5e2a0b380d",
+   "metadata": {},
+   "source": [
+    "- Single core"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "4d4cde6d-73c1-4197-86ed-131724e21296",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts, zip_codes, categorical_variables=['rando']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2982d8dc-c1e9-4927-8643-9900b1b09890",
+   "metadata": {},
+   "source": [
+    "- Dask"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [],
+   "source": [
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5e19b8dd-505f-4dc1-ba85-9fd825e59b43",
    "metadata": {},
    "source": [
-    "### Extensive\n",
+    "And we can compare both results are the same:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "8bc830b2-99a7-4c11-a8d9-0fad3aefcf06",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "rando_0    4.188295e-08\n",
+       "rando_1    5.328575e-08\n",
+       "rando_2    5.396667e-08\n",
+       "rando_3    2.935173e-08\n",
+       "dtype: float64"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "a = (\n",
+    "    cat_dk\n",
+    "    .set_index('ZIP')\n",
+    "    .reindex(zip_codes['ZIP'].values)\n",
+    "    .drop(columns='geometry')\n",
+    ")\n",
     "\n",
-    "Here we transfer the total population from `tracts` to `h3`."
+    "b = (\n",
+    "    cat_sc\n",
+    "    .drop(columns='geometry')\n",
+    "    [['rando_0', 'rando_1', 'rando_2', 'rando_3']]\n",
+    ")\n",
+    "b.index = a.index\n",
+    "\n",
+    "(a - b).max()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "c0e1d903-1e6f-446b-9d46-93cbdce3bc76",
+   "id": "e2e04df1-3331-449c-b74c-e910239c3067",
    "metadata": {},
    "source": [
-    "First, we transfer with the single-core approach:"
+    "The differences in the estimates for the proportions of each area start at the 8th decimal, and thus likely rounding errors derived from the different approaches used to compute the interpolation (the single core does it in one-shot, while Dask computes parts and brings them together later with a sum)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1debbdf4-892f-4fda-834a-0403595794ef",
+   "metadata": {},
+   "source": [
+    "## Performance\n",
+    "\n",
+    "---\n",
+    "\n",
+    "**NOTE** - Timings below do _not_ include computation time required for spatial shuffling and partitioning (which can be substantial with large datasets), or converting from `geopandas`. These are \"sunk costs\" that'll only make this approach preferable with large datasets, although they can be computed once and the result stored in disk efficiently (e.g., as Parquet files). Having said that, when \"larger\" is large enough is not very large in modern terms: from a handful of thousand observations the gains will be substantial if several cores/workers are available.\n",
+    "\n",
+    "---"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e5242c13-c4cd-46e2-9131-ec1734bcc142",
+   "metadata": {},
+   "source": [
+    "We can now time the example above:\n"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
-   "id": "2d5dd9e8-4c55-43d0-9730-4b1f0826305f",
+   "execution_count": 9,
+   "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8",
    "metadata": {
     "tags": []
    },
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n",
-      "  return lib.intersects(a, b, **kwargs)\n",
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
-      "  return lib.intersection(a, b, **kwargs)\n"
+      "85.5 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
    "source": [
-    "ext_sc = tobler.area_weighted.area_interpolate(\n",
-    "    tracts, h3, extensive_variables=['total_pop', 'total_pop_white']\n",
+    "%%timeit\n",
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts, zip_codes, categorical_variables=['rando']\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "444 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%timeit\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
+   ]
+  },
   {
    "cell_type": "markdown",
-   "id": "b7604a13-770d-45d3-bd36-23464ff39138",
+   "id": "a124ee86-c527-4386-be8d-2dc833270fd9",
    "metadata": {},
    "source": [
-    "Then we perform the same operation using Dask:"
+    "This is notably slower (about 5x!). For such a small dataset, the overhead in distributing computations and collecting them overcomes any gains in parallelism.\n",
+    "\n",
+    "Now we can artificially increase the size of the datasets by concatenating them several times and re-computing (this time we only time one execution):"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
-   "id": "0a84dc41-f128-4e4f-98da-44c7729b73a9",
+   "execution_count": 24,
+   "id": "5f56d579-0022-45c2-845c-f351bf96ed01",
    "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
     "tags": []
    },
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
-      "  return lib.intersection(a, b, **kwargs)\n"
+      "40x increase | N. tracts: 4680 | N. ZIPs: 1680\n"
      ]
     },
     {
-     "ename": "TypeError",
-     "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[7], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtotal_pop_white\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m     70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m     71\u001b[0m     name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m     72\u001b[0m )\n\u001b[1;32m     73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     75\u001b[0m \u001b[43m    \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     76\u001b[0m \u001b[43m    \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[43m    \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m    \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     79\u001b[0m \u001b[43m    \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     80\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     81\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m    \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m     88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m     89\u001b[0m     graph, \n\u001b[1;32m     90\u001b[0m     name,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     93\u001b[0m     new_spatial_partitions\n\u001b[1;32m     94\u001b[0m )\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m    119\u001b[0m     source_df,\n\u001b[1;32m    120\u001b[0m     target_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    129\u001b[0m     category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m     estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[43m        \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m        \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m        \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    137\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m        \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    140\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    141\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    142\u001b[0m     estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'"
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n",
+      "This may cause some slowdown.\n",
+      "Consider scattering data ahead of time and using futures.\n",
+      "  warnings.warn(\n",
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.17 MiB.\n",
+      "This may cause some slowdown.\n",
+      "Consider scattering data ahead of time and using futures.\n",
+      "  warnings.warn(\n"
      ]
     }
    ],
    "source": [
-    "ext_dk = tobler.area_weighted.area_interpolate_dask(\n",
-    "    dtracts, dh3, 'hex_id', extensive_variables=['total_pop', 'total_pop_white']\n",
+    "sizeup = 40\n",
+    "tracts_lrg = pandas.concat([tracts] * sizeup)\n",
+    "zips_lrg = pandas.concat([zip_codes] * sizeup)\n",
+    "print(\n",
+    "    f'{sizeup}x increase | N. tracts: {len(tracts_lrg)} | N. ZIPs: {len(zips_lrg)}'\n",
+    ")\n",
+    "\n",
+    "dtracts_lrg = (\n",
+    "    dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=800)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
+    ")\n",
+    "\n",
+    "dzips_lrg = (\n",
+    "    dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=800)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "db296055-3865-43f8-bfd0-0ea40f246ba7",
+   "id": "e5187109-ba95-4b5f-b373-2ec4745d0289",
    "metadata": {},
    "source": [
-    "### Categorical"
+    "And re-compute the timings:"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "8cf00b26-765c-40ce-a78a-dcfce4838c88",
-   "metadata": {},
+   "id": "c0da372a-f791-47fb-ade0-317a1cf6ff9c",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
    "source": [
-    "Single-core:"
+    "---\n",
+    "\n",
+    "### 10x"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
-   "id": "cbb3dbbd-70e4-4c3d-935e-6a6b60341f7c",
+   "execution_count": 14,
+   "id": "620cf9ab-7b9e-4458-809c-c7a73d13f26c",
    "metadata": {
-    "collapsed": true,
-    "jupyter": {
-     "outputs_hidden": true
-    },
     "tags": []
    },
    "outputs": [
     {
-     "name": "stderr",
+     "name": "stdout",
      "output_type": "stream",
      "text": [
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/predicates.py:798: RuntimeWarning: invalid value encountered in intersects\n",
-      "  return lib.intersects(a, b, **kwargs)\n",
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
-      "  return lib.intersection(a, b, **kwargs)\n"
+      "Computing for a sizeup of 10x\n",
+      "CPU times: user 7.21 s, sys: 11.3 ms, total: 7.23 s\n",
+      "Wall time: 6.95 s\n"
      ]
-    },
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "c615b27a-e004-429b-a0c5-e4b237516f9f",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
     {
-     "ename": "TypeError",
-     "evalue": "Object with dtype category cannot perform the numpy op isnan",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[8], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m cat_sc \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate.py:267\u001b[0m, in \u001b[0;36m_area_interpolate_binning\u001b[0;34m(source_df, target_df, extensive_variables, intensive_variables, table, allocate_total)\u001b[0m\n\u001b[1;32m    265\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m extensive_variables:\n\u001b[1;32m    266\u001b[0m     \u001b[38;5;28;01mfor\u001b[39;00m variable \u001b[38;5;129;01min\u001b[39;00m extensive_variables:\n\u001b[0;32m--> 267\u001b[0m         vals \u001b[38;5;241m=\u001b[39m \u001b[43m_nan_check\u001b[49m\u001b[43m(\u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mvariable\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    268\u001b[0m         vals \u001b[38;5;241m=\u001b[39m _inf_check(source_df, variable)\n\u001b[1;32m    269\u001b[0m         estimates \u001b[38;5;241m=\u001b[39m diags([vals], [\u001b[38;5;241m0\u001b[39m])\u001b[38;5;241m.\u001b[39mdot(weights)\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/util/util.py:25\u001b[0m, in \u001b[0;36m_nan_check\u001b[0;34m(df, column)\u001b[0m\n\u001b[1;32m     20\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Check if variable has nan values.\u001b[39;00m\n\u001b[1;32m     21\u001b[0m \n\u001b[1;32m     22\u001b[0m \u001b[38;5;124;03mWarn and replace nan with 0.0.\u001b[39;00m\n\u001b[1;32m     23\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m     24\u001b[0m values \u001b[38;5;241m=\u001b[39m df[column]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[0;32m---> 25\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(\u001b[43mnp\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43misnan\u001b[49m\u001b[43m(\u001b[49m\u001b[43mvalues\u001b[49m\u001b[43m)\u001b[49m) \u001b[38;5;129;01mor\u001b[39;00m np\u001b[38;5;241m.\u001b[39many(np\u001b[38;5;241m.\u001b[39misinf(values)):\n\u001b[1;32m     26\u001b[0m     wherenan \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39misnan(values)\n\u001b[1;32m     27\u001b[0m     values[wherenan] \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0.0\u001b[39m\n",
-      "File \u001b[0;32m~/mambaforge/envs/gds/lib/python3.10/site-packages/pandas/core/arrays/categorical.py:1639\u001b[0m, in \u001b[0;36mCategorical.__array_ufunc__\u001b[0;34m(self, ufunc, method, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m   1635\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m result\n\u001b[1;32m   1637\u001b[0m \u001b[38;5;66;03m# for all other cases, raise for now (similarly as what happens in\u001b[39;00m\n\u001b[1;32m   1638\u001b[0m \u001b[38;5;66;03m# Series.__array_prepare__)\u001b[39;00m\n\u001b[0;32m-> 1639\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTypeError\u001b[39;00m(\n\u001b[1;32m   1640\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mObject with dtype \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdtype\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m cannot perform \u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1641\u001b[0m     \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mthe numpy op \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mufunc\u001b[38;5;241m.\u001b[39m\u001b[38;5;18m__name__\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m   1642\u001b[0m )\n",
-      "\u001b[0;31mTypeError\u001b[0m: Object with dtype category cannot perform the numpy op isnan"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 10x\n",
+      "CPU times: user 548 ms, sys: 18 ms, total: 566 ms\n",
+      "Wall time: 3.56 s\n"
      ]
     }
    ],
    "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cc13af25-e97e-4b34-bb1f-bb946c15748e",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "\n",
+    "### 20x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "8dbb40d4-4b3b-446d-9d1b-99462a122d6e",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 20x\n",
+      "CPU times: user 28.6 s, sys: 26.1 ms, total: 28.7 s\n",
+      "Wall time: 27.6 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
     "cat_sc = tobler.area_weighted.area_interpolate(\n",
-    "    tracts, h3, extensive_variables=['sub_30', 'tract']\n",
+    "    tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
     ")"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "f2ca1394-5f8d-428f-a61c-87beb8778322",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 20x\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 16.77 MiB.\n",
+      "This may cause some slowdown.\n",
+      "Consider scattering data ahead of time and using futures.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.32 s, sys: 65.3 ms, total: 1.38 s\n",
+      "Wall time: 9.86 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
+   ]
+  },
   {
    "cell_type": "markdown",
-   "id": "76507c2b-769a-4b80-8b62-40206e2cab42",
-   "metadata": {},
+   "id": "335b34b4-9fea-48a6-b38b-8b1a5d755ca1",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
    "source": [
-    "And through Dask:"
+    "---\n",
+    "\n",
+    "### 30x"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
-   "id": "7107527b-88cc-4e9c-97d2-72a1d153c657",
+   "execution_count": 26,
+   "id": "1598ce3f-d21e-4a60-9619-ee5b1eb4932f",
    "metadata": {
     "tags": []
    },
    "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 30x\n",
+      "CPU times: user 1min 4s, sys: 176 ms, total: 1min 4s\n",
+      "Wall time: 1min 1s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "224ffbca-7690-4b20-bad2-efbf042623a9",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 30x\n"
+     ]
+    },
     {
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/Users/dani/mambaforge/envs/gds/lib/python3.10/site-packages/shapely/set_operations.py:133: RuntimeWarning: invalid value encountered in intersection\n",
-      "  return lib.intersection(a, b, **kwargs)\n"
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 25.14 MiB.\n",
+      "This may cause some slowdown.\n",
+      "Consider scattering data ahead of time and using futures.\n",
+      "  warnings.warn(\n"
      ]
     },
     {
-     "ename": "TypeError",
-     "evalue": "_area_interpolate_binning() got an unexpected keyword argument 'spatial_index'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m ext_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m      2\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdh3\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mhex_id\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43msub_30\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mtract\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:74\u001b[0m, in \u001b[0;36marea_interpolate_dask\u001b[0;34m(left_dgdf, right_dgdf, id_col, extensive_variables, intensive_variables, categorical_variables)\u001b[0m\n\u001b[1;32m     70\u001b[0m graph \u001b[38;5;241m=\u001b[39m HighLevelGraph\u001b[38;5;241m.\u001b[39mfrom_collections(\n\u001b[1;32m     71\u001b[0m     name, dsk, dependencies\u001b[38;5;241m=\u001b[39m[left_dgdf, right_dgdf]\n\u001b[1;32m     72\u001b[0m )\n\u001b[1;32m     73\u001b[0m \u001b[38;5;66;03m# Get metadata for the outcome table\u001b[39;00m\n\u001b[0;32m---> 74\u001b[0m meta \u001b[38;5;241m=\u001b[39m \u001b[43mid_area_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     75\u001b[0m \u001b[43m    \u001b[49m\u001b[43mleft_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     76\u001b[0m \u001b[43m    \u001b[49m\u001b[43mright_dgdf\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_meta\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     77\u001b[0m \u001b[43m    \u001b[49m\u001b[43mid_col\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     78\u001b[0m \u001b[43m    \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     79\u001b[0m \u001b[43m    \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     80\u001b[0m \u001b[43m    \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     81\u001b[0m \u001b[43m    \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m,\u001b[49m\n\u001b[1;32m     82\u001b[0m \u001b[43m    \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mauto\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     83\u001b[0m \u001b[43m    \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;241;43m1\u001b[39;49m\u001b[43m,\u001b[49m\n\u001b[1;32m     84\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m     85\u001b[0m \u001b[43m    \u001b[49m\u001b[43mcategory_vars\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategory_vars\u001b[49m\n\u001b[1;32m     86\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     87\u001b[0m \u001b[38;5;66;03m# Build output table\u001b[39;00m\n\u001b[1;32m     88\u001b[0m transferred \u001b[38;5;241m=\u001b[39m dask_geopandas\u001b[38;5;241m.\u001b[39mGeoDataFrame(\n\u001b[1;32m     89\u001b[0m     graph, \n\u001b[1;32m     90\u001b[0m     name,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m     93\u001b[0m     new_spatial_partitions\n\u001b[1;32m     94\u001b[0m )\n",
-      "File \u001b[0;32m~/code/tobler_darribas/tobler/area_weighted/area_interpolate_dask.py:131\u001b[0m, in \u001b[0;36mid_area_interpolate\u001b[0;34m(source_df, target_df, id_col, extensive_variables, intensive_variables, table, allocate_total, spatial_index, n_jobs, categorical_variables, category_vars)\u001b[0m\n\u001b[1;32m    118\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mid_area_interpolate\u001b[39m(\n\u001b[1;32m    119\u001b[0m     source_df,\n\u001b[1;32m    120\u001b[0m     target_df,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    129\u001b[0m     category_vars\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m    130\u001b[0m ):\n\u001b[0;32m--> 131\u001b[0m     estimates \u001b[38;5;241m=\u001b[39m \u001b[43marea_interpolate\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    132\u001b[0m \u001b[43m        \u001b[49m\u001b[43msource_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    133\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtarget_df\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    134\u001b[0m \u001b[43m        \u001b[49m\u001b[43mextensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    135\u001b[0m \u001b[43m        \u001b[49m\u001b[43mintensive_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mintensive_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    136\u001b[0m \u001b[43m        \u001b[49m\u001b[43mtable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    137\u001b[0m \u001b[43m        \u001b[49m\u001b[43mallocate_total\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mallocate_total\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    138\u001b[0m \u001b[43m        \u001b[49m\u001b[43mspatial_index\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mspatial_index\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    139\u001b[0m \u001b[43m        \u001b[49m\u001b[43mn_jobs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mn_jobs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    140\u001b[0m \u001b[43m        \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcategorical_variables\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    141\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    142\u001b[0m     estimates[id_col] \u001b[38;5;241m=\u001b[39m target_df[id_col]\u001b[38;5;241m.\u001b[39mvalues\n\u001b[1;32m    144\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m categorical_variables \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n",
-      "\u001b[0;31mTypeError\u001b[0m: _area_interpolate_binning() got an unexpected keyword argument 'spatial_index'"
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 1.91 s, sys: 58.8 ms, total: 1.97 s\n",
+      "Wall time: 14.6 s\n"
      ]
     }
    ],
    "source": [
-    "ext_dk = tobler.area_weighted.area_interpolate_dask(\n",
-    "    dtracts, dh3, 'hex_id', extensive_variables=['sub_30', 'tract']\n",
-    ")"
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
    ]
   },
   {
    "cell_type": "markdown",
-   "id": "571e7878-25ad-49bc-a7a5-a632988f6a4b",
-   "metadata": {},
+   "id": "b004834f-c5ce-4f92-be9a-364a07c7996b",
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
+   "source": [
+    "---\n",
+    "\n",
+    "### 40x"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "b6b9d06a-9034-4c39-b3a9-92fc6408d5c6",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 40x\n",
+      "CPU times: user 2min 2s, sys: 1.71 s, total: 2min 3s\n",
+      "Wall time: 1min 53s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_sc = tobler.area_weighted.area_interpolate(\n",
+    "    tracts_lrg, zips_lrg, categorical_variables=['rando']\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28",
+   "metadata": {
+    "tags": []
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Computing for a sizeup of 40x\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 33.52 MiB.\n",
+      "This may cause some slowdown.\n",
+      "Consider scattering data ahead of time and using futures.\n",
+      "  warnings.warn(\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "CPU times: user 5.58 s, sys: 417 ms, total: 5.99 s\n",
+      "Wall time: 29.3 s\n"
+     ]
+    }
+   ],
    "source": [
-    "## Performance"
+    "%%time\n",
+    "print(f'Computing for a sizeup of {sizeup}x')\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
    ]
   }
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3 (ipykernel)",
+   "display_name": "tobler",
    "language": "python",
-   "name": "python3"
+   "name": "tobler"
   },
   "language_info": {
    "codemirror_mode": {
@@ -419,7 +733,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.10"
+   "version": "3.11.4"
   }
  },
  "nbformat": 4,
diff --git a/environment.yml b/environment.yml
index 3037858..7d4c6e2 100644
--- a/environment.yml
+++ b/environment.yml
@@ -2,6 +2,9 @@ name: tobler
 channels:
   - conda-forge
 dependencies:
+  - dask-geopandas
+  - dask
+  - distributed
   - jupyterlab
   - numpy
   - geopandas >=0.13
diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index c0ae853..24b204d 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -1,31 +1,75 @@
+'''
+Area Weighted Interpolation, out-of-core and parallel through Dask
+'''
+
 import pandas
 import geopandas
 import dask_geopandas
-import warnings
 import numpy as np
 from dask.base import tokenize
 from dask.highlevelgraph import HighLevelGraph
-from tobler.area_weighted import area_interpolate
+from .area_interpolate import _area_interpolate_binning as area_interpolate
 
 def area_interpolate_dask(
-    left_dgdf,
-    right_dgdf,
+    source_dgdf,
+    target_dgdf,
     id_col,
     extensive_variables=None,
     intensive_variables=None,
     categorical_variables=None,
 ):
+    '''
+    Out-of-core and parallel area interpolation for categorical variables.
+    
+    Parameters
+    ----------
+    source_dgdf : dask_geopandas.GeoDataFrame
+        Dask-geopandas GeoDataFrame
+        IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
+        This is required so only overlapping partitions are checked for interpolation. See
+        more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html
+    target_dgdf : dask_geopandas.GeoDataFrame
+        Dask-geopandas GeoDataFrame
+        IMPORTANT: the table needs to be spatially shuffled and with spatial partitions.
+        This is required so only overlapping partitions are checked for interpolation. See
+        more on spatial shuffling at: https://dask-geopandas.readthedocs.io/en/stable/guide/spatial-partitioning.html   
+    id_col : str
+        Name of the column in `target_dgdf` with unique IDs to be used in output table
+    extensive_variables : list
+        [Optional. Default=None] Columns in `source_dgdf` for extensive variables.
+        IMPORTANT: currently NOT implemented.
+    intensive_variables : list
+        [Optional. Default=None] Columns in `source_dgdf` for intensive variables
+        IMPORTANT: currently NOT implemented.
+    categorical_variables : list
+        [Optional. Default=None] Columns in `source_dgdf` for categorical variables       
+        IMPORTANT: categorical variables must be of type `'category[known]'`. This is so
+        all categories are known ahead of time and Dask can run lazily.
+
+    Returns
+    -------
+    estimates : dask_geopandas.GeoDataFrame
+         new dask-geopandas geodaraframe with interpolated variables and `id_col` as
+         columns and target_df geometry as output geometry
+    
+    '''
     if intensive_variables is not None:
         raise NotImplementedError((
             "Dask-based interpolation of intensive variables is "
             "not implemented yet. Please remove intensive variables to "
             "be able to run the rest."
         ))
+    if extensive_variables is not None:
+        raise NotImplementedError((
+            "Dask-based interpolation of extensive variables is "
+            "not implemented yet. Please remove intensive variables to "
+            "be able to run the rest."
+        ))
     # Categoricals must be Dask's known categorical
     if categorical_variables is not None:
         category_vars = []
         for cat_var in categorical_variables:
-            var_names = [f'{cat_var}_{c}' for c in left_dgdf[cat_var].cat.categories]
+            var_names = [f'{cat_var}_{c}' for c in source_dgdf[cat_var].cat.categories]
             category_vars.extend(var_names)
     else:
         category_vars = None
@@ -33,21 +77,21 @@ def area_interpolate_dask(
     dsk = {}
     new_spatial_partitions = []
     parts = geopandas.sjoin(
-        left_dgdf.spatial_partitions.to_frame('geometry'),
-        right_dgdf.spatial_partitions.to_frame('geometry'),
+        source_dgdf.spatial_partitions.to_frame('geometry'),
+        target_dgdf.spatial_partitions.to_frame('geometry'),
         how='inner',
         predicate='intersects'
     )
     parts_left = np.asarray(parts.index)
     parts_right = np.asarray(parts['index_right'].values)
     name = 'area_interpolate-' + tokenize(
-        right_dgdf, left_dgdf
+        target_dgdf, source_dgdf
     )   
     for i, (l, r) in enumerate(zip(parts_left, parts_right)):
         dsk[(name, i)] = (
             id_area_interpolate,
-            (left_dgdf._name, l),
-            (right_dgdf._name, r),
+            (source_dgdf._name, l),
+            (target_dgdf._name, r),
             id_col,
             extensive_variables,
             intensive_variables,
@@ -58,22 +102,22 @@ def area_interpolate_dask(
             categorical_variables,
             category_vars
         )
-        lr = left_dgdf.spatial_partitions.iloc[l]
-        rr = right_dgdf.spatial_partitions.iloc[r]
+        lr = source_dgdf.spatial_partitions.iloc[l]
+        rr = target_dgdf.spatial_partitions.iloc[r]
         extent = lr.intersection(rr)
         new_spatial_partitions.append(extent)
     # Create geometries for new spatial partitions
     new_spatial_partitions = geopandas.GeoSeries(
-        data=new_spatial_partitions, crs=left_dgdf.crs
+        data=new_spatial_partitions, crs=source_dgdf.crs
     )
     # Build Dask graph
     graph = HighLevelGraph.from_collections(
-        name, dsk, dependencies=[left_dgdf, right_dgdf]
+        name, dsk, dependencies=[source_dgdf, target_dgdf]
     )
     # Get metadata for the outcome table
     meta = id_area_interpolate(
-        left_dgdf._meta,
-        right_dgdf._meta,
+        source_dgdf._meta,
+        target_dgdf._meta,
         id_col,
         extensive_variables=extensive_variables,
         intensive_variables=intensive_variables,
@@ -93,8 +137,9 @@ def area_interpolate_dask(
         new_spatial_partitions
     )
     # Merge chunks
-    out = right_dgdf[[id_col, 'geometry']]
-    ## Extensive --> Add up estimates by ID
+    out = target_dgdf[[id_col, 'geometry']]
+    ## Extensive --> Not implemented (DAB: the below does not match single-core)
+    '''
     if extensive_variables is not None:
         out_extensive = (
             transferred
@@ -103,6 +148,7 @@ def area_interpolate_dask(
             .agg({v: 'sum' for v in extensive_variables})
         )
         out = out.join(out_extensive, on=id_col)
+    '''
     ## Intensive --> Weight by area of the chunk (Not implemented)
     ## Categorical --> Add up proportions
     if categorical_variables is not None:
@@ -128,6 +174,55 @@ def id_area_interpolate(
     categorical_variables=None,
     category_vars=None
 ):
+    '''
+    Light wrapper around single-core area interpolation to be run on distributed workers
+    
+    Parameters
+    ----------
+    source_df : geopandas.GeoDataFrame
+    target_df : geopandas.GeoDataFrame
+    id_col : str
+        Name of the column in `target_dgdf` with unique IDs to be used in output table   
+    extensive_variables : list
+        [Optional. Default=None] Columns in dataframes for extensive variables
+    intensive_variables : list
+        [Optional. Default=None] Columns in dataframes for intensive variables
+    table : scipy.sparse.csr_matrix
+        [Optional. Default=None] Area allocation source-target correspondence
+        table. If not provided, it will be built from `source_df` and
+        `target_df` using `tobler.area_interpolate._area_tables_binning`
+    allocate_total : boolean
+        [Optional. Default=True] True if total value of source area should be
+        allocated. False if denominator is area of i. Note that the two cases
+        would be identical when the area of the source polygon is exhausted by
+        intersections. See Notes for more details.
+    spatial_index : str
+        [Optional. Default="auto"] Spatial index to use to build the
+        allocation of area from source to target tables. It currently support
+        the following values:
+            - "source": build the spatial index on `source_df`
+            - "target": build the spatial index on `target_df`
+            - "auto": attempts to guess the most efficient alternative.
+              Currently, this option uses the largest table to build the
+              index, and performs a `bulk_query` on the shorter table.
+        This argument is ignored if n_jobs>1 (or n_jobs=-1).
+    n_jobs : int
+        [Optional. Default=1] Number of processes to run in parallel to
+        generate the area allocation. If -1, this is set to the number of CPUs
+        available. If `table` is passed, this is ignored.
+    categorical_variables : list
+        [Optional. Default=None] Columns in dataframes for categorical variables
+    categories : list
+        [Optional. Default=None] Full list of category names in the format
+        `f'{var_name}_{cat_name}'`
+
+    Returns
+    -------
+    estimates : geopandas.GeoDataFrame
+         new geodaraframe with interpolated variables as columns and target_df geometry
+         as output geometry
+   
+    '''
     estimates = area_interpolate(
         source_df,
         target_df,

From 3e78f4e435205ea997f6155699bf1886daa40cf6 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 11 Aug 2023 11:21:22 +0000
Subject: [PATCH 04/16] Add example to notebook reproducing known bug

---
 04_area_interpolate_dask.ipynb | 194 ++++++++++++++++++++++++++++++---
 1 file changed, 179 insertions(+), 15 deletions(-)

diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb
index 9e88c68..7a43f7b 100644
--- a/04_area_interpolate_dask.ipynb
+++ b/04_area_interpolate_dask.ipynb
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 4,
    "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
    "metadata": {
     "tags": []
@@ -114,18 +114,6 @@
     "client = Client(LocalCluster(n_workers=8))"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": 12,
-   "id": "69f06d42-f47f-4120-811b-275431b1cf3a",
-   "metadata": {
-    "tags": []
-   },
-   "outputs": [],
-   "source": [
-    "client.shutdown()"
-   ]
-  },
   {
    "cell_type": "markdown",
    "id": "88c32c7d-0ca8-4945-a1f8-edfbc8917880",
@@ -167,7 +155,10 @@
   {
    "cell_type": "markdown",
    "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6",
-   "metadata": {},
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
    "source": [
     "## Correctness"
    ]
@@ -276,7 +267,10 @@
   {
    "cell_type": "markdown",
    "id": "1debbdf4-892f-4fda-834a-0403595794ef",
-   "metadata": {},
+   "metadata": {
+    "jp-MarkdownHeadingCollapsed": true,
+    "tags": []
+   },
    "source": [
     "## Performance\n",
     "\n",
@@ -715,6 +709,176 @@
     "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
     ").compute()"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "93576191-ddb0-4316-af7e-d12393e520b6",
+   "metadata": {},
+   "source": [
+    "## Bug"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "079ff509-cd42-4982-8144-f4915e996f83",
+   "metadata": {},
+   "source": [
+    "There is a recurrent bug that appears in some cases that errors the computation and should be fixed ideally before merging. The code below reproduces it:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "d2470144-7c4d-4638-90a8-6bc4254128ef",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2023-08-11 11:19:01,747 - distributed.worker - WARNING - Compute Failed\n",
+      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 2, 1)\n",
+      "Function:  pipe\n",
+      "args:      ([      sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                 ...                                              \n",
+      "29437                                            0  ...                                      0.000000\n",
+      "29472                                            0  ...                                      0.000000\n",
+      "29483                                            0  ...                                      0.008659\n",
+      "\n",
+      "[3 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                  ...                                              \n",
+      "29437                                           0.0  ...                                           0.0\n",
+      "29472                                           0.0  ...                                           0.0\n",
+      "29483                                           0.0  \n",
+      "kwargs:    {}\n",
+      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
+      "\n",
+      "2023-08-11 11:19:01,750 - distributed.worker - WARNING - Compute Failed\n",
+      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 0, 1)\n",
+      "Function:  pipe\n",
+      "args:      ([       sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                  ...                                              \n",
+      "29438                                      0.905017  ...                                             0\n",
+      "\n",
+      "[1 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                  ...                                              \n",
+      "29449                                      0.001648  ...                                             0\n",
+      "29426                                      0.000000  ...                                             0\n",
+      "29414                                      0.000000  ...                                             0\n",
+      "\n",
+      "[3 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                      \n",
+      "kwargs:    {}\n",
+      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
+      "\n",
+      "2023-08-11 11:19:01,761 - distributed.worker - ERROR - Exception during execution of task ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 4, 1).\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 184, in __getitem__\n",
+      "    return self.fast[key]\n",
+      "           ~~~~~~~~~^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
+      "    return func(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/lru.py\", line 117, in __getitem__\n",
+      "    result = self.d[key]\n",
+      "             ~~~~~~^^^^^\n",
+      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2412, in _prepare_args_for_execution\n",
+      "    data[k] = self.data[k]\n",
+      "              ~~~~~~~~~^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/spill.py\", line 216, in __getitem__\n",
+      "    return super().__getitem__(key)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
+      "    return func(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 186, in __getitem__\n",
+      "    return self.slow_to_fast(key)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 153, in slow_to_fast\n",
+      "    value = self.slow[key]\n",
+      "            ~~~~~~~~~^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
+      "    return func(*args, **kwargs)\n",
+      "           ^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/cache.py\", line 67, in __getitem__\n",
+      "    gen = self._last_updated[key]\n",
+      "          ~~~~~~~~~~~~~~~~~~^^^^^\n",
+      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
+      "\n",
+      "During handling of the above exception, another exception occurred:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2264, in execute\n",
+      "    args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs)\n",
+      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
+      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2416, in _prepare_args_for_execution\n",
+      "    data[k] = Actor(type(self.state.actors[k]), self.address, k, self)\n",
+      "                         ~~~~~~~~~~~~~~~~~^^^\n",
+      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
+      "2023-08-11 11:19:01,791 - distributed.worker - WARNING - Compute Failed\n",
+      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 3, 1)\n",
+      "Function:  pipe\n",
+      "args:      ([       sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                  ...                                              \n",
+      "29487                                      0.002037  ...                                             0\n",
+      "29455                                      0.275390  ...                                             0\n",
+      "\n",
+      "[2 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                                                  ...                                              \n",
+      "29487                                      0.000000  ...                                      0.000000\n",
+      "29455                                      0.002053  ...                                      0.018981\n",
+      "\n",
+      "[2 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
+      "ZIP                      \n",
+      "kwargs:    {}\n",
+      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
+      "\n"
+     ]
+    },
+    {
+     "ename": "TypeError",
+     "evalue": "can only concatenate str (not \"float\") to str",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 13\u001b[0m\n\u001b[1;32m      1\u001b[0m dtracts \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m      2\u001b[0m     dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(tracts[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrando\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      4\u001b[0m )\n\u001b[1;32m      6\u001b[0m dzips \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m      7\u001b[0m     dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(zip_codes[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mZIP\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      9\u001b[0m )\n\u001b[1;32m     11\u001b[0m cat_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdzips\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mZIP\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrando\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m---> 13\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/toolz/functoolz.py:628\u001b[0m, in \u001b[0;36mpipe\u001b[0;34m()\u001b[0m\n\u001b[1;32m    608\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\" Pipe a value through a sequence of functions\u001b[39;00m\n\u001b[1;32m    609\u001b[0m \n\u001b[1;32m    610\u001b[0m \u001b[38;5;124;03mI.e. ``pipe(data, f, g, h)`` is equivalent to ``h(g(f(data)))``\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;124;03m    thread_last\u001b[39;00m\n\u001b[1;32m    626\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    627\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m--> 628\u001b[0m     data \u001b[38;5;241m=\u001b[39m func(data)\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
+      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1209\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1207\u001b[0m result \u001b[38;5;241m=\u001b[39m collections\u001b[38;5;241m.\u001b[39mOrderedDict()\n\u001b[1;32m   1208\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1209\u001b[0m     r \u001b[38;5;241m=\u001b[39m func(grouped, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfunc_kwargs)\n\u001b[1;32m   1211\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m   1212\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m idx, s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(r):\n",
+      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1255\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1253\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like[column])\n",
+      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/pandas/_libs/groupby.pyx:717\u001b[0m, in \u001b[0;36mpandas._libs.groupby.group_sum\u001b[0;34m()\u001b[0m\n\u001b[1;32m    715\u001b[0m     t = val\n\u001b[1;32m    716\u001b[0m else:\n\u001b[0;32m--> 717\u001b[0m     t = sumx[lab, j] + val\n\u001b[1;32m    718\u001b[0m sumx[lab, j] = t\n\u001b[1;32m    719\u001b[0m \n",
+      "\u001b[0;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str"
+     ]
+    }
+   ],
+   "source": [
+    "dtracts = (\n",
+    "    dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
+    ")\n",
+    "\n",
+    "dzips = (\n",
+    "    dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n",
+    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
+    ")\n",
+    "\n",
+    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
+    "    dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
+    ").compute()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "08596315-236b-45df-955e-44a98b0a2eba",
+   "metadata": {},
+   "source": [
+    "[DAB]: my hunch is that the error, though cryptic and hard to debug, comes from a worker returning an empty result (perhaps `None`?) which, when it's passed through the aggregation post collection from the workers, raises the error. Further investigation is warranted."
+   ]
   }
  ],
  "metadata": {

From 9ef8dc04f710309bc7fa6fc18cb07a6bb8c6df3e Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Mon, 14 Aug 2023 10:00:09 +0000
Subject: [PATCH 05/16] Fix bug that appears sometimes w/ many small partitions

---
 04_area_interpolate_dask.ipynb                | 207 ++----------------
 tobler/area_weighted/area_interpolate_dask.py |   9 +-
 2 files changed, 24 insertions(+), 192 deletions(-)

diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb
index 7a43f7b..d368967 100644
--- a/04_area_interpolate_dask.ipynb
+++ b/04_area_interpolate_dask.ipynb
@@ -124,18 +124,18 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 7,
    "id": "a31a1a91-4071-40e2-a21f-7e035d734976",
    "metadata": {},
    "outputs": [],
    "source": [
     "dtracts = (\n",
-    "    dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=4)\n",
+    "    dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")\n",
     "\n",
     "dzips = (\n",
-    "    dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=4)\n",
+    "    dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")"
    ]
@@ -156,7 +156,6 @@
    "cell_type": "markdown",
    "id": "b783aabc-8221-40f6-a0d5-bf21dd75e2a6",
    "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
     "tags": []
    },
    "source": [
@@ -173,7 +172,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 8,
    "id": "4d4cde6d-73c1-4197-86ed-131724e21296",
    "metadata": {
     "tags": []
@@ -195,7 +194,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 9,
    "id": "d8c7896f-9004-4a07-b3ba-75301f8120e5",
    "metadata": {
     "tags": []
@@ -218,7 +217,7 @@
   {
    "cell_type": "code",
    "execution_count": 8,
-   "id": "8bc830b2-99a7-4c11-a8d9-0fad3aefcf06",
+   "id": "81de5e35-f3b6-4567-86b1-36d98583dca0",
    "metadata": {
     "tags": []
    },
@@ -268,7 +267,6 @@
    "cell_type": "markdown",
    "id": "1debbdf4-892f-4fda-834a-0403595794ef",
    "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
     "tags": []
    },
    "source": [
@@ -291,7 +289,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
+   "execution_count": 12,
    "id": "902e494b-65ba-4fa2-99e6-eb3a513769f8",
    "metadata": {
     "tags": []
@@ -301,7 +299,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "85.5 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
+      "85 ms ± 1.51 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
      ]
     }
    ],
@@ -314,7 +312,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 13,
    "id": "5cfc44d9-f79a-4b8e-9caa-975ea64d5f0e",
    "metadata": {
     "tags": []
@@ -324,7 +322,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "444 ms ± 2.69 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
+      "1.41 s ± 51.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
      ]
     }
    ],
@@ -347,7 +345,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 24,
+   "execution_count": 17,
    "id": "5f56d579-0022-45c2-845c-f351bf96ed01",
    "metadata": {
     "tags": []
@@ -368,7 +366,7 @@
       "This may cause some slowdown.\n",
       "Consider scattering data ahead of time and using futures.\n",
       "  warnings.warn(\n",
-      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.17 MiB.\n",
+      "/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/client.py:3161: UserWarning: Sending large graph of size 30.18 MiB.\n",
       "This may cause some slowdown.\n",
       "Consider scattering data ahead of time and using futures.\n",
       "  warnings.warn(\n"
@@ -384,12 +382,12 @@
     ")\n",
     "\n",
     "dtracts_lrg = (\n",
-    "    dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=800)\n",
+    "    dask_geopandas.from_geopandas(tracts_lrg[['geometry', 'rando']], chunksize=500)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")\n",
     "\n",
     "dzips_lrg = (\n",
-    "    dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=800)\n",
+    "    dask_geopandas.from_geopandas(zips_lrg[['ZIP', 'geometry']], chunksize=500)\n",
     "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
     ")"
    ]
@@ -633,7 +631,6 @@
    "cell_type": "markdown",
    "id": "b004834f-c5ce-4f92-be9a-364a07c7996b",
    "metadata": {
-    "jp-MarkdownHeadingCollapsed": true,
     "tags": []
    },
    "source": [
@@ -670,7 +667,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 25,
+   "execution_count": 18,
    "id": "8a68e5fe-ee41-48cc-9222-6554a7651c28",
    "metadata": {
     "tags": []
@@ -697,8 +694,8 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "CPU times: user 5.58 s, sys: 417 ms, total: 5.99 s\n",
-      "Wall time: 29.3 s\n"
+      "CPU times: user 6.99 s, sys: 512 ms, total: 7.5 s\n",
+      "Wall time: 30.5 s\n"
      ]
     }
    ],
@@ -709,176 +706,6 @@
     "    dtracts_lrg, dzips_lrg, 'ZIP', categorical_variables=['rando']\n",
     ").compute()"
    ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "93576191-ddb0-4316-af7e-d12393e520b6",
-   "metadata": {},
-   "source": [
-    "## Bug"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "079ff509-cd42-4982-8144-f4915e996f83",
-   "metadata": {},
-   "source": [
-    "There is a recurrent bug that appears in some cases that errors the computation and should be fixed ideally before merging. The code below reproduces it:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "id": "d2470144-7c4d-4638-90a8-6bc4254128ef",
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "2023-08-11 11:19:01,747 - distributed.worker - WARNING - Compute Failed\n",
-      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 2, 1)\n",
-      "Function:  pipe\n",
-      "args:      ([      sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                 ...                                              \n",
-      "29437                                            0  ...                                      0.000000\n",
-      "29472                                            0  ...                                      0.000000\n",
-      "29483                                            0  ...                                      0.008659\n",
-      "\n",
-      "[3 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                  ...                                              \n",
-      "29437                                           0.0  ...                                           0.0\n",
-      "29472                                           0.0  ...                                           0.0\n",
-      "29483                                           0.0  \n",
-      "kwargs:    {}\n",
-      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
-      "\n",
-      "2023-08-11 11:19:01,750 - distributed.worker - WARNING - Compute Failed\n",
-      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 0, 1)\n",
-      "Function:  pipe\n",
-      "args:      ([       sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                  ...                                              \n",
-      "29438                                      0.905017  ...                                             0\n",
-      "\n",
-      "[1 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                  ...                                              \n",
-      "29449                                      0.001648  ...                                             0\n",
-      "29426                                      0.000000  ...                                             0\n",
-      "29414                                      0.000000  ...                                             0\n",
-      "\n",
-      "[3 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                      \n",
-      "kwargs:    {}\n",
-      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
-      "\n",
-      "2023-08-11 11:19:01,761 - distributed.worker - ERROR - Exception during execution of task ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 4, 1).\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 184, in __getitem__\n",
-      "    return self.fast[key]\n",
-      "           ~~~~~~~~~^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/lru.py\", line 117, in __getitem__\n",
-      "    result = self.d[key]\n",
-      "             ~~~~~~^^^^^\n",
-      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
-      "\n",
-      "During handling of the above exception, another exception occurred:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2412, in _prepare_args_for_execution\n",
-      "    data[k] = self.data[k]\n",
-      "              ~~~~~~~~~^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/spill.py\", line 216, in __getitem__\n",
-      "    return super().__getitem__(key)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 186, in __getitem__\n",
-      "    return self.slow_to_fast(key)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/buffer.py\", line 153, in slow_to_fast\n",
-      "    value = self.slow[key]\n",
-      "            ~~~~~~~~~^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/common.py\", line 127, in wrapper\n",
-      "    return func(*args, **kwargs)\n",
-      "           ^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/zict/cache.py\", line 67, in __getitem__\n",
-      "    gen = self._last_updated[key]\n",
-      "          ~~~~~~~~~~~~~~~~~~^^^^^\n",
-      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
-      "\n",
-      "During handling of the above exception, another exception occurred:\n",
-      "\n",
-      "Traceback (most recent call last):\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2264, in execute\n",
-      "    args2, kwargs2 = self._prepare_args_for_execution(ts, args, kwargs)\n",
-      "                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
-      "  File \"/opt/conda/envs/tobler/lib/python3.11/site-packages/distributed/worker.py\", line 2416, in _prepare_args_for_execution\n",
-      "    data[k] = Actor(type(self.state.actors[k]), self.address, k, self)\n",
-      "                         ~~~~~~~~~~~~~~~~~^^^\n",
-      "KeyError: \"('aggregate-chunk-827a5db528ee8de42db84a4c3d7fb9a7-1584b66fccc2ab7917ded8b6f5bb1127', 34)\"\n",
-      "2023-08-11 11:19:01,791 - distributed.worker - WARNING - Compute Failed\n",
-      "Key:       ('aggregate-combine-827a5db528ee8de42db84a4c3d7fb9a7', 3, 1)\n",
-      "Function:  pipe\n",
-      "args:      ([       sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                  ...                                              \n",
-      "29487                                      0.002037  ...                                             0\n",
-      "29455                                      0.275390  ...                                             0\n",
-      "\n",
-      "[2 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                                                  ...                                              \n",
-      "29487                                      0.000000  ...                                      0.000000\n",
-      "29455                                      0.002053  ...                                      0.018981\n",
-      "\n",
-      "[2 rows x 4 columns],        sum-rando_0-17eccfbe7bc44d26fa589319100d6357  ...  sum-rando_3-52ec0a920e05152510b5ff7aa35c0200\n",
-      "ZIP                      \n",
-      "kwargs:    {}\n",
-      "Exception: 'TypeError(\\'can only concatenate str (not \"float\") to str\\')'\n",
-      "\n"
-     ]
-    },
-    {
-     "ename": "TypeError",
-     "evalue": "can only concatenate str (not \"float\") to str",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
-      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
-      "Cell \u001b[0;32mIn[9], line 13\u001b[0m\n\u001b[1;32m      1\u001b[0m dtracts \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m      2\u001b[0m     dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(tracts[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mrando\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m      3\u001b[0m     \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      4\u001b[0m )\n\u001b[1;32m      6\u001b[0m dzips \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m      7\u001b[0m     dask_geopandas\u001b[38;5;241m.\u001b[39mfrom_geopandas(zip_codes[[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mZIP\u001b[39m\u001b[38;5;124m'\u001b[39m, \u001b[38;5;124m'\u001b[39m\u001b[38;5;124mgeometry\u001b[39m\u001b[38;5;124m'\u001b[39m]], npartitions\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m16\u001b[39m)\n\u001b[1;32m      8\u001b[0m     \u001b[38;5;241m.\u001b[39mspatial_shuffle(by\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mhilbert\u001b[39m\u001b[38;5;124m'\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtasks\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      9\u001b[0m )\n\u001b[1;32m     11\u001b[0m cat_dk \u001b[38;5;241m=\u001b[39m \u001b[43mtobler\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_weighted\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43marea_interpolate_dask\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m     12\u001b[0m \u001b[43m    \u001b[49m\u001b[43mdtracts\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mdzips\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mZIP\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcategorical_variables\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[38;5;124;43mrando\u001b[39;49m\u001b[38;5;124;43m'\u001b[39;49m\u001b[43m]\u001b[49m\n\u001b[0;32m---> 13\u001b[0m \u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n",
-      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/toolz/functoolz.py:628\u001b[0m, in \u001b[0;36mpipe\u001b[0;34m()\u001b[0m\n\u001b[1;32m    608\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\" Pipe a value through a sequence of functions\u001b[39;00m\n\u001b[1;32m    609\u001b[0m \n\u001b[1;32m    610\u001b[0m \u001b[38;5;124;03mI.e. ``pipe(data, f, g, h)`` is equivalent to ``h(g(f(data)))``\u001b[39;00m\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    625\u001b[0m \u001b[38;5;124;03m    thread_last\u001b[39;00m\n\u001b[1;32m    626\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    627\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m func \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m--> 628\u001b[0m     data \u001b[38;5;241m=\u001b[39m func(data)\n\u001b[1;32m    629\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m data\n",
-      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1209\u001b[0m, in \u001b[0;36m_groupby_apply_funcs\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1207\u001b[0m result \u001b[38;5;241m=\u001b[39m collections\u001b[38;5;241m.\u001b[39mOrderedDict()\n\u001b[1;32m   1208\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m result_column, func, func_kwargs \u001b[38;5;129;01min\u001b[39;00m funcs:\n\u001b[0;32m-> 1209\u001b[0m     r \u001b[38;5;241m=\u001b[39m func(grouped, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfunc_kwargs)\n\u001b[1;32m   1211\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(r, \u001b[38;5;28mtuple\u001b[39m):\n\u001b[1;32m   1212\u001b[0m         \u001b[38;5;28;01mfor\u001b[39;00m idx, s \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28menumerate\u001b[39m(r):\n",
-      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/dask/dataframe/groupby.py:1255\u001b[0m, in \u001b[0;36m_apply_func_to_column\u001b[0;34m()\u001b[0m\n\u001b[1;32m   1252\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m column \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   1253\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like)\n\u001b[0;32m-> 1255\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m func(df_like[column])\n",
-      "File \u001b[0;32m/opt/conda/envs/tobler/lib/python3.11/site-packages/pandas/_libs/groupby.pyx:717\u001b[0m, in \u001b[0;36mpandas._libs.groupby.group_sum\u001b[0;34m()\u001b[0m\n\u001b[1;32m    715\u001b[0m     t = val\n\u001b[1;32m    716\u001b[0m else:\n\u001b[0;32m--> 717\u001b[0m     t = sumx[lab, j] + val\n\u001b[1;32m    718\u001b[0m sumx[lab, j] = t\n\u001b[1;32m    719\u001b[0m \n",
-      "\u001b[0;31mTypeError\u001b[0m: can only concatenate str (not \"float\") to str"
-     ]
-    }
-   ],
-   "source": [
-    "dtracts = (\n",
-    "    dask_geopandas.from_geopandas(tracts[['geometry', 'rando']], npartitions=16)\n",
-    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
-    ")\n",
-    "\n",
-    "dzips = (\n",
-    "    dask_geopandas.from_geopandas(zip_codes[['ZIP', 'geometry']], npartitions=16)\n",
-    "    .spatial_shuffle(by='hilbert', shuffle=\"tasks\")\n",
-    ")\n",
-    "\n",
-    "cat_dk = tobler.area_weighted.area_interpolate_dask(\n",
-    "    dtracts, dzips, 'ZIP', categorical_variables=['rando']\n",
-    ").compute()"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "id": "08596315-236b-45df-955e-44a98b0a2eba",
-   "metadata": {},
-   "source": [
-    "[DAB]: my hunch is that the error, though cryptic and hard to debug, comes from a worker returning an empty result (perhaps `None`?) which, when it's passed through the aggregation post collection from the workers, raises the error. Further investigation is warranted."
-   ]
   }
  ],
  "metadata": {
diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index 24b204d..2228172 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -10,6 +10,8 @@
 from dask.highlevelgraph import HighLevelGraph
 from .area_interpolate import _area_interpolate_binning as area_interpolate
 
+from dask.distributed import print as dprint
+
 def area_interpolate_dask(
     source_dgdf,
     target_dgdf,
@@ -154,11 +156,13 @@ def area_interpolate_dask(
     if categorical_variables is not None:
         out_categorical = (
             transferred
-            [category_vars + [id_col]]
-            .groupby(id_col)
+            [category_vars]
+            .astype(float)
+            .groupby(transferred[id_col])
             .agg({v: 'sum' for v in category_vars})
         )    
         out = out.join(out_categorical, on=id_col)
+    #return transferred
     return out
 
 def id_area_interpolate(
@@ -244,4 +248,5 @@ def id_area_interpolate(
         estimates = estimates.join(
             pandas.DataFrame(index=estimates.index, columns=category_vars_to_add)
         )    
+    #dprint(f"######################\n{estimates}\n######################")
     return estimates

From 4cf10635c8ec408e2f83ab5317484f6006f4cda0 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Mon, 14 Aug 2023 11:41:40 +0100
Subject: [PATCH 06/16] Remove debugging code

---
 tobler/area_weighted/area_interpolate_dask.py | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index 2228172..82021ff 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -10,8 +10,6 @@
 from dask.highlevelgraph import HighLevelGraph
 from .area_interpolate import _area_interpolate_binning as area_interpolate
 
-from dask.distributed import print as dprint
-
 def area_interpolate_dask(
     source_dgdf,
     target_dgdf,
@@ -162,7 +160,6 @@ def area_interpolate_dask(
             .agg({v: 'sum' for v in category_vars})
         )    
         out = out.join(out_categorical, on=id_col)
-    #return transferred
     return out
 
 def id_area_interpolate(
@@ -248,5 +245,4 @@ def id_area_interpolate(
         estimates = estimates.join(
             pandas.DataFrame(index=estimates.index, columns=category_vars_to_add)
         )    
-    #dprint(f"######################\n{estimates}\n######################")
     return estimates

From f71d984a6b88d484adde4ad4970804691f0e846f Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Mon, 14 Aug 2023 20:22:18 +0000
Subject: [PATCH 07/16] Add categorical_frequency

---
 04_area_interpolate_dask.ipynb                | 2 +-
 tobler/area_weighted/area_interpolate.py      | 9 ++++++++-
 tobler/area_weighted/area_interpolate_dask.py | 5 ++++-
 3 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/04_area_interpolate_dask.ipynb b/04_area_interpolate_dask.ipynb
index d368967..128b57b 100644
--- a/04_area_interpolate_dask.ipynb
+++ b/04_area_interpolate_dask.ipynb
@@ -104,7 +104,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
+   "execution_count": 2,
    "id": "d65ac8ec-51e2-4d2d-abb2-96a7519ed749",
    "metadata": {
     "tags": []
diff --git a/tobler/area_weighted/area_interpolate.py b/tobler/area_weighted/area_interpolate.py
index 40bafe9..155d533 100644
--- a/tobler/area_weighted/area_interpolate.py
+++ b/tobler/area_weighted/area_interpolate.py
@@ -212,6 +212,7 @@ def _area_interpolate_binning(
     spatial_index="auto",
     n_jobs=1,
     categorical_variables=None,
+    categorical_frequency=True
 ):
     """
     Area interpolation for extensive, intensive and categorical variables.
@@ -249,6 +250,11 @@ def _area_interpolate_binning(
         available. If `table` is passed, this is ignored.
     categorical_variables : list
         [Optional. Default=None] Columns in dataframes for categorical variables
+    categorical_frequency : Boolean
+        [Optional. Default=True] If True, `estimates` returns the frequency of each
+        value in a categorical variable in every polygon of `target_df` (proportion of
+        area). If False, `estimates` contains the area in every polygon of `target_df`
+        that is occupied by each value of the categorical
 
     Returns
     -------
@@ -357,7 +363,8 @@ def _area_interpolate_binning(
                 )[0]
 
         categorical = pd.DataFrame(categorical)
-        categorical = categorical.div(target_df.area.values, axis="rows")
+        if categorical_frequency is True:
+            categorical = categorical.div(target_df.area.values, axis="rows")
 
     if extensive_variables:
         dfs.append(extensive)
diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index 2228172..6303cc2 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -102,7 +102,8 @@ def area_interpolate_dask(
             'auto',
             1,
             categorical_variables,
-            category_vars
+            category_vars,
+            False
         )
         lr = source_dgdf.spatial_partitions.iloc[l]
         rr = target_dgdf.spatial_partitions.iloc[r]
@@ -129,6 +130,7 @@ def area_interpolate_dask(
         n_jobs=1,
         categorical_variables=categorical_variables,
         category_vars=category_vars
+        categorical_frequency=False
     )
     # Build output table
     transferred = dask_geopandas.GeoDataFrame(
@@ -237,6 +239,7 @@ def id_area_interpolate(
         spatial_index=spatial_index,
         n_jobs=n_jobs,
         categorical_variables=categorical_variables,
+        categorical_frequency=False
     )
     estimates[id_col] = target_df[id_col].values
     

From bf157ef42767c2742ad223d237faf98000e1f2b6 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Mon, 14 Aug 2023 20:51:44 +0000
Subject: [PATCH 08/16] Integrate categorical_frequency into Dask

---
 tobler/area_weighted/area_interpolate_dask.py | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index ed90071..8a795ac 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -17,6 +17,7 @@ def area_interpolate_dask(
     extensive_variables=None,
     intensive_variables=None,
     categorical_variables=None,
+    categorical_frequency=True
 ):
     '''
     Out-of-core and parallel area interpolation for categorical variables.
@@ -45,6 +46,12 @@ def area_interpolate_dask(
         [Optional. Default=None] Columns in `source_dgdf` for categorical variables       
         IMPORTANT: categorical variables must be of type `'category[known]'`. This is so
         all categories are known ahead of time and Dask can run lazily.
+    categorical_frequency : Boolean
+        [Optional. Default=True] If True, `estimates` returns the frequency of each
+        value in a categorical variable in every polygon of `target_df` (proportion of
+        area). If False, `estimates` contains the area in every polygon of `target_df`
+        that is occupied by each value of the categorical
+
 
     Returns
     -------
@@ -101,7 +108,6 @@ def area_interpolate_dask(
             1,
             categorical_variables,
             category_vars,
-            False
         )
         lr = source_dgdf.spatial_partitions.iloc[l]
         rr = target_dgdf.spatial_partitions.iloc[r]
@@ -127,8 +133,7 @@ def area_interpolate_dask(
         spatial_index='auto',
         n_jobs=1,
         categorical_variables=categorical_variables,
-        category_vars=category_vars
-        categorical_frequency=False
+        category_vars=category_vars,
     )
     # Build output table
     transferred = dask_geopandas.GeoDataFrame(
@@ -162,6 +167,11 @@ def area_interpolate_dask(
             .agg({v: 'sum' for v in category_vars})
         )    
         out = out.join(out_categorical, on=id_col)
+        if categorical_frequency is True:
+            cols = out_categorical.columns.tolist()
+            out[cols] = out[cols].div(
+                out.area, axis='index'
+            )
     return out
 
 def id_area_interpolate(

From c2537165f46a3a1d6b01101564168252c42bbfce Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Wed, 16 Aug 2023 14:40:08 +0000
Subject: [PATCH 09/16] Move notebook to notebooks folder

---
 .ci/310.yml                                                  | 3 +++
 .ci/311.yml                                                  | 3 +++
 .ci/39.yml                                                   | 5 ++++-
 .../04_area_interpolate_dask.ipynb                           | 0
 4 files changed, 10 insertions(+), 1 deletion(-)
 rename 04_area_interpolate_dask.ipynb => notebooks/04_area_interpolate_dask.ipynb (100%)

diff --git a/.ci/310.yml b/.ci/310.yml
index f894ed7..e666b7a 100644
--- a/.ci/310.yml
+++ b/.ci/310.yml
@@ -3,6 +3,9 @@ channels:
   - conda-forge
 dependencies:
   - python=3.10
+  - dask
+  - dask-geopandas
+  - distributed
   - jupyterlab
   - numpy
   - geopandas
diff --git a/.ci/311.yml b/.ci/311.yml
index a09fcd2..a533e59 100644
--- a/.ci/311.yml
+++ b/.ci/311.yml
@@ -4,6 +4,9 @@ channels:
 dependencies:
   - python=3.11
   - jupyterlab
+  - dask
+  - dask-geopandas
+  - distributed
   - numpy
   - geopandas
   - pandas
diff --git a/.ci/39.yml b/.ci/39.yml
index 3eceed8..d029839 100644
--- a/.ci/39.yml
+++ b/.ci/39.yml
@@ -3,6 +3,9 @@ channels:
   - conda-forge
 dependencies:
   - python=3.9
+  - dask
+  - dask-geopandas
+  - distributed
   - numpy
   - geopandas
   - pandas
@@ -29,4 +32,4 @@ dependencies:
   - numpydoc
   - nbsphinx
   - joblib
-  - astropy
\ No newline at end of file
+  - astropy
diff --git a/04_area_interpolate_dask.ipynb b/notebooks/04_area_interpolate_dask.ipynb
similarity index 100%
rename from 04_area_interpolate_dask.ipynb
rename to notebooks/04_area_interpolate_dask.ipynb

From 07469c6ec2b1d6f85ef42ab6baec0e0140d33f24 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Wed, 16 Aug 2023 14:58:04 +0000
Subject: [PATCH 10/16] Add test for dask_ai

---
 tobler/tests/test_area_interpolators.py | 30 ++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index 27cd829..db49f56 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -1,9 +1,11 @@
 """test interpolation functions."""
 import geopandas
+import dask_geopandas
 
 from libpysal.examples import load_example
 from numpy.testing import assert_almost_equal
 from tobler.area_weighted import area_interpolate
+from tobler.area_weighted import area_interpolate_dask
 from tobler.area_weighted.area_interpolate import _area_tables_binning
 from geopandas.testing import assert_geodataframe_equal
 import pytest
@@ -79,6 +81,32 @@ def test_area_interpolate_categorical():
     assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
 
 
+def test_area_interpolate_categorical_dask():
+    sac1, sac2 = datasets()
+    sac1['animal'] = sac1['animal'].astype('category')
+    dsac1 = (
+            dask_geopandas.from_geopandas(sac1, npartitions=2)
+            .spatial_shuffle(by='hilbert', shuffle='tasks')
+    )
+    dsac2 = (
+            dask_geopandas.from_geopandas(sac2, npartitions=2)
+            .spatial_shuffle(by='hilbert', shuffle='tasks')
+    )
+    area = area_interpolate_dask.area_interpolate_dask(
+        source_df=sac1,
+        target_df=sac2,
+        extensive_variables=["TOT_POP"],
+        intensive_variables=["pct_poverty"],
+        categorical_variables=["animal"],
+        n_jobs=1,
+    )
+    assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
+    assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
+    assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)
+    assert_almost_equal(area.animal_wombat.sum(), 23, decimal=0)
+    assert_almost_equal(area.animal_capybara.sum(), 20, decimal=0)
+
+
 def test_area_interpolate_custom_index():
     sac1, sac2 = datasets()
     sac1.index = sac1.index * 2
@@ -193,4 +221,4 @@ def test_passed_table():
         table=dok,
     )
     assert_almost_equal(area.TOT_POP.sum(), 1796856, decimal=0)
-    assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)
\ No newline at end of file
+    assert_almost_equal(area.pct_poverty.sum(), 2140, decimal=0)

From 96d5831e4b3754f950e1324f27303072c17c4a05 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 13:53:00 +0100
Subject: [PATCH 11/16] Change test as suggested by @knaaptime

---
 tobler/tests/test_area_interpolators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index db49f56..d5adf61 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -92,7 +92,7 @@ def test_area_interpolate_categorical_dask():
             dask_geopandas.from_geopandas(sac2, npartitions=2)
             .spatial_shuffle(by='hilbert', shuffle='tasks')
     )
-    area = area_interpolate_dask.area_interpolate_dask(
+    area = area_interpolate_dask(
         source_df=sac1,
         target_df=sac2,
         extensive_variables=["TOT_POP"],

From 9f20b02a556d4fe0442d677a7d7a8c5657778d02 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 14:01:37 +0100
Subject: [PATCH 12/16] Add optional imports for dask(-geopandas)

---
 tobler/area_weighted/area_interpolate_dask.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/tobler/area_weighted/area_interpolate_dask.py b/tobler/area_weighted/area_interpolate_dask.py
index 8a795ac..7f3b26e 100755
--- a/tobler/area_weighted/area_interpolate_dask.py
+++ b/tobler/area_weighted/area_interpolate_dask.py
@@ -4,11 +4,18 @@
 
 import pandas
 import geopandas
-import dask_geopandas
 import numpy as np
-from dask.base import tokenize
-from dask.highlevelgraph import HighLevelGraph
 from .area_interpolate import _area_interpolate_binning as area_interpolate
+try:
+    import dask_geopandas
+    from dask.base import tokenize
+    from dask.highlevelgraph import HighLevelGraph
+except ImportError:
+    raise ImportError(
+        "Area interpolation with Dask requires `dask` and "
+        "`dask_geopandas` installed to run. Please install them "
+        "before importing this functionality."
+    )
 
 def area_interpolate_dask(
     source_dgdf,

From 6a9cc3f30d1d12f6cf43c56f6d60f285c4158e9a Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 14:04:46 +0100
Subject: [PATCH 13/16] Minor fix for dask test

---
 tobler/tests/test_area_interpolators.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index d5adf61..9070aa9 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -93,12 +93,9 @@ def test_area_interpolate_categorical_dask():
             .spatial_shuffle(by='hilbert', shuffle='tasks')
     )
     area = area_interpolate_dask(
-        source_df=sac1,
-        target_df=sac2,
-        extensive_variables=["TOT_POP"],
-        intensive_variables=["pct_poverty"],
+        source_dgdf=sac1,
+        target_dgdf=sac2,
         categorical_variables=["animal"],
-        n_jobs=1,
     )
     assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
     assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)

From d6ed8f602f37aa9b66206822b7a3b962adb7c02c Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 14:21:55 +0100
Subject: [PATCH 14/16] more typo fixing of dask tests

---
 tobler/tests/test_area_interpolators.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index 9070aa9..94a397a 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -95,6 +95,7 @@ def test_area_interpolate_categorical_dask():
     area = area_interpolate_dask(
         source_dgdf=sac1,
         target_dgdf=sac2,
+        id_col='ZIP',
         categorical_variables=["animal"],
     )
     assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)

From d316125a139d571742d17a497fde7f09bdb980b4 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 14:35:26 +0100
Subject: [PATCH 15/16] More typos...

---
 tobler/tests/test_area_interpolators.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index 94a397a..4531ea3 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -93,8 +93,8 @@ def test_area_interpolate_categorical_dask():
             .spatial_shuffle(by='hilbert', shuffle='tasks')
     )
     area = area_interpolate_dask(
-        source_dgdf=sac1,
-        target_dgdf=sac2,
+        source_dgdf=dsac1,
+        target_dgdf=dsac2,
         id_col='ZIP',
         categorical_variables=["animal"],
     )

From 17841d328ff99ae7a49b8228c354262584b98c78 Mon Sep 17 00:00:00 2001
From: Dani Arribas-Bel <daniel.arribas.bel@gmail.com>
Date: Fri, 18 Aug 2023 14:45:01 +0100
Subject: [PATCH 16/16] loading dask gdf to memory for tests

---
 tobler/tests/test_area_interpolators.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tobler/tests/test_area_interpolators.py b/tobler/tests/test_area_interpolators.py
index 4531ea3..bc8791b 100644
--- a/tobler/tests/test_area_interpolators.py
+++ b/tobler/tests/test_area_interpolators.py
@@ -97,7 +97,7 @@ def test_area_interpolate_categorical_dask():
         target_dgdf=dsac2,
         id_col='ZIP',
         categorical_variables=["animal"],
-    )
+    ).compute()
     assert_almost_equal(area.animal_cat.sum(), 32, decimal=0)
     assert_almost_equal(area.animal_dog.sum(), 19, decimal=0)
     assert_almost_equal(area.animal_donkey.sum(), 22, decimal=0)