From 7e70e54fa291df08b3d1989338953155c92f4cdb Mon Sep 17 00:00:00 2001
From: dschuck <chuck@developmentseed.org>
Date: Tue, 17 May 2022 00:05:47 +0000
Subject: [PATCH] Add columns and query expression for subsetting

Allow user to control specific columns to include in output, as well as
query expression for selecting/filtering rows.  This might also allow
the final output file to be small enough to load completely into memory.

Also, refactor subsetting logic to improve performance.
---
 .markdownlint.yaml                |   3 +
 gedi-subset/CHANGELOG.md          |  15 +-
 gedi-subset/README.md             |  27 ++-
 gedi-subset/algorithm_config.yaml |   6 +-
 gedi-subset/gedi_utils.py         |  40 ++++-
 gedi-subset/hdf5.ipynb            | 272 ++++++++++++++++++++++++++++++
 gedi-subset/osx.py                |   4 +-
 gedi-subset/subset.py             |  49 ++++--
 gedi-subset/subset.sh             |  16 +-
 9 files changed, 403 insertions(+), 29 deletions(-)
 create mode 100644 .markdownlint.yaml
 create mode 100644 gedi-subset/hdf5.ipynb

diff --git a/.markdownlint.yaml b/.markdownlint.yaml
new file mode 100644
index 0000000..c673096
--- /dev/null
+++ b/.markdownlint.yaml
@@ -0,0 +1,3 @@
+default: true
+MD024: # no-duplicate-heading/no-duplicate-header
+  allow_different_nesting: true
diff --git a/gedi-subset/CHANGELOG.md b/gedi-subset/CHANGELOG.md
index 9bc4c45..cd1ad2c 100644
--- a/gedi-subset/CHANGELOG.md
+++ b/gedi-subset/CHANGELOG.md
@@ -7,7 +7,18 @@ variation of [Semantic Versioning], with the following difference: each version
 is prefixed with `gedi-subset-` (e.g., `gedi-subset-0.1.0`) to allow for
 distinct lines of versioning of independent work in sibling directories.
 
-## [0.1.0] - 2022-05-26
+## [gedi-subset-0.2.0] - 2022-06-01
+
+## Added
+
+- Added inputs `columns` and `query` to refine filtering/subsetting.  See
+  `gedi-subset/README.md` for details.
+
+## Changed
+
+- Improved performance of subsetting/filtering logic, resulting in ~5x speedup.
+
+## [gedi-subset-0.1.0] - 2022-06-01
 
 ### Added
 
@@ -17,4 +28,4 @@ distinct lines of versioning of independent work in sibling directories.
 [Keep a Changelog]:
     https://keepachangelog.com/en/1.0.0/
 [Semantic Versioning]:
-    https://semver.org/spec/v2.0.0.html
\ No newline at end of file
+    https://semver.org/spec/v2.0.0.html
diff --git a/gedi-subset/README.md b/gedi-subset/README.md
index 4c10c31..e0f4590 100644
--- a/gedi-subset/README.md
+++ b/gedi-subset/README.md
@@ -25,9 +25,25 @@ At a high level, the GEDI subsetting algorithm does the following:
 
 To run a GEDI subsetting DPS job, you must supply the following inputs:
 
-- `aoi`: URL to a GeoJSON file representing your area of interest
+- `aoi` (**required**): URL to a GeoJSON file representing your area of interest
+- `columns`: Comma-separated list of column names to include in output file.
+  (**Default:**
+  `agbd, agbd_se, l2_quality_flag, l4_quality_flag, sensitivity, sensitivity_a2`)
+- `query`: Query expression for subsetting the rows in the output file.
+  **IMPORTANT:** The `columns` input must contain at least all of the columns
+  that appear in this query expression, otherwise an error will occur.
+  (**Default:** `l2_quality_flag == 1 and l4_quality_flag == 1 and sensitivity >
+  0.95 and sensitivity_a2 > 0.95"`)
 - `limit`: Maximum number of GEDI granule data files to download (among those
-  that intersect the specified AOI)
+  that intersect the specified AOI).  (**Default:** 10,000)
+
+**IMPORTANT:** When supplying input values via the ADE UI, for convenience, to
+accept _all_ default values, you may leave _all_ optional inputs blank.
+However, if you supply a value for _any_ optional input, you must enter a dash
+(`-`) as the input value for _all other_ optional inputs.  This ensures that
+the input values remain correctly ordered for the underlying script to which the
+inputs are supplied.  Otherwise, your job may fail due to invalid script
+arguments, or might produce unpredictable results.
 
 If your AOI is a publicly available geoBoundary, see
 [Getting the GeoJSON URL for a geoBoundary](#getting-the-geojson-url-for-a-geoboundary)
@@ -233,7 +249,7 @@ able to register the new version of the algorithm, as follows, within the ADE:
 1. Pull the latest code from GitHub (to obtain merged PR, if necessary):
 
    ```bash
-   git pull origin
+   git pull origin main
    git checkout main
    ```
 
@@ -242,6 +258,7 @@ able to register the new version of the algorithm, as follows, within the ADE:
 
    ```bash
    git push --all ade
+   git push --tags ade
    ```
 
 1. In the ADE's File Browser, navigate to
@@ -263,7 +280,9 @@ able to register the new version of the algorithm, as follows, within the ADE:
 
 Country Boundaries from:
 
-Runfola, D. et al. (2020) geoBoundaries: A global database of political administrative boundaries. PLoS ONE 15(4): e0231866. <https://doi.org/10.1371/journal.pone.0231866>
+Runfola, D. et al. (2020) geoBoundaries: A global database of political
+administrative boundaries.  PLoS ONE 15(4): e0231866.
+<https://doi.org/10.1371/journal.pone.0231866>
 
 [geoBoundaries]:
   https://www.geoboundaries.org
diff --git a/gedi-subset/algorithm_config.yaml b/gedi-subset/algorithm_config.yaml
index c75f4ce..4342c84 100644
--- a/gedi-subset/algorithm_config.yaml
+++ b/gedi-subset/algorithm_config.yaml
@@ -1,6 +1,6 @@
 description: Subset GEDI L4A granules within an area of interest (AOI)
 algo_name: gedi-subset
-version: gedi-subset-0.1.0
+version: gedi-subset-0.2.0
 environment: ubuntu
 repository_url: https://repo.ops.maap-project.org/data-team/maap-documentation-examples.git
 docker_url: mas.maap-project.org:5000/root/ade-base-images/r:latest
@@ -11,5 +11,9 @@ disk_space: 20GB
 inputs:
   - name: aoi
     download: True
+  - name: columns
+    download: False
+  - name: query
+    download: False
   - name: limit
     download: False
diff --git a/gedi-subset/gedi_utils.py b/gedi-subset/gedi_utils.py
index f85b237..2382bfd 100644
--- a/gedi-subset/gedi_utils.py
+++ b/gedi-subset/gedi_utils.py
@@ -3,7 +3,7 @@
 import os
 import os.path
 import warnings
-from typing import Any, Callable, Mapping, Sequence, TypeVar, Union
+from typing import Any, Callable, List, Mapping, Sequence, TypeVar, Union
 
 import h5py
 import numpy as np
@@ -67,7 +67,7 @@ def df_assign(col_name: str, val: Any, df: _DF) -> _DF:
 
 @curry
 def append_message(extra_message: str, e: Exception) -> Exception:
-    message, *other_args = e.args if e.args else ("",)
+    message, *other_args = e.args if e.args else ("",)  # pytype: disable=bad-unpacking
     new_message = f"{message}: {extra_message}" if message else extra_message
     e.args = (new_message, *other_args)
 
@@ -177,7 +177,7 @@ def spatial_filter(beam, aoi):
 
 @curry
 def subset_h5(
-    path: Union[str, os.PathLike], aoi: gpd.GeoDataFrame, filter_cols: Sequence[str]
+    path: Union[str, os.PathLike], aoi: gpd.GeoDataFrame, filter_cols: Sequence[str], expr: str
 ) -> gpd.GeoDataFrame:
     """
     Extract the beam data only for the aoi and only columns of interest
@@ -226,10 +226,10 @@ def subset_h5(
                             col_val.append(value[:][indices].tolist())
 
                 # create a pandas dataframe
-                beam_df = pd.DataFrame(map(list, zip(*col_val)), columns=col_names)
+                beam_df = pd.DataFrame(map(list, zip(*col_val)), columns=col_names).query(expr)
                 # Inserting BEAM names
                 beam_df.insert(
-                    0, "BEAM", np.repeat(str(v), len(beam_df.index)).tolist()
+                    0, "BEAM", np.repeat(v[5:], len(beam_df.index)).tolist()
                 )
                 # Appending to the subset_df dataframe
                 subset_df = pd.concat([subset_df, beam_df])
@@ -250,6 +250,36 @@ def subset_h5(
     return subset_gdf
 
 
+def subset_hdf5(
+    path: str,
+    aoi: gpd.GeoDataFrame,
+    columns: Sequence[str],
+    expr: str,
+) -> gpd.GeoDataFrame:
+    def subset_beam(beam: h5py.Group) -> gpd.GeoDataFrame:
+        def append_series(path: str, value: Union[h5py.Group, h5py.Dataset]) -> None:
+            if (name := path.split("/")[-1]) in columns:
+                series.append(pd.Series(value, name=name))
+
+        series: List[pd.Series] = []
+        beam.visititems(append_series)
+        df = pd.concat(series, axis=1).query(expr)
+        df.insert(0, "BEAM", beam.name[5:])
+
+        x, y = df.lon_lowestmode, df.lat_lowestmode
+        df.drop(["lon_lowestmode", "lat_lowestmode"], axis=1, inplace=True)
+        gdf = gpd.GeoDataFrame(df, geometry=gpd.points_from_xy(x, y), crs="EPSG:4326")
+
+        return gdf[gdf.geometry.within(aoi.geometry[0])]
+
+    with h5py.File(path) as hdf5:
+        beams = (value for key, value in hdf5.items() if key.startswith("BEAM"))
+        beam_dfs = (subset_beam(beam) for beam in beams)
+        beams_df = pd.concat(beam_dfs, ignore_index=True, copy=False)
+
+    return beams_df
+
+
 def write_subset(infile, gdf):
     """
     Write GeoDataFrame to Flatgeobuf
diff --git a/gedi-subset/hdf5.ipynb b/gedi-subset/hdf5.ipynb
new file mode 100644
index 0000000..fe688b5
--- /dev/null
+++ b/gedi-subset/hdf5.ipynb
@@ -0,0 +1,272 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "collect-charlotte",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from functools import reduce\n",
+    "\n",
+    "import geopandas as gpd\n",
+    "import h5py\n",
+    "import numpy as np\n",
+    "import pandas as pd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 80,
+   "id": "broken-scale",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "path = '/projects/maap-documentation-examples/output/GEDI04_A_2019111040155_O02008_02_T04616_02_002_02_V002.h5'\n",
+    "columns = [\n",
+    "    \"agbd\",\n",
+    "    \"agbd_se\",\n",
+    "    \"l2_quality_flag\",\n",
+    "    \"l4_quality_flag\",\n",
+    "    \"lat_lowestmode\",\n",
+    "    \"lon_lowestmode\",\n",
+    "    \"sensitivity\",\n",
+    "    \"sensitivity_a2\",\n",
+    "]\n",
+    "query = \"l2_quality_flag == 1 and l4_quality_flag == 1 and sensitivity > 0.95 and sensitivity_a2 > 0.95\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "id": "compatible-thompson",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def subset_h5(path, filter_cols, query):\n",
+    "    \"\"\"\n",
+    "    Extract the beam data only for the aoi and only columns of interest\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    subset_df = pd.DataFrame()\n",
+    "    \n",
+    "    with h5py.File(path, \"r\") as hf_in:\n",
+    "        # loop through BEAMXXXX groups\n",
+    "        for k, beam in ((k, v) for k, v in hf_in.items() if k.startswith(\"BEAM\")):\n",
+    "            col_names = []\n",
+    "            col_val = []\n",
+    "#             indices = spatial_filter(beam, aoi)\n",
+    "            items = (\n",
+    "                (k, v)\n",
+    "                for k, v in beam.items()\n",
+    "                if k in filter_cols or isinstance(v, h5py.Group)\n",
+    "            )\n",
+    "\n",
+    "            for key, value in items:\n",
+    "                # looping through subgroups\n",
+    "                if isinstance(value, h5py.Group):\n",
+    "                    items2 = ((k2, _) for k2, _ in value.items() if k2 in filter_cols)\n",
+    "                    for key2, value2 in items2:\n",
+    "                        # xvar variables have 2D\n",
+    "                        if key2.startswith(\"xvar\"):\n",
+    "                            for r in range(4):\n",
+    "                                col_names.append(key2 + \"_\" + str(r + 1))\n",
+    "                                col_val.append(value2[(), r].tolist())\n",
+    "                        else:\n",
+    "                            col_names.append(key2)\n",
+    "                            col_val.append(value2[()])\n",
+    "                # looping through base group\n",
+    "                elif key.startswith(\"xvar\"):\n",
+    "                    # xvar variables have 2D\n",
+    "                    for r in range(4):\n",
+    "                        col_names.append(key + \"_\" + str(r + 1))\n",
+    "                        col_val.append(value[(), r])\n",
+    "                else:\n",
+    "                    col_names.append(key)\n",
+    "                    col_val.append(value[()])\n",
+    "\n",
+    "            # create a pandas dataframe\n",
+    "            beam_df = pd.DataFrame(map(list, zip(*col_val)), columns=col_names)\n",
+    "            beam_df.query(query, inplace=True)\n",
+    "            # Inserting BEAM names\n",
+    "            beam_df.insert(0, \"BEAM\", np.repeat(str(k), len(beam_df.index)))\n",
+    "            # Appending to the subset_df dataframe\n",
+    "            subset_df = pd.concat([subset_df, beam_df])\n",
+    "        \n",
+    "    return subset_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 104,
+   "id": "covered-figure",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "       BEAM        agbd    agbd_se  sensitivity_a2  l2_quality_flag  \\\n",
+      "0  BEAM0000  141.049683  17.123022        0.978424                1   \n",
+      "1  BEAM0000   95.756226  17.124018        0.979147                1   \n",
+      "2  BEAM0000   90.346252  17.123966        0.966114                1   \n",
+      "3  BEAM0000  113.583145  17.124689        0.977122                1   \n",
+      "4  BEAM0000   93.160324  17.123558        0.983254                1   \n",
+      "\n",
+      "   l4_quality_flag  lat_lowestmode  lon_lowestmode  sensitivity  \n",
+      "0                1        0.097697        9.372999     0.988014  \n",
+      "1                1        0.098120        9.373297     0.986098  \n",
+      "2                1        0.098542        9.373594     0.966114  \n",
+      "3                1        0.098964        9.373892     0.983985  \n",
+      "4                1        0.099386        9.374189     0.990431  \n",
+      "                agbd        agbd_se  sensitivity_a2  l2_quality_flag  \\\n",
+      "count  290661.000000  290661.000000   290661.000000         290661.0   \n",
+      "mean        8.946183       3.448750        0.986348              1.0   \n",
+      "std        48.692471       2.441094        0.005411              0.0   \n",
+      "min         0.516440       2.981795        0.950001              1.0   \n",
+      "25%         0.870713       3.005720        0.982702              1.0   \n",
+      "50%         1.061925       3.007290        0.986877              1.0   \n",
+      "75%         1.391625       3.008365        0.990650              1.0   \n",
+      "max      2762.396240      17.585686        0.998231              1.0   \n",
+      "\n",
+      "       l4_quality_flag  lat_lowestmode  lon_lowestmode    sensitivity  \n",
+      "count         290661.0   290661.000000   290661.000000  290661.000000  \n",
+      "mean               1.0       27.329840       35.047115       0.967601  \n",
+      "std                0.0       11.926019       18.554284       0.010686  \n",
+      "min                1.0        0.062150        9.372999       0.950000  \n",
+      "25%                1.0       19.848514       24.166614       0.957756  \n",
+      "50%                1.0       24.433955       28.213979       0.967372  \n",
+      "75%                1.0       31.656327       35.637879       0.977670  \n",
+      "max                1.0       51.818693       92.572213       0.997179  \n",
+      "CPU times: user 4.52 s, sys: 249 ms, total: 4.77 s\n",
+      "Wall time: 5.75 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = subset_h5(path, columns, query)\n",
+    "\n",
+    "print(result.head())\n",
+    "print(result.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 110,
+   "id": "enhanced-bargain",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def subset_beam(beam, columns, query):\n",
+    "    def append_series(path, value):\n",
+    "        if (name := path.split(\"/\")[-1]) in columns:\n",
+    "            series.append(pd.Series(value, name=name))\n",
+    "\n",
+    "    series = []\n",
+    "    beam.visititems(append_series)\n",
+    "\n",
+    "    df = pd.concat(series, axis=1)\n",
+    "    df.query(query, inplace=True)\n",
+    "    df.insert(0, \"BEAM\", np.repeat(beam.name[5:], len(df.index)))\n",
+    "    \n",
+    "    return df\n",
+    "\n",
+    "\n",
+    "def subset_hdf5(path, columns, query):\n",
+    "    with h5py.File(path) as hdf5:\n",
+    "        beams = (value for key, value in hdf5.items() if key.startswith(\"BEAM\"))\n",
+    "        beam_dfs = (subset_beam(beam, columns, query) for beam in beams)\n",
+    "        beams_df = reduce(lambda df0, df1: pd.concat([df0, df1], copy=False), beam_dfs)\n",
+    "    \n",
+    "    return beams_df"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "id": "another-pixel",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   BEAM        agbd    agbd_se  sensitivity_a2  l2_quality_flag  \\\n",
+      "0  0000  141.049683  17.123022        0.978424                1   \n",
+      "1  0000   95.756226  17.124018        0.979147                1   \n",
+      "2  0000   90.346252  17.123966        0.966114                1   \n",
+      "3  0000  113.583145  17.124689        0.977122                1   \n",
+      "4  0000   93.160324  17.123558        0.983254                1   \n",
+      "\n",
+      "   l4_quality_flag  lat_lowestmode  lon_lowestmode  sensitivity  \n",
+      "0                1        0.097697        9.372999     0.988014  \n",
+      "1                1        0.098120        9.373297     0.986098  \n",
+      "2                1        0.098542        9.373594     0.966114  \n",
+      "3                1        0.098964        9.373892     0.983985  \n",
+      "4                1        0.099386        9.374189     0.990431  \n",
+      "                agbd        agbd_se  sensitivity_a2  l2_quality_flag  \\\n",
+      "count  290661.000000  290661.000000   290661.000000         290661.0   \n",
+      "mean        8.946183       3.448750        0.986348              1.0   \n",
+      "std        48.692471       2.441094        0.005411              0.0   \n",
+      "min         0.516440       2.981795        0.950001              1.0   \n",
+      "25%         0.870713       3.005720        0.982702              1.0   \n",
+      "50%         1.061925       3.007290        0.986877              1.0   \n",
+      "75%         1.391625       3.008365        0.990650              1.0   \n",
+      "max      2762.396240      17.585686        0.998231              1.0   \n",
+      "\n",
+      "       l4_quality_flag  lat_lowestmode  lon_lowestmode    sensitivity  \n",
+      "count         290661.0   290661.000000   290661.000000  290661.000000  \n",
+      "mean               1.0       27.329840       35.047115       0.967601  \n",
+      "std                0.0       11.926019       18.554284       0.010686  \n",
+      "min                1.0        0.062150        9.372999       0.950000  \n",
+      "25%                1.0       19.848514       24.166614       0.957756  \n",
+      "50%                1.0       24.433955       28.213979       0.967372  \n",
+      "75%                1.0       31.656327       35.637879       0.977670  \n",
+      "max                1.0       51.818693       92.572213       0.997179  \n",
+      "CPU times: user 661 ms, sys: 81 ms, total: 742 ms\n",
+      "Wall time: 2.01 s\n"
+     ]
+    }
+   ],
+   "source": [
+    "%%time\n",
+    "\n",
+    "result = subset_hdf5(path, columns, query)\n",
+    "       \n",
+    "print(result.head())\n",
+    "print(result.describe())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "plastic-russell",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/gedi-subset/osx.py b/gedi-subset/osx.py
index 0a06d34..3716db0 100644
--- a/gedi-subset/osx.py
+++ b/gedi-subset/osx.py
@@ -12,11 +12,11 @@
 
 import os
 import os.path
-from typing import TypeAlias
+from typing import TypeAlias, Union
 
 from returns.io import IOResultE, impure_safe
 
-StrPath: TypeAlias = str | os.PathLike[str]
+StrPath: TypeAlias = Union[str, os.PathLike[str]]
 
 exists = impure_safe(os.path.exists)
 
diff --git a/gedi-subset/subset.py b/gedi-subset/subset.py
index 9c46941..baba1ff 100755
--- a/gedi-subset/subset.py
+++ b/gedi-subset/subset.py
@@ -7,7 +7,7 @@
 from dataclasses import dataclass
 from enum import Enum
 from pathlib import Path
-from typing import Any, Iterable, Tuple
+from typing import Any, Iterable, List, Sequence, Tuple
 
 import geopandas as gpd
 import osx
@@ -20,7 +20,7 @@
     gdf_to_file,
     gdf_to_parquet,
     granule_intersects,
-    subset_h5,
+    subset_hdf5,
 )
 from maap.maap import MAAP
 from maap.Result import Granule
@@ -60,6 +60,8 @@ class SubsetGranuleProps:
     granule: Granule
     maap: MAAP
     aoi_gdf: gpd.GeoDataFrame
+    columns: Sequence[str]
+    query: str
     output_dir: Path
 
 
@@ -77,19 +79,14 @@ def subset_granule(props: SubsetGranuleProps) -> Maybe[str]:
     GeoParquet file.
     """
 
-    filter_cols = [
-        "agbd",
-        "agbd_se",
-        "l4_quality_flag",
-        "sensitivity",
-        "lat_lowestmode",
-        "lon_lowestmode",
-    ]
     io_result = download_granule(props.maap, str(props.output_dir), props.granule)
     inpath = unsafe_perform_io(io_result.alt(raise_exception).unwrap())
 
     logger.debug(f"Subsetting {inpath}")
-    gdf = df_assign("filename", inpath, subset_h5(inpath, props.aoi_gdf, filter_cols))
+    gdf: gpd.GeoDataFrame = flow(
+        subset_hdf5(inpath, props.aoi_gdf, props.columns, props.query),
+        df_assign("filename", inpath),
+    )
     osx.remove(inpath)
 
     if gdf.empty:
@@ -115,6 +112,8 @@ def set_logging_level(logging_level: int) -> None:
 def subset_granules(
     maap: MAAP,
     aoi_gdf: gpd.GeoDataFrame,
+    columns: Sequence[str],
+    query: str,
     output_dir: Path,
     dest: Path,
     init_args: Tuple[Any, ...],
@@ -142,7 +141,8 @@ def append_subset(src: str) -> IOResultE[str]:
     chunksize = 10
     processes = os.cpu_count()
     payloads = (
-        SubsetGranuleProps(granule, maap, aoi_gdf, output_dir) for granule in granules
+        SubsetGranuleProps(granule, maap, aoi_gdf, columns, query, output_dir)
+        for granule in granules
     )
 
     logger.info(f"Subsetting on {processes} processes (chunksize={chunksize})")
@@ -169,7 +169,6 @@ def main(
         resolve_path=True,
     ),
     doi=typer.Option(
-        # "10.3334/ORNLDAAC/1986",  # GEDI L4A DOI, v2
         "10.3334/ORNLDAAC/2056",  # GEDI L4A DOI, v2.1
         help="Digital Object Identifier of collection to subset (https://www.doi.org/)",
     ),
@@ -177,6 +176,28 @@ def main(
         CMRHost.maap,
         help="CMR hostname",
     ),
+    columns: str = typer.Option(
+        ",".join(
+            [
+                "agbd",
+                "agbd_se",
+                "l2_quality_flag",
+                "l4_quality_flag",
+                "lat_lowestmode",
+                "lon_lowestmode",
+                "sensitivity",
+                "sensitivity_a2",
+            ]
+        ),
+        help="Comma-separated list of columns to select",
+    ),
+    query: str = typer.Option(
+        "l2_quality_flag == 1"
+        " and l4_quality_flag == 1"
+        " and sensitivity > 0.95"
+        " and sensitivity_a2 > 0.95",
+        help="Boolean query expression to select rows",
+    ),
     limit: int = typer.Option(
         10_000,
         help="Maximum number of granules to subset",
@@ -224,6 +245,8 @@ def main(
         for subsets in subset_granules(
             maap,
             aoi_gdf,
+            [c.strip() for c in columns.split(",")],
+            query,
             output_dir,
             dest,
             (logging_level,),
diff --git a/gedi-subset/subset.sh b/gedi-subset/subset.sh
index 547e58a..431ed4a 100755
--- a/gedi-subset/subset.sh
+++ b/gedi-subset/subset.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 
-set -xeuo pipefail
+set -xuo pipefail
 
 basedir=$(dirname "$(readlink -f "$0")")
 
@@ -16,7 +16,19 @@ if test -d input; then
     # We are executing within a DPS job, so the AOI file was automatically
     # downloaded to the `input` directory.
     aoi=$(ls input/*)
-    ${subset_py} --verbose --aoi "${aoi}" --limit "${1:-10000}"
+
+    n_actual=${#}
+    n_expected=3
+
+    if test ${n_actual} -gt 0 -a ${n_actual} -ne ${n_expected}; then
+        echo "Expected ${n_expected} inputs, but got ${n_actual}:" $(printf " '%b'" "$@") >&2
+        exit 1
+    fi
+
+    columns=$(test "${1:--}" != "-" && echo " --columns '${1:--}'")
+    query=$(test "${2:--}" != "-" && echo " --query '${2:--}'")
+    limit=$(test "${3:--}" != "-" && echo " --limit ${3:--}")
+    ${subset_py} --verbose --aoi "${aoi}"${columns}${query}${limit}
 else
     # This was invoked directly, so simply pass all arguments through to the
     # Python script.