From 59b6ee516bbb22b8ef0d9a82d6412c8f542d0904 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Wed, 22 Mar 2023 19:25:14 +0100 Subject: [PATCH] =?UTF-8?q?add=20date=5Ffilter=5F{min,max}()=20helper=20fu?= =?UTF-8?q?nctions=20=F0=9F=93=9D=E2=9C=85?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/stop_times.js | 49 ++++++++++++++++++++++++++++++++++ readme.md | 16 +++++++++-- test/amtrak-gtfs-2021-10-06.sh | 6 +++++ 3 files changed, 69 insertions(+), 2 deletions(-) diff --git a/lib/stop_times.js b/lib/stop_times.js index 2f13564..e28a1f9 100644 --- a/lib/stop_times.js +++ b/lib/stop_times.js @@ -109,6 +109,55 @@ CREATE INDEX ON "${opt.schema}".stop_times (trip_id, stop_sequence_consec); CREATE INDEX ON "${opt.schema}".stop_times (arrival_time); CREATE INDEX ON "${opt.schema}".stop_times (departure_time); +CREATE OR REPLACE FUNCTION "${opt.schema}".largest_departure_time () +RETURNS interval AS $$ + SELECT departure_time + FROM "${opt.schema}".stop_times + WHERE EXISTS ( + SELECT * + FROM "${opt.schema}".trips + JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY departure_time DESC + LIMIT 1; +$$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION "${opt.schema}".largest_arrival_time () +RETURNS interval AS $$ + SELECT arrival_time + FROM "${opt.schema}".stop_times + WHERE EXISTS ( + SELECT * + FROM "${opt.schema}".trips + JOIN "${opt.schema}".service_days ON service_days.service_id = trips.service_id + WHERE trips.trip_id = stop_times.trip_id + ) + ORDER BY arrival_time DESC + LIMIT 1; +$$ LANGUAGE SQL IMMUTABLE; +CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_min ( + _timestamp TIMESTAMP WITH TIME ZONE +) +RETURNS date AS $$ + SELECT date_trunc( + 'day', + _timestamp + - GREATEST( + "${opt.schema}".largest_arrival_time(), + "${opt.schema}".largest_departure_time() + ) + -- we assume the DST <-> standard time shift is always <= 1h + - '1 hour 1 second'::interval + ); +$$ LANGUAGE SQL IMMUTABLE; +-- This function doesn't do much, we just provide it to match date_filter_min(). +CREATE OR REPLACE FUNCTION "${opt.schema}".dates_filter_max ( + _timestamp TIMESTAMP WITH TIME ZONE +) +RETURNS date AS $$ + SELECT date_trunc('day', _timestamp); +$$ LANGUAGE SQL IMMUTABLE; + CREATE OR REPLACE VIEW "${opt.schema}".arrivals_departures AS WITH stop_times_based AS NOT MATERIALIZED ( SELECT diff --git a/readme.md b/readme.md index 8e236c9..7548110 100644 --- a/readme.md +++ b/readme.md @@ -269,9 +269,9 @@ Let's consider two examples: - A `departure_time` of `26:59:00` with a trip running on `2021-03-01`: The time, applied to this specific date, "extends" into the following day, so it actually departs at `2021-03-02T02:59+01`. - A departure time of `03:01:00` with a trip running on `2021-03-28`: This is when the standard -> DST switch happens in the `Europe/Berlin` timezone. Because the dep. time refers to noon - 12h (*not* to midnight), it actually happens at `2021-03-28T03:01+02` which is *not* `3h1m` after `2021-03-28T00:00+01`. -`gtfs-via-postgres` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure date+time range might still end up within the range, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. +`gtfs-via-postgres` always prioritizes correctness over speed. Because it follows the GTFS semantics, when filtering `arrivals_departures` by *absolute* departure date+time, it cannot automatically filter `service_days` (which is `calendar` and `calendar_dates` combined), because **even a date *before* the date of the desired departure time frame might still end up *within*, when combined with a `departure_time` of e.g. `27:30:00`**; Instead, it has to consider all `service_days` and apply the `departure_time` to all of them to check if they're within the range. -However, values >48h are really rare. If you know (or want to assume) that your feed *does not* have `arrival_time`/`departure_time` values larger than a certain amount, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by *a lot*. +However, if you determine your feed's largest `arrival_time`/`departure_time`, you can filter on `date` when querying `arrivals_departures`; This allows PostgreSQL to reduce the number of joins and calendar calculations by orders of magnitude, speeding up your queries significantly. For example, when querying all *absolute* departures at `de:11000:900120003` (*S Ostkreuz Bhf (Berlin)*) between `2022-03-23T12:30+01` and `2022-03-23T12:35+01` within the [2022-02-25 *VBB* feed](https://vbb-gtfs.jannisr.de/2022-02-25/), filtering by `date` speeds it up nicely (Apple M1, PostgreSQL 14.2): @@ -285,6 +285,18 @@ For example, when querying all *absolute* departures at `de:11000:900120003` (*S *none* | `2022-03-13` >= `date` < `2022-04-08` | 34s | ~35m *none* | `2022-03-22` > `date` < `2022-03-24` | 2.4s | ~1523k +`gtfs-via-postgres` provides 2 helper functions `dates_filter_min(t_min)` & `dates_filter_max(t_max)` for this. When filtering by `t_departure` (absolute departure date+time), `t_min` is the lower `t_departure` bound, whereas `t_max` is the upper bound. The VBB example above can be queried like this: + +```sql +SELECT * +FROM arrivals_departures +-- filter by absolute departure date+time +WHERE t_departure >= '2022-03-23T12:30+01' AND t_departure <= '2022-03-23T12:35+01' +-- allow "cutoffs" by filtering by date +AND "date" >= dates_filter_min('2022-03-23T12:30+01') -- evaluates to 2023-03-22 +AND "date" <= dates_filter_max('2022-03-23T12:35+01') -- evaluates to 2023-03-23 +``` + ## Performance diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 29a797c..51603ba 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -46,3 +46,9 @@ if [[ "$version" != "4" ]]; then echo "invalid gtfs_via_postgres_version(): $version" 1>&2 exit 1 fi + +fMin=$(psql --csv -t -c "SELECT amtrak.dates_filter_min('2021-11-27T13:45:00-06')" | tail -n 1) +if [[ "$fMin" != "2021-11-24" ]]; then + echo "invalid dates_filter_min(…): $fMin" 1>&2 + exit 1 +fi