From b846e5172373ca3356b1577109cb76a1945af6c1 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Sat, 4 Feb 2023 18:25:21 +0100 Subject: [PATCH] =?UTF-8?q?add=20--stats-by-route-id-and-date=20option=20?= =?UTF-8?q?=F0=9F=93=9D=E2=9C=85=E2=9A=A1=EF=B8=8F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Supports analyzing a feed by route and/or date and/or day of the week. part of #33 --- benchmark/index.sql | 1 + benchmark/stats_by_route_id_and_date.sql | 8 +++ cli.js | 10 ++++ docs/analysis/feed-by-route-and-date.md | 51 ++++++++++++++++ index.js | 8 +++ lib/index.js | 1 + lib/stats_by_route_id_and_date.js | 74 ++++++++++++++++++++++++ package.json | 1 + readme.md | 5 ++ test/amtrak-gtfs-2021-10-06.sh | 14 +++++ 10 files changed, 173 insertions(+) create mode 100644 benchmark/stats_by_route_id_and_date.sql create mode 100644 docs/analysis/feed-by-route-and-date.md create mode 100644 lib/stats_by_route_id_and_date.js diff --git a/benchmark/index.sql b/benchmark/index.sql index 589011c..08869d0 100644 --- a/benchmark/index.sql +++ b/benchmark/index.sql @@ -75,6 +75,7 @@ LANGUAGE plpgsql; \i connections_by_stop.sql \i connections_by_non_existent_stop.sql \i connections_by_time.sql +\i stats_by_route_id_and_date.sql SELECT * FROM _benchmark; diff --git a/benchmark/stats_by_route_id_and_date.sql b/benchmark/stats_by_route_id_and_date.sql new file mode 100644 index 0000000..7c36baa --- /dev/null +++ b/benchmark/stats_by_route_id_and_date.sql @@ -0,0 +1,8 @@ +SELECT * from bench( +'SELECT * +FROM stats_by_route_id_and_date +WHERE route_id = ''17452_900'' -- M4 +AND date >= ''2022-08-08'' AND date <= ''2022-08-14'' +AND is_effective = true', +10 +); diff --git a/cli.js b/cli.js index 5ea877c..2da6ede 100755 --- a/cli.js +++ b/cli.js @@ -41,6 +41,9 @@ const { 'stops-location-index': { type: 'boolean', }, + 'stats-by-route-id-and-date': { + type: 'string', + }, 'schema': { type: 'string', }, @@ -75,6 +78,12 @@ Options: Default if levels.txt has not been provided. --stops-location-index Create a spatial index on stops.stop_loc for efficient queries by geolocation. + --stats-by-route-id-and-date Wether to generate a stats_by_route_id_and_date view + letting you analyze all data per routes and/or date: + - none: Don't generate a view. + - view: Fast generation, slow access. + - materialized-view: Slow generation, fast access. + Default: none --schema The schema to use for the database. Default: public --postgraphile Tweak generated SQL for PostGraphile usage. https://www.graphile.org/postgraphile/ @@ -113,6 +122,7 @@ const opt = { tripsWithoutShapeId: !!flags['trips-without-shape-id'], routesWithoutAgencyId: !!flags['routes-without-agency-id'], stopsLocationIndex: !!flags['stops-location-index'], + statsByRouteIdAndDate: flags['stats-by-route-id-and-date'] || 'none', schema: flags['schema'] || 'public', postgraphile: !!flags.postgraphile, importMetadata: !!flags['import-metadata'], diff --git a/docs/analysis/feed-by-route-and-date.md b/docs/analysis/feed-by-route-and-date.md new file mode 100644 index 0000000..76ce3ce --- /dev/null +++ b/docs/analysis/feed-by-route-and-date.md @@ -0,0 +1,51 @@ +# analysing a GTFS dataset by route ID and/or date + +Are you trying to answer a question like those below? + +- Are there certain dates or days of the week that have sigificantly less arrivals/departures (hereinafter "stop time events")? – This *may* indicate errors in the data, e.g. a faulty `calendar.csv` or `calendar_dates.csv` file. +- Has the number of stop time events decreased, compared to the last dataset version? +- Do specific routes stop running during certain time periods? + +`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_id_and_date` to help with such SQL queries. Use the `--stats-by-route-id-and-date` flag to enable it** in the generated SQL: + +- If you run `gtfs-to-sql` with `--stats-by-route-id-and-date=view`, `stats_by_route_id_and_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_id_and_date` rarely or in time-uncritical scenarios. +- If you pass `--stats-by-route-id-and-date=materialized-view`, the `stats_by_route_id_and_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed). + +`stats_by_route_id_and_date` has the following columns: + +- `date` +- `dow` – day of the week, following the [PostgreSQL notation `0` (Sunday) to `6` (Saturday)](https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-EXTRACT) +- `nr_of_trips` – nr of trips starting on that date +- `nr_of_arrs_deps` – nr of trips taking place on that date +- `is_effective` – wether `nr_of_trips` & `nr_of_arrs_deps` are calculated based on the *effective* date (i.e. the date that the stop time event actually happens on) or *schedule* date (i.e. the date which their `stop_time` rows refer to) + +So + +- if you want to take a customer-facing perspective on the data (as in "I don't care which if trips are scheduled before midnight, I want to know if they run today"), filter for `is_effective = True` rows; +- If you're interested in the operational/planning perspective (e.g. if you're looking for data errors), filter for `is_effective = False` rows. + +## example: nr of effective stop time events of a single route over a week + +```sql +-- using VBB's 2023-02-27 GTFS data +SELECT + *, + EXTRACT(dow FROM "date") as dow +FROM stats_by_route_id_and_date stats +WHERE is_effective = True +AND route_id = '17438_900', -- M1 tram line +AND "date" >= '2023-03-19' -- Sunday, dow = 0 +AND "date" <= '2023-03-25' -- Saturday, dow = 6 +ORDER BY route_id, "date", is_effective DESC +``` + +```csv +route_id,date,dow,nr_of_trips,nr_of_arrs_deps,is_effective,dow +17438_900,2023-03-19,0,258,5870,t,0 +17438_900,2023-03-20,1,345,7831,t,1 +17438_900,2023-03-21,2,345,7831,t,2 +17438_900,2023-03-22,3,345,7831,t,3 +17438_900,2023-03-23,4,345,7831,t,4 +17438_900,2023-03-24,5,345,7831,t,5 +17438_900,2023-03-25,6,326,9001,t,6 +``` diff --git a/index.js b/index.js index f46fcab..04ce6f7 100644 --- a/index.js +++ b/index.js @@ -19,6 +19,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { routesWithoutAgencyId: false, stopsWithoutLevelId: !files.some(f => f.name === 'levels'), stopsLocationIndex: false, + statsByRouteIdAndDate: 'none', schema: 'public', postgraphile: false, importMetadata: false, @@ -31,6 +32,7 @@ const convertGtfsToSql = async function* (files, opt = {}) { requireDependencies, ignoreUnsupportedFiles, importMetadata, + statsByRouteIdAndDate, } = opt if (ignoreUnsupportedFiles) { @@ -81,6 +83,12 @@ const convertGtfsToSql = async function* (files, opt = {}) { dep: [], }, } : {}), + + ...(statsByRouteIdAndDate !== 'none' ? { + 'stats_by_route_id_and_date': { + dep: ['stop_times'], + }, + } : {}), } for (const file of files) { diff --git a/lib/index.js b/lib/index.js index 524ea70..e744855 100644 --- a/lib/index.js +++ b/lib/index.js @@ -20,4 +20,5 @@ module.exports = { levels: require('./levels'), translations: require('./translations'), import_metadata: require('./import_metadata'), + stats_by_route_id_and_date: require('./stats_by_route_id_and_date'), } diff --git a/lib/stats_by_route_id_and_date.js b/lib/stats_by_route_id_and_date.js new file mode 100644 index 0000000..68b5a77 --- /dev/null +++ b/lib/stats_by_route_id_and_date.js @@ -0,0 +1,74 @@ +'use strict' + +const afterAll = (opt) => { + let materialized = false + if (opt.statsByRouteIdAndDate === 'materialized-view') { + materialized = true + } else if (opt.statsByRouteIdAndDate !== 'view') { + throw new Error('invalid opt.statsByRouteIdAndDate, must be one of these: none, view, materialized-view.') + } + const createViewCmd = materialized + ? `CREATE MATERIALIZED VIEW` + : `CREATE OR REPLACE VIEW` + + return `\ +${createViewCmd} "${opt.schema}".stats_by_route_id_and_date AS +WITH + arrs_deps_with_svc_date AS NOT MATERIALIZED ( + SELECT + route_id, stop_sequence_consec, + "date"::date AS svc_date, + EXTRACT(DOW FROM "date") AS svc_dow + FROM "${opt.schema}".arrivals_departures + ), + by_svc_date AS NOT MATERIALIZED ( + SELECT DISTINCT ON (route_id, svc_date) + route_id, + svc_date AS "date", + svc_dow AS dow, + count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, svc_date) AS nr_of_trips, + count(*) OVER (PARTITION BY route_id, svc_date) AS nr_of_arrs_deps + FROM arrs_deps_with_svc_date + ), + arrs_deps_with_effective_date AS NOT MATERIALIZED ( + SELECT + route_id, stop_sequence_consec, + coalesce(t_departure, t_arrival)::date AS effective_date, + EXTRACT(DOW FROM coalesce(t_departure, t_arrival)) AS effective_dow + FROM "${opt.schema}".arrivals_departures + ), + by_effective_date AS NOT MATERIALIZED ( + SELECT DISTINCT ON (route_id, effective_date) + route_id, + effective_date AS "date", + effective_dow AS dow, + count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, effective_date) AS nr_of_trips, + count(*) OVER (PARTITION BY route_id, effective_date) AS nr_of_arrs_deps + FROM arrs_deps_with_effective_date + ) +SELECT + *, + True AS is_effective +FROM by_effective_date +UNION +SELECT + *, + False AS is_effective +FROM by_svc_date; + +${materialized ? `\ +CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id); +CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date ("date"); +CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id, "date", is_effective); +CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id, dow, is_effective); +` : ''} + +${opt.postgraphile ? `\ +COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_route_id_and_date IS E'@name routeStats\\n@primaryKey route_id,date,is_effective\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByDate'; +` : ''} +` +} + +module.exports = { + afterAll, +} diff --git a/package.json b/package.json index 9bb16bf..17a3304 100644 --- a/package.json +++ b/package.json @@ -12,6 +12,7 @@ "index.js", "scripts", "lib", + "docs", "example.sh", "LICENSE", "LICENSE-PROSPERITY.md", diff --git a/readme.md b/readme.md index 7548110..4163d42 100644 --- a/readme.md +++ b/readme.md @@ -257,6 +257,10 @@ env NODE_ENV=development npm exec -- serve-gtfs-via-graphql **As an example for the GraphQL API, check out the [test query](test/sample-gtfs-feed-postgraphile-test.graphql)** or open the [GraphiQL UI](https://github.com/graphql/graphiql) served at [`localhost:3000/graphiql`](http://localhost:3000/graphiql). +### more guides + +The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`. + ## Correctness vs. Speed regarding GTFS Time Values @@ -323,6 +327,7 @@ The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https:/ |
SELECT count(*)
FROM connections
WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)
| 84.17 | 83.71 | 83.98 | 84.05 | 84.18 | 84.82 | 85.44 | 88.441 | 100 | |
SELECT count(*)
FROM connections
WHERE from_stop_id = 'definitely-non-existent'
| 15.53 | 15.404 | 15.5 | 15.52 | 15.54 | 15.6 | 15.9 | 15.915 | 100 | |
SELECT *
FROM connections
WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'
AND date > '2022-08-08' AND date <= '2022-08-09'
ORDER BY t_departure
LIMIT 100
| 8414.27 | 7885.369 | 7994.99 | 8364.33 | 8735.64 | 9147.52 | 9180.64 | 9188.92 | 7 | +|
SELECT *
FROM stats_by_route_id_and_date
WHERE route_id = '17452_900' -- M4
AND date >= '2022-08-08' AND date <= '2022-08-14'
AND is_effective = true
| 2900.64 | 2888.196 | 2891.65 | 2893.07 | 2905.69 | 2927.44 | 2936.58 | 2938.86 | 10 | ## Related Projects diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh index 51603ba..88595c9 100755 --- a/test/amtrak-gtfs-2021-10-06.sh +++ b/test/amtrak-gtfs-2021-10-06.sh @@ -15,6 +15,7 @@ export PGDATABASE='amtrak_2021_10_06' ../cli.js -d --trips-without-shape-id --schema amtrak \ --import-metadata \ + --stats-by-route-id-and-date=view \ -- amtrak-gtfs-2021-10-06/*.txt | psql -b query=$(cat << EOF @@ -52,3 +53,16 @@ if [[ "$fMin" != "2021-11-24" ]]; then echo "invalid dates_filter_min(…): $fMin" 1>&2 exit 1 fi + +acelaStatQuery=$(cat << EOF +SELECT nr_of_trips, nr_of_arrs_deps +FROM amtrak.stats_by_route_id_and_date +WHERE route_id = '40751' -- Acela +AND date = '2021-11-26' +AND is_effective = True +EOF) +acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1) +if [[ "$acelaStat" != "16,190" ]]; then + echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2 + exit 1 +fi