add --stats-by-route-id-and-date option 📝✅⚡️

Supports analyzing a feed by route and/or date and/or day of the week. part of #33
public-transport · Mar 29, 2023 · b846e51 · b846e51
1 parent 59b6ee5
commit b846e51
Show file tree

Hide file tree

Showing 10 changed files with 173 additions and 0 deletions.
diff --git a/benchmark/index.sql b/benchmark/index.sql
@@ -75,6 +75,7 @@ LANGUAGE plpgsql;
 \i connections_by_stop.sql
 \i connections_by_non_existent_stop.sql
 \i connections_by_time.sql
+\i stats_by_route_id_and_date.sql
 
 SELECT * FROM _benchmark;
 

diff --git a/benchmark/stats_by_route_id_and_date.sql b/benchmark/stats_by_route_id_and_date.sql
@@ -0,0 +1,8 @@
+SELECT * from bench(
+'SELECT *
+FROM stats_by_route_id_and_date
+WHERE route_id = ''17452_900'' -- M4
+AND date >= ''2022-08-08'' AND date <= ''2022-08-14''
+AND is_effective = true',
+10
+);
diff --git a/cli.js b/cli.js
@@ -41,6 +41,9 @@ const {
 		'stops-location-index': {
 			type: 'boolean',
 		},
+		'stats-by-route-id-and-date': {
+			type: 'string',
+		},
 		'schema': {
 			type: 'string',
 		},
@@ -75,6 +78,12 @@ Options:
                                   Default if levels.txt has not been provided.
     --stops-location-index        Create a spatial index on stops.stop_loc for efficient
                                     queries by geolocation.
+    --stats-by-route-id-and-date  Wether to generate a stats_by_route_id_and_date view
+                                    letting you analyze all data per routes and/or date:
+                                    - none: Don't generate a view.
+                                    - view: Fast generation, slow access.
+                                    - materialized-view: Slow generation, fast access.
+                                    Default: none
     --schema                      The schema to use for the database. Default: public
     --postgraphile                Tweak generated SQL for PostGraphile usage.
                                     https://www.graphile.org/postgraphile/
@@ -113,6 +122,7 @@ const opt = {
 	tripsWithoutShapeId: !!flags['trips-without-shape-id'],
 	routesWithoutAgencyId: !!flags['routes-without-agency-id'],
 	stopsLocationIndex: !!flags['stops-location-index'],
+	statsByRouteIdAndDate: flags['stats-by-route-id-and-date'] || 'none',
 	schema: flags['schema'] || 'public',
 	postgraphile: !!flags.postgraphile,
 	importMetadata: !!flags['import-metadata'],

diff --git a/docs/analysis/feed-by-route-and-date.md b/docs/analysis/feed-by-route-and-date.md
@@ -0,0 +1,51 @@
+# analysing a GTFS dataset by route ID and/or date
+
+Are you trying to answer a question like those below?
+
+- Are there certain dates or days of the week that have sigificantly less arrivals/departures (hereinafter "stop time events")? – This *may* indicate errors in the data, e.g. a faulty `calendar.csv` or `calendar_dates.csv` file.
+- Has the number of stop time events decreased, compared to the last dataset version?
+- Do specific routes stop running during certain time periods?
+
+`gtfs-via-postgres` optionally provides a **(materialized) view `stats_by_route_id_and_date` to help with such SQL queries. Use the `--stats-by-route-id-and-date` flag to enable it** in the generated SQL:
+
+- If you run `gtfs-to-sql` with `--stats-by-route-id-and-date=view`, `stats_by_route_id_and_date` will be a "regular" non-materialized view. Use this option if you want to import the GTFS data quickly, and if you only query `stats_by_route_id_and_date` rarely or in time-uncritical scenarios.
+- If you pass `--stats-by-route-id-and-date=materialized-view`, the `stats_by_route_id_and_date` view will [be materialized](https://www.postgresql.org/docs/14/rules-materializedviews.html). Use this option if you need fast queries, and if you can tolerate significantly longer import times (3m for the 64mb 2023-03-05 SNCB/NMBS GTFS feed, 1h15m for the 540mb 2023-02-27 VBB GTFS feed).
+
+`stats_by_route_id_and_date` has the following columns:
+
+- `date`
+- `dow` – day of the week, following the [PostgreSQL notation `0` (Sunday) to `6` (Saturday)](https://www.postgresql.org/docs/14/functions-datetime.html#FUNCTIONS-DATETIME-EXTRACT)
+- `nr_of_trips` – nr of trips starting on that date
+- `nr_of_arrs_deps` – nr of trips taking place on that date
+- `is_effective` – wether `nr_of_trips` & `nr_of_arrs_deps` are calculated based on the *effective* date (i.e. the date that the stop time event actually happens on) or *schedule* date (i.e. the date which their `stop_time` rows refer to)
+
+So
+
+- if you want to take a customer-facing perspective on the data (as in "I don't care which if trips are scheduled before midnight, I want to know if they run today"), filter for `is_effective = True` rows;
+- If you're interested in the operational/planning perspective (e.g. if you're looking for data errors), filter for `is_effective = False` rows.
+
+## example: nr of effective stop time events of a single route over a week
+
+```sql
+-- using VBB's 2023-02-27 GTFS data
+SELECT
+	*,
+	EXTRACT(dow FROM "date") as dow
+FROM stats_by_route_id_and_date stats
+WHERE is_effective = True
+AND route_id = '17438_900', -- M1 tram line
+AND "date" >= '2023-03-19' -- Sunday, dow = 0
+AND "date" <= '2023-03-25' -- Saturday, dow = 6
+ORDER BY route_id, "date", is_effective DESC
+```
+
+```csv
+route_id,date,dow,nr_of_trips,nr_of_arrs_deps,is_effective,dow
+17438_900,2023-03-19,0,258,5870,t,0
+17438_900,2023-03-20,1,345,7831,t,1
+17438_900,2023-03-21,2,345,7831,t,2
+17438_900,2023-03-22,3,345,7831,t,3
+17438_900,2023-03-23,4,345,7831,t,4
+17438_900,2023-03-24,5,345,7831,t,5
+17438_900,2023-03-25,6,326,9001,t,6
+```
diff --git a/index.js b/index.js
@@ -19,6 +19,7 @@ const convertGtfsToSql = async function* (files, opt = {}) {
 		routesWithoutAgencyId: false,
 		stopsWithoutLevelId: !files.some(f => f.name === 'levels'),
 		stopsLocationIndex: false,
+		statsByRouteIdAndDate: 'none',
 		schema: 'public',
 		postgraphile: false,
 		importMetadata: false,
@@ -31,6 +32,7 @@ const convertGtfsToSql = async function* (files, opt = {}) {
 		requireDependencies,
 		ignoreUnsupportedFiles,
 		importMetadata,
+		statsByRouteIdAndDate,
 	} = opt
 
 	if (ignoreUnsupportedFiles) {
@@ -81,6 +83,12 @@ const convertGtfsToSql = async function* (files, opt = {}) {
 				dep: [],
 			},
 		} : {}),
+
+		...(statsByRouteIdAndDate !== 'none' ? {
+			'stats_by_route_id_and_date': {
+				dep: ['stop_times'],
+			},
+		} : {}),
 	}
 
 	for (const file of files) {

diff --git a/lib/index.js b/lib/index.js
@@ -20,4 +20,5 @@ module.exports = {
 	levels: require('./levels'),
 	translations: require('./translations'),
 	import_metadata: require('./import_metadata'),
+	stats_by_route_id_and_date: require('./stats_by_route_id_and_date'),
 }
diff --git a/lib/stats_by_route_id_and_date.js b/lib/stats_by_route_id_and_date.js
@@ -0,0 +1,74 @@
+'use strict'
+
+const afterAll = (opt) => {
+	let materialized = false
+	if (opt.statsByRouteIdAndDate === 'materialized-view') {
+		materialized = true
+	} else if (opt.statsByRouteIdAndDate !== 'view') {
+		throw new Error('invalid opt.statsByRouteIdAndDate, must be one of these: none, view, materialized-view.')
+	}
+	const createViewCmd = materialized
+		? `CREATE MATERIALIZED VIEW`
+		: `CREATE OR REPLACE VIEW`
+
+	return `\
+${createViewCmd} "${opt.schema}".stats_by_route_id_and_date AS
+WITH
+	arrs_deps_with_svc_date AS NOT MATERIALIZED (
+		SELECT
+			route_id, stop_sequence_consec,
+			"date"::date AS svc_date,
+			EXTRACT(DOW FROM "date") AS svc_dow
+		FROM "${opt.schema}".arrivals_departures
+	),
+	by_svc_date AS NOT MATERIALIZED (
+		SELECT DISTINCT ON (route_id, svc_date)
+			route_id,
+			svc_date AS "date",
+			svc_dow AS dow,
+			count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, svc_date) AS nr_of_trips,
+			count(*) OVER (PARTITION BY route_id, svc_date) AS nr_of_arrs_deps
+		FROM arrs_deps_with_svc_date
+	),
+	arrs_deps_with_effective_date AS NOT MATERIALIZED (
+		SELECT
+			route_id, stop_sequence_consec,
+			coalesce(t_departure, t_arrival)::date AS effective_date,
+			EXTRACT(DOW FROM coalesce(t_departure, t_arrival)) AS effective_dow
+		FROM "${opt.schema}".arrivals_departures
+	),
+	by_effective_date AS NOT MATERIALIZED (
+		SELECT DISTINCT ON (route_id, effective_date)
+			route_id,
+			effective_date AS "date",
+			effective_dow AS dow,
+			count(*) FILTER (WHERE stop_sequence_consec = 0) OVER (PARTITION BY route_id, effective_date) AS nr_of_trips,
+			count(*) OVER (PARTITION BY route_id, effective_date) AS nr_of_arrs_deps
+		FROM arrs_deps_with_effective_date
+	)
+SELECT
+	*,
+	True AS is_effective
+FROM by_effective_date
+UNION
+SELECT
+	*,
+	False AS is_effective
+FROM by_svc_date;
+
+${materialized ? `\
+CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id);
+CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date ("date");
+CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id, "date", is_effective);
+CREATE INDEX ON "${opt.schema}".stats_by_route_id_and_date (route_id, dow, is_effective);
+` : ''}
+
+${opt.postgraphile ? `\
+COMMENT ON${materialized ? ' MATERIALIZED' : ''} VIEW "${opt.schema}".stats_by_route_id_and_date IS E'@name routeStats\\n@primaryKey route_id,date,is_effective\\n@foreignKey (route_id) references routes|@fieldName route|@foreignFieldName statsByDate';
+` : ''}
+`
+}
+
+module.exports = {
+	afterAll,
+}
diff --git a/package.json b/package.json
@@ -12,6 +12,7 @@
 		"index.js",
 		"scripts",
 		"lib",
+		"docs",
 		"example.sh",
 		"LICENSE",
 		"LICENSE-PROSPERITY.md",

diff --git a/readme.md b/readme.md
@@ -257,6 +257,10 @@ env NODE_ENV=development npm exec -- serve-gtfs-via-graphql
 
 **As an example for the GraphQL API, check out the [test query](test/sample-gtfs-feed-postgraphile-test.graphql)** or open the [GraphiQL UI](https://github.com/graphql/graphiql) served at [`localhost:3000/graphiql`](http://localhost:3000/graphiql).
 
+### more guides
+
+The [`docs` directory](docs) contains more instructions on how to use `gtfs-via-postgres`.
+
 
 ## Correctness vs. Speed regarding GTFS Time Values
 
@@ -323,6 +327,7 @@ The following benchmarks were run with the [2022-07-01 VBB GTFS dataset](https:/
 | <pre>SELECT count(*)<br>FROM connections<br>WHERE from_stop_id = 'de:11000:900100001::4' -- S+U Friedrichstr. (Berlin)</pre> | 84.17 | 83.71 | 83.98 | 84.05 | 84.18 | 84.82 | 85.44 | 88.441 | 100 |
 | <pre>SELECT count(*)<br>FROM connections<br>WHERE from_stop_id = 'definitely-non-existent'</pre> | 15.53 | 15.404 | 15.5 | 15.52 | 15.54 | 15.6 | 15.9 | 15.915 | 100 |
 | <pre>SELECT *<br>FROM connections<br>WHERE t_departure >= '2022-08-09T07:10+02' AND t_departure <= '2022-08-09T07:30+02'<br>AND date > '2022-08-08' AND date <= '2022-08-09'<br>ORDER BY t_departure<br>LIMIT 100</pre> | 8414.27 | 7885.369 | 7994.99 | 8364.33 | 8735.64 | 9147.52 | 9180.64 | 9188.92 | 7 |
+| <pre>SELECT *<br>FROM stats_by_route_id_and_date<br>WHERE route_id = '17452_900' -- M4<br>AND date >= '2022-08-08' AND date <= '2022-08-14'<br>AND is_effective = true</pre> | 2900.64 | 2888.196 | 2891.65 | 2893.07 | 2905.69 | 2927.44 | 2936.58 | 2938.86 | 10 |
 
 
 ## Related Projects

diff --git a/test/amtrak-gtfs-2021-10-06.sh b/test/amtrak-gtfs-2021-10-06.sh
@@ -15,6 +15,7 @@ export PGDATABASE='amtrak_2021_10_06'
 
 ../cli.js -d --trips-without-shape-id --schema amtrak \
 	--import-metadata \
+	--stats-by-route-id-and-date=view \
 	-- amtrak-gtfs-2021-10-06/*.txt | psql -b
 
 query=$(cat << EOF
@@ -52,3 +53,16 @@ if [[ "$fMin" != "2021-11-24" ]]; then
 	echo "invalid dates_filter_min(…): $fMin" 1>&2
 	exit 1
 fi
+
+acelaStatQuery=$(cat << EOF
+SELECT nr_of_trips, nr_of_arrs_deps
+FROM amtrak.stats_by_route_id_and_date
+WHERE route_id = '40751' -- Acela
+AND date = '2021-11-26'
+AND is_effective = True
+EOF)
+acelaStat=$(psql --csv -t -c "$acelaStatQuery" | tail -n 1)
+if [[ "$acelaStat" != "16,190" ]]; then
+	echo "invalid stats for route 40751 (Acela) on 2021-11-26: $acelaStat" 1>&2
+	exit 1
+fi