From e0c48236be81d8617938361fb72029ac8c29a776 Mon Sep 17 00:00:00 2001 From: Jannis R Date: Thu, 29 Aug 2024 17:18:56 +0200 Subject: [PATCH] =?UTF-8?q?matching:=20handle=20GTFS=20Schedule=20stops=20?= =?UTF-8?q?without=20parent=20station=20=E2=9C=85=F0=9F=93=9D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/caveats.md | 30 ++++++++++++++++++- lib/gtfs-stop-by-aus-haltid.js | 53 ++++++++++++++++++++++++++------- test/gtfs-stop-by-aus-haltid.js | 22 ++++++++++++++ 3 files changed, 94 insertions(+), 11 deletions(-) diff --git a/docs/caveats.md b/docs/caveats.md index 036d40e..ee4adea 100644 --- a/docs/caveats.md +++ b/docs/caveats.md @@ -1,3 +1,31 @@ ## stop IDs -While matching `AUS` `IstFahrt`s to the GTFS Schedule data, the services assumes that the `HaltID`s (without the data-provider-specific prefixes) uniquely match a GTFS stop/station. As soon as there are >1 matching GTFS stops/stations, matching of the `AusFahrt` gets skipped. +While matching `AUS` `IstFahrt`s to the GTFS Schedule data, the services assumes that the `IstHalt.HaltID`s (without the data-provider-specific prefixes) match a GTFS stop/station unambigously, with some notable exceptions relaxing this requirement: + +- The service assumes that every entry with a 3-section `$country:$region:$id`-formatted `stop_id` is a station. +- As long as there's only one station matching the `AUS` `HaltID`, regardless of the number of other stops/platforms matching too, it is considered an unambiguous match. +- Alternatively, as long as there are only stops/platforms (no station) "roughly" matching the `AUS` `HaltID`, the lexicographically first them is considered an unambiguous match. +- The `parent_station`-based topology delineated by GTFS Schedule is currently ignored. + +Consider the following fictional GTFS Schedule `stops.txt`: + +```csv +stop_id,stop_name,parent_station +de:12063:900210771,"Rathenow, Bahnhof", +de:12063:900210771::2,"Rathenow, Bahnhof", +de:12063:900210771::1,"Rathenow, Bahnhof", +de:12063:900210771:1:50,"Rathenow, Bahnhof",de:12063:900210771 +de:12063:900210771:1:51,"Rathenow, Bahnhof",de:12063:900210771 +de:12063:900210771:2:52,"Rathenow, Bahnhof",de:12063:900210771 +de:12063:900210778::1,"Rathenow, Clara-Zetkin-Str.", +de:12063:900210779::1,"Rathenow, Curlandstr.", +de:12063:900210779::2,"Rathenow, Curlandstr.", +``` + +- The `AUS` `HaltID` of `ODEG_900210771` matches `de:12063:900210771`. +- The `AUS` `HaltID` of `ODEG_900210778` matches `de:12063:900210778::1`. +- The `AUS` `HaltID` of `ODEG_900210779` matches `de:12063:900210779::1`, because `de:12063:900210779::2` is also just a stop/platform. +- The `AUS` `HaltID` of `ODEG_90021077` doesn't match. + +> [!IMPORTANT] +> The generated GTFS Realtime `StopTimeUpdate`s will be wrong in some cases, because their `stop_id` (e.g. `de:12063:900210779::1`) won't match the stop that the GTFS Schedule `trips.txt` specifies (e.g. `de:12063:900210779::2`)! diff --git a/lib/gtfs-stop-by-aus-haltid.js b/lib/gtfs-stop-by-aus-haltid.js index b344aee..8edea8d 100644 --- a/lib/gtfs-stop-by-aus-haltid.js +++ b/lib/gtfs-stop-by-aus-haltid.js @@ -1,6 +1,7 @@ import {strictEqual} from 'node:assert/strict' import QuickLRU from 'quick-lru' import {connectToPostgres} from './db.js' +import pick from 'lodash/pick.js' const stripDataProviderPrefixFromAusHaltID = (ausHaltId) => { // remove data provider prefix, e.g. @@ -50,10 +51,25 @@ SELECT stop_id, stop_name, st_x(stop_loc::geometry) AS stop_lon, - st_y(stop_loc::geometry) AS stop_lat -FROM stops -WHERE stop_id LIKE $1 -ORDER BY stop_id + st_y(stop_loc::geometry) AS stop_lat, + specificity, + exact_match +FROM ( + SELECT + *, + 1 AS specificity, + True AS exact_match + FROM stops + WHERE stop_id LIKE $1 + UNION ALL + SELECT + *, + 2 AS specificity, + False AS exact_match + FROM stops + WHERE stop_id LIKE $2 +) t +ORDER BY specificity ASC, stop_id ASC LIMIT 2 `, values: [ @@ -61,7 +77,10 @@ LIMIT 2 // By requiring stop IDs to end with the (stripped) HaltID (e.g. `de:12063:900210771` "Rathenow, Bahnhof"), effectively we only obtain stations (stops witout parent). // [0] https://en.wikipedia.org/wiki/Identification_of_Fixed_Objects_in_Public_Transport // [1] https://www.delfi.de/de/strategie-technik/architektur/ - `%:${escapeForLikeOp(strippedHaltId)}`, + // stations with the DHID format `$country:$region:$station_id`: + `de:%:${strippedHaltId}`, + // stops/platforms with the DHID format `$country:$region:$station_id:$stop_platform_id`: + `de:%:${strippedHaltId}:%`, ], }) @@ -69,19 +88,32 @@ LIMIT 2 logger.warn({ ausHaltId, strippedHaltId, - }, 'no GTFS stop found for an AUS HaltID') + }, 'no GTFS stop/station found for an AUS HaltID') return null } - if (stops.length > 1) { + // Effectively, we allow either + // - 1 exact match; or + // - >=1 non-exact matches, as long as there's no exact match. + if (stops.length > 1 && stops[0].exact_match && stops[1].exact_match) { logger.warn({ ausHaltId, strippedHaltId, gtfsStops: stops, - }, '>1 GTFS stops found for an AUS HaltID, ignoring ambiguous match') + }, '>1 GTFS stops/stations found for an AUS HaltID, ignoring ambiguous match') return null } - // todo: trace-log? - return stops[0] + + const stop = pick(stops[0], [ + 'stop_id', + 'stop_name', + 'stop_lat', 'stop_lon', + ]) + logger.trace({ + ausHaltId, + strippedHaltId, + gtfsStops: stops, + }, 'using most likely GTFS stop/station') + return stop } const cache = new QuickLRU({ @@ -102,6 +134,7 @@ LIMIT 2 return { queryGtfsStopByAusHaltID: cachedQueryGtfsStopByAusHaltID, + uncachedQueryGtfsStopByAusHaltID: queryGtfsStopByAusHaltID, stop, } } diff --git a/test/gtfs-stop-by-aus-haltid.js b/test/gtfs-stop-by-aus-haltid.js index d1b9011..55e58f2 100644 --- a/test/gtfs-stop-by-aus-haltid.js +++ b/test/gtfs-stop-by-aus-haltid.js @@ -15,6 +15,28 @@ after(async () => { await stop() }) +test('works with a corresponding Schedule station', async (t) => { + // from ./fixtures/aus-istfahrt-13865-00024-1#HVG.json + const stop = await queryGtfsStopByAusHaltID('ODEG_900210771') + deepStrictEqual(stop, { + stop_id: 'de:12063:900210771', + stop_name: 'Rathenow, Bahnhof', + stop_lat: 52.600105, + stop_lon: 12.354617, + }) +}) + +test('works with corresponding Schedule stops only, without parent station', async (t) => { + // from ./fixtures/aus-istfahrt-13865-00024-1#HVG.json + const stop = await queryGtfsStopByAusHaltID('ODEG_900360079') + deepStrictEqual(stop, { + stop_id: 'de:12053:900360079::1', + stop_name: 'Frankfurt (Oder), Kopernikusstr.', + stop_lat: 52.327039, + stop_lon: 14.511701, + }) +}) + test('does not allow `%` SQL injections', async (t) => { // There is only one stop with a stop_id matching `%:90044994%`, so if we insert `90044994%` unescaped into a `LIKE '%' || $1` query, we obtain a non-ambiguous result through injection. const stop = await queryGtfsStopByAusHaltID('90044994%')