diff --git a/CHANGELOG.md b/CHANGELOG.md index 9c74d33..e1cee9c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,10 @@ ### New features -- Added support for multiple watchdogs, to monitor occurrences of specific messages. +- Support for multiple watchdogs, to not just monitor heartbeats but to also monitor occurrences of specific messages. + +- Support to count specific messages during a configurable interval, to report basic statistics (and to implicitly + ensure the monitor itself is still operational). - The log file's directory name can either be specified in `filename`, or in `dirname` (default: current folder). diff --git a/README.md b/README.md index 6d7243c..498dbdb 100644 --- a/README.md +++ b/README.md @@ -6,8 +6,8 @@ serial): - Combines the stream of received bytes into single-line messages. - Uses [configurable](./config-example-ttn.js) regular expressions to determine a log level. - Uses the log level to save the single-line messages to rotating log files, each line prefixed with a timestamp. -- Optionally sends notifications to Slack and/or Telegram for lines with some minimum log level, or when the UART has - been inactive or has not received specific messages for some configurable time. +- Optionally sends notifications to Slack and/or Telegram for lines with some minimum log level, when the UART has been + inactive or has not received specific messages for some configurable time, and for basic reports. This is not intended to be a production-ready monitoring mechanism, if only as it has no fallback for losing its internet connection. @@ -141,8 +141,8 @@ sudo apt-get update sudo apt-get install vim ``` -...after which `vi -R uart-monitor-debug-20180301.log.gz` just works. (Note that `grep` already works fine for -compressed logs.) +...after which `vi -R uart-monitor-debug-20180301.log.gz` just works. To search the files, `zgrep` is propably already +installed. ### Cleaning the resulting log files diff --git a/config-example-ttn.js b/config-example-ttn.js index e140f77..3d4a19b 100644 --- a/config-example-ttn.js +++ b/config-example-ttn.js @@ -87,6 +87,7 @@ module.exports = { // } // }, + // Optional watchdogs, only reported to Slack and/or Telegram. watchdogs: [ { // No include/exclude patterns, hence the pure heartbeat watchdog that is satisfied for any UART byte @@ -101,6 +102,7 @@ module.exports = { // matched for a FULL line from the UART. To independently monitor multiple types of messages one should // configure multiple watchdogs, instead of listing multiple patterns within a single watchdog. name: 'LoRaWAN uplink', + // MQTT: Sending UPLINK OK include: [/sending uplink ok/i], exclude: [], timeout: 15 * MINUTES, @@ -109,6 +111,7 @@ module.exports = { { // The TTN Gateway sends a status message every 30 seconds, but booting might take a bit longer name: 'gateway status', + // MQTT: Sending status succeeded include: [/sending status succeeded/i], exclude: [], timeout: 5 * MINUTES, @@ -117,11 +120,122 @@ module.exports = { { // The TTN Gateway reboots for a firmware upgrade once per day name: 'daily firmware upgrade', - include: [/rebooting gateway for firmware update check/i], + // MAIN: Rebooting gateway for firmware update check + include: [/firmware update/i], exclude: [], timeout: 25 * HOURS, repeat: 12 * HOURS } + ], + + // Optional reporters, only sending notifications to Slack and/or Telegram. + reporters: [ + { + // One can define multiple schedules, where each will keep track of its own counts for its interval. + schedules: [ + // Enable for quick testing: + // { + // name: '5 minutes report', + // interval: 5 * MINUTES, + // level: 'warn' + // }, + { + name: 'Hourly gateway report', + // The interval for which to collect and report the statistics. The very first report might use a + // smaller interval, as the monitor runs the reporting at recognizable times. Like if the interval + // is an exact number of days, the first report is sent to Slack and/or Telegram around midnight + // (LOCAL TIME), no matter when the monitor was started. Likewise this detects multiples of hours, + // 30 minutes, 15 minutes, 10 minutes, 5 minutes, 1 minute, 30 seconds, 15 seconds, 10 seconds, + // 5 seconds and a single second. + interval: 60 * MINUTES, + // The log level at which to report the statistics, matched against the minimum level for Slack and + // Telegram. Here: Slack only. + level: 'warn' + }, + { + name: 'Daily gateway report', + interval: 24 * HOURS, + // Both Slack and Telegram + level: 'error' + } + ], + reports: [ + { + name: 'dropped LoRa packets', + // LORA: Packet dropped! Bad CRC + include: [/LORA: Packet dropped/i], + exclude: [] + }, + { + name: 'accepted LoRa packets', + include: [/LORA: Accepted packet/i], + exclude: [] + }, + { + name: 'forwarded LoRa uplinks', + include: [/MQTT: Sending UPLINK OK/i], + exclude: [] + }, + { + name: 'received LoRa downlinks', + include: [/MQTT: Received DOWNLINK/i], + exclude: [] + }, + { + name: 'status packets', + include: [/MQTT: Sending status packet/i], + // MQTT: Sending status succeeded: 11 + exclude: [/MQTT: Sending status succeeded/] + }, + { + name: 'MQTT errors', + // MQTT: Connection failed + // MQTT: Sending status failed + // MQTT: Sending UPLINK failed: -1 + // ...all followed by: + // MAIN: MQTT error + include: [/MQTT.* error/i], + // MQTT: Report reboot error: 0110 + exclude: [/report .* error/i] + }, + { + name: 'network errors', + // INET: Connected to a network, waiting for DHCP lease, checking validity with ping + // INET: Trying to connect to WiFi router again (after 120 seconds) + // INET: No Ethernet and WiFi link (after 10 seconds) + include: [/INET: No Ethernet and WiFi link/i], + // INET: Error sending probe on Eth + // INET: Error sending probe on WiFi + exclude: [/INET: Error sending probe/] + }, + { + name: 'reboots', + // *** Application reboot + // Reboot reason: 0x10 + include: [/Reboot reason/i], + // MAIN: Rebooting gateway for firmware update check + exclude: [/upgrade check/i] + } + ] + }, + { + schedules: [ + { + name: 'Hourly UART report', + interval: 60 * MINUTES, + // Slack only + level: 'warn' + } + ], + reports: [ + { + name: 'UART messages', + include: [/.*/i], + exclude: [] + } + ] + } + ] }; diff --git a/index.js b/index.js index 2035339..e30f56b 100644 --- a/index.js +++ b/index.js @@ -1,8 +1,8 @@ /** * Monitors a Raspberry Pi UART, combining the stream of received bytes into single-line messages, determining a log * level based on regular expressions, saving the messages to rotating log files while prefixing each message with a - * timestamp, and optionally sending notifications to Slack and/or Telegram for some minimum log level or when the UART - * has been inactive or has not seen specific messages for too long. + * timestamp, and optionally sending notifications to Slack and/or Telegram for some minimum log level, when the UART + * has been inactive or has not seen specific messages for too long, or for some basic reports. */ 'use strict'; @@ -12,6 +12,7 @@ const Serial = require('raspi-serial').Serial; const LogService = require('./log-service'); const NotificationService = require('./notification-service'); const WatchdogService = require('./watchdog-service'); +const ReportingService = require('./reporting-service'); const config = require('./config'); @@ -21,8 +22,9 @@ logService.warn('[monitor] Starting UART monitor'); const notificationService = new NotificationService(config.notifications); notificationService.warn('Starting UART monitor'); -// The watchdog only sends notifications to Telegram and/or Slack (if configured), not to the log files +// The watchdogs and reporters only send notifications to Telegram and/or Slack (if configured), not to the log files const watchdogService = new WatchdogService(config.watchdogs, notificationService); +const reportingService = new ReportingService(config.reporters, notificationService); /** * Gets (guesses) a log level based on the message's text, using the regular expressions from the configuration. @@ -54,6 +56,7 @@ function log(data) { logService.log(level, msg); notificationService.log(level, msg); watchdogService.message(msg); + reportingService.message(msg); } // The message following the last newline might not be complete yet; handle at a later time buffer = lines[lines.length - 1]; diff --git a/reporter.js b/reporter.js new file mode 100644 index 0000000..e6c62e8 --- /dev/null +++ b/reporter.js @@ -0,0 +1,93 @@ +/** + * Configures one or more counters for regular expressions, and reports the collected statistics at the configured + * interval. + */ + +'use strict'; + +class Reporter { + + constructor(reports, schedule, logger) { + this.reports = reports; + this.schedule = schedule; + this.logger = logger; + this.lastRun = Date.now(); + this.counts = {}; + + if (schedule.interval) { + // The first run might use an interval that is a lot smaller than the configured interval; see below + this._schedule(); + } + else { + this.logger.warn('No reporting interval in configuration; not enabling reporter ' + schedule.name); + } + } + + /** + * Compares the given message to the known regular expressions, and increases the report's counter if matched. + * + * @param msg The message to compare against the configured patterns + */ + message(msg) { + this.reports.forEach(report => { + if (report.include.some(p => p.test(msg)) && !(report.exclude || []).some(p => p.test(msg))) { + this.counts[report.name] = (this.counts[report.name] || 0) + 1; + } + }); + } + + /** + * Schedules the next report. + * + * @private + */ + _schedule() { + setTimeout(() => this._report(), this._minimizeInterval(this.schedule.interval)); + } + + /** + * Decreases the given `interval` to match the last possible time that coincides with the unit of the given + * interval, if any. + * + * Like if the interval is an exact number of days, this yields the milliseconds until some next day (local time), + * and when called just before midnight the returned interval might be quite small. Likewise this detects multiples + * of hours, 30 minutes, 15 minutes, 10 minutes, 5 minutes, 1 minute, 30 seconds, 15 seconds, 10 seconds, 5 seconds + * and a single second. If no unit is detected, this just returns the given interval. But as, e.g., 11 minutes will + * be matched as a multiple of 1 minute, and 62 seconds as a multiple of 1 second, such will only happen when + * explicitly passing an interval that cannot be rounded to a second. + * + * @param interval the maximum number of milliseconds until the expected time + * @private + */ + _minimizeInterval(interval) { + const unit = [24 * 3600, 3600, 30 * 60, 15 * 60, 10 * 60, 5 * 60, 60, 30, 15, 10, 5, 1].find(s => interval % (s * 1000) === 0) * 1000; + if (!unit) { + return interval; + } + const utc = new Date(); + // Ensure daily reports start at midnight in the local timezone + const now = utc.getTime() - (utc.getTimezoneOffset() * 60 * 1000); + const next = Math.floor((now + interval) / unit) * unit; + return next - now; + } + + /** + * Reports the collected statistics at the configured log level, and schedules a new report. + * + * @private + */ + _report() { + const msg = this.schedule.name + ' since ' + (new Date(this.lastRun)).toISOString() + + ':\n' + + this.reports.map(report => '\u2022 ' + report.name + ": " + (this.counts[report.name] || 0)).join('\n'); + + this.logger.log(this.schedule.level || 'info', msg); + + this.lastRun = Date.now(); + this.counts = {}; + this._schedule(); + } + +} + +module.exports = Reporter; diff --git a/reporting-service.js b/reporting-service.js new file mode 100644 index 0000000..e36c94b --- /dev/null +++ b/reporting-service.js @@ -0,0 +1,25 @@ +/** + * Configures one or more reporters, each with one or more schedules, and delegates messages to each one of them. + */ + +'use strict'; + +const Reporter = require('./reporter'); + +class ReportingService { + + constructor(config, logger) { + this.reporters = []; + config.forEach(reporter => { + // In case of multiple schedules: duplicate each reporter for each schedule + reporter.schedules.forEach(schedule => this.reporters.push(new Reporter(reporter.reports, schedule, logger))); + }); + } + + message(msg) { + this.reporters.forEach(reporter => reporter.message(msg)); + } + +} + +module.exports = ReportingService;