From d0776520bdd29c3ecadf60578d67d4232a37263d Mon Sep 17 00:00:00 2001 From: Alvar Penning Date: Tue, 19 Mar 2024 15:22:15 +0100 Subject: [PATCH] PostgreSQL: get_sla_ok_percent to return decimal The final division within the get_sla_ok_percent SQL function in its PostgreSQL implementation silently truncated decimal places. An explicit decimal cast resulted for this equation resulted in a decimal value. To both verify and catch this in the future, a test with odd numbers was added. This already succeeded for MySQL, but needed the modified schema for PostgreSQL. Closes #648. --- schema/pgsql/schema.sql | 2 +- schema/pgsql/upgrades/1.1.1.sql | 140 ++++++++++++++++++++++++++++++++ tests/sql/sla_test.go | 16 ++++ 3 files changed, 157 insertions(+), 1 deletion(-) diff --git a/schema/pgsql/schema.sql b/schema/pgsql/schema.sql index 9027fac52..c2584d600 100644 --- a/schema/pgsql/schema.sql +++ b/schema/pgsql/schema.sql @@ -166,7 +166,7 @@ BEGIN END IF; END LOOP; - RETURN 100 * (total_time - problem_time) / total_time; + RETURN 100 * CAST(total_time - problem_time AS decimal) / total_time; END; $$; diff --git a/schema/pgsql/upgrades/1.1.1.sql b/schema/pgsql/upgrades/1.1.1.sql index ed12ed71b..f3e6366fe 100644 --- a/schema/pgsql/upgrades/1.1.1.sql +++ b/schema/pgsql/upgrades/1.1.1.sql @@ -1,3 +1,143 @@ +CREATE OR REPLACE FUNCTION get_sla_ok_percent( + in_host_id bytea20, + in_service_id bytea20, + in_start_time biguint, + in_end_time biguint +) + RETURNS decimal(7, 4) + LANGUAGE plpgsql + STABLE + PARALLEL RESTRICTED +AS $$ +DECLARE + last_event_time biguint := in_start_time; + last_hard_state tinyuint; + active_downtimes uint := 0; + problem_time biguint := 0; + total_time biguint; + row record; +BEGIN + IF in_end_time <= in_start_time THEN + RAISE 'end time must be greater than start time'; + END IF; + + total_time := in_end_time - in_start_time; + + -- Use the latest event at or before the beginning of the SLA interval as the initial state. + SELECT hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time <= in_start_time + ORDER BY s.event_time DESC + LIMIT 1; + + -- If this does not exist, use the previous state from the first event after the beginning of the SLA interval. + IF last_hard_state IS NULL THEN + SELECT previous_hard_state INTO last_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + ORDER BY s.event_time ASC + LIMIT 1; + END IF; + + -- If this also does not exist, use the current host/service state. + IF last_hard_state IS NULL THEN + IF in_service_id IS NULL THEN + SELECT hard_state INTO last_hard_state + FROM host_state s + WHERE s.host_id = in_host_id; + ELSE + SELECT hard_state INTO last_hard_state + FROM service_state s + WHERE s.host_id = in_host_id + AND s.service_id = in_service_id; + END IF; + END IF; + + IF last_hard_state IS NULL THEN + last_hard_state := 0; + END IF; + + FOR row IN + ( + -- all downtime_start events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + GREATEST(downtime_start, in_start_time) AS event_time, + 'downtime_start' AS event_type, + 1 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + ) UNION ALL ( + -- all downtime_end events before the end of the SLA interval + -- for downtimes that overlap the SLA interval in any way + SELECT + downtime_end AS event_time, + 'downtime_end' AS event_type, + 2 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + FROM sla_history_downtime d + WHERE d.host_id = in_host_id + AND ((in_service_id IS NULL AND d.service_id IS NULL) OR d.service_id = in_service_id) + AND d.downtime_start < in_end_time + AND d.downtime_end >= in_start_time + AND d.downtime_end < in_end_time + ) UNION ALL ( + -- all state events strictly in interval + SELECT + event_time, + 'state_change' AS event_type, + 0 AS event_prio, + hard_state, + previous_hard_state + FROM sla_history_state s + WHERE s.host_id = in_host_id + AND ((in_service_id IS NULL AND s.service_id IS NULL) OR s.service_id = in_service_id) + AND s.event_time > in_start_time + AND s.event_time < in_end_time + ) UNION ALL ( + -- end event to keep loop simple, values are not used + SELECT + in_end_time AS event_time, + 'end' AS event_type, + 3 AS event_prio, + NULL::tinyuint AS hard_state, + NULL::tinyuint AS previous_hard_state + ) + ORDER BY event_time, event_prio + LOOP + IF row.previous_hard_state = 99 THEN + total_time := total_time - (row.event_time - last_event_time); + ELSEIF ((in_service_id IS NULL AND last_hard_state > 0) OR (in_service_id IS NOT NULL AND last_hard_state > 1)) + AND last_hard_state != 99 + AND active_downtimes = 0 + THEN + problem_time := problem_time + row.event_time - last_event_time; + END IF; + + last_event_time := row.event_time; + IF row.event_type = 'state_change' THEN + last_hard_state := row.hard_state; + ELSEIF row.event_type = 'downtime_start' THEN + active_downtimes := active_downtimes + 1; + ELSEIF row.event_type = 'downtime_end' THEN + active_downtimes := active_downtimes - 1; + END IF; + END LOOP; + + RETURN 100 * CAST(total_time - problem_time AS decimal) / total_time; +END; +$$; + ALTER TABLE notification ALTER COLUMN name TYPE varchar(767); COMMENT ON COLUMN notification.name IS '255+1+255+1+255, i.e. "host.name!service.name!notification.name"'; diff --git a/tests/sql/sla_test.go b/tests/sql/sla_test.go index 8a898504a..006ec8d02 100644 --- a/tests/sql/sla_test.go +++ b/tests/sql/sla_test.go @@ -47,6 +47,22 @@ func TestSla(t *testing.T) { Start: 1000, End: 2000, Expected: 60.0, + }, { + Name: "MultipleStateChangesOddNumbers", + // Some flapping again, also test that calculations are rounded correctly including decimal places. + Events: []SlaHistoryEvent{ + &State{Time: 1000, State: 2, PreviousState: 99}, // -2.3% + &State{Time: 1023, State: 0, PreviousState: 2}, + &State{Time: 1100, State: 2, PreviousState: 0}, // -14.2% + &State{Time: 1242, State: 0, PreviousState: 2}, + &State{Time: 1300, State: 2, PreviousState: 0}, // -0.7% + &State{Time: 1307, State: 0, PreviousState: 2}, + &State{Time: 1400, State: 2, PreviousState: 0}, // -26.6% + &State{Time: 1666, State: 0, PreviousState: 2}, + }, + Start: 1000, + End: 2000, + Expected: 56.2, }, { Name: "OverlappingDowntimesAndProblems", // SLA should be 90%: