Skip to content

Commit

Permalink
🚀 Threshold monitor (#778)
Browse files Browse the repository at this point in the history
* 🚧 wip

* 🚧 wip

* 🚀 degraded

* 🚀 degraded

* 🚀 degraded

* 🚀 degraded

* 🤔 strange

* 🤔 trying

* 🤔 empty value

* 🚀 degraded

* 🔥 degraded

* 🚀 degraded

* 🔥 update pr

* 🛂 pr

* 📝 changelog

* 🚀 update image
  • Loading branch information
thibaultleouay authored Jul 1, 2024
1 parent b79b052 commit db163f6
Show file tree
Hide file tree
Showing 30 changed files with 2,576 additions and 227 deletions.
31 changes: 28 additions & 3 deletions apps/checker/cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ func main() {
}
// We need a new client for each request to avoid connection reuse.
requestClient := &http.Client{
Timeout: 45 * time.Second,
Timeout: time.Duration(req.Timeout) * time.Millisecond,
}
defer requestClient.CloseIdleConnections()

Expand Down Expand Up @@ -171,6 +171,7 @@ func main() {
}

res.Assertions = assertionAsString
// That part could be refactored
if !isSuccessfull && req.Status == "active" {
// Q: Why here we do not check if the status was previously active?
checker.UpdateStatus(ctx, checker.UpdateData{
Expand All @@ -182,8 +183,20 @@ func main() {
CronTimestamp: req.CronTimestamp,
})
}

if req.Status == "error" && isSuccessfull {
// Check if the status is degraded
if isSuccessfull && req.Status == "active" {
if req.DegradedAfter > 0 && res.Latency > req.DegradedAfter {
checker.UpdateStatus(ctx, checker.UpdateData{
MonitorId: req.MonitorID,
Status: "degraded",
Region: flyRegion,
StatusCode: res.StatusCode,
CronTimestamp: req.CronTimestamp,
})
}
}
// We were in error and now we are successful don't check for degraded
if isSuccessfull && req.Status == "error" {
// Q: Why here we check the data before updating the status in this scenario?
checker.UpdateStatus(ctx, checker.UpdateData{
MonitorId: req.MonitorID,
Expand All @@ -193,6 +206,18 @@ func main() {
CronTimestamp: req.CronTimestamp,
})
}
// if we were in degraded and now we are successful, we should update the status to active
if isSuccessfull && req.Status == "degraded" {
if req.DegradedAfter > 0 && res.Latency <= req.DegradedAfter {
checker.UpdateStatus(ctx, checker.UpdateData{
MonitorId: req.MonitorID,
Status: "active",
Region: flyRegion,
StatusCode: res.StatusCode,
CronTimestamp: req.CronTimestamp,
})
}
}

if err := tinybirdClient.SendEvent(ctx, res, dataSourceName); err != nil {
log.Ctx(ctx).Error().Err(err).Msg("failed to send event to tinybird")
Expand Down
2 changes: 2 additions & 0 deletions apps/checker/request/request.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,8 @@ type CheckerRequest struct {
Method string `json:"method"`
CronTimestamp int64 `json:"cronTimestamp"`
Body string `json:"body"`
Timeout int64 `json:"timeout"`
DegradedAfter int64 `json:"degradedAfter,omitempty"`
Headers []struct {
Key string `json:"key"`
Value string `json:"value"`
Expand Down
10 changes: 9 additions & 1 deletion apps/server/src/checker/alerting.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ export const triggerNotifications = async ({
monitorId: string;
statusCode?: number;
message?: string;
notifType: "alert" | "recovery";
notifType: "alert" | "recovery" | "degraded";
incidentId?: string;
}) => {
console.log(`💌 triggerAlerting for ${monitorId}`);
Expand Down Expand Up @@ -62,6 +62,14 @@ export const triggerNotifications = async ({
incidentId,
});
break;
case "degraded":
await providerToFunction[notif.notification.provider].sendDegraded({
monitor,
notification: selectNotificationSchema.parse(notif.notification),
statusCode,
message,
});
break;
}
// ALPHA
await checkerAudit.publishAuditLog({
Expand Down
57 changes: 49 additions & 8 deletions apps/server/src/checker/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -59,11 +59,52 @@ checkerRoute.post("/updateStatus", async (c) => {
and(
eq(incidentTable.monitorId, Number(monitorId)),
isNull(incidentTable.resolvedAt),
isNull(incidentTable.acknowledgedAt),
),
isNull(incidentTable.acknowledgedAt)
)
)
.get();

if (status === "degraded") {
// We upsert the status of the monitor
await upsertMonitorStatus({
monitorId: monitorId,
status: "degraded",
region: region,
});
await checkerAudit.publishAuditLog({
id: `monitor:${monitorId}`,
action: "monitor.degraded",
targets: [{ id: monitorId, type: "monitor" }],
metadata: { region, statusCode: Number(statusCode) },
});
const currentMonitor = await db
.select()
.from(schema.monitor)
.where(eq(schema.monitor.id, Number(monitorId)))
.get();
if (currentMonitor?.status === "active") {
const redisKey = `${monitorId}-${cronTimestamp}-degraded`;
// We add the new region to the set
await redis.sadd(redisKey, region);
// let's add an expire to the set
await redis.expire(redisKey, 60 * 60 * 24);
// We get the number of regions affected
const nbAffectedRegion = await redis.scard(redisKey);

const monitor = selectMonitorSchema.parse(currentMonitor);

const numberOfRegions = monitor.regions.length;

if (nbAffectedRegion > numberOfRegions / 2) {
await triggerNotifications({
monitorId,
statusCode,
message,
notifType: "degraded",
});
}
}
}
// if we are in error
if (status === "error") {
// trigger alerting
Expand Down Expand Up @@ -100,7 +141,7 @@ checkerRoute.post("/updateStatus", async (c) => {
const numberOfRegions = monitor.regions.length;

console.log(
`🤓 MonitorID ${monitorId} incident current affected ${nbAffectedRegion} total region ${numberOfRegions}`,
`🤓 MonitorID ${monitorId} incident current affected ${nbAffectedRegion} total region ${numberOfRegions}`
);
// If the number of affected regions is greater than half of the total region, we trigger the alerting
// 4 of 6 monitor need to fail to trigger an alerting
Expand All @@ -114,8 +155,8 @@ checkerRoute.post("/updateStatus", async (c) => {
eq(incidentTable.monitorId, Number(monitorId)),
isNull(incidentTable.resolvedAt),
isNull(incidentTable.acknowledgedAt),
eq(incidentTable.startedAt, new Date(cronTimestamp)),
),
eq(incidentTable.startedAt, new Date(cronTimestamp))
)
)
.get();

Expand Down Expand Up @@ -192,7 +233,7 @@ checkerRoute.post("/updateStatus", async (c) => {
const numberOfRegions = monitor.regions.length;

console.log(
`🤓 MonitorId ${monitorId} recovering incident current ${nbAffectedRegion} total region ${numberOfRegions}`,
`🤓 MonitorId ${monitorId} recovering incident current ${nbAffectedRegion} total region ${numberOfRegions}`
);
// // If the number of affected regions is greater than half of the total region, we trigger the alerting
// // 4 of 6 monitor need to fail to trigger an alerting
Expand All @@ -204,8 +245,8 @@ checkerRoute.post("/updateStatus", async (c) => {
and(
eq(incidentTable.monitorId, Number(monitorId)),
isNull(incidentTable.resolvedAt),
isNull(incidentTable.acknowledgedAt),
),
isNull(incidentTable.acknowledgedAt)
)
)
.get();
if (incident) {
Expand Down
22 changes: 20 additions & 2 deletions apps/server/src/checker/utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,28 @@ import type {
import {
sendAlert as sendDiscordAlert,
sendRecovery as sendDiscordRecovery,
sendDegraded as sendDiscordDegraded,
} from "@openstatus/notification-discord";
import {
sendAlert as sendEmailAlert,
sendRecovery as sendEmailRecovery,
sendDegraded as sendEmailDegraded,
} from "@openstatus/notification-emails";
import {
sendAlert as sendSlackAlert,
sendRecovery as sendSlackRecovery,
sendDegraded as sendSlackDegraded,
} from "@openstatus/notification-slack";
import {
sendAlert as sendSmsAlert,
sendRecovery as sendSmsRecovery,
sendDegraded as sendSmsDegraded,
} from "@openstatus/notification-twillio-sms";

import {
sendAlert as sendPagerdutyAlert,
sendRecovery as sendPagerDutyRecovery,
sendDegraded as sendPagerDutyDegraded,
} from "@openstatus/notification-pagerduty";

type SendNotification = ({
Expand All @@ -42,20 +47,33 @@ type SendNotification = ({
type Notif = {
sendAlert: SendNotification;
sendRecovery: SendNotification;
sendDegraded: SendNotification;
};
export const providerToFunction = {
email: {
sendAlert: sendEmailAlert,
sendRecovery: sendEmailRecovery,
sendDegraded: sendEmailDegraded,
},
slack: {
sendAlert: sendSlackAlert,
sendRecovery: sendSlackRecovery,
sendDegraded: sendSlackDegraded,
},
discord: { sendAlert: sendDiscordAlert, sendRecovery: sendDiscordRecovery },
sms: { sendAlert: sendSmsAlert, sendRecovery: sendSmsRecovery },
discord: {
sendAlert: sendDiscordAlert,
sendRecovery: sendDiscordRecovery,
sendDegraded: sendDiscordDegraded,
},
sms: {
sendAlert: sendSmsAlert,
sendRecovery: sendSmsRecovery,
sendDegraded: sendSmsDegraded,
},

pagerduty: {
sendAlert: sendPagerdutyAlert,
sendRecovery: sendPagerDutyRecovery,
sendDegraded: sendPagerDutyDegraded,
},
} satisfies Record<NotificationProvider, Notif>;
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
2 changes: 2 additions & 0 deletions apps/web/src/app/api/checker/cron/_cron.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,6 +156,8 @@ const createCronTask = async ({
headers: row.headers,
status: status,
assertions: row.assertions ? JSON.parse(row.assertions) : null,
degradedAfter: row.degradedAfter,
timeout: row.timeout,
};

const newTask: google.cloud.tasks.v2beta3.ITask = {
Expand Down
2 changes: 2 additions & 0 deletions apps/web/src/app/api/checker/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ export const payloadSchema = z.object({
cronTimestamp: z.number(),
status: z.enum(monitorStatus),
assertions: z.array(base).nullable(),
timeout: z.number().default(45000),
degradedAfter: z.number().nullable(),
});

export type Payload = z.infer<typeof payloadSchema>;
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,17 @@ export default async function EditPage({
defaultSection={search.success ? search.data.section : undefined}
defaultValues={{
...monitor,
// FIXME - Why is this not working?
degradedAfter: monitor.degradedAfter ?? undefined,
pages: pages
.filter((page) =>
page.monitorsToPages.map(({ monitorId }) => monitorId).includes(id),
page.monitorsToPages.map(({ monitorId }) => monitorId).includes(id)
)
.map(({ id }) => id),
notifications: monitorNotifications?.map(({ id }) => id),
tags: tags
.filter((tag) =>
tag.monitor.map(({ monitorId }) => monitorId).includes(id),
tag.monitor.map(({ monitorId }) => monitorId).includes(id)
)
.map(({ id }) => id),
}}
Expand Down
4 changes: 2 additions & 2 deletions apps/web/src/components/content/changelog.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@ export function ChangelogCard({ post }: { post: Changelog }) {
{formatDate(new Date(post.publishedAt))}
</p>
<h1 className="mb-5 font-cal text-3xl">{post.title}</h1>
<div className="relative h-64 w-full overflow-hidden rounded-lg border border-border">
<div className="relative aspect-video w-full overflow-hidden rounded-lg border border-border">
<Image
src={post.image}
fill={true}
alt={post.title}
className="object-cover"
className="object-contain"
/>
</div>
</div>
Expand Down
8 changes: 5 additions & 3 deletions apps/web/src/components/forms/monitor/form.tsx
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@ import type { RegionChecker } from "@/components/ping-response-analysis/utils";
import { toast, toastAction } from "@/lib/toast";
import { formatDuration } from "@/lib/utils";
import { api } from "@/trpc/client";
import type { Writeable } from "@/types/utils";
import { SaveButton } from "../shared/save-button";
import { General } from "./general";
import { RequestTestButton } from "./request-test-button";
Expand Down Expand Up @@ -87,6 +86,9 @@ export function MonitorForm({
statusAssertions: _assertions.filter((a) => a.type === "status") as any, // TS considers a.type === "header"
// biome-ignore lint/suspicious/noExplicitAny: <explanation>
headerAssertions: _assertions.filter((a) => a.type === "header") as any, // TS considers a.type === "status"

degradedAfter: defaultValues?.degradedAfter,
timeout: defaultValues?.timeout || 45000,
},
});
const router = useRouter();
Expand Down Expand Up @@ -257,9 +259,9 @@ export function MonitorForm({
>
<TabsList>
<TabsTrigger value="request">Request</TabsTrigger>
<TabsTrigger value="scheduling">Scheduling</TabsTrigger>
<TabsTrigger value="scheduling">Scheduling & Regions</TabsTrigger>
<TabsTrigger value="assertions">
Assertions{" "}
Timing & Assertions{" "}
{_assertions.length ? (
<Badge variant="secondary" className="ml-1">
{_assertions.length}
Expand Down
Loading

0 comments on commit db163f6

Please sign in to comment.