From 6fb9961c74da55d1d270621770b2483b3dfddc1c Mon Sep 17 00:00:00 2001 From: Christy Jacob Date: Fri, 20 Dec 2024 22:20:24 +0530 Subject: [PATCH 1/2] fix: incident creation and resolution logic --- docker-compose.yml | 9 +- logger.go | 69 ++++++++++ main.go | 325 +++++++++++++++++++++++---------------------- 3 files changed, 236 insertions(+), 167 deletions(-) create mode 100644 logger.go diff --git a/docker-compose.yml b/docker-compose.yml index fd91496..de1d2cf 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,16 +1,15 @@ -version: '3.8' - services: monitoring: build: context: . dockerfile: Dockerfile + hostname: monitoring-local command: - monitoring - "--url=${BETTER_STACK_URL}" - - "--interval=10" - - "--cpu-limit=90" - - "--memory-limit=80" + - "--interval=5" + - "--cpu-limit=5" + - "--memory-limit=10" - "--disk-limit=85" volumes: - /:/host:ro diff --git a/logger.go b/logger.go new file mode 100644 index 0000000..45c20e4 --- /dev/null +++ b/logger.go @@ -0,0 +1,69 @@ +package main + +import ( + "fmt" + "log" + "os" + "time" +) + +const ( + colorReset = "\033[0m" + colorRed = "\033[31m" + colorGreen = "\033[32m" + colorYellow = "\033[33m" + colorBlue = "\033[34m" + colorPurple = "\033[35m" + colorCyan = "\033[36m" +) + +type Logger struct { + logger *log.Logger +} + +func New() *Logger { + return &Logger{ + logger: log.New(os.Stdout, "", 0), + } +} + +func (l *Logger) formatMessage(level, format string, args ...interface{}) string { + timestamp := time.Now().Format("2006-01-02 15:04:05") + message := fmt.Sprintf(format, args...) + return fmt.Sprintf("%s [%s] %s", timestamp, level, message) +} + +func (l *Logger) Log(format string, args ...interface{}) { + msg := l.formatMessage("LOG", format, args...) + l.logger.Printf("%s", msg) +} + +func (l *Logger) Success(format string, args ...interface{}) { + msg := l.formatMessage("SUCCESS", format, args...) + l.logger.Printf("%s%s%s", colorGreen, msg, colorReset) +} + +func (l *Logger) Warn(format string, args ...interface{}) { + msg := l.formatMessage("WARNING", format, args...) + l.logger.Printf("%s%s%s", colorYellow, msg, colorReset) +} + +func (l *Logger) Error(format string, args ...interface{}) { + msg := l.formatMessage("ERROR", format, args...) + l.logger.Printf("%s%s%s", colorRed, msg, colorReset) +} + +func (l *Logger) Info(format string, args ...interface{}) { + msg := l.formatMessage("INFO", format, args...) + l.logger.Printf("%s%s%s", colorBlue, msg, colorReset) +} + +func (l *Logger) Debug(format string, args ...interface{}) { + msg := l.formatMessage("DEBUG", format, args...) + l.logger.Printf("%s%s%s", colorCyan, msg, colorReset) +} + +func (l *Logger) Fatal(format string, args ...interface{}) { + msg := l.formatMessage("FATAL", format, args...) + l.logger.Fatalf("%s%s%s", colorPurple, msg, colorReset) +} \ No newline at end of file diff --git a/main.go b/main.go index 4d63b86..94afcd2 100644 --- a/main.go +++ b/main.go @@ -4,7 +4,7 @@ import ( "encoding/json" "flag" "fmt" - "log" + "io" "net/http" "os" "path/filepath" @@ -16,23 +16,25 @@ import ( "github.com/shirou/gopsutil/v3/mem" ) -type Incident struct { - Title string `json:"title"` - Cause string `json:"cause"` - AlertID string `json:"alert_id"` - Timestamp int64 `json:"timestamp"` - Resolved bool `json:"resolved,omitempty"` +type Metric struct { + Title string `json:"title"` + Cause string `json:"cause"` + AlertID string `json:"alert_id"` + Timestamp int64 `json:"timestamp"` + Status string `json:"status"` + Value float64 `json:"value"` + Limit float64 `json:"limit"` } type SystemMonitor struct { - httpClient *http.Client - incidents map[string][]Incident + httpClient *http.Client betterStackURL string - hostname string - cpuLimit float64 - memoryLimit float64 - diskLimit float64 - interval int + hostname string + cpuLimit float64 + memoryLimit float64 + diskLimit float64 + interval int + log *Logger } func NewSystemMonitor(betterStackURL string, interval int, cpuLimit, memoryLimit, diskLimit float64) (*SystemMonitor, error) { @@ -45,21 +47,17 @@ func NewSystemMonitor(betterStackURL string, interval int, cpuLimit, memoryLimit httpClient: &http.Client{ Timeout: 5 * time.Second, }, - incidents: map[string][]Incident{ - "cpu": {}, - "memory": {}, - "disk": {}, - }, betterStackURL: betterStackURL, - hostname: hostname, - cpuLimit: cpuLimit, - memoryLimit: memoryLimit, - diskLimit: diskLimit, - interval: interval, + hostname: hostname, + cpuLimit: cpuLimit, + memoryLimit: memoryLimit, + diskLimit: diskLimit, + interval: interval, + log: New(), }, nil } -func (s *SystemMonitor) evaluateCPUIncident() (*Incident, error) { +func (s *SystemMonitor) checkCPU() error { duration := float64(s.interval) / 10 if duration < 5 { duration = 5 @@ -70,119 +68,149 @@ func (s *SystemMonitor) evaluateCPUIncident() (*Incident, error) { cpuPercent, err := cpu.Percent(time.Duration(duration)*time.Second, false) if err != nil { - return nil, fmt.Errorf("failed to get CPU usage: %v", err) + return fmt.Errorf("failed to get CPU usage: %v", err) } if len(cpuPercent) == 0 { - return nil, nil + return nil } - log.Printf("CPU usage: %.2f%%\n", cpuPercent[0]) - if cpuPercent[0] > s.cpuLimit { - return &Incident{ - Title: fmt.Sprintf("CPU usage higher than %.0f%%! - %s", s.cpuLimit, s.hostname), - Cause: "High CPU usage", - AlertID: fmt.Sprintf("high-cpu-%s", s.hostname), - Timestamp: time.Now().Unix(), - }, nil + value := cpuPercent[0] + status := s.getStatus(value, s.cpuLimit) + if status == "fail" { + s.log.Warn("CPU usage %.2f%% exceeds limit of %.2f%%", value, s.cpuLimit) + } else { + s.log.Log("CPU usage: %.2f%% (limit: %.2f%%)", value, s.cpuLimit) + } + + metric := Metric{ + Title: fmt.Sprintf("CPU Usage - %s", s.hostname), + Cause: "CPU monitoring check", + AlertID: fmt.Sprintf("cpu-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: value, + Limit: s.cpuLimit, } - return nil, nil + return s.sendMetric(metric) } -func (s *SystemMonitor) evaluateMemoryIncident() (*Incident, error) { +func (s *SystemMonitor) checkMemory() error { vmStat, err := mem.VirtualMemory() if err != nil { - return nil, fmt.Errorf("failed to get memory stats: %v", err) + return fmt.Errorf("failed to get memory stats: %v", err) } - log.Printf("Memory usage: %.2f%% (Available: %d MB, Total: %d MB)\n", - vmStat.UsedPercent, - vmStat.Available/(1024*1024), - vmStat.Total/(1024*1024)) + value := vmStat.UsedPercent + status := s.getStatus(value, s.memoryLimit) + if status == "fail" { + s.log.Warn("Memory usage %.2f%% exceeds limit of %.2f%%", value, s.memoryLimit) + } else { + s.log.Log("Memory usage: %.2f%% (limit: %.2f%%), Available: %d MB, Total: %d MB", + value, + s.memoryLimit, + vmStat.Available/(1024*1024), + vmStat.Total/(1024*1024)) + } - if vmStat.UsedPercent > s.memoryLimit { - return &Incident{ - Title: fmt.Sprintf("Memory usage higher than %.0f%%! - %s", s.memoryLimit, s.hostname), - Cause: "High memory usage", - AlertID: fmt.Sprintf("high-memory-%s", s.hostname), - Timestamp: time.Now().Unix(), - }, nil + metric := Metric{ + Title: fmt.Sprintf("Memory Usage - %s", s.hostname), + Cause: "Memory monitoring check", + AlertID: fmt.Sprintf("memory-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: value, + Limit: s.memoryLimit, } - return nil, nil + return s.sendMetric(metric) } -func (s *SystemMonitor) evaluateDiskIncident() ([]Incident, error) { - var incidents []Incident - +func (s *SystemMonitor) checkDisk() error { // Check root partition usage, err := disk.Usage("/") if err != nil { - return nil, fmt.Errorf("failed to get disk usage: %v", err) + return fmt.Errorf("failed to get disk usage: %v", err) } - log.Printf("Diskspace used /: %.2f%% (Free: %d MB, Total: %d MB)\n", - usage.UsedPercent, - usage.Free/(1024*1024), - usage.Total/(1024*1024)) + value := usage.UsedPercent + status := s.getStatus(value, s.diskLimit) + if status == "fail" { + s.log.Warn("Root disk usage %.2f%% exceeds limit of %.2f%%", value, s.diskLimit) + } else { + s.log.Log("Root disk usage: %.2f%% (limit: %.2f%%), Free: %d MB, Total: %d MB", + value, + s.diskLimit, + usage.Free/(1024*1024), + usage.Total/(1024*1024)) + } - if usage.UsedPercent > s.diskLimit { - incidents = append(incidents, Incident{ - Title: fmt.Sprintf("Root disk usage higher than %.0f%%! - %s", s.diskLimit, s.hostname), - Cause: "High disk usage", - AlertID: fmt.Sprintf("high-disk-%s", s.hostname), - Timestamp: time.Now().Unix(), - }) + if err := s.sendMetric(Metric{ + Title: fmt.Sprintf("Root Disk Usage - %s", s.hostname), + Cause: "Disk monitoring check", + AlertID: fmt.Sprintf("disk-root-%s", s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: value, + Limit: s.diskLimit, + }); err != nil { + return err } // Check mounted directories mounts, err := filepath.Glob("/mnt/*") if err != nil { - return nil, fmt.Errorf("failed to list mounted directories: %v", err) + return fmt.Errorf("failed to list mounted directories: %v", err) } for _, mount := range mounts { usage, err := disk.Usage(mount) if err != nil { - log.Printf("Failed to get disk usage for %s: %v\n", mount, err) + s.log.Error("Failed to get disk usage for %s: %v", mount, err) continue } - log.Printf("Diskspace used %s: %.2f%% (Free: %d MB, Total: %d MB)\n", - mount, - usage.UsedPercent, - usage.Free/(1024*1024), - usage.Total/(1024*1024)) + value := usage.UsedPercent + status := s.getStatus(value, s.diskLimit) + if status == "fail" { + s.log.Warn("Disk usage for %s %.2f%% exceeds limit of %.2f%%", mount, value, s.diskLimit) + } else { + s.log.Log("Disk usage for %s: %.2f%% (limit: %.2f%%), Free: %d MB, Total: %d MB", + mount, + value, + s.diskLimit, + usage.Free/(1024*1024), + usage.Total/(1024*1024)) + } - if usage.UsedPercent > s.diskLimit { - incidents = append(incidents, Incident{ - Title: fmt.Sprintf("%s disk usage higher than %.0f%%! - %s", mount, s.diskLimit, s.hostname), - Cause: "High disk usage", - AlertID: fmt.Sprintf("high-disk-%s", s.hostname), - Timestamp: time.Now().Unix(), - }) + if err := s.sendMetric(Metric{ + Title: fmt.Sprintf("Disk Usage %s - %s", mount, s.hostname), + Cause: "Disk monitoring check", + AlertID: fmt.Sprintf("disk-%s-%s", filepath.Base(mount), s.hostname), + Timestamp: time.Now().Unix(), + Status: status, + Value: value, + Limit: s.diskLimit, + }); err != nil { + return err } } - return incidents, nil -} - -func (s *SystemMonitor) createIncident(incident Incident) error { - log.Printf("Triggering incident: %s\n", incident.Title) - return s.sendIncident(incident) + return nil } -func (s *SystemMonitor) resolveIncident(incident Incident) error { - log.Printf("Resolving incident: %s\n", incident.Title) - incident.Resolved = true - return s.sendIncident(incident) +func (s *SystemMonitor) getStatus(value, limit float64) string { + if value > limit { + return "fail" + } + return "pass" } -func (s *SystemMonitor) sendIncident(incident Incident) error { - body, err := json.Marshal(incident) +func (s *SystemMonitor) sendMetric(metric Metric) error { + body, err := json.Marshal(metric) if err != nil { - return fmt.Errorf("failed to marshal incident: %v", err) + return fmt.Errorf("failed to marshal metric: %v", err) } req, err := http.NewRequest(http.MethodPost, s.betterStackURL, strings.NewReader(string(body))) @@ -190,8 +218,9 @@ func (s *SystemMonitor) sendIncident(incident Incident) error { return fmt.Errorf("failed to create request: %v", err) } - req.Header.Set("Content-Type", "application/json") - req.Header.Set("User-Agent", "Appwrite system-monitoring") + req.Header.Set("Content-Type", "application/json; charset=utf-8") + req.Header.Set("Accept", "application/json") + req.Header.Set("User-Agent", "Appwrite Resource Monitoring") resp, err := s.httpClient.Do(req) if err != nil { @@ -199,52 +228,20 @@ func (s *SystemMonitor) sendIncident(incident Incident) error { } defer resp.Body.Close() - if resp.StatusCode >= 400 { - return fmt.Errorf("request failed with status: %d", resp.StatusCode) - } - - return nil -} - -func (s *SystemMonitor) processType(monitorType string, evaluate func() (interface{}, error)) error { - incidents, err := evaluate() + // Read response body + respBody, err := io.ReadAll(resp.Body) if err != nil { - return fmt.Errorf("failed to evaluate %s: %v", monitorType, err) - } - - if incidents == nil { - if len(s.incidents[monitorType]) > 0 { - log.Printf("Resolving active incident of type %s\n", monitorType) - for _, incident := range s.incidents[monitorType] { - if err := s.resolveIncident(incident); err != nil { - log.Printf("Failed to resolve incident: %v\n", err) - } - } - s.incidents[monitorType] = nil - } - return nil + return fmt.Errorf("failed to read response body: %v", err) } - if len(s.incidents[monitorType]) > 0 { - log.Printf("Already have active incident of type '%s', skipping.\n", monitorType) - return nil + // Log response details without colors + s.log.Log("Response Status: %s", resp.Status) + if len(respBody) > 0 { + s.log.Log("Response Body: %s", string(respBody)) } - switch i := incidents.(type) { - case *Incident: - if i != nil { - if err := s.createIncident(*i); err != nil { - return fmt.Errorf("failed to create incident: %v", err) - } - s.incidents[monitorType] = []Incident{*i} - } - case []Incident: - for _, incident := range i { - if err := s.createIncident(incident); err != nil { - return fmt.Errorf("failed to create incident: %v", err) - } - } - s.incidents[monitorType] = i + if resp.StatusCode >= 400 { + return fmt.Errorf("request failed with status: %d, body: %s", resp.StatusCode, string(respBody)) } return nil @@ -254,29 +251,33 @@ func (s *SystemMonitor) Start() { ticker := time.NewTicker(time.Duration(s.interval) * time.Second) defer ticker.Stop() + // Initial check + s.runChecks() + + // Periodic checks for range ticker.C { - if err := s.processType("cpu", func() (interface{}, error) { - return s.evaluateCPUIncident() - }); err != nil { - log.Printf("Error processing CPU metrics: %v\n", err) - } + s.runChecks() + } +} - if err := s.processType("memory", func() (interface{}, error) { - return s.evaluateMemoryIncident() - }); err != nil { - log.Printf("Error processing memory metrics: %v\n", err) - } +func (s *SystemMonitor) runChecks() { + if err := s.checkCPU(); err != nil { + s.log.Error("Error checking CPU: %v", err) + } - if err := s.processType("disk", func() (interface{}, error) { - return s.evaluateDiskIncident() - }); err != nil { - log.Printf("Error processing disk metrics: %v\n", err) - } + if err := s.checkMemory(); err != nil { + s.log.Error("Error checking memory: %v", err) + } + + if err := s.checkDisk(); err != nil { + s.log.Error("Error checking disk: %v", err) } } func main() { - // Define command line flags + log := New() + + // Command line flags betterStackURL := flag.String("url", "", "BetterStack webhook URL (required)") interval := flag.Int("interval", 300, "Check interval in seconds (default: 300)") cpuLimit := flag.Float64("cpu-limit", 90.0, "CPU usage threshold percentage (default: 90)") @@ -294,33 +295,33 @@ func main() { // Validate required flags if *betterStackURL == "" { flag.Usage() - log.Fatal("Error: BetterStack webhook URL is required") + log.Fatal("BetterStack webhook URL is required") } // Validate ranges if *interval <= 0 { - log.Fatal("Error: interval must be greater than 0") + log.Fatal("Interval must be greater than 0") } if *cpuLimit < 0 || *cpuLimit > 100 { - log.Fatal("Error: cpu-limit must be between 0 and 100") + log.Fatal("CPU limit must be between 0 and 100") } if *memoryLimit < 0 || *memoryLimit > 100 { - log.Fatal("Error: memory-limit must be between 0 and 100") + log.Fatal("Memory limit must be between 0 and 100") } if *diskLimit < 0 || *diskLimit > 100 { - log.Fatal("Error: disk-limit must be between 0 and 100") + log.Fatal("Disk limit must be between 0 and 100") } monitor, err := NewSystemMonitor(*betterStackURL, *interval, *cpuLimit, *memoryLimit, *diskLimit) if err != nil { - log.Fatalf("Failed to create system monitor: %v", err) + log.Fatal("Failed to create system monitor: %v", err) } - log.Printf("Starting monitoring with settings:") - log.Printf("- Check interval: %d seconds", *interval) - log.Printf("- CPU limit: %.1f%%", *cpuLimit) - log.Printf("- Memory limit: %.1f%%", *memoryLimit) - log.Printf("- Disk limit: %.1f%%", *diskLimit) + log.Info("Starting monitoring with settings:") + log.Info("- Check interval: %d seconds", *interval) + log.Info("- CPU limit: %.1f%%", *cpuLimit) + log.Info("- Memory limit: %.1f%%", *memoryLimit) + log.Info("- Disk limit: %.1f%%", *diskLimit) monitor.Start() } \ No newline at end of file From 7788758373825b4c4a591315e9d861c72e68ac85 Mon Sep 17 00:00:00 2001 From: Christy Jacob Date: Fri, 20 Dec 2024 22:25:21 +0530 Subject: [PATCH 2/2] fix: incident creation and resolution --- main.go | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/main.go b/main.go index 94afcd2..8365faa 100644 --- a/main.go +++ b/main.go @@ -4,7 +4,6 @@ import ( "encoding/json" "flag" "fmt" - "io" "net/http" "os" "path/filepath" @@ -228,20 +227,9 @@ func (s *SystemMonitor) sendMetric(metric Metric) error { } defer resp.Body.Close() - // Read response body - respBody, err := io.ReadAll(resp.Body) - if err != nil { - return fmt.Errorf("failed to read response body: %v", err) - } - - // Log response details without colors s.log.Log("Response Status: %s", resp.Status) - if len(respBody) > 0 { - s.log.Log("Response Body: %s", string(respBody)) - } - if resp.StatusCode >= 400 { - return fmt.Errorf("request failed with status: %d, body: %s", resp.StatusCode, string(respBody)) + return fmt.Errorf("request failed with status: %d", resp.StatusCode) } return nil