diff --git a/cmd/alertmanager.go b/cmd/alertmanager.go new file mode 100644 index 0000000..bb197be --- /dev/null +++ b/cmd/alertmanager.go @@ -0,0 +1,194 @@ +package cmd + +import ( + "fmt" + "slices" + + "github.com/NETWAYS/go-check" + "github.com/NETWAYS/go-check/perfdata" + "github.com/NETWAYS/go-check/result" + "github.com/spf13/cobra" +) + +type AlertmanagerConfig struct { + AlertName []string + ExcludeAlerts []string + ExcludeLabels []string + IncludeLabels []string + ProblemsOnly bool + NoAlertsState string +} + +const stateUnprocessed = "unprocessed" +const stateActive = "active" +const stateSuppressed = "suppressed" + +var cliAlertmanagerConfig AlertmanagerConfig + +var alertmanagerCmd = &cobra.Command{ + Use: "alertmanager", + Short: "Checks the status of a Alertmanager alert", + Long: `Checks the status of a Alertmanager alert and evaluates the status of the alert`, + Run: func(_ *cobra.Command, _ []string) { + // Convert --no-alerts-state to integer and validate input + noAlertsState, err := convertStateToInt(cliAlertmanagerConfig.NoAlertsState) + if err != nil { + check.ExitError(fmt.Errorf("invalid value for --no-alerts-state: %s", cliAlertmanagerConfig.NoAlertsState)) + } + + var ( + counterUnprocessed int + counterActive int + counterSuppressed int + ) + + c := cliConfig.NewClient() + errCon := c.Connect() + + if errCon != nil { + check.ExitError(errCon) + } + + ctx, cancel := cliConfig.timeoutContext() + defer cancel() + + alerts, err := c.GetAlertmanagerAlerts(ctx) + if err != nil { + check.ExitError(err) + } + + // If there are no alerts we can exit early + if len(alerts) == 0 { + // Just an empty PerfdataList to have consistent perfdata output + pdlist := perfdata.PerfdataList{ + {Label: "total", Value: 0}, + {Label: "unprocessed", Value: 0}, + {Label: "active", Value: 0}, + {Label: "suppressed", Value: 0}, + } + + // Since the user is expecting the state of a certain alert and + // it that is not present it might be noteworthy. + if cliAlertmanagerConfig.AlertName != nil { + check.ExitRaw(check.Unknown, "No such alert defined", "|", pdlist.String()) + } + + check.ExitRaw(noAlertsState, "No alerts found", "|", pdlist.String()) + } + + var overall result.Overall + + for _, al := range alerts { + // If it's not the Alert we're looking for, Skip! + if cliAlertmanagerConfig.AlertName != nil { + if !slices.Contains(cliAlertmanagerConfig.AlertName, al.GetName()) { + continue + } + } + + labelsMatchedInclude := matchesLabel(al.Labels, cliAlertmanagerConfig.IncludeLabels) + + if len(cliAlertConfig.IncludeLabels) > 0 && !labelsMatchedInclude { + // If the alert labels don't match here we can skip it. + continue + } + + alertMatchedExclude, regexErr := matches(al.GetName(), cliAlertmanagerConfig.ExcludeAlerts) + + if regexErr != nil { + check.ExitRaw(check.Unknown, "Invalid regular expression provided:", regexErr.Error()) + } + + if alertMatchedExclude { + // If the alert matches a regex from the list we can skip it. + continue + } + + labelsMatchedExclude := matchesLabel(al.Labels, cliAlertConfig.ExcludeLabels) + + if len(cliAlertConfig.ExcludeLabels) > 0 && labelsMatchedExclude { + // If the alert labels matches here we can skip it. + continue + } + + sc := result.NewPartialResult() + + switch al.GetState() { + case stateUnprocessed: + //nolint: errcheck + sc.SetState(check.Warning) + sc.Output = al.GetName() + " is unprocessed" + counterUnprocessed++ + case stateActive: + //nolint: errcheck + sc.SetState(check.Critical) + sc.Output = al.GetName() + " is active" + counterActive++ + case stateSuppressed: + //nolint: errcheck + sc.SetState(check.OK) + sc.Output = al.GetName() + " is suppressed" + counterSuppressed++ + default: + //nolint: errcheck + sc.SetState(check.Unknown) + sc.Output = al.GetName() + "invalid alert state" + } + + overall.AddSubcheck(sc) + } + + counterAlert := counterUnprocessed + counterActive + counterSuppressed + + perfList := perfdata.PerfdataList{ + {Label: "total", Value: counterAlert}, + {Label: "unprocessed", Value: counterUnprocessed}, + {Label: "active", Value: counterActive}, + {Label: "suppressed", Value: counterSuppressed}, + } + + // When there are no alerts we add an empty PartialResult just to have consistent output + if len(overall.PartialResults) == 0 { + sc := result.NewPartialResult() + // We already make sure it's valid + //nolint: errcheck + sc.SetDefaultState(noAlertsState) + sc.Output = "No alerts retrieved" + overall.AddSubcheck(sc) + } + + overall.PartialResults[0].Perfdata = append(overall.PartialResults[0].Perfdata, perfList...) + + overall.Summary = fmt.Sprintf("%d total alerts: %d unprocessed - %d active - %d suppressed", + counterAlert, + counterUnprocessed, + counterActive, + counterSuppressed) + + check.ExitRaw(overall.GetStatus(), overall.GetOutput()) + }, +} + +func init() { + rootCmd.AddCommand(alertmanagerCmd) + + fs := alertmanagerCmd.Flags() + + fs.StringVarP(&cliAlertmanagerConfig.NoAlertsState, "no-alerts-state", "T", "OK", "State to assign when no alerts are found (0, 1, 2, 3, OK, WARNING, CRITICAL, UNKNOWN). If not set this defaults to OK") + + fs.StringArrayVar(&cliAlertmanagerConfig.ExcludeAlerts, "exclude-alert", []string{}, "Alerts to ignore. Can be used multiple times and supports regex.") + + fs.StringSliceVarP(&cliAlertmanagerConfig.AlertName, "name", "n", nil, + "The name of one or more specific alerts to check."+ + "\nThis parameter can be repeated e.G.: '--name alert1 --name alert2'"+ + "\nIf no name is given, all alerts will be evaluated") + + fs.StringArrayVar(&cliAlertmanagerConfig.IncludeLabels, "include-label", []string{}, + "The label of one or more specific alerts to include. "+ + "\nThis parameter can be repeated e.g.: '--include-label prio=high --include-label another=example'"+ + "\nNote that repeated --include-label are combined using a union.") + + fs.StringArrayVar(&cliAlertmanagerConfig.ExcludeLabels, "exclude-label", []string{}, + "The label of one or more specific alerts to exclude."+ + "\nThis parameter can be repeated e.g.: '--exclude-label prio=high --exclude-label another=example'") +} diff --git a/internal/alert/alertmanager.go b/internal/alert/alertmanager.go new file mode 100644 index 0000000..1cee41a --- /dev/null +++ b/internal/alert/alertmanager.go @@ -0,0 +1,32 @@ +package alert + +import ( + "github.com/prometheus/common/model" +) + +// Alert represents an Alertmanager alert +type AlertmanagerAlert struct { + Annotations struct { + Description string `json:"description"` + Summary string `json:"summary"` + } `json:"annotations"` + Status struct { + // unprocessed, active, suppressed + State string `json:"state"` + } `json:"status"` + Labels model.LabelSet `json:"labels"` +} + +func (al *AlertmanagerAlert) GetState() string { + return al.Status.State +} + +func (al *AlertmanagerAlert) GetName() string { + n, ok := al.Labels["alertname"] + + if ok { + return string(n) + } + + return "" +} diff --git a/internal/alert/alertmanager_test.go b/internal/alert/alertmanager_test.go new file mode 100644 index 0000000..c201659 --- /dev/null +++ b/internal/alert/alertmanager_test.go @@ -0,0 +1,21 @@ +package alert + +import ( + "encoding/json" + "testing" +) + +func TestUmarshallPipeline(t *testing.T) { + a := `{"annotations":{"description":"job has disappeared","summary":"job missing"},"endsAt":"2026-02-03T15:48:53.926Z","fingerprint":"17358d92dd0f3b58","receivers":[{"name":"team-X-mails"}],"startsAt":"2026-02-03T15:12:53.926Z","status":{"inhibitedBy":[],"mutedBy":[],"silencedBy":["d2353af6"],"state":"suppressed"},"updatedAt":"2026-02-03T15:44:53.929Z","generatorURL":"http://f2526c40017b:9090","labels":{"alertname":"missing","job":"alertmanager","monitor":"my-monitor","severity":"low"}}` + + var alert AlertmanagerAlert + err := json.Unmarshal([]byte(a), &alert) + + if err != nil { + t.Error(err) + } + + if alert.Annotations.Summary != "job missing" { + t.Error("\nActual: ", alert.Annotations.Summary, "\nExpected: ", "job missing") + } +} diff --git a/internal/client/client.go b/internal/client/client.go index 4668885..aff7b90 100644 --- a/internal/client/client.go +++ b/internal/client/client.go @@ -2,11 +2,14 @@ package client import ( "context" + "encoding/json" "fmt" "net/http" "net/url" "strings" + "github.com/NETWAYS/check_prometheus/internal/alert" + "github.com/NETWAYS/go-check" "github.com/prometheus/client_golang/api" v1 "github.com/prometheus/client_golang/api/prometheus/v1" @@ -85,6 +88,34 @@ func (c *Client) GetStatus(ctx context.Context, endpoint string) (returncode int return check.Unknown, resp.StatusCode, respBody, err } +func (c *Client) GetAlertmanagerAlerts(ctx context.Context) ([]alert.AlertmanagerAlert, error) { + u, _ := url.JoinPath(c.URL, "/api/v2/alerts") + req, errReq := http.NewRequestWithContext(ctx, http.MethodGet, u, nil) + + if errReq != nil { + return []alert.AlertmanagerAlert{}, fmt.Errorf("could not create request: %w", errReq) + } + + // Making the request with the preconfigured Client + // So that we can reuse the preconfigured Roundtripper + resp, b, errDo := c.Client.Do(ctx, req) + + if errDo != nil { + return []alert.AlertmanagerAlert{}, fmt.Errorf("could not get status: %w", errDo) + } + + defer resp.Body.Close() + + var alerts []alert.AlertmanagerAlert + errJSON := json.Unmarshal(b, &alerts) + + if errJSON != nil { + return []alert.AlertmanagerAlert{}, fmt.Errorf("could not parse alerts: %w", errJSON) + } + + return alerts, nil +} + type headersRoundTripper struct { headers map[string]string rt http.RoundTripper