diff --git a/infrastructure/grafana/provisioning/alerting/contact_points.yaml b/infrastructure/grafana/provisioning/alerting/contact_points.yaml new file mode 100644 index 0000000..262547a --- /dev/null +++ b/infrastructure/grafana/provisioning/alerting/contact_points.yaml @@ -0,0 +1,17 @@ +apiVersion: 1 + +contactPoints: + - orgId: 1 + name: Telegram + receivers: + - uid: aet1srtyc40lca + type: telegram + settings: + bottoken: 7797967907:AAGZuUzzuS4LLb525rDNY52Awc2tvpsLjd4 + chatid: "-1002555823797" + disable_notification: false + disable_web_page_preview: false + message: '{{ template "telegram.default.message" . }}' + parse_mode: Markdown + protect_content: false + disableResolveMessage: false diff --git a/infrastructure/grafana/provisioning/alerting/rules.yaml b/infrastructure/grafana/provisioning/alerting/rules.yaml new file mode 100644 index 0000000..30615c1 --- /dev/null +++ b/infrastructure/grafana/provisioning/alerting/rules.yaml @@ -0,0 +1,141 @@ +apiVersion: 1 + +groups: + - orgId: 1 + name: Default + folder: Backend + interval: 10s + rules: + - uid: aet1xbx1yaupsb + title: Backend p99 > 500 ms + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: | + histogram_quantile( + 0.99, + sum( + rate( + caddy_http_request_duration_seconds_bucket{instance="proxy:2019",handler="reverse_proxy",host="proxy:8080",job="caddy"}[$__rate_interval] + ) + ) by (le) + ) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 0.5 + type: gte + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + dashboardUid: e3a78c36-2f34-4ad6-81d5-284002896829 + panelId: 32 + noDataState: NoData + execErrState: Error + for: 10s + keepFiringFor: 10s + annotations: + __dashboardUid__: e3a78c36-2f34-4ad6-81d5-284002896829 + __panelId__: "32" + runbook_url: https://admin.adnova.itqdev.xyz + summary: p99 > 500 ms + isPaused: false + notification_settings: + receiver: Telegram + - orgId: 1 + name: Default + folder: Postgres + interval: 10s + rules: + - uid: fet1txr4slywwe + title: "> 100 QPS on Postgresql" + condition: C + data: + - refId: A + relativeTimeRange: + from: 600 + to: 0 + datasourceUid: prometheus + model: + editorMode: code + expr: | + sum( + irate(pg_stat_database_xact_commit{datname="postgres",instance="postgres-exporter:9187",job="postgres"}[5m]) + ) + + sum( + irate(pg_stat_database_xact_rollback{datname="postgres",instance="postgres-exporter:9187",job="postgres"}[5m]) + ) + instant: true + intervalMs: 1000 + legendFormat: __auto + maxDataPoints: 43200 + range: false + refId: A + - refId: C + datasourceUid: __expr__ + model: + conditions: + - evaluator: + params: + - 100 + type: gte + operator: + type: and + query: + params: + - C + reducer: + params: [] + type: last + type: query + datasource: + type: __expr__ + uid: __expr__ + expression: A + intervalMs: 1000 + maxDataPoints: 43200 + refId: C + type: threshold + dashboardUid: postgres-overview + panelId: 14 + noDataState: NoData + execErrState: Error + for: 10s + keepFiringFor: 1m + annotations: + __dashboardUid__: postgres-overview + __panelId__: "14" + runbook_url: https://admin.adnova.itqdev.xyz + summary: Postgresql QPS exceeded 100 + isPaused: false + notification_settings: + receiver: Telegram diff --git a/infrastructure/grafana/provisioning/alerting/templates.yaml b/infrastructure/grafana/provisioning/alerting/templates.yaml new file mode 100644 index 0000000..b1ade37 --- /dev/null +++ b/infrastructure/grafana/provisioning/alerting/templates.yaml @@ -0,0 +1,54 @@ +apiVersion: 1 + +templates: + - orgId: 1 + name: Telegram + template: | + {{ define "telegram.default.message" }} {{ if gt (len .Alerts.Firing) 0 }} + 🔥🚨 *FIRE IN THE HOLE!* 🚨🔥 + We've got *{{ len .Alerts.Firing }} firing alert(s)* that need your immediate attention! + {{ range .Alerts.Firing }} + --- + *Alert:* `{{ .Labels.alertname }}` + {{ if .Labels.instance }}*Instance:* `{{ .Labels.instance }}`{{ end }} + *Status:* 🔴 *FIRING* since {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }} + + {{ if .Annotations.summary }}*Summary:* {{ .Annotations.summary }}{{ end }} + {{ if .Annotations.description }}*Description:* {{ .Annotations.description }}{{ end }} + *Labels:* + {{ range .Labels.SortedPairs }} • `{{ .Name }}` = `{{ .Value }}` + {{ end }} + {{ if gt (len .Annotations) 0 }}*Annotations:* + {{ range .Annotations.SortedPairs }} • `{{ .Name }}` = `{{ .Value }}` + {{ end }}{{ end }} + + {{ if .DashboardURL }}📊 [View Dashboard]({{ .DashboardURL }}) + {{ end }}{{ if .PanelURL }}📈 [View Panel]({{ .PanelURL }}) + {{ end }}{{ if .GeneratorURL }}🔗 [Alert Source]({{ .GeneratorURL }}) + {{ end }}{{ if .SilenceURL }}🤫 [Silence Alert]({{ .SilenceURL }}) + {{ end }}--- {{ end }} {{ end }} + {{ if gt (len .Alerts.Resolved) 0 }} + ✅🟢 *ALL CLEAR!* 🟢✅ + Great news! *{{ len .Alerts.Resolved }} alert(s)* have been resolved. + {{ range .Alerts.Resolved }} + --- + *Alert:* `{{ .Labels.alertname }}` + {{ if .Labels.instance }}*Instance:* `{{ .Labels.instance }}`{{ end }} + *Status:* ✅ *RESOLVED* at {{ .EndsAt.Format "2006-01-02 15:04:05 MST" }} (was active since {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}) + + {{ if .Annotations.summary }}*Summary:* {{ .Annotations.summary }}{{ end }} + {{ if .Annotations.description }}*Description:* {{ .Annotations.description }}{{ end }} + *Labels:* + {{ range .Labels.SortedPairs }} • `{{ .Name }}` = `{{ .Value }}` + {{ end }} + {{ if gt (len .Annotations) 0 }}*Annotations:* + {{ range .Annotations.SortedPairs }} • `{{ .Name }}` = `{{ .Value }}` + {{ end }}{{ end }} + + {{ if .DashboardURL }}📊 [View Dashboard]({{ .DashboardURL }}) + {{ end }}{{ if .PanelURL }}📈 [View Panel]({{ .PanelURL }}) + {{ end }}{{ if .GeneratorURL }}🔗 [Alert Source]({{ .GeneratorURL }}) {{ end }} {{ end }} {{ end }} + {{ if or (gt (len .Alerts.Firing) 0) (gt (len .Alerts.Resolved) 0) }} + 🔔 *Grafana Alertmanager:* [View All Alerts]({{ template "__alertmanagerURL" . }}) 🔔 + {{ end }} + {{ end }}