groups: - interval: 2m30s name: slo-http-increase rules: - expr: sum by (cluster, http_status_code) (increase(http_server_total_requests_total[1h])) labels: slo: slo-http record: http_server_total_requests:increase1h - expr: sum by (cluster, http_status_code) (increase(http_server_total_requests_total[12h])) labels: slo: slo-http record: http_server_total_requests:increase12h - expr: sum by (cluster, http_status_code) (increase(http_server_total_requests_total[1d])) labels: slo: slo-http record: http_server_total_requests:increase1d - expr: sum by (cluster, http_status_code) (increase(http_server_total_requests_total[1w])) labels: slo: slo-http record: http_server_total_requests:increase1w - expr: sum by (cluster, http_status_code) (increase(http_server_total_requests_total[4w])) labels: slo: slo-http record: http_server_total_requests:increase4w - interval: 1m name: slo-http rules: - expr: sum by (cluster) (rate(http_server_total_requests_total{http_status_code=~"5.."}[30m])) / sum by (cluster) (rate(http_server_total_requests_total[30m])) labels: slo: slo-http record: http_server_total_requests:burnrate30m - expr: sum by (cluster) (rate(http_server_total_requests_total{http_status_code=~"5.."}[1h])) / sum by (cluster) (rate(http_server_total_requests_total[1h])) labels: slo: slo-http record: http_server_total_requests:burnrate1h - expr: sum by (cluster) (rate(http_server_total_requests_total{http_status_code=~"5.."}[2h])) / sum by (cluster) (rate(http_server_total_requests_total[2h])) labels: slo: slo-http record: http_server_total_requests:burnrate2h - expr: sum by (cluster) (rate(http_server_total_requests_total{http_status_code=~"5.."}[1d])) / sum by (cluster) (rate(http_server_total_requests_total[1d])) labels: slo: slo-http record: http_server_total_requests:burnrate1d - alert: ErrorBudgetBurn annotations: dashboardUId: slohttp description: "High error count for slo-http with labels: {{$labels}}" runbook_url: "" summary: High error count for slo-http expr: http_server_total_requests:burnrate30m{slo="slo-http"} > (14 * (1-0.995)) and http_server_total_requests:burnrate1h{slo="slo-http"} > (14 * (1-0.995)) and http_server_total_requests:increase1h{slo="slo-http"} > 10 for: 2m labels: long: 1h severity: warning short: 30m slo: slo-http - alert: ErrorBudgetBurn annotations: dashboardUId: slohttp description: "High error count for slo-http with labels: {{$labels}}" runbook_url: "" summary: High error count for slo-http expr: http_server_total_requests:burnrate2h{slo="slo-http"} > (3 * (1-0.995)) and http_server_total_requests:burnrate1d{slo="slo-http"} > (3 * (1-0.995)) and http_server_total_requests:increase1d{slo="slo-http"} > 10 for: 15m labels: long: 1d severity: warning short: 2h slo: slo-http - interval: 1m0s name: slo-http-latency-increase rules: - alert: LatencyP95AboveThreshold_75000 expr: http_server_duration_milliseconds_bucket:histogram_quantile{percentile="p95", }*1 > 75000 for: 10m labels: severity: page slo: slo-http - expr: sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="+Inf"}[1h])) - sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="75000"}[1h])) labels: latency: "75000" record: http_server_duration_milliseconds_bucket:all1h - expr: sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="+Inf"}[12h])) - sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="75000"}[12h])) labels: latency: "75000" record: http_server_duration_milliseconds_bucket:all12h - expr: sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="+Inf"}[1d])) - sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="75000"}[1d])) labels: latency: "75000" record: http_server_duration_milliseconds_bucket:all1d - expr: sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="+Inf"}[1w])) - sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="75000"}[1w])) labels: latency: "75000" record: http_server_duration_milliseconds_bucket:all1w - expr: sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="+Inf"}[4w])) - sum by (cluster) (increase(http_server_duration_milliseconds_bucket{le="75000"}[4w])) labels: latency: "75000" record: http_server_duration_milliseconds_bucket:all4w - expr: (histogram_quantile(0.99, sum by (cluster, le) (rate(http_server_duration_milliseconds_bucket[4m])))) labels: percentile: p99 record: http_server_duration_milliseconds_bucket:histogram_quantile - expr: (histogram_quantile(0.95, sum by (cluster, le) (rate(http_server_duration_milliseconds_bucket[4m])))) labels: percentile: p95 record: http_server_duration_milliseconds_bucket:histogram_quantile - expr: (histogram_quantile(0.5, sum by (cluster, le) (rate(http_server_duration_milliseconds_bucket[4m])))) labels: percentile: p50 record: http_server_duration_milliseconds_bucket:histogram_quantile