groups: - interval: 2m30s name: slo-onprem-increase rules: - expr: sum by (cluster, rpc_connect_rpc_error_code, rpc_method, rpc_service) (increase(rpc_server_request_count[1h])) labels: slo: slo-onprem record: rpc_server_request:increase1h - expr: sum by (cluster, rpc_connect_rpc_error_code, rpc_method, rpc_service) (increase(rpc_server_request_count[12h])) labels: slo: slo-onprem record: rpc_server_request:increase12h - expr: sum by (cluster, rpc_connect_rpc_error_code, rpc_method, rpc_service) (increase(rpc_server_request_count[1d])) labels: slo: slo-onprem record: rpc_server_request:increase1d - expr: sum by (cluster, rpc_connect_rpc_error_code, rpc_method, rpc_service) (increase(rpc_server_request_count[1w])) labels: slo: slo-onprem record: rpc_server_request:increase1w - expr: sum by (cluster, rpc_connect_rpc_error_code, rpc_method, rpc_service) (increase(rpc_server_request_count[4w])) labels: slo: slo-onprem record: rpc_server_request:increase4w - interval: 1m name: slo-onprem rules: - expr: sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count{rpc_connect_rpc_error_code=~"internal|unavailable|data_loss|unknown"}[30m])) / sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count[30m])) labels: slo: slo-onprem record: rpc_server_request:burnrate30m - expr: sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count{rpc_connect_rpc_error_code=~"internal|unavailable|data_loss|unknown"}[1h])) / sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count[1h])) labels: slo: slo-onprem record: rpc_server_request:burnrate1h - expr: sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count{rpc_connect_rpc_error_code=~"internal|unavailable|data_loss|unknown"}[2h])) / sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count[2h])) labels: slo: slo-onprem record: rpc_server_request:burnrate2h - expr: sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count{rpc_connect_rpc_error_code=~"internal|unavailable|data_loss|unknown"}[1d])) / sum by (cluster, rpc_method, rpc_service) (rate(rpc_server_request_count[1d])) labels: slo: slo-onprem record: rpc_server_request:burnrate1d - alert: ErrorBudgetBurn annotations: dashboardUId: sloonprem description: "High error count for slo-onprem with labels: {{$labels}}" runbook_url: "" summary: High error count for slo-onprem expr: rpc_server_request:burnrate30m{slo="slo-onprem"} > (14 * (1-0.995)) and rpc_server_request:burnrate1h{slo="slo-onprem"} > (14 * (1-0.995)) and rpc_server_request:increase1h{slo="slo-onprem"} > 10 for: 2m labels: long: 1h severity: warning short: 30m slo: slo-onprem - alert: ErrorBudgetBurn annotations: dashboardUId: sloonprem description: "High error count for slo-onprem with labels: {{$labels}}" runbook_url: "" summary: High error count for slo-onprem expr: rpc_server_request:burnrate2h{slo="slo-onprem"} > (3 * (1-0.995)) and rpc_server_request:burnrate1d{slo="slo-onprem"} > (3 * (1-0.995)) and rpc_server_request:increase1d{slo="slo-onprem"} > 10 for: 15m labels: long: 1d severity: warning short: 2h slo: slo-onprem - interval: 1m0s name: slo-onprem-latency-increase rules: - alert: LatencyP95AboveThreshold_250000 expr: rpc_server_duration_milliseconds_bucket:histogram_quantile{percentile="p95", rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}*1 > 250000 for: 10m labels: severity: page slo: slo-onprem - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1h])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="250000",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1h])) labels: latency: "250000" record: rpc_server_duration_milliseconds_bucket:slow1h - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[12h])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="250000",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[12h])) labels: latency: "250000" record: rpc_server_duration_milliseconds_bucket:slow12h - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1d])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="250000",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1d])) labels: latency: "250000" record: rpc_server_duration_milliseconds_bucket:slow1d - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1w])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="250000",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1w])) labels: latency: "250000" record: rpc_server_duration_milliseconds_bucket:slow1w - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[4w])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="250000",rpc_service=~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[4w])) labels: latency: "250000" record: rpc_server_duration_milliseconds_bucket:slow4w - alert: LatencyP95AboveThreshold_15000 expr: rpc_server_duration_milliseconds_bucket:histogram_quantile{percentile="p95", rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}*1 > 15000 for: 10m labels: severity: page slo: slo-onprem - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1h])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="15000",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1h])) labels: latency: "15000" record: rpc_server_duration_milliseconds_bucket:fast1h - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[12h])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="15000",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[12h])) labels: latency: "15000" record: rpc_server_duration_milliseconds_bucket:fast12h - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1d])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="15000",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1d])) labels: latency: "15000" record: rpc_server_duration_milliseconds_bucket:fast1d - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1w])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="15000",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[1w])) labels: latency: "15000" record: rpc_server_duration_milliseconds_bucket:fast1w - expr: sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="+Inf",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[4w])) - sum by (cluster, rpc_service, rpc_method) (increase(rpc_server_duration_milliseconds_bucket{le="15000",rpc_service!~"buf.alpha.registry.v1alpha1.CodeGenerationService|buf.alpha.javacompile.v1alpha1.CompileService|buf.alpha.registry.v1alpha1.GenerateService|buf.alpha.sandbox.v1alpha1.SandboxService"}[4w])) labels: latency: "15000" record: rpc_server_duration_milliseconds_bucket:fast4w - expr: (histogram_quantile(0.99, sum by (cluster, rpc_service, rpc_method, le) (rate(rpc_server_duration_milliseconds_bucket[4m])))) labels: percentile: p99 record: rpc_server_duration_milliseconds_bucket:histogram_quantile - expr: (histogram_quantile(0.95, sum by (cluster, rpc_service, rpc_method, le) (rate(rpc_server_duration_milliseconds_bucket[4m])))) labels: percentile: p95 record: rpc_server_duration_milliseconds_bucket:histogram_quantile - expr: (histogram_quantile(0.5, sum by (cluster, rpc_service, rpc_method, le) (rate(rpc_server_duration_milliseconds_bucket[4m])))) labels: percentile: p50 record: rpc_server_duration_milliseconds_bucket:histogram_quantile