Rules

AllInstances

1m38.229s ago

17.38ms

Rule State Error Last Evaluation Evaluation Time
alert: InstanceDown expr: up{job="host"} == 0 for: 1m labels: severity: critical annotations: description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.' title: Instance {{ $labels.instance }} down ok 1m38.23s ago 589.7us
alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) ok 1m38.229s ago 208.2us
alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: description: |- Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) ok 1m38.229s ago 106.5us
alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 labels: severity: warning annotations: description: |- The Prometheus notification queue has not been empty for 10 minutes VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus notifications backlog (instance {{ $labels.instance }}) ok 1m38.229s ago 102.2us
alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 labels: severity: critical annotations: description: |- Alertmanager is failing sending notifications VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) ok 1m38.229s ago 75.41us
alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 labels: severity: critical annotations: description: |- Prometheus has no target in service discovery VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target empty (instance {{ $labels.instance }}) ok 1m38.229s ago 177.3us
alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: description: |- Prometheus has many scrapes that exceed the sample limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus large scrape (instance {{ $labels.instance }}) ok 1m38.229s ago 119us
alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 labels: severity: warning annotations: description: |- Prometheus has many samples rejected due to duplicate timestamps but different values VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) ok 1m38.23s ago 96.27us
alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: description: |- Node memory is filling up (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of memory (instance {{ $labels.instance }}) ok 1m38.23s ago 475.1us
alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: description: |- The node is under heavy memory pressure. High rate of major page faults VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host memory under memory pressure (instance {{ $labels.instance }}) ok 1m38.229s ago 192us
alert: HostUnusualNetworkThroughputIn expr: sum by(instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: description: |- Host network interfaces are probably receiving too much data (> 100 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput in (instance {{ $labels.instance }}) ok 1m38.23s ago 502.2us
alert: HostUnusualNetworkThroughputOut expr: sum by(instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: description: |- Host network interfaces are probably sending too much data (> 100 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput out (instance {{ $labels.instance }}) ok 1m38.23s ago 651us
alert: HostUnusualDiskReadRate expr: sum by(instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: description: |- Disk is probably reading too much data (> 50 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read rate (instance {{ $labels.instance }}) ok 1m38.229s ago 313.6us
alert: HostUnusualDiskWriteRate expr: sum by(instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 2m labels: severity: warning annotations: description: |- Disk is probably writing too much data (> 50 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write rate (instance {{ $labels.instance }}) ok 1m38.229s ago 330.6us
alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{device!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{device!="rootfs",mountpoint="/"} < 10 and node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: description: |- Disk is almost full (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of disk space (instance {{ $labels.instance }}) ok 1m38.229s ago 1.048ms
alert: HostUnusualDiskReadLatency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: description: |- Disk latency is growing (read operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read latency (instance {{ $labels.instance }}) ok 1m38.228s ago 359.1us
alert: HostUnusualDiskWriteLatency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: description: |- Disk latency is growing (write operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write latency (instance {{ $labels.instance }}) ok 1m38.228s ago 348.6us
alert: HostHighCpuLoad expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 labels: severity: warning annotations: description: |- CPU load is > 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host high CPU load (instance {{ $labels.instance }}) ok 1m38.227s ago 501.4us
alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 labels: severity: warning annotations: description: |- CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) ok 1m38.227s ago 268.6us
alert: HostSwapIsFillingUp expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: warning annotations: description: |- Swap is filling up (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host swap is filling up (instance {{ $labels.instance }}) ok 1m38.227s ago 253.8us
alert: HostNetworkReceiveErrors expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: description: |- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Receive Errors (instance {{ $labels.instance }}) ok 1m38.227s ago 401.2us
alert: HostNetworkTransmitErrors expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: description: |- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Transmit Errors (instance {{ $labels.instance }}) ok 1m38.227s ago 403.3us
alert: HostNetworkInterfaceSaturated expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000 for: 1m labels: severity: warning annotations: description: |- The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Interface Saturated (instance {{ $labels.instance }}) ok 1m38.226s ago 1.121ms
alert: ContainerKilled expr: time() - container_last_seen > 60 labels: severity: warning annotations: description: |- A container has disappeared VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container killed (instance {{ $labels.instance }}) ok 1m38.225s ago 5.741ms
alert: ContainerVolumeUsage expr: (1 - (sum by(instance) (container_fs_inodes_free) / sum by(instance) (container_fs_inodes_total))) * 100 > 80 for: 2m labels: severity: warning annotations: description: |- Container Volume usage is above 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container Volume usage (instance {{ $labels.instance }}) ok 1m38.22s ago 1.848ms
alert: ContainerVolumeIoUsage expr: (sum by(instance, name) (container_fs_io_current) * 100) > 80 for: 2m labels: severity: warning annotations: description: |- Container Volume IO usage is above 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container Volume IO usage (instance {{ $labels.instance }}) ok 1m38.218s ago 1.019ms
alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 2m labels: severity: warning annotations: description: |- Container is being throttled VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container high throttle rate (instance {{ $labels.instance }}) ok 1m38.217s ago 68.54us