Alerts


/alertmanager/alert.rules > AllInstances
ContainerHighThrottleRate (0 active)
alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m])
  > 1
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Container is being throttled
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container high throttle rate (instance {{ $labels.instance }})
ContainerKilled (0 active)
alert: ContainerKilled
expr: time()
  - container_last_seen > 60
labels:
  severity: warning
annotations:
  description: |-
    A container has disappeared
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container killed (instance {{ $labels.instance }})
ContainerVolumeIoUsage (0 active)
alert: ContainerVolumeIoUsage
expr: (sum
  by(instance, name) (container_fs_io_current) * 100) > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Container Volume IO usage is above 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container Volume IO usage (instance {{ $labels.instance }})
ContainerVolumeUsage (0 active)
alert: ContainerVolumeUsage
expr: (1
  - (sum by(instance) (container_fs_inodes_free) / sum by(instance) (container_fs_inodes_total)))
  * 100 > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Container Volume usage is above 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Container Volume usage (instance {{ $labels.instance }})
HostCpuStealNoisyNeighbor (0 active)
alert: HostCpuStealNoisyNeighbor
expr: avg
  by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >
  10
labels:
  severity: warning
annotations:
  description: |-
    CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: 100
  - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)
  > 80
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.instance }})
HostMemoryUnderMemoryPressure (0 active)
alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m])
  > 1000
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    The node is under heavy memory pressure. High rate of major page faults
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host memory under memory pressure (instance {{ $labels.instance }})
HostNetworkInterfaceSaturated (0 active)
alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m])
  + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"}
  > 0.8 < 10000
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Interface Saturated (instance {{ $labels.instance }})
HostNetworkReceiveErrors (0 active)
alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m])
  / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Receive Errors (instance {{ $labels.instance }})
HostNetworkTransmitErrors (0 active)
alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m])
  / rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Transmit Errors (instance {{ $labels.instance }})
HostOutOfDiskSpace (0 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{device!="rootfs",mountpoint="/"}
  * 100) / node_filesystem_size_bytes{device!="rootfs",mountpoint="/"}
  < 10 and node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})
HostOutOfMemory (0 active)
alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Node memory is filling up (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})
HostSwapIsFillingUp (0 active)
alert: HostSwapIsFillingUp
expr: (1
  - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Swap is filling up (>80%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host swap is filling up (instance {{ $labels.instance }})
HostUnusualDiskReadLatency (0 active)
alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m])
  / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])
  > 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk latency is growing (read operations > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk read latency (instance {{ $labels.instance }})
HostUnusualDiskReadRate (0 active)
alert: HostUnusualDiskReadRate
expr: sum
  by(instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Disk is probably reading too much data (> 50 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk read rate (instance {{ $labels.instance }})
HostUnusualDiskWriteLatency (0 active)
alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m])
  / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m])
  > 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk latency is growing (write operations > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk write latency (instance {{ $labels.instance }})
HostUnusualDiskWriteRate (0 active)
alert: HostUnusualDiskWriteRate
expr: sum
  by(instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk is probably writing too much data (> 50 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk write rate (instance {{ $labels.instance }})
HostUnusualNetworkThroughputIn (0 active)
alert: HostUnusualNetworkThroughputIn
expr: sum
  by(instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Host network interfaces are probably receiving too much data (> 100 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual network throughput in (instance {{ $labels.instance }})
HostUnusualNetworkThroughputOut (0 active)
alert: HostUnusualNetworkThroughputOut
expr: sum
  by(instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Host network interfaces are probably sending too much data (> 100 MB/s)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual network throughput out (instance {{ $labels.instance }})
InstanceDown (0 active)
alert: InstanceDown
expr: up{job="host"}
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
    more than 1 minute.'
  title: Instance {{ $labels.instance }} down
PrometheusAlertmanagerNotificationFailing (0 active)
alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m])
  > 0
labels:
  severity: critical
annotations:
  description: |-
    Alertmanager is failing sending notifications
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
    }})
PrometheusLargeScrape (0 active)
alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])
  > 10
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has many scrapes that exceed the sample limit
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus large scrape (instance {{ $labels.instance }})
PrometheusNotificationsBacklog (0 active)
alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m])
  > 0
labels:
  severity: warning
annotations:
  description: |-
    The Prometheus notification queue has not been empty for 10 minutes
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus notifications backlog (instance {{ $labels.instance }})
PrometheusRuleEvaluationFailures (0 active)
alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m])
  > 0
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
PrometheusRuleEvaluationSlow (0 active)
alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds
  > prometheus_rule_group_interval_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
PrometheusTargetEmpty (0 active)
alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets
  == 0
labels:
  severity: critical
annotations:
  description: |-
    Prometheus has no target in service discovery
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target empty (instance {{ $labels.instance }})
PrometheusTargetScrapeDuplicate (0 active)
alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
  > 0
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has many samples rejected due to duplicate timestamps but different values
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})