| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: InstanceDown
expr: up{job="host"}
== 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
more than 1 minute.'
title: Instance {{ $labels.instance }} down
|
ok
|
|
59.53s ago
|
380.1us |
| alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m])
> 0
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
183.8us |
| alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds
> prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
115.8us |
| alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m])
> 0
labels:
severity: warning
annotations:
description: |-
The Prometheus notification queue has not been empty for 10 minutes
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
112.3us |
| alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m])
> 0
labels:
severity: critical
annotations:
description: |-
Alertmanager is failing sending notifications
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
|
ok
|
|
59.53s ago
|
82.09us |
| alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets
== 0
labels:
severity: critical
annotations:
description: |-
Prometheus has no target in service discovery
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target empty (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
134.6us |
| alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])
> 10
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus has many scrapes that exceed the sample limit
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
96.06us |
| alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
labels:
severity: warning
annotations:
description: |-
Prometheus has many samples rejected due to duplicate timestamps but different values
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
88.84us |
| alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes
/ node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
description: |-
Node memory is filling up (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of memory (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
663us |
| alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m])
> 1000
for: 2m
labels:
severity: warning
annotations:
description: |-
The node is under heavy memory pressure. High rate of major page faults
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
229.1us |
| alert: HostUnusualNetworkThroughputIn
expr: sum
by(instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
464.4us |
| alert: HostUnusualNetworkThroughputOut
expr: sum
by(instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
535.3us |
| alert: HostUnusualDiskReadRate
expr: sum
by(instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk is probably reading too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
ok
|
|
59.53s ago
|
338.7us |
| alert: HostUnusualDiskWriteRate
expr: sum
by(instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk is probably writing too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
ok
|
|
59.529s ago
|
299.4us |
| alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{device!="rootfs",mountpoint="/"}
* 100) / node_filesystem_size_bytes{device!="rootfs",mountpoint="/"}
< 10 and node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk is almost full (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of disk space (instance {{ $labels.instance }})
|
ok
|
|
59.529s ago
|
1.001ms |
| alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m])
/ rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])
> 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (read operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
ok
|
|
59.528s ago
|
425.8us |
| alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m])
/ rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m])
> 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (write operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
ok
|
|
59.528s ago
|
497.2us |
| alert: HostHighCpuLoad
expr: 100
- (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)
> 80
labels:
severity: warning
annotations:
description: |-
CPU load is > 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }})
|
ok
|
|
59.528s ago
|
431.1us |
| alert: HostCpuStealNoisyNeighbor
expr: avg
by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >
10
labels:
severity: warning
annotations:
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
ok
|
|
59.527s ago
|
282.3us |
| alert: HostSwapIsFillingUp
expr: (1
- (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Swap is filling up (>80%)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host swap is filling up (instance {{ $labels.instance }})
|
ok
|
|
59.527s ago
|
262.5us |
| alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m])
/ rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
ok
|
|
59.527s ago
|
369.1us |
| alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m])
/ rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
ok
|
|
59.527s ago
|
368.3us |
| alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"}
> 0.8 < 10000
for: 1m
labels:
severity: warning
annotations:
description: |-
The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
ok
|
|
59.527s ago
|
1.196ms |
| alert: ContainerKilled
expr: time()
- container_last_seen > 60
labels:
severity: warning
annotations:
description: |-
A container has disappeared
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container killed (instance {{ $labels.instance }})
|
ok
|
|
59.526s ago
|
4.543ms |
| alert: ContainerVolumeUsage
expr: (1
- (sum by(instance) (container_fs_inodes_free) / sum by(instance) (container_fs_inodes_total)))
* 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Container Volume usage is above 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container Volume usage (instance {{ $labels.instance }})
|
ok
|
|
59.521s ago
|
1.659ms |
| alert: ContainerVolumeIoUsage
expr: (sum
by(instance, name) (container_fs_io_current) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Container Volume IO usage is above 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container Volume IO usage (instance {{ $labels.instance }})
|
ok
|
|
59.52s ago
|
910.9us |
| alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m])
> 1
for: 2m
labels:
severity: warning
annotations:
description: |-
Container is being throttled
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container high throttle rate (instance {{ $labels.instance }})
|
ok
|
|
59.519s ago
|
91.5us |