| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: InstanceDown
expr: up{job="host"}
== 0
for: 1m
labels:
severity: critical
annotations:
description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for
more than 1 minute.'
title: Instance {{ $labels.instance }} down
|
ok
|
|
1m38.23s ago
|
589.7us |
| alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m])
> 0
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
208.2us |
| alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds
> prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
106.5us |
| alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m])
> 0
labels:
severity: warning
annotations:
description: |-
The Prometheus notification queue has not been empty for 10 minutes
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
102.2us |
| alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m])
> 0
labels:
severity: critical
annotations:
description: |-
Alertmanager is failing sending notifications
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
|
ok
|
|
1m38.229s ago
|
75.41us |
| alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets
== 0
labels:
severity: critical
annotations:
description: |-
Prometheus has no target in service discovery
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target empty (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
177.3us |
| alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])
> 10
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus has many scrapes that exceed the sample limit
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
119us |
| alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
labels:
severity: warning
annotations:
description: |-
Prometheus has many samples rejected due to duplicate timestamps but different values
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
ok
|
|
1m38.23s ago
|
96.27us |
| alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes
/ node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
description: |-
Node memory is filling up (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of memory (instance {{ $labels.instance }})
|
ok
|
|
1m38.23s ago
|
475.1us |
| alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m])
> 1000
for: 2m
labels:
severity: warning
annotations:
description: |-
The node is under heavy memory pressure. High rate of major page faults
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
192us |
| alert: HostUnusualNetworkThroughputIn
expr: sum
by(instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably receiving too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
ok
|
|
1m38.23s ago
|
502.2us |
| alert: HostUnusualNetworkThroughputOut
expr: sum
by(instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
description: |-
Host network interfaces are probably sending too much data (> 100 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
ok
|
|
1m38.23s ago
|
651us |
| alert: HostUnusualDiskReadRate
expr: sum
by(instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk is probably reading too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
313.6us |
| alert: HostUnusualDiskWriteRate
expr: sum
by(instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk is probably writing too much data (> 50 MB/s)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk write rate (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
330.6us |
| alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{device!="rootfs",mountpoint="/"}
* 100) / node_filesystem_size_bytes{device!="rootfs",mountpoint="/"}
< 10 and node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk is almost full (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of disk space (instance {{ $labels.instance }})
|
ok
|
|
1m38.229s ago
|
1.048ms |
| alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m])
/ rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])
> 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (read operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
ok
|
|
1m38.228s ago
|
359.1us |
| alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m])
/ rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m])
> 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (write operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
ok
|
|
1m38.228s ago
|
348.6us |
| alert: HostHighCpuLoad
expr: 100
- (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100)
> 80
labels:
severity: warning
annotations:
description: |-
CPU load is > 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }})
|
ok
|
|
1m38.227s ago
|
501.4us |
| alert: HostCpuStealNoisyNeighbor
expr: avg
by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >
10
labels:
severity: warning
annotations:
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
ok
|
|
1m38.227s ago
|
268.6us |
| alert: HostSwapIsFillingUp
expr: (1
- (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Swap is filling up (>80%)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host swap is filling up (instance {{ $labels.instance }})
|
ok
|
|
1m38.227s ago
|
253.8us |
| alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m])
/ rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
ok
|
|
1m38.227s ago
|
401.2us |
| alert: HostNetworkTransmitErrors
expr: rate(node_network_transmit_errs_total[2m])
/ rate(node_network_transmit_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
ok
|
|
1m38.227s ago
|
403.3us |
| alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m])
+ rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"}
> 0.8 < 10000
for: 1m
labels:
severity: warning
annotations:
description: |-
The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Interface Saturated (instance {{ $labels.instance }})
|
ok
|
|
1m38.226s ago
|
1.121ms |
| alert: ContainerKilled
expr: time()
- container_last_seen > 60
labels:
severity: warning
annotations:
description: |-
A container has disappeared
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container killed (instance {{ $labels.instance }})
|
ok
|
|
1m38.225s ago
|
5.741ms |
| alert: ContainerVolumeUsage
expr: (1
- (sum by(instance) (container_fs_inodes_free) / sum by(instance) (container_fs_inodes_total)))
* 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Container Volume usage is above 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container Volume usage (instance {{ $labels.instance }})
|
ok
|
|
1m38.22s ago
|
1.848ms |
| alert: ContainerVolumeIoUsage
expr: (sum
by(instance, name) (container_fs_io_current) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
Container Volume IO usage is above 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container Volume IO usage (instance {{ $labels.instance }})
|
ok
|
|
1m38.218s ago
|
1.019ms |
| alert: ContainerHighThrottleRate
expr: rate(container_cpu_cfs_throttled_seconds_total[3m])
> 1
for: 2m
labels:
severity: warning
annotations:
description: |-
Container is being throttled
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Container high throttle rate (instance {{ $labels.instance }})
|
ok
|
|
1m38.217s ago
|
68.54us |