Prometheus Time Series Collection and Processing Server

Rules

AllInstances			59.53s ago	15.82ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: InstanceDown expr: up{job="host"} == 0 for: 1m labels: severity: critical annotations: description: '{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minute.' title: Instance {{ $labels.instance }} down	ok		59.53s ago	380.1us
alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})	ok		59.53s ago	183.8us
alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: description: \|- Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})	ok		59.53s ago	115.8us
alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 labels: severity: warning annotations: description: \|- The Prometheus notification queue has not been empty for 10 minutes VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus notifications backlog (instance {{ $labels.instance }})	ok		59.53s ago	112.3us
alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 labels: severity: critical annotations: description: \|- Alertmanager is failing sending notifications VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})	ok		59.53s ago	82.09us
alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 labels: severity: critical annotations: description: \|- Prometheus has no target in service discovery VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target empty (instance {{ $labels.instance }})	ok		59.53s ago	134.6us
alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: description: \|- Prometheus has many scrapes that exceed the sample limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus large scrape (instance {{ $labels.instance }})	ok		59.53s ago	96.06us
alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 labels: severity: warning annotations: description: \|- Prometheus has many samples rejected due to duplicate timestamps but different values VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})	ok		59.53s ago	88.84us
alert: HostOutOfMemory expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10 for: 2m labels: severity: warning annotations: description: \|- Node memory is filling up (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of memory (instance {{ $labels.instance }})	ok		59.53s ago	663us
alert: HostMemoryUnderMemoryPressure expr: rate(node_vmstat_pgmajfault[1m]) > 1000 for: 2m labels: severity: warning annotations: description: \|- The node is under heavy memory pressure. High rate of major page faults VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host memory under memory pressure (instance {{ $labels.instance }})	ok		59.53s ago	229.1us
alert: HostUnusualNetworkThroughputIn expr: sum by(instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: description: \|- Host network interfaces are probably receiving too much data (> 100 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput in (instance {{ $labels.instance }})	ok		59.53s ago	464.4us
alert: HostUnusualNetworkThroughputOut expr: sum by(instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100 for: 5m labels: severity: warning annotations: description: \|- Host network interfaces are probably sending too much data (> 100 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput out (instance {{ $labels.instance }})	ok		59.53s ago	535.3us
alert: HostUnusualDiskReadRate expr: sum by(instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50 for: 5m labels: severity: warning annotations: description: \|- Disk is probably reading too much data (> 50 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read rate (instance {{ $labels.instance }})	ok		59.53s ago	338.7us
alert: HostUnusualDiskWriteRate expr: sum by(instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50 for: 2m labels: severity: warning annotations: description: \|- Disk is probably writing too much data (> 50 MB/s) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write rate (instance {{ $labels.instance }})	ok		59.529s ago	299.4us
alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{device!="rootfs",mountpoint="/"} * 100) / node_filesystem_size_bytes{device!="rootfs",mountpoint="/"} < 10 and node_filesystem_readonly == 0 for: 2m labels: severity: warning annotations: description: \|- Disk is almost full (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of disk space (instance {{ $labels.instance }})	ok		59.529s ago	1.001ms
alert: HostUnusualDiskReadLatency expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: description: \|- Disk latency is growing (read operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read latency (instance {{ $labels.instance }})	ok		59.528s ago	425.8us
alert: HostUnusualDiskWriteLatency expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0 for: 2m labels: severity: warning annotations: description: \|- Disk latency is growing (write operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write latency (instance {{ $labels.instance }})	ok		59.528s ago	497.2us
alert: HostHighCpuLoad expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80 labels: severity: warning annotations: description: \|- CPU load is > 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host high CPU load (instance {{ $labels.instance }})	ok		59.528s ago	431.1us
alert: HostCpuStealNoisyNeighbor expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 labels: severity: warning annotations: description: \|- CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})	ok		59.527s ago	282.3us
alert: HostSwapIsFillingUp expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80 for: 2m labels: severity: warning annotations: description: \|- Swap is filling up (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host swap is filling up (instance {{ $labels.instance }})	ok		59.527s ago	262.5us
alert: HostNetworkReceiveErrors expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: description: \|- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Receive Errors (instance {{ $labels.instance }})	ok		59.527s ago	369.1us
alert: HostNetworkTransmitErrors expr: rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01 for: 2m labels: severity: warning annotations: description: \|- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Transmit Errors (instance {{ $labels.instance }})	ok		59.527s ago	368.3us
alert: HostNetworkInterfaceSaturated expr: (rate(node_network_receive_bytes_total{device!~"^tap."}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap."}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8 < 10000 for: 1m labels: severity: warning annotations: description: \|- The network interface "{{ $labels.device }}" on "{{ $labels.instance }}" is getting overloaded. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Interface Saturated (instance {{ $labels.instance }})	ok		59.527s ago	1.196ms
alert: ContainerKilled expr: time() - container_last_seen > 60 labels: severity: warning annotations: description: \|- A container has disappeared VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container killed (instance {{ $labels.instance }})	ok		59.526s ago	4.543ms
alert: ContainerVolumeUsage expr: (1 - (sum by(instance) (container_fs_inodes_free) / sum by(instance) (container_fs_inodes_total))) * 100 > 80 for: 2m labels: severity: warning annotations: description: \|- Container Volume usage is above 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container Volume usage (instance {{ $labels.instance }})	ok		59.521s ago	1.659ms
alert: ContainerVolumeIoUsage expr: (sum by(instance, name) (container_fs_io_current) * 100) > 80 for: 2m labels: severity: warning annotations: description: \|- Container Volume IO usage is above 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container Volume IO usage (instance {{ $labels.instance }})	ok		59.52s ago	910.9us
alert: ContainerHighThrottleRate expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1 for: 2m labels: severity: warning annotations: description: \|- Container is being throttled VALUE = {{ $value }} LABELS = {{ $labels }} summary: Container high throttle rate (instance {{ $labels.instance }})	ok		59.519s ago	91.5us

Rules

AllInstances

59.53s ago

15.82ms