Alerts


/etc/prometheus/rules/blackbox.rules > blackbox-exporter
HttpSlowRequests (0 active)
alert: HttpSlowRequests
expr: avg_over_time(probe_http_duration_seconds[1m])
  > 3
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    HTTP request took more than 3s
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: HTTP slow requests (instance {{ $labels.instance }})
HttpStatusCode (0 active)
alert: HttpStatusCode
expr: probe_http_status_code
  <= 199 or probe_http_status_code >= 400
for: 5m
labels:
  severity: error
annotations:
  description: |-
    HTTP status code is not 200-399
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: HTTP Status Code (instance {{ $labels.instance }})
ProbeFailed (0 active)
alert: ProbeFailed
expr: probe_success
  == 0
for: 5m
labels:
  severity: error
annotations:
  description: |-
    Probe failed
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Probe failed (instance {{ $labels.instance }})
SlowPing (0 active)
alert: SlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m])
  > 1
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Blackbox ping took more than 1s
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Slow ping (instance {{ $labels.instance }})
SlowProbe (0 active)
alert: SlowProbe
expr: avg_over_time(probe_duration_seconds[1m])
  > 3
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Blackbox probe took more than 3s to complete
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Slow probe (instance {{ $labels.instance }})
SslCertificateHasExpired (0 active)
alert: SslCertificateHasExpired
expr: probe_ssl_earliest_cert_expiry
  - time() <= 0
for: 5m
labels:
  severity: error
annotations:
  description: |-
    SSL certificate has expired already
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: SSL certificate has expired (instance {{ $labels.instance }})
SslCertificateWillExpireSoon (0 active)
alert: SslCertificateWillExpireSoon
expr: probe_ssl_earliest_cert_expiry
  - time() < 86400 * 20
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    SSL certificate expires in 20 days
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: SSL certificate will expire soon (instance {{ $labels.instance }})
/etc/prometheus/rules/monitoring.rules > monitoring
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: 100
  - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) *
  100) > 80
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.instance }})
HostOutOfDiskSpace (0 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes
  * 100) / node_filesystem_size_bytes < 15 and on (instance, device, mountpoint)
  node_filesystem_readonly == 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk is almost full (< 15% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})
HostOutOfMemory (0 active)
alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes * 100 < 10
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Node memory is filling up (< 10% left)
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})
InstanceDown (0 active)
alert: InstanceDown
expr: up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: stuff's happening with {{ $labels.service }}
/etc/prometheus/rules/mysql.rules > mysqld_exporter
MysqlDown (0 active)
alert: MysqlDown
expr: mysql_up == 0
for: 5m
labels:
  severity: critical
annotations:
  description: |-
    MySQL instance is down on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: MySQL down (instance {{ $labels.instance }})
MysqlHighThreadsRunning (0 active)
alert: MysqlHighThreadsRunning
expr: avg
  by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by
  (instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    More than 60% of MySQL connections are in running state on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: MySQL high threads running (instance {{ $labels.instance }})
MysqlTooManyConnections (0 active)
alert: MysqlTooManyConnections
expr: avg
  by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by
  (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    More than 80% of MySQL connections are in use on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS: {{ $labels }}
  summary: MySQL too many connections (instance {{ $labels.instance }})