271 lines
11 KiB
YAML
Raw Permalink Normal View History

# OpenBMP — Grafana unified-alerting rule provisioning
# Grafana 9.1.7 (apiVersion: 1)
#
# Provisioned alert rules for the OpenBMP BGP-monitoring stack. They query the
# PostgreSQL datasource (uid: obmp_postgres) and fire on BGP peer/router
# session loss, peer flap storms, and RPKI-invalid routes.
#
# ----------------------------------------------------------------------
# DEPLOYMENT
# ----------------------------------------------------------------------
# This file is read by Grafana from /etc/grafana/provisioning/alerting/.
# The compose stack bind-mounts ${OBMP_DATA_ROOT}/grafana/provisioning into
# the container, so copy this directory there and restart Grafana:
#
# cp -r obmp-grafana/provisioning/alerting ${OBMP_DATA_ROOT}/grafana/provisioning/
# docker compose -p obmp restart grafana
#
# Pair it with contact-points.yaml (in this directory) for notifications.
#
# ----------------------------------------------------------------------
# OPERATOR REVIEW — fields you should check before relying on these
# ----------------------------------------------------------------------
# * folderUID: '1001' — reuses the existing 'OBMP-Base' dashboard folder so
# the rules have a home in the UI. Change it to a dedicated alerting
# folder UID if you prefer; the folder must already exist in Grafana.
# * datasourceUid: obmp_postgres — confirmed correct for this stack.
# * Thresholds and `for:` durations below are reasonable starting points.
# Tune them against your production baseline (40 full-table routers will
# have a different normal flap/churn profile than the lab).
# * The reduce/threshold expression UIDs (B, C) and refIds are internal to
# each rule; do not rename them without updating the matching references.
# * Alert-rule provisioning YAML is intricate. These definitions are
# intentionally minimal and well-commented. After first load, open each
# rule in the Grafana UI (Alerting > Alert rules) and confirm it
# evaluates without error before depending on it for paging.
# ----------------------------------------------------------------------
apiVersion: 1
groups:
- orgId: 1
name: OpenBMP BGP Health
folder: OBMP-Base
# How often every rule in this group is evaluated.
interval: 1m
rules:
# ------------------------------------------------------------------
# (a) BGP peer down within the last 15 minutes
# ------------------------------------------------------------------
# bgp_peers.state is an enum ('up'/'down'); .timestamp is the last
# state-change time. A peer whose state is 'down' AND changed within
# the last 15 min indicates a recent session loss.
- uid: obmp-peer-down
title: BGP Peer Down (recent)
condition: C
for: 5m
data:
- refId: A
relativeTimeRange: { from: 600, to: 0 }
datasourceUid: obmp_postgres
model:
refId: A
datasource: { type: postgres, uid: obmp_postgres }
format: table
rawSql: >
SELECT count(*)::float8 AS value
FROM bgp_peers
WHERE state = 'down'
AND timestamp > (now() AT TIME ZONE 'utc') - interval '15 minutes';
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
datasource: { type: __expr__, uid: __expr__ }
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
datasource: { type: __expr__, uid: __expr__ }
expression: B
# Fire when one or more peers went down in the last 15 min.
conditions:
- evaluator: { type: gt, params: [0] }
labels:
severity: critical
service: bmp
annotations:
summary: One or more BGP peers went down in the last 15 minutes
description: >
{{ $values.B }} BGP peer(s) are in state 'down' with a state
change within the last 15 minutes. Check the OBMP peer
inventory and the affected routers.
# ------------------------------------------------------------------
# (b) Peer flap storm — >5 down-events for one peer in 1 hour
# ------------------------------------------------------------------
# peer_event_log records every peer state transition. Counting 'down'
# events per peer over the last hour detects a flapping session even
# if the peer is currently 'up'. The inner query groups per peer; the
# outer takes the worst offender's count.
- uid: obmp-peer-flap-storm
title: BGP Peer Flap Storm
condition: C
for: 0m
data:
- refId: A
relativeTimeRange: { from: 3600, to: 0 }
datasourceUid: obmp_postgres
model:
refId: A
datasource: { type: postgres, uid: obmp_postgres }
format: table
rawSql: >
SELECT coalesce(max(c), 0)::float8 AS value
FROM (
SELECT count(*) AS c
FROM peer_event_log
WHERE state = 'down'
AND timestamp > (now() AT TIME ZONE 'utc') - interval '1 hour'
GROUP BY peer_hash_id
) s;
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
datasource: { type: __expr__, uid: __expr__ }
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
datasource: { type: __expr__, uid: __expr__ }
expression: B
# >5 down-events for a single peer within 1h = flap storm.
conditions:
- evaluator: { type: gt, params: [5] }
labels:
severity: warning
service: bmp
annotations:
summary: A BGP peer is flapping (more than 5 resets in the last hour)
description: >
At least one peer has logged {{ $values.B }} 'down' events in
peer_event_log within the last hour. Investigate link/session
instability on the affected peer.
# ------------------------------------------------------------------
# (c) RPKI-invalid routes present
# ------------------------------------------------------------------
# ip_rib has no RPKI column on this schema, so validity is derived by
# joining against rpki_validator (ROA cache, refreshed by the psql-app
# RPKI cron). A route is "invalid" when a covering ROA exists for the
# prefix but NO ROA matches its origin AS.
#
# NOTE: rpki_validator is empty until ENABLE_RPKI=1 has run at least
# once (every ~2h). Until then this rule correctly reports 0.
- uid: obmp-rpki-invalid
title: RPKI-Invalid Routes Present
condition: C
for: 10m
data:
- refId: A
relativeTimeRange: { from: 600, to: 0 }
datasourceUid: obmp_postgres
model:
refId: A
datasource: { type: postgres, uid: obmp_postgres }
format: table
rawSql: >
SELECT count(*)::float8 AS value
FROM ip_rib r
WHERE r.iswithdrawn = false
AND r.origin_as IS NOT NULL
AND EXISTS (
SELECT 1 FROM rpki_validator v
WHERE r.prefix <<= v.prefix
AND r.prefix_len BETWEEN masklen(v.prefix) AND v.prefix_len_max
)
AND NOT EXISTS (
SELECT 1 FROM rpki_validator v2
WHERE r.prefix <<= v2.prefix
AND r.prefix_len BETWEEN masklen(v2.prefix) AND v2.prefix_len_max
AND v2.origin_as = r.origin_as
);
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
datasource: { type: __expr__, uid: __expr__ }
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
datasource: { type: __expr__, uid: __expr__ }
expression: B
# Any RPKI-invalid route is worth surfacing. Raise the param
# (e.g. to 10) if you expect a steady-state baseline of
# invalids and only want to alert on spikes.
conditions:
- evaluator: { type: gt, params: [0] }
labels:
severity: warning
service: routing-security
annotations:
summary: RPKI-invalid routes are present in the RIB
description: >
{{ $values.B }} route(s) in ip_rib are RPKI-invalid (a covering
ROA exists but none matches the route's origin AS). Possible
mis-origination or hijack — review the RPKI Validation dashboard.
# ------------------------------------------------------------------
# (d) Router BMP session down
# ------------------------------------------------------------------
# routers.state is the BMP session state for each monitored router.
# 'down' means the router's BMP feed to the collector has dropped.
- uid: obmp-router-bmp-down
title: Router BMP Session Down
condition: C
for: 5m
data:
- refId: A
relativeTimeRange: { from: 600, to: 0 }
datasourceUid: obmp_postgres
model:
refId: A
datasource: { type: postgres, uid: obmp_postgres }
format: table
rawSql: >
SELECT count(*)::float8 AS value
FROM routers
WHERE state = 'down';
- refId: B
datasourceUid: __expr__
model:
refId: B
type: reduce
datasource: { type: __expr__, uid: __expr__ }
expression: A
reducer: last
- refId: C
datasourceUid: __expr__
model:
refId: C
type: threshold
datasource: { type: __expr__, uid: __expr__ }
expression: B
# Any router with a down BMP session.
conditions:
- evaluator: { type: gt, params: [0] }
labels:
severity: critical
service: bmp
annotations:
summary: One or more routers have a down BMP session
description: >
{{ $values.B }} router(s) are in BMP state 'down' — the
collector is no longer receiving BMP from them. Check the
router BMP config and reachability to the collector on port 5000.