271 lines
11 KiB
YAML
271 lines
11 KiB
YAML
|
|
# OpenBMP — Grafana unified-alerting rule provisioning
|
||
|
|
# Grafana 9.1.7 (apiVersion: 1)
|
||
|
|
#
|
||
|
|
# Provisioned alert rules for the OpenBMP BGP-monitoring stack. They query the
|
||
|
|
# PostgreSQL datasource (uid: obmp_postgres) and fire on BGP peer/router
|
||
|
|
# session loss, peer flap storms, and RPKI-invalid routes.
|
||
|
|
#
|
||
|
|
# ----------------------------------------------------------------------
|
||
|
|
# DEPLOYMENT
|
||
|
|
# ----------------------------------------------------------------------
|
||
|
|
# This file is read by Grafana from /etc/grafana/provisioning/alerting/.
|
||
|
|
# The compose stack bind-mounts ${OBMP_DATA_ROOT}/grafana/provisioning into
|
||
|
|
# the container, so copy this directory there and restart Grafana:
|
||
|
|
#
|
||
|
|
# cp -r obmp-grafana/provisioning/alerting ${OBMP_DATA_ROOT}/grafana/provisioning/
|
||
|
|
# docker compose -p obmp restart grafana
|
||
|
|
#
|
||
|
|
# Pair it with contact-points.yaml (in this directory) for notifications.
|
||
|
|
#
|
||
|
|
# ----------------------------------------------------------------------
|
||
|
|
# OPERATOR REVIEW — fields you should check before relying on these
|
||
|
|
# ----------------------------------------------------------------------
|
||
|
|
# * folderUID: '1001' — reuses the existing 'OBMP-Base' dashboard folder so
|
||
|
|
# the rules have a home in the UI. Change it to a dedicated alerting
|
||
|
|
# folder UID if you prefer; the folder must already exist in Grafana.
|
||
|
|
# * datasourceUid: obmp_postgres — confirmed correct for this stack.
|
||
|
|
# * Thresholds and `for:` durations below are reasonable starting points.
|
||
|
|
# Tune them against your production baseline (40 full-table routers will
|
||
|
|
# have a different normal flap/churn profile than the lab).
|
||
|
|
# * The reduce/threshold expression UIDs (B, C) and refIds are internal to
|
||
|
|
# each rule; do not rename them without updating the matching references.
|
||
|
|
# * Alert-rule provisioning YAML is intricate. These definitions are
|
||
|
|
# intentionally minimal and well-commented. After first load, open each
|
||
|
|
# rule in the Grafana UI (Alerting > Alert rules) and confirm it
|
||
|
|
# evaluates without error before depending on it for paging.
|
||
|
|
# ----------------------------------------------------------------------
|
||
|
|
|
||
|
|
apiVersion: 1
|
||
|
|
|
||
|
|
groups:
|
||
|
|
- orgId: 1
|
||
|
|
name: OpenBMP BGP Health
|
||
|
|
folder: OBMP-Base
|
||
|
|
# How often every rule in this group is evaluated.
|
||
|
|
interval: 1m
|
||
|
|
rules:
|
||
|
|
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# (a) BGP peer down within the last 15 minutes
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# bgp_peers.state is an enum ('up'/'down'); .timestamp is the last
|
||
|
|
# state-change time. A peer whose state is 'down' AND changed within
|
||
|
|
# the last 15 min indicates a recent session loss.
|
||
|
|
- uid: obmp-peer-down
|
||
|
|
title: BGP Peer Down (recent)
|
||
|
|
condition: C
|
||
|
|
for: 5m
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 600, to: 0 }
|
||
|
|
datasourceUid: obmp_postgres
|
||
|
|
model:
|
||
|
|
refId: A
|
||
|
|
datasource: { type: postgres, uid: obmp_postgres }
|
||
|
|
format: table
|
||
|
|
rawSql: >
|
||
|
|
SELECT count(*)::float8 AS value
|
||
|
|
FROM bgp_peers
|
||
|
|
WHERE state = 'down'
|
||
|
|
AND timestamp > (now() AT TIME ZONE 'utc') - interval '15 minutes';
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: B
|
||
|
|
type: reduce
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: C
|
||
|
|
type: threshold
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: B
|
||
|
|
# Fire when one or more peers went down in the last 15 min.
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [0] }
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: bmp
|
||
|
|
annotations:
|
||
|
|
summary: One or more BGP peers went down in the last 15 minutes
|
||
|
|
description: >
|
||
|
|
{{ $values.B }} BGP peer(s) are in state 'down' with a state
|
||
|
|
change within the last 15 minutes. Check the OBMP peer
|
||
|
|
inventory and the affected routers.
|
||
|
|
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# (b) Peer flap storm — >5 down-events for one peer in 1 hour
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# peer_event_log records every peer state transition. Counting 'down'
|
||
|
|
# events per peer over the last hour detects a flapping session even
|
||
|
|
# if the peer is currently 'up'. The inner query groups per peer; the
|
||
|
|
# outer takes the worst offender's count.
|
||
|
|
- uid: obmp-peer-flap-storm
|
||
|
|
title: BGP Peer Flap Storm
|
||
|
|
condition: C
|
||
|
|
for: 0m
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 3600, to: 0 }
|
||
|
|
datasourceUid: obmp_postgres
|
||
|
|
model:
|
||
|
|
refId: A
|
||
|
|
datasource: { type: postgres, uid: obmp_postgres }
|
||
|
|
format: table
|
||
|
|
rawSql: >
|
||
|
|
SELECT coalesce(max(c), 0)::float8 AS value
|
||
|
|
FROM (
|
||
|
|
SELECT count(*) AS c
|
||
|
|
FROM peer_event_log
|
||
|
|
WHERE state = 'down'
|
||
|
|
AND timestamp > (now() AT TIME ZONE 'utc') - interval '1 hour'
|
||
|
|
GROUP BY peer_hash_id
|
||
|
|
) s;
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: B
|
||
|
|
type: reduce
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: C
|
||
|
|
type: threshold
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: B
|
||
|
|
# >5 down-events for a single peer within 1h = flap storm.
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [5] }
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
service: bmp
|
||
|
|
annotations:
|
||
|
|
summary: A BGP peer is flapping (more than 5 resets in the last hour)
|
||
|
|
description: >
|
||
|
|
At least one peer has logged {{ $values.B }} 'down' events in
|
||
|
|
peer_event_log within the last hour. Investigate link/session
|
||
|
|
instability on the affected peer.
|
||
|
|
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# (c) RPKI-invalid routes present
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# ip_rib has no RPKI column on this schema, so validity is derived by
|
||
|
|
# joining against rpki_validator (ROA cache, refreshed by the psql-app
|
||
|
|
# RPKI cron). A route is "invalid" when a covering ROA exists for the
|
||
|
|
# prefix but NO ROA matches its origin AS.
|
||
|
|
#
|
||
|
|
# NOTE: rpki_validator is empty until ENABLE_RPKI=1 has run at least
|
||
|
|
# once (every ~2h). Until then this rule correctly reports 0.
|
||
|
|
- uid: obmp-rpki-invalid
|
||
|
|
title: RPKI-Invalid Routes Present
|
||
|
|
condition: C
|
||
|
|
for: 10m
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 600, to: 0 }
|
||
|
|
datasourceUid: obmp_postgres
|
||
|
|
model:
|
||
|
|
refId: A
|
||
|
|
datasource: { type: postgres, uid: obmp_postgres }
|
||
|
|
format: table
|
||
|
|
rawSql: >
|
||
|
|
SELECT count(*)::float8 AS value
|
||
|
|
FROM ip_rib r
|
||
|
|
WHERE r.iswithdrawn = false
|
||
|
|
AND r.origin_as IS NOT NULL
|
||
|
|
AND EXISTS (
|
||
|
|
SELECT 1 FROM rpki_validator v
|
||
|
|
WHERE r.prefix <<= v.prefix
|
||
|
|
AND r.prefix_len BETWEEN masklen(v.prefix) AND v.prefix_len_max
|
||
|
|
)
|
||
|
|
AND NOT EXISTS (
|
||
|
|
SELECT 1 FROM rpki_validator v2
|
||
|
|
WHERE r.prefix <<= v2.prefix
|
||
|
|
AND r.prefix_len BETWEEN masklen(v2.prefix) AND v2.prefix_len_max
|
||
|
|
AND v2.origin_as = r.origin_as
|
||
|
|
);
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: B
|
||
|
|
type: reduce
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: C
|
||
|
|
type: threshold
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: B
|
||
|
|
# Any RPKI-invalid route is worth surfacing. Raise the param
|
||
|
|
# (e.g. to 10) if you expect a steady-state baseline of
|
||
|
|
# invalids and only want to alert on spikes.
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [0] }
|
||
|
|
labels:
|
||
|
|
severity: warning
|
||
|
|
service: routing-security
|
||
|
|
annotations:
|
||
|
|
summary: RPKI-invalid routes are present in the RIB
|
||
|
|
description: >
|
||
|
|
{{ $values.B }} route(s) in ip_rib are RPKI-invalid (a covering
|
||
|
|
ROA exists but none matches the route's origin AS). Possible
|
||
|
|
mis-origination or hijack — review the RPKI Validation dashboard.
|
||
|
|
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# (d) Router BMP session down
|
||
|
|
# ------------------------------------------------------------------
|
||
|
|
# routers.state is the BMP session state for each monitored router.
|
||
|
|
# 'down' means the router's BMP feed to the collector has dropped.
|
||
|
|
- uid: obmp-router-bmp-down
|
||
|
|
title: Router BMP Session Down
|
||
|
|
condition: C
|
||
|
|
for: 5m
|
||
|
|
data:
|
||
|
|
- refId: A
|
||
|
|
relativeTimeRange: { from: 600, to: 0 }
|
||
|
|
datasourceUid: obmp_postgres
|
||
|
|
model:
|
||
|
|
refId: A
|
||
|
|
datasource: { type: postgres, uid: obmp_postgres }
|
||
|
|
format: table
|
||
|
|
rawSql: >
|
||
|
|
SELECT count(*)::float8 AS value
|
||
|
|
FROM routers
|
||
|
|
WHERE state = 'down';
|
||
|
|
- refId: B
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: B
|
||
|
|
type: reduce
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: A
|
||
|
|
reducer: last
|
||
|
|
- refId: C
|
||
|
|
datasourceUid: __expr__
|
||
|
|
model:
|
||
|
|
refId: C
|
||
|
|
type: threshold
|
||
|
|
datasource: { type: __expr__, uid: __expr__ }
|
||
|
|
expression: B
|
||
|
|
# Any router with a down BMP session.
|
||
|
|
conditions:
|
||
|
|
- evaluator: { type: gt, params: [0] }
|
||
|
|
labels:
|
||
|
|
severity: critical
|
||
|
|
service: bmp
|
||
|
|
annotations:
|
||
|
|
summary: One or more routers have a down BMP session
|
||
|
|
description: >
|
||
|
|
{{ $values.B }} router(s) are in BMP state 'down' — the
|
||
|
|
collector is no longer receiving BMP from them. Check the
|
||
|
|
router BMP config and reachability to the collector on port 5000.
|