sam b681c473c0 Add Policy Diff, fleet-wide full-table feed, and Kafka lag monitoring
Policy Diff (roadmap E2 follow-up): obmp-rib-poller pulls per-router
post-policy accepted/advertised prefix counts and route-policy bindings
over CLI+NETCONF (BMP on XRv9000 24.3.1 carries only pre-policy
Adj-RIB-In). New tables in 008_obmp_policy_diff.sql; Policy Diff
dashboard joins them against BMP ip_rib for received-vs-kept-vs-rejected.

GoBGP fleet-wide feed: GoBGP re-advertises the full Bromirski table to
both labs' core routers (CML AS65020, PROX AS65021) over eBGP; as route
reflectors the cores propagate it to every R9K client, so all 18 lab
routers carry and BMP-export a full table -- an intentional stress test
of the ingestion/storage path. cml/gobgp_peering_config.py applies and
rolls back the core-side config; gobgp/README.md documents the rollback.

Kafka lag monitoring: kafka-lag-monitor samples consumer-group lag every
30s into TimescaleDB (009_kafka_lag.sql); Kafka Ingestion Lag dashboard
gives visibility into the pipeline under churn load.

Peer Detail dashboard: the Peer selector is now router-qualified
(router -> peer) so it is unambiguous in an iBGP route-reflector mesh.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-19 12:42:25 -07:00

113 lines
9.7 KiB
JSON

{
"annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
"description": "Kafka consumer-group lag for the OpenBMP ingestion path, sampled every 30s by the kafka-lag-monitor service. Use it to sanity-check ingestion under load: lag spikes during a BGP convergence storm and should drain back to ~0; the consumer member count rises when psql-app is scaled out.",
"editable": true,
"fiscalYearStartMonth": 0,
"graphTooltip": 1,
"id": null,
"links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
"liveNow": false,
"panels": [
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Total lag across all partitions at the latest sample.",
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
"gridPos": {"h": 4,"w": 6,"x": 0,"y": 0},
"id": 1,
"options": {"colorMode": "background","graphMode": "area","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
"pluginVersion": "9.1.7",
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT sum(lag) AS \"Total Lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
"title": "Current Total Lag","type": "stat"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Active consumer members in the group at the latest sample. Rises when psql-app is scaled out.",
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "blue","value": null}]}},"overrides": []},
"gridPos": {"h": 4,"w": 6,"x": 6,"y": 0},
"id": 2,
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
"pluginVersion": "9.1.7",
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' ORDER BY ts DESC LIMIT 1","refId": "A"}],
"title": "Active Consumers","type": "stat"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Topic-partitions tracked for the group at the latest sample.",
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "purple","value": null}]}},"overrides": []},
"gridPos": {"h": 4,"w": 6,"x": 12,"y": 0},
"id": 3,
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
"pluginVersion": "9.1.7",
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT count(*) AS \"Partitions\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
"title": "Partitions Monitored","type": "stat"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Highest total lag observed in the selected time range.",
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
"gridPos": {"h": 4,"w": 6,"x": 18,"y": 0},
"id": 4,
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
"pluginVersion": "9.1.7",
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT max(t.total) AS \"Peak Lag\" FROM (SELECT ts, sum(lag) AS total FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts) t","refId": "A"}],
"title": "Peak Lag (range)","type": "stat"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.",
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 4},
"id": 5,
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId": "A"}],
"title": "Total Consumer Lag","type": "timeseries"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Active consumer members over time. Step changes correspond to psql-app scale events or rebalances.",
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 15,"lineInterpolation": "stepAfter","lineWidth": 2,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 4},
"id": 6,
"options": {"legend": {"calcs": ["min","max"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "single","sort": "none"}},
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
"title": "Consumer Members","type": "timeseries"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Lag broken down by topic. unicast_prefix and base_attribute carry the BGP route churn and dominate during a convergence storm.",
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []},
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 13},
"id": 7,
"options": {"legend": {"calcs": ["last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId": "A"}],
"title": "Lag by Topic","type": "timeseries"
},
{
"datasource": {"type": "postgres","uid": "obmp_postgres"},
"description": "Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.",
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 13},
"id": 8,
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
"title": "Lag by Partition (unicast_prefix)","type": "timeseries"
}
],
"refresh": "30s",
"schemaVersion": 36,
"style": "dark",
"tags": ["obmp", "obmp-nav", "telemetry", "kafka"],
"templating": {
"list": [
{"name": "group","type": "query","label": "Consumer Group","datasource": {"type": "postgres","uid": "obmp_postgres"},"query": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","definition": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","refresh": 1,"includeAll": false,"multi": false,"current": {"selected": true,"text": "obmp-psql-consumer","value": "obmp-psql-consumer"},"options": [],"sort": 1,"hide": 0}
]
},
"time": {"from": "now-3h","to": "now"},
"timepicker": {},
"timezone": "",
"title": "Kafka Ingestion Lag",
"uid": "kafka-lag",
"version": 1,
"weekStart": ""
}