Policy Diff (roadmap E2 follow-up): obmp-rib-poller pulls per-router post-policy accepted/advertised prefix counts and route-policy bindings over CLI+NETCONF (BMP on XRv9000 24.3.1 carries only pre-policy Adj-RIB-In). New tables in 008_obmp_policy_diff.sql; Policy Diff dashboard joins them against BMP ip_rib for received-vs-kept-vs-rejected. GoBGP fleet-wide feed: GoBGP re-advertises the full Bromirski table to both labs' core routers (CML AS65020, PROX AS65021) over eBGP; as route reflectors the cores propagate it to every R9K client, so all 18 lab routers carry and BMP-export a full table -- an intentional stress test of the ingestion/storage path. cml/gobgp_peering_config.py applies and rolls back the core-side config; gobgp/README.md documents the rollback. Kafka lag monitoring: kafka-lag-monitor samples consumer-group lag every 30s into TimescaleDB (009_kafka_lag.sql); Kafka Ingestion Lag dashboard gives visibility into the pipeline under churn load. Peer Detail dashboard: the Peer selector is now router-qualified (router -> peer) so it is unambiguous in an iBGP route-reflector mesh. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
113 lines
9.7 KiB
JSON
113 lines
9.7 KiB
JSON
{
|
|
"annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
|
|
"description": "Kafka consumer-group lag for the OpenBMP ingestion path, sampled every 30s by the kafka-lag-monitor service. Use it to sanity-check ingestion under load: lag spikes during a BGP convergence storm and should drain back to ~0; the consumer member count rises when psql-app is scaled out.",
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Total lag across all partitions at the latest sample.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 0,"y": 0},
|
|
"id": 1,
|
|
"options": {"colorMode": "background","graphMode": "area","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT sum(lag) AS \"Total Lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
|
|
"title": "Current Total Lag","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Active consumer members in the group at the latest sample. Rises when psql-app is scaled out.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "blue","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 6,"y": 0},
|
|
"id": 2,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' ORDER BY ts DESC LIMIT 1","refId": "A"}],
|
|
"title": "Active Consumers","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Topic-partitions tracked for the group at the latest sample.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "purple","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 12,"y": 0},
|
|
"id": 3,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT count(*) AS \"Partitions\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
|
|
"title": "Partitions Monitored","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Highest total lag observed in the selected time range.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 18,"y": 0},
|
|
"id": 4,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT max(t.total) AS \"Peak Lag\" FROM (SELECT ts, sum(lag) AS total FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts) t","refId": "A"}],
|
|
"title": "Peak Lag (range)","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 4},
|
|
"id": 5,
|
|
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId": "A"}],
|
|
"title": "Total Consumer Lag","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Active consumer members over time. Step changes correspond to psql-app scale events or rebalances.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 15,"lineInterpolation": "stepAfter","lineWidth": 2,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 4},
|
|
"id": 6,
|
|
"options": {"legend": {"calcs": ["min","max"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "single","sort": "none"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
|
|
"title": "Consumer Members","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Lag broken down by topic. unicast_prefix and base_attribute carry the BGP route churn and dominate during a convergence storm.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 13},
|
|
"id": 7,
|
|
"options": {"legend": {"calcs": ["last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId": "A"}],
|
|
"title": "Lag by Topic","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 13},
|
|
"id": 8,
|
|
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
|
|
"title": "Lag by Partition (unicast_prefix)","type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 36,
|
|
"style": "dark",
|
|
"tags": ["obmp", "obmp-nav", "telemetry", "kafka"],
|
|
"templating": {
|
|
"list": [
|
|
{"name": "group","type": "query","label": "Consumer Group","datasource": {"type": "postgres","uid": "obmp_postgres"},"query": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","definition": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","refresh": 1,"includeAll": false,"multi": false,"current": {"selected": true,"text": "obmp-psql-consumer","value": "obmp-psql-consumer"},"options": [],"sort": 1,"hide": 0}
|
|
]
|
|
},
|
|
"time": {"from": "now-3h","to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "",
|
|
"title": "Kafka Ingestion Lag",
|
|
"uid": "kafka-lag",
|
|
"version": 1,
|
|
"weekStart": ""
|
|
}
|