obmp-churn-monitor: a decoupled fast-path BGP churn consumer. Reads openbmp.parsed.unicast_prefix with its own Kafka consumer group and only counts announcements/withdrawals per (router,peer) into churn_metrics (010_churn_metrics.sql) -- no relational RIB write. Storm-tested: it stayed real-time (tracked 1k->85k msg/s) while the psql-app bulk pipeline lag grew 3.8M->5.6M. Live BGP Churn dashboard reads it. tools/churn_storm.py: programmatic churn-storm generator (flaps GoBGP's eBGP sessions to the lab cores) for load testing. Stress-test finding: fleet-wide full table from 18 routers exceeds this 31 GiB host. The bottleneck is RAM, not CPU -- at 16 cores the host still hit load 33 because it was swap-thrashing (swap 2/2 full, <1.5 GiB free). Lag ran away 3.8M->20M+. Recourse: more host RAM for bulk throughput; the fast-path consumer for visibility regardless. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
99 lines
8.5 KiB
JSON
99 lines
8.5 KiB
JSON
{
|
|
"annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
|
|
"description": "Real-time BGP churn rate from the obmp-churn-monitor fast-path consumer. This consumer reads Kafka with its own group and only counts announcements/withdrawals, so it stays current even when the main psql-app ingestion pipeline lags minutes behind during a churn storm. Use the Kafka Ingestion Lag dashboard alongside this: when lag is high, THIS dashboard is still telling you what is churning.",
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
|
|
"liveNow": true,
|
|
"panels": [
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Total churn events (announcements + withdrawals) in the last minute.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 100000},{"color": "red","value": 1000000}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 0,"y": 0},
|
|
"id": 1,
|
|
"options": {"colorMode": "background","graphMode": "area","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT COALESCE(sum(adds + dels),0) AS \"Churn (1m)\" FROM churn_metrics WHERE ts > now() - interval '1 minute'","refId": "A"}],
|
|
"title": "Churn Events (last min)","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Route announcements in the last minute.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 6,"y": 0},
|
|
"id": 2,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT COALESCE(sum(adds),0) AS \"Announcements\" FROM churn_metrics WHERE ts > now() - interval '1 minute'","refId": "A"}],
|
|
"title": "Announcements (last min)","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Route withdrawals in the last minute.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 1}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 12,"y": 0},
|
|
"id": 3,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT COALESCE(sum(dels),0) AS \"Withdrawals\" FROM churn_metrics WHERE ts > now() - interval '1 minute'","refId": "A"}],
|
|
"title": "Withdrawals (last min)","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Distinct BGP sessions with churn in the last minute.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "purple","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 18,"y": 0},
|
|
"id": 4,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT count(*) AS \"Sessions\" FROM (SELECT router_ip, peer_ip FROM churn_metrics WHERE ts > now() - interval '1 minute' AND (adds > 0 OR dels > 0) GROUP BY router_ip, peer_ip) s","refId": "A"}],
|
|
"title": "Churning Sessions","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "BGP churn rate over time -- announcements vs withdrawals per minute. This stays live during a storm even while the Kafka Ingestion Lag dashboard shows the bulk pipeline backed up.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": [{"matcher": {"id": "byName","options": "Withdrawals"},"properties": [{"id": "color","value": {"fixedColor": "red","mode": "fixed"}}]},{"matcher": {"id": "byName","options": "Announcements"},"properties": [{"id": "color","value": {"fixedColor": "green","mode": "fixed"}}]}]},
|
|
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 4},
|
|
"id": 5,
|
|
"options": {"legend": {"calcs": ["max","sum"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT $__timeGroupAlias(ts,'1m'), sum(adds) AS \"Announcements\", sum(dels) AS \"Withdrawals\" FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY 1 ORDER BY 1","refId": "A"}],
|
|
"title": "Churn Rate (per minute)","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Churn per minute broken down by the BMP router reporting it.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 4},
|
|
"id": 6,
|
|
"options": {"legend": {"calcs": ["sum"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT $__timeGroupAlias(ts,'1m'), COALESCE(host(router_ip),'(unknown)') AS metric, sum(adds + dels) AS churn FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY 1, router_ip ORDER BY 1","refId": "A"}],
|
|
"title": "Churn by Router","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Busiest BGP sessions by churn over the dashboard time range.",
|
|
"fieldConfig": {"defaults": {"custom": {"align": "auto","displayMode": "auto"}},"overrides": [{"matcher": {"id": "byName","options": "Withdraws"},"properties": [{"id": "custom.displayMode","value": "color-text"},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "text","value": null},{"color": "orange","value": 1}]}}]}]},
|
|
"gridPos": {"h": 9,"w": 24,"x": 0,"y": 13},
|
|
"id": 7,
|
|
"options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Total Churn"}]},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT host(router_ip) AS \"Router\", host(peer_ip) AS \"Peer\", peer_asn AS \"Peer AS\", sum(adds) AS \"Announces\", sum(dels) AS \"Withdraws\", sum(adds + dels) AS \"Total Churn\" FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY router_ip, peer_ip, peer_asn ORDER BY \"Total Churn\" DESC LIMIT 20","refId": "A"}],
|
|
"title": "Top Churning Sessions","type": "table"
|
|
}
|
|
],
|
|
"refresh": "10s",
|
|
"schemaVersion": 36,
|
|
"style": "dark",
|
|
"tags": ["obmp", "obmp-nav", "telemetry", "bgp"],
|
|
"templating": {"list": []},
|
|
"time": {"from": "now-1h","to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "",
|
|
"title": "Live BGP Churn",
|
|
"uid": "live-churn",
|
|
"version": 1,
|
|
"weekStart": ""
|
|
}
|