{ "annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]}, "description": "Kafka consumer-group lag for the OpenBMP ingestion path, sampled every 30s by the kafka-lag-monitor service. Use it to sanity-check ingestion under load: lag spikes during a BGP convergence storm and should drain back to ~0; the consumer member count rises when psql-app is scaled out.", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 1, "id": null, "links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}], "liveNow": false, "panels": [ { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Total lag across all partitions at the latest sample.", "fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []}, "gridPos": {"h": 4,"w": 6,"x": 0,"y": 0}, "id": 1, "options": {"colorMode": "background","graphMode": "area","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"}, "pluginVersion": "9.1.7", "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT sum(lag) AS \"Total Lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}], "title": "Current Total Lag","type": "stat" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Active consumer members in the group at the latest sample. Rises when psql-app is scaled out.", "fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "blue","value": null}]}},"overrides": []}, "gridPos": {"h": 4,"w": 6,"x": 6,"y": 0}, "id": 2, "options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"}, "pluginVersion": "9.1.7", "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' ORDER BY ts DESC LIMIT 1","refId": "A"}], "title": "Active Consumers","type": "stat" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Topic-partitions tracked for the group at the latest sample.", "fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "purple","value": null}]}},"overrides": []}, "gridPos": {"h": 4,"w": 6,"x": 12,"y": 0}, "id": 3, "options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"}, "pluginVersion": "9.1.7", "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT count(*) AS \"Partitions\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}], "title": "Partitions Monitored","type": "stat" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Highest total lag observed in the selected time range.", "fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []}, "gridPos": {"h": 4,"w": 6,"x": 18,"y": 0}, "id": 4, "options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"}, "pluginVersion": "9.1.7", "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT max(t.total) AS \"Peak Lag\" FROM (SELECT ts, sum(lag) AS total FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts) t","refId": "A"}], "title": "Peak Lag (range)","type": "stat" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, "gridPos": {"h": 12,"w": 12,"x": 0,"y": 4}, "id": 5, "options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId": "A"}], "title": "Total Consumer Lag","type": "timeseries" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Active consumer members over time. Step changes correspond to psql-app scale events or rebalances.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 15,"lineInterpolation": "stepAfter","lineWidth": 2,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, "gridPos": {"h": 12,"w": 12,"x": 12,"y": 4}, "id": 6, "options": {"legend": {"calcs": ["min","max"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "single","sort": "none"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}], "title": "Consumer Members","type": "timeseries" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Lag broken down by topic. unicast_prefix and base_attribute carry the BGP route churn and dominate during a convergence storm.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []}, "gridPos": {"h": 12,"w": 12,"x": 0,"y": 16}, "id": 7, "options": {"legend": {"calcs": ["last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId": "A"}], "title": "Lag by Topic","type": "timeseries" }, { "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, "gridPos": {"h": 12,"w": 12,"x": 12,"y": 16}, "id": 8, "options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}], "title": "Lag by Partition (unicast_prefix)","type": "timeseries" } ], "refresh": "30s", "schemaVersion": 36, "style": "dark", "tags": ["obmp", "obmp-nav", "telemetry", "kafka"], "templating": { "list": [ {"name": "group","type": "query","label": "Consumer Group","datasource": {"type": "postgres","uid": "obmp_postgres"},"query": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","definition": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","refresh": 1,"includeAll": false,"multi": false,"current": {"selected": true,"text": "obmp-psql-consumer","value": "obmp-psql-consumer"},"options": [],"sort": 1,"hide": 0} ] }, "time": {"from": "now-3h","to": "now"}, "timepicker": {}, "timezone": "", "title": "Kafka Ingestion Lag", "uid": "kafka-lag", "version": 1, "weekStart": "" }