Two recurring layout issues across dashboards I built this session:
1) Right-placed legend tables ate 30% of each panel width.
2) Default h:9 panels left ~50% of the viewport empty on a 1080p
display (total dashboard height ~18 grid rows vs ~30 available).
Stack Resources (Telemetry-3001/stack_resources.json):
* 3 timeseries: legend placement right -> bottom, calcs [max] -> [last,max],
added sortBy: Max desc so top consumers float to the top of the legend.
* Bumped all 4 panels h: 9 -> 14 (dashboard total 18 -> 28 rows).
Kafka Ingestion Lag and Live BGP Churn (Telemetry-3001/*):
* Bumped timeseries panels h: 9 -> 12; second-row y: 13 -> 16.
Dashboard total 22 -> 28 rows.
Policy Diff (obmp/History-1002/policy_diff.json):
* Bumped bottom-row panels h: 8 -> 11. Total 24 -> 27 rows.
Untouched (already adequate, scrollable by design, or built earlier):
evpn_rib (30 rows), global_table (38), router_diff (52), and the
Maps-1006 dashboards (already h:22-28 single panels).
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
113 lines
9.7 KiB
JSON
113 lines
9.7 KiB
JSON
{
|
|
"annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
|
|
"description": "Kafka consumer-group lag for the OpenBMP ingestion path, sampled every 30s by the kafka-lag-monitor service. Use it to sanity-check ingestion under load: lag spikes during a BGP convergence storm and should drain back to ~0; the consumer member count rises when psql-app is scaled out.",
|
|
"editable": true,
|
|
"fiscalYearStartMonth": 0,
|
|
"graphTooltip": 1,
|
|
"id": null,
|
|
"links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
|
|
"liveNow": false,
|
|
"panels": [
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Total lag across all partitions at the latest sample.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 0,"y": 0},
|
|
"id": 1,
|
|
"options": {"colorMode": "background","graphMode": "area","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT sum(lag) AS \"Total Lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
|
|
"title": "Current Total Lag","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Active consumer members in the group at the latest sample. Rises when psql-app is scaled out.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "blue","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 6,"y": 0},
|
|
"id": 2,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' ORDER BY ts DESC LIMIT 1","refId": "A"}],
|
|
"title": "Active Consumers","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Topic-partitions tracked for the group at the latest sample.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "purple","value": null}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 12,"y": 0},
|
|
"id": 3,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT count(*) AS \"Partitions\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId": "A"}],
|
|
"title": "Partitions Monitored","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Highest total lag observed in the selected time range.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "thresholds"},"unit": "short","thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "yellow","value": 50000},{"color": "red","value": 1000000}]}},"overrides": []},
|
|
"gridPos": {"h": 4,"w": 6,"x": 18,"y": 0},
|
|
"id": 4,
|
|
"options": {"colorMode": "value","graphMode": "none","justifyMode": "auto","orientation": "auto","reduceOptions": {"calcs": ["lastNotNull"],"fields": "","values": false},"textMode": "auto"},
|
|
"pluginVersion": "9.1.7",
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT max(t.total) AS \"Peak Lag\" FROM (SELECT ts, sum(lag) AS total FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts) t","refId": "A"}],
|
|
"title": "Peak Lag (range)","type": "stat"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 12,"w": 12,"x": 0,"y": 4},
|
|
"id": 5,
|
|
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId": "A"}],
|
|
"title": "Total Consumer Lag","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Active consumer members over time. Step changes correspond to psql-app scale events or rebalances.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 15,"lineInterpolation": "stepAfter","lineWidth": 2,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 12,"w": 12,"x": 12,"y": 4},
|
|
"id": 6,
|
|
"options": {"legend": {"calcs": ["min","max"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "single","sort": "none"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
|
|
"title": "Consumer Members","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Lag broken down by topic. unicast_prefix and base_attribute carry the BGP route churn and dominate during a convergence storm.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 12,"w": 12,"x": 0,"y": 16},
|
|
"id": 7,
|
|
"options": {"legend": {"calcs": ["last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId": "A"}],
|
|
"title": "Lag by Topic","type": "timeseries"
|
|
},
|
|
{
|
|
"datasource": {"type": "postgres","uid": "obmp_postgres"},
|
|
"description": "Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.",
|
|
"fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []},
|
|
"gridPos": {"h": 12,"w": 12,"x": 12,"y": 16},
|
|
"id": 8,
|
|
"options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
|
"targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}],
|
|
"title": "Lag by Partition (unicast_prefix)","type": "timeseries"
|
|
}
|
|
],
|
|
"refresh": "30s",
|
|
"schemaVersion": 36,
|
|
"style": "dark",
|
|
"tags": ["obmp", "obmp-nav", "telemetry", "kafka"],
|
|
"templating": {
|
|
"list": [
|
|
{"name": "group","type": "query","label": "Consumer Group","datasource": {"type": "postgres","uid": "obmp_postgres"},"query": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","definition": "SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","refresh": 1,"includeAll": false,"multi": false,"current": {"selected": true,"text": "obmp-psql-consumer","value": "obmp-psql-consumer"},"options": [],"sort": 1,"hide": 0}
|
|
]
|
|
},
|
|
"time": {"from": "now-3h","to": "now"},
|
|
"timepicker": {},
|
|
"timezone": "",
|
|
"title": "Kafka Ingestion Lag",
|
|
"uid": "kafka-lag",
|
|
"version": 1,
|
|
"weekStart": ""
|
|
}
|