diff --git a/obmp-grafana/dashboards/Telemetry-3001/kafka_lag.json b/obmp-grafana/dashboards/Telemetry-3001/kafka_lag.json index be4cac1..a9ea81a 100644 --- a/obmp-grafana/dashboards/Telemetry-3001/kafka_lag.json +++ b/obmp-grafana/dashboards/Telemetry-3001/kafka_lag.json @@ -56,7 +56,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, - "gridPos": {"h": 9,"w": 12,"x": 0,"y": 4}, + "gridPos": {"h": 12,"w": 12,"x": 0,"y": 4}, "id": 5, "options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId": "A"}], @@ -66,7 +66,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Active consumer members over time. Step changes correspond to psql-app scale events or rebalances.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 15,"lineInterpolation": "stepAfter","lineWidth": 2,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, - "gridPos": {"h": 9,"w": 12,"x": 12,"y": 4}, + "gridPos": {"h": 12,"w": 12,"x": 12,"y": 4}, "id": 6, "options": {"legend": {"calcs": ["min","max"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "single","sort": "none"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}], @@ -76,7 +76,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Lag broken down by topic. unicast_prefix and base_attribute carry the BGP route churn and dominate during a convergence storm.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []}, - "gridPos": {"h": 9,"w": 12,"x": 0,"y": 13}, + "gridPos": {"h": 12,"w": 12,"x": 0,"y": 16}, "id": 7, "options": {"legend": {"calcs": ["last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId": "A"}], @@ -86,7 +86,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": []}, - "gridPos": {"h": 9,"w": 12,"x": 12,"y": 13}, + "gridPos": {"h": 12,"w": 12,"x": 12,"y": 16}, "id": 8, "options": {"legend": {"calcs": ["max","last"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId": "A"}], diff --git a/obmp-grafana/dashboards/Telemetry-3001/live_churn.json b/obmp-grafana/dashboards/Telemetry-3001/live_churn.json index 5fe8828..c8419e7 100644 --- a/obmp-grafana/dashboards/Telemetry-3001/live_churn.json +++ b/obmp-grafana/dashboards/Telemetry-3001/live_churn.json @@ -56,7 +56,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "BGP churn rate over time -- announcements vs withdrawals per minute. This stays live during a storm even while the Kafka Ingestion Lag dashboard shows the bulk pipeline backed up.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 25,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true},"unit": "short"},"overrides": [{"matcher": {"id": "byName","options": "Withdrawals"},"properties": [{"id": "color","value": {"fixedColor": "red","mode": "fixed"}}]},{"matcher": {"id": "byName","options": "Announcements"},"properties": [{"id": "color","value": {"fixedColor": "green","mode": "fixed"}}]}]}, - "gridPos": {"h": 9,"w": 12,"x": 0,"y": 4}, + "gridPos": {"h": 12,"w": 12,"x": 0,"y": 4}, "id": 5, "options": {"legend": {"calcs": ["max","sum"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT $__timeGroupAlias(ts,'1m'), sum(adds) AS \"Announcements\", sum(dels) AS \"Withdrawals\" FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY 1 ORDER BY 1","refId": "A"}], @@ -66,7 +66,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Churn per minute broken down by the BMP router reporting it.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 20,"lineInterpolation": "smooth","lineWidth": 1,"showPoints": "never","spanNulls": true,"stacking": {"group": "A","mode": "normal"}},"unit": "short"},"overrides": []}, - "gridPos": {"h": 9,"w": 12,"x": 12,"y": 4}, + "gridPos": {"h": 12,"w": 12,"x": 12,"y": 4}, "id": 6, "options": {"legend": {"calcs": ["sum"],"displayMode": "table","placement": "bottom","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "time_series","rawSql": "SELECT $__timeGroupAlias(ts,'1m'), COALESCE(host(router_ip),'(unknown)') AS metric, sum(adds + dels) AS churn FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY 1, router_ip ORDER BY 1","refId": "A"}], @@ -76,7 +76,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Busiest BGP sessions by churn over the dashboard time range.", "fieldConfig": {"defaults": {"custom": {"align": "auto","displayMode": "auto"}},"overrides": [{"matcher": {"id": "byName","options": "Withdraws"},"properties": [{"id": "custom.displayMode","value": "color-text"},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "text","value": null},{"color": "orange","value": 1}]}}]}]}, - "gridPos": {"h": 9,"w": 24,"x": 0,"y": 13}, + "gridPos": {"h": 12,"w": 24,"x": 0,"y": 16}, "id": 7, "options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Total Churn"}]}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT host(router_ip) AS \"Router\", host(peer_ip) AS \"Peer\", peer_asn AS \"Peer AS\", sum(adds) AS \"Announces\", sum(dels) AS \"Withdraws\", sum(adds + dels) AS \"Total Churn\" FROM churn_metrics WHERE $__timeFilter(ts) GROUP BY router_ip, peer_ip, peer_asn ORDER BY \"Total Churn\" DESC LIMIT 20","refId": "A"}], diff --git a/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json index f194e13..7c3b782 100644 --- a/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json +++ b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json @@ -15,9 +15,9 @@ "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0,"thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}, "overrides": [] }, - "gridPos": {"h": 9,"w": 12,"x": 0,"y": 0}, + "gridPos": {"h": 14,"w": 12,"x": 0,"y": 0}, "id": 1, - "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "options": {"legend": {"calcs": ["last","max"],"displayMode": "table","placement": "bottom","showLegend": true,"sortBy": "Max","sortDesc": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], "title": "Container Memory %", "type": "timeseries" @@ -29,9 +29,9 @@ "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0}, "overrides": [] }, - "gridPos": {"h": 9,"w": 12,"x": 12,"y": 0}, + "gridPos": {"h": 14,"w": 12,"x": 12,"y": 0}, "id": 2, - "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "options": {"legend": {"calcs": ["last","max"],"displayMode": "table","placement": "bottom","showLegend": true,"sortBy": "Max","sortDesc": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_cpu\" and r._field == \"usage_percent\" and r.cpu == \"cpu-total\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], "title": "Container CPU %", "type": "timeseries" @@ -43,9 +43,9 @@ "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "bytes","min": 0}, "overrides": [] }, - "gridPos": {"h": 9,"w": 12,"x": 0,"y": 9}, + "gridPos": {"h": 14,"w": 12,"x": 0,"y": 14}, "id": 3, - "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "options": {"legend": {"calcs": ["last","max"],"displayMode": "table","placement": "bottom","showLegend": true,"sortBy": "Max","sortDesc": true},"tooltip": {"mode": "multi","sort": "desc"}}, "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], "title": "Container Memory Usage", "type": "timeseries" @@ -57,7 +57,7 @@ "defaults": {"custom": {"align": "auto","displayMode": "auto"},"unit": "percent"}, "overrides": [{"matcher": {"id": "byName","options": "Memory %"},"properties": [{"id": "custom.displayMode","value": "gradient-gauge"},{"id": "max","value": 100},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}]}] }, - "gridPos": {"h": 9,"w": 12,"x": 12,"y": 9}, + "gridPos": {"h": 14,"w": 12,"x": 12,"y": 14}, "id": 4, "options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Memory %"}]}, "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: -5m)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> last()\n |> keep(columns: [\"container_name\", \"_value\"])\n |> group()\n |> rename(columns: {_value: \"Memory %\", container_name: \"Container\"})\n |> sort(columns: [\"Memory %\"], desc: true)","refId": "A"}], diff --git a/obmp-grafana/dashboards/obmp/History-1002/policy_diff.json b/obmp-grafana/dashboards/obmp/History-1002/policy_diff.json index 0342c2f..f512a34 100644 --- a/obmp-grafana/dashboards/obmp/History-1002/policy_diff.json +++ b/obmp-grafana/dashboards/obmp/History-1002/policy_diff.json @@ -66,7 +66,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Prefixes received (BMP) but not accepted into the BGP table, by router.", "fieldConfig": {"defaults": {"color": {"mode": "palette-classic"},"custom": {"lineWidth": 1,"fillOpacity": 80,"axisPlacement": "auto"}},"overrides": []}, - "gridPos": {"h": 8,"w": 12,"x": 0,"y": 16}, + "gridPos": {"h": 11,"w": 12,"x": 0,"y": 16}, "id": 6, "options": {"orientation": "horizontal","showValue": "auto","xField": "Router","legend": {"showLegend": false},"tooltip": {"mode": "single"}}, "pluginVersion": "9.1.7", @@ -77,7 +77,7 @@ "datasource": {"type": "postgres","uid": "obmp_postgres"}, "description": "Full route-policy (RPL) bodies retrieved from the routers via NETCONF. The body is what the heuristic attribution engine would parse in a later phase.", "fieldConfig": {"defaults": {"custom": {"align": "auto","displayMode": "auto"}},"overrides": []}, - "gridPos": {"h": 8,"w": 12,"x": 12,"y": 16}, + "gridPos": {"h": 11,"w": 12,"x": 12,"y": 16}, "id": 7, "options": {"showHeader": true,"sortBy": [{"desc": false,"displayName": "Router"}]}, "targets": [{"datasource": {"type": "postgres","uid": "obmp_postgres"},"format": "table","rawSql": "SELECT r.name AS \"Router\", rp.policy_name AS \"Policy\", rp.body AS \"RPL Body\", rp.retrieved_at AS \"Retrieved\" FROM route_policies rp JOIN routers r ON r.hash_id = rp.router_hash_id WHERE r.name IN ($router) ORDER BY r.name, rp.policy_name","refId": "A"}],