From 9d74940614c0c21a22697475a44dfc4823e900ad Mon Sep 17 00:00:00 2001
From: sam <info@apodacalab.com>
Date: Mon, 18 May 2026 22:03:52 -0700
Subject: [PATCH] Fix ExaBGP OOM, add container health checks and resource
 monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

RCA: the exabgp container was OOM-killed — its 512m mem_limit was far too
small for the full-table feature (900K route objects in memory). Raises the
limit to a parameterized 6g default (EXABGP_MEM_LIMIT).

Adds Docker healthchecks to 14 services (port/HTTP probes) so unhealthy
containers are visible. Adds a Telegraf docker input that collects per-
container CPU/memory/IO into InfluxDB, plus a "Stack Resources" dashboard —
so resource pressure is caught before it causes an OOM crash. telegraf runs
with an overridden entrypoint so it keeps root and can read the docker socket.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .env.example                                  |  2 +
 docker-compose.yml                            | 91 ++++++++++++++++++-
 .../Telemetry-3001/stack_resources.json       | 78 ++++++++++++++++
 telegraf/telegraf.conf                        | 11 +++
 4 files changed, 181 insertions(+), 1 deletion(-)
 create mode 100644 obmp-grafana/dashboards/Telemetry-3001/stack_resources.json

diff --git a/.env.example b/.env.example
index b9175fc..b0a5343 100644
--- a/.env.example
+++ b/.env.example
@@ -24,6 +24,8 @@ OBMP_COOKIE_DOMAIN=example.com
 PSQL_MEM_LIMIT=6g
 PSQL_APP_MEM_LIMIT=4g
 KAFKA_MEM_LIMIT=4g
+# ExaBGP — the full-table feature holds up to 900K route objects in memory.
+EXABGP_MEM_LIMIT=6g
 
 # gNMI streaming telemetry (telegraf, test profile). GNMI_ADDRESSES is a
 # quoted, comma-separated host:port list — add a router here once gNMI/grpc
diff --git a/docker-compose.yml b/docker-compose.yml
index dfeea97..8129489 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -17,6 +17,12 @@ services:
   zookeeper:
     restart: unless-stopped
     container_name: obmp-zookeeper
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/2181'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
     image: confluentinc/cp-zookeeper:7.1.1
     mem_limit: 1g
     volumes:
@@ -29,6 +35,12 @@ services:
   kafka:
     restart: unless-stopped
     container_name: obmp-kafka
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/9092'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 90s
     image: confluentinc/cp-kafka:7.1.1
     # Raise KAFKA_MEM_LIMIT for production (full-table initial dumps are bursty).
     mem_limit: ${KAFKA_MEM_LIMIT:-4g}
@@ -87,6 +99,12 @@ services:
   grafana:
     restart: unless-stopped
     container_name: obmp-grafana
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
     image: grafana/grafana:9.1.7
     mem_limit: 1g
     ports:
@@ -128,6 +146,12 @@ services:
   psql:
     restart: unless-stopped
     container_name: obmp-psql
+    healthcheck:
+      test: ["CMD-SHELL", "pg_isready -U openbmp -d openbmp"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 60s
     image: openbmp/postgres:2.2.1
     # Raise PSQL_MEM_LIMIT for production (see docs/production-sizing.md).
     mem_limit: ${PSQL_MEM_LIMIT:-6g}
@@ -153,6 +177,12 @@ services:
   collector:
     restart: unless-stopped
     container_name: obmp-collector
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5000'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
     image: openbmp/collector:2.2.3
     mem_limit: 2g
     sysctls:
@@ -169,6 +199,8 @@ services:
   psql-app:
     restart: unless-stopped
     container_name: obmp-psql-app
+    # No healthcheck — the consumer exposes no health port; Docker's
+    # restart-on-exit covers process death.
     image: openbmp/psql-app:2.2.2
     # mem_limit must exceed the MEM (JVM heap) env below. Raise both for
     # production — see docs/production-sizing.md.
@@ -216,8 +248,16 @@ services:
   exabgp:
     restart: unless-stopped
     container_name: obmp-exabgp
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5050'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
     profiles: ["test"]
-    mem_limit: 512m
+    # The full-table feature generates up to 900K route objects in memory;
+    # 512m OOM-killed it. Raise EXABGP_MEM_LIMIT in .env for larger tables.
+    mem_limit: ${EXABGP_MEM_LIMIT:-6g}
     build:
       context: ./exabgp
       dockerfile: Dockerfile
@@ -241,6 +281,12 @@ services:
   exabgp-ui:
     restart: unless-stopped
     container_name: obmp-exabgp-ui
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:5001/ || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
     profiles: ["test"]
     mem_limit: 256m
     build:
@@ -255,6 +301,12 @@ services:
   influxdb:
     restart: unless-stopped
     container_name: obmp-influxdb
+    healthcheck:
+      test: ["CMD-SHELL", "curl -fsS http://localhost:8086/health || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
     profiles: ["test"]
     image: influxdb:2.7
     mem_limit: 2g
@@ -280,6 +332,13 @@ services:
       context: ./telegraf
       dockerfile: Dockerfile
     network_mode: host
+    # Run telegraf as root and override the image entrypoint (which otherwise
+    # drops back to the telegraf user) so [[inputs.docker]] can read the
+    # Docker daemon socket for container resource metrics.
+    user: root
+    entrypoint: ["telegraf"]
+    volumes:
+      - /var/run/docker.sock:/var/run/docker.sock
     depends_on:
       - influxdb
     environment:
@@ -295,6 +354,12 @@ services:
   traffic-gen:
     restart: unless-stopped
     container_name: obmp-traffic-gen
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5051'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
     profiles: ["test"]
     mem_limit: 1g
     build:
@@ -312,6 +377,12 @@ services:
   traffic-gen-ui:
     restart: unless-stopped
     container_name: obmp-traffic-gen-ui
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:5002/ || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
     profiles: ["test"]
     mem_limit: 256m
     build:
@@ -323,6 +394,12 @@ services:
   traffic-gen-responder:
     restart: unless-stopped
     container_name: obmp-traffic-gen-responder
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5053'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
     profiles: ["test"]
     mem_limit: 1g
     build:
@@ -345,6 +422,12 @@ services:
   whois:
     restart: unless-stopped
     container_name: obmp-whois
+    healthcheck:
+      test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/43'"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 30s
     image: openbmp/whois:2.2.0
     mem_limit: 1g
     sysctls:
@@ -378,6 +461,12 @@ services:
   portal:
     restart: unless-stopped
     container_name: obmp-portal
+    healthcheck:
+      test: ["CMD-SHELL", "wget -q --spider http://localhost:80/ || exit 1"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 20s
     profiles: ["auth"]
     mem_limit: 128m
     image: nginx:alpine
diff --git a/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json
new file mode 100644
index 0000000..f194e13
--- /dev/null
+++ b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json
@@ -0,0 +1,78 @@
+{
+  "annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
+  "description": "Per-container CPU, memory, and I/O for the OpenBMP stack — collected by the Telegraf docker input. Watch memory % to catch a container approaching its mem_limit before it OOM-crashes.",
+  "editable": true,
+  "fiscalYearStartMonth": 0,
+  "graphTooltip": 1,
+  "id": null,
+  "links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
+  "liveNow": false,
+  "panels": [
+    {
+      "datasource": {"type": "influxdb","uid": "obmp_influxdb"},
+      "description": "Memory usage as a percentage of each container's mem_limit. Sustained values near 100% precede an OOM kill.",
+      "fieldConfig": {
+        "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0,"thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}},
+        "overrides": []
+      },
+      "gridPos": {"h": 9,"w": 12,"x": 0,"y": 0},
+      "id": 1,
+      "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
+      "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n  |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n  |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n  |> group(columns: [\"container_name\"])","refId": "A"}],
+      "title": "Container Memory %",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "influxdb","uid": "obmp_influxdb"},
+      "description": "CPU usage per container (cpu-total). Can exceed 100% — that is multiple cores.",
+      "fieldConfig": {
+        "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0},
+        "overrides": []
+      },
+      "gridPos": {"h": 9,"w": 12,"x": 12,"y": 0},
+      "id": 2,
+      "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
+      "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"docker_container_cpu\" and r._field == \"usage_percent\" and r.cpu == \"cpu-total\")\n  |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n  |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n  |> group(columns: [\"container_name\"])","refId": "A"}],
+      "title": "Container CPU %",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "influxdb","uid": "obmp_influxdb"},
+      "description": "Absolute memory usage per container.",
+      "fieldConfig": {
+        "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "bytes","min": 0},
+        "overrides": []
+      },
+      "gridPos": {"h": 9,"w": 12,"x": 0,"y": 9},
+      "id": 3,
+      "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
+      "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n  |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n  |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage\")\n  |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n  |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n  |> group(columns: [\"container_name\"])","refId": "A"}],
+      "title": "Container Memory Usage",
+      "type": "timeseries"
+    },
+    {
+      "datasource": {"type": "influxdb","uid": "obmp_influxdb"},
+      "description": "Current memory pressure per container. Anything in orange/red is close to its mem_limit.",
+      "fieldConfig": {
+        "defaults": {"custom": {"align": "auto","displayMode": "auto"},"unit": "percent"},
+        "overrides": [{"matcher": {"id": "byName","options": "Memory %"},"properties": [{"id": "custom.displayMode","value": "gradient-gauge"},{"id": "max","value": 100},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}]}]
+      },
+      "gridPos": {"h": 9,"w": 12,"x": 12,"y": 9},
+      "id": 4,
+      "options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Memory %"}]},
+      "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n  |> range(start: -5m)\n  |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n  |> last()\n  |> keep(columns: [\"container_name\", \"_value\"])\n  |> group()\n  |> rename(columns: {_value: \"Memory %\", container_name: \"Container\"})\n  |> sort(columns: [\"Memory %\"], desc: true)","refId": "A"}],
+      "title": "Current Memory % by Container",
+      "type": "table"
+    }
+  ],
+  "refresh": "30s",
+  "schemaVersion": 36,
+  "style": "dark",
+  "tags": ["obmp", "obmp-nav", "telemetry", "resources"],
+  "time": {"from": "now-1h","to": "now"},
+  "timepicker": {},
+  "timezone": "browser",
+  "title": "Stack Resources",
+  "uid": "obmp-stack-resources",
+  "version": 1
+}
diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf
index 4df7338..7c6aac3 100644
--- a/telegraf/telegraf.conf
+++ b/telegraf/telegraf.conf
@@ -53,6 +53,17 @@
     subscription_mode = "sample"
     sample_interval = "30s"
 
+## Docker container resource metrics — CPU, memory (incl. limit + %), network,
+## and block IO for every obmp-* container. Surfaces resource pressure (e.g. a
+## container approaching its mem_limit) before it OOM-crashes.
+[[inputs.docker]]
+  endpoint = "unix:///var/run/docker.sock"
+  gather_services = false
+  container_name_include = ["obmp-*"]
+  perdevice = false
+  total = true
+  timeout = "10s"
+
 ###############################################################################
 #                            OUTPUT PLUGINS                                   #
 ###############################################################################