From 9d74940614c0c21a22697475a44dfc4823e900ad Mon Sep 17 00:00:00 2001 From: sam Date: Mon, 18 May 2026 22:03:52 -0700 Subject: [PATCH] Fix ExaBGP OOM, add container health checks and resource monitoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RCA: the exabgp container was OOM-killed — its 512m mem_limit was far too small for the full-table feature (900K route objects in memory). Raises the limit to a parameterized 6g default (EXABGP_MEM_LIMIT). Adds Docker healthchecks to 14 services (port/HTTP probes) so unhealthy containers are visible. Adds a Telegraf docker input that collects per- container CPU/memory/IO into InfluxDB, plus a "Stack Resources" dashboard — so resource pressure is caught before it causes an OOM crash. telegraf runs with an overridden entrypoint so it keeps root and can read the docker socket. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 2 + docker-compose.yml | 91 ++++++++++++++++++- .../Telemetry-3001/stack_resources.json | 78 ++++++++++++++++ telegraf/telegraf.conf | 11 +++ 4 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 obmp-grafana/dashboards/Telemetry-3001/stack_resources.json diff --git a/.env.example b/.env.example index b9175fc..b0a5343 100644 --- a/.env.example +++ b/.env.example @@ -24,6 +24,8 @@ OBMP_COOKIE_DOMAIN=example.com PSQL_MEM_LIMIT=6g PSQL_APP_MEM_LIMIT=4g KAFKA_MEM_LIMIT=4g +# ExaBGP — the full-table feature holds up to 900K route objects in memory. +EXABGP_MEM_LIMIT=6g # gNMI streaming telemetry (telegraf, test profile). GNMI_ADDRESSES is a # quoted, comma-separated host:port list — add a router here once gNMI/grpc diff --git a/docker-compose.yml b/docker-compose.yml index dfeea97..8129489 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -17,6 +17,12 @@ services: zookeeper: restart: unless-stopped container_name: obmp-zookeeper + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/2181'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s image: confluentinc/cp-zookeeper:7.1.1 mem_limit: 1g volumes: @@ -29,6 +35,12 @@ services: kafka: restart: unless-stopped container_name: obmp-kafka + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/9092'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 90s image: confluentinc/cp-kafka:7.1.1 # Raise KAFKA_MEM_LIMIT for production (full-table initial dumps are bursty). mem_limit: ${KAFKA_MEM_LIMIT:-4g} @@ -87,6 +99,12 @@ services: grafana: restart: unless-stopped container_name: obmp-grafana + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s image: grafana/grafana:9.1.7 mem_limit: 1g ports: @@ -128,6 +146,12 @@ services: psql: restart: unless-stopped container_name: obmp-psql + healthcheck: + test: ["CMD-SHELL", "pg_isready -U openbmp -d openbmp"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 60s image: openbmp/postgres:2.2.1 # Raise PSQL_MEM_LIMIT for production (see docs/production-sizing.md). mem_limit: ${PSQL_MEM_LIMIT:-6g} @@ -153,6 +177,12 @@ services: collector: restart: unless-stopped container_name: obmp-collector + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5000'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s image: openbmp/collector:2.2.3 mem_limit: 2g sysctls: @@ -169,6 +199,8 @@ services: psql-app: restart: unless-stopped container_name: obmp-psql-app + # No healthcheck — the consumer exposes no health port; Docker's + # restart-on-exit covers process death. image: openbmp/psql-app:2.2.2 # mem_limit must exceed the MEM (JVM heap) env below. Raise both for # production — see docs/production-sizing.md. @@ -216,8 +248,16 @@ services: exabgp: restart: unless-stopped container_name: obmp-exabgp + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5050'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s profiles: ["test"] - mem_limit: 512m + # The full-table feature generates up to 900K route objects in memory; + # 512m OOM-killed it. Raise EXABGP_MEM_LIMIT in .env for larger tables. + mem_limit: ${EXABGP_MEM_LIMIT:-6g} build: context: ./exabgp dockerfile: Dockerfile @@ -241,6 +281,12 @@ services: exabgp-ui: restart: unless-stopped container_name: obmp-exabgp-ui + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:5001/ || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s profiles: ["test"] mem_limit: 256m build: @@ -255,6 +301,12 @@ services: influxdb: restart: unless-stopped container_name: obmp-influxdb + healthcheck: + test: ["CMD-SHELL", "curl -fsS http://localhost:8086/health || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 40s profiles: ["test"] image: influxdb:2.7 mem_limit: 2g @@ -280,6 +332,13 @@ services: context: ./telegraf dockerfile: Dockerfile network_mode: host + # Run telegraf as root and override the image entrypoint (which otherwise + # drops back to the telegraf user) so [[inputs.docker]] can read the + # Docker daemon socket for container resource metrics. + user: root + entrypoint: ["telegraf"] + volumes: + - /var/run/docker.sock:/var/run/docker.sock depends_on: - influxdb environment: @@ -295,6 +354,12 @@ services: traffic-gen: restart: unless-stopped container_name: obmp-traffic-gen + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5051'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s profiles: ["test"] mem_limit: 1g build: @@ -312,6 +377,12 @@ services: traffic-gen-ui: restart: unless-stopped container_name: obmp-traffic-gen-ui + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:5002/ || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s profiles: ["test"] mem_limit: 256m build: @@ -323,6 +394,12 @@ services: traffic-gen-responder: restart: unless-stopped container_name: obmp-traffic-gen-responder + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5053'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s profiles: ["test"] mem_limit: 1g build: @@ -345,6 +422,12 @@ services: whois: restart: unless-stopped container_name: obmp-whois + healthcheck: + test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/43'"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 30s image: openbmp/whois:2.2.0 mem_limit: 1g sysctls: @@ -378,6 +461,12 @@ services: portal: restart: unless-stopped container_name: obmp-portal + healthcheck: + test: ["CMD-SHELL", "wget -q --spider http://localhost:80/ || exit 1"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 20s profiles: ["auth"] mem_limit: 128m image: nginx:alpine diff --git a/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json new file mode 100644 index 0000000..f194e13 --- /dev/null +++ b/obmp-grafana/dashboards/Telemetry-3001/stack_resources.json @@ -0,0 +1,78 @@ +{ + "annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]}, + "description": "Per-container CPU, memory, and I/O for the OpenBMP stack — collected by the Telegraf docker input. Watch memory % to catch a container approaching its mem_limit before it OOM-crashes.", + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": null, + "links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}], + "liveNow": false, + "panels": [ + { + "datasource": {"type": "influxdb","uid": "obmp_influxdb"}, + "description": "Memory usage as a percentage of each container's mem_limit. Sustained values near 100% precede an OOM kill.", + "fieldConfig": { + "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0,"thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}, + "overrides": [] + }, + "gridPos": {"h": 9,"w": 12,"x": 0,"y": 0}, + "id": 1, + "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], + "title": "Container Memory %", + "type": "timeseries" + }, + { + "datasource": {"type": "influxdb","uid": "obmp_influxdb"}, + "description": "CPU usage per container (cpu-total). Can exceed 100% — that is multiple cores.", + "fieldConfig": { + "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0}, + "overrides": [] + }, + "gridPos": {"h": 9,"w": 12,"x": 12,"y": 0}, + "id": 2, + "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_cpu\" and r._field == \"usage_percent\" and r.cpu == \"cpu-total\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], + "title": "Container CPU %", + "type": "timeseries" + }, + { + "datasource": {"type": "influxdb","uid": "obmp_influxdb"}, + "description": "Absolute memory usage per container.", + "fieldConfig": { + "defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "bytes","min": 0}, + "overrides": [] + }, + "gridPos": {"h": 9,"w": 12,"x": 0,"y": 9}, + "id": 3, + "options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}}, + "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}], + "title": "Container Memory Usage", + "type": "timeseries" + }, + { + "datasource": {"type": "influxdb","uid": "obmp_influxdb"}, + "description": "Current memory pressure per container. Anything in orange/red is close to its mem_limit.", + "fieldConfig": { + "defaults": {"custom": {"align": "auto","displayMode": "auto"},"unit": "percent"}, + "overrides": [{"matcher": {"id": "byName","options": "Memory %"},"properties": [{"id": "custom.displayMode","value": "gradient-gauge"},{"id": "max","value": 100},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}]}] + }, + "gridPos": {"h": 9,"w": 12,"x": 12,"y": 9}, + "id": 4, + "options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Memory %"}]}, + "targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: -5m)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> last()\n |> keep(columns: [\"container_name\", \"_value\"])\n |> group()\n |> rename(columns: {_value: \"Memory %\", container_name: \"Container\"})\n |> sort(columns: [\"Memory %\"], desc: true)","refId": "A"}], + "title": "Current Memory % by Container", + "type": "table" + } + ], + "refresh": "30s", + "schemaVersion": 36, + "style": "dark", + "tags": ["obmp", "obmp-nav", "telemetry", "resources"], + "time": {"from": "now-1h","to": "now"}, + "timepicker": {}, + "timezone": "browser", + "title": "Stack Resources", + "uid": "obmp-stack-resources", + "version": 1 +} diff --git a/telegraf/telegraf.conf b/telegraf/telegraf.conf index 4df7338..7c6aac3 100644 --- a/telegraf/telegraf.conf +++ b/telegraf/telegraf.conf @@ -53,6 +53,17 @@ subscription_mode = "sample" sample_interval = "30s" +## Docker container resource metrics — CPU, memory (incl. limit + %), network, +## and block IO for every obmp-* container. Surfaces resource pressure (e.g. a +## container approaching its mem_limit) before it OOM-crashes. +[[inputs.docker]] + endpoint = "unix:///var/run/docker.sock" + gather_services = false + container_name_include = ["obmp-*"] + perdevice = false + total = true + timeout = "10s" + ############################################################################### # OUTPUT PLUGINS # ###############################################################################