Fix ExaBGP OOM, add container health checks and resource monitoring
RCA: the exabgp container was OOM-killed — its 512m mem_limit was far too small for the full-table feature (900K route objects in memory). Raises the limit to a parameterized 6g default (EXABGP_MEM_LIMIT). Adds Docker healthchecks to 14 services (port/HTTP probes) so unhealthy containers are visible. Adds a Telegraf docker input that collects per- container CPU/memory/IO into InfluxDB, plus a "Stack Resources" dashboard — so resource pressure is caught before it causes an OOM crash. telegraf runs with an overridden entrypoint so it keeps root and can read the docker socket. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
482c0cdc01
commit
9d74940614
@ -24,6 +24,8 @@ OBMP_COOKIE_DOMAIN=example.com
|
||||
PSQL_MEM_LIMIT=6g
|
||||
PSQL_APP_MEM_LIMIT=4g
|
||||
KAFKA_MEM_LIMIT=4g
|
||||
# ExaBGP — the full-table feature holds up to 900K route objects in memory.
|
||||
EXABGP_MEM_LIMIT=6g
|
||||
|
||||
# gNMI streaming telemetry (telegraf, test profile). GNMI_ADDRESSES is a
|
||||
# quoted, comma-separated host:port list — add a router here once gNMI/grpc
|
||||
|
||||
@ -17,6 +17,12 @@ services:
|
||||
zookeeper:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-zookeeper
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/2181'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
image: confluentinc/cp-zookeeper:7.1.1
|
||||
mem_limit: 1g
|
||||
volumes:
|
||||
@ -29,6 +35,12 @@ services:
|
||||
kafka:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-kafka
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/9092'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 90s
|
||||
image: confluentinc/cp-kafka:7.1.1
|
||||
# Raise KAFKA_MEM_LIMIT for production (full-table initial dumps are bursty).
|
||||
mem_limit: ${KAFKA_MEM_LIMIT:-4g}
|
||||
@ -87,6 +99,12 @@ services:
|
||||
grafana:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-grafana
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:3000/api/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
image: grafana/grafana:9.1.7
|
||||
mem_limit: 1g
|
||||
ports:
|
||||
@ -128,6 +146,12 @@ services:
|
||||
psql:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-psql
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "pg_isready -U openbmp -d openbmp"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 60s
|
||||
image: openbmp/postgres:2.2.1
|
||||
# Raise PSQL_MEM_LIMIT for production (see docs/production-sizing.md).
|
||||
mem_limit: ${PSQL_MEM_LIMIT:-6g}
|
||||
@ -153,6 +177,12 @@ services:
|
||||
collector:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-collector
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5000'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
image: openbmp/collector:2.2.3
|
||||
mem_limit: 2g
|
||||
sysctls:
|
||||
@ -169,6 +199,8 @@ services:
|
||||
psql-app:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-psql-app
|
||||
# No healthcheck — the consumer exposes no health port; Docker's
|
||||
# restart-on-exit covers process death.
|
||||
image: openbmp/psql-app:2.2.2
|
||||
# mem_limit must exceed the MEM (JVM heap) env below. Raise both for
|
||||
# production — see docs/production-sizing.md.
|
||||
@ -216,8 +248,16 @@ services:
|
||||
exabgp:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-exabgp
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5050'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
profiles: ["test"]
|
||||
mem_limit: 512m
|
||||
# The full-table feature generates up to 900K route objects in memory;
|
||||
# 512m OOM-killed it. Raise EXABGP_MEM_LIMIT in .env for larger tables.
|
||||
mem_limit: ${EXABGP_MEM_LIMIT:-6g}
|
||||
build:
|
||||
context: ./exabgp
|
||||
dockerfile: Dockerfile
|
||||
@ -241,6 +281,12 @@ services:
|
||||
exabgp-ui:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-exabgp-ui
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:5001/ || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
profiles: ["test"]
|
||||
mem_limit: 256m
|
||||
build:
|
||||
@ -255,6 +301,12 @@ services:
|
||||
influxdb:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-influxdb
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "curl -fsS http://localhost:8086/health || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 40s
|
||||
profiles: ["test"]
|
||||
image: influxdb:2.7
|
||||
mem_limit: 2g
|
||||
@ -280,6 +332,13 @@ services:
|
||||
context: ./telegraf
|
||||
dockerfile: Dockerfile
|
||||
network_mode: host
|
||||
# Run telegraf as root and override the image entrypoint (which otherwise
|
||||
# drops back to the telegraf user) so [[inputs.docker]] can read the
|
||||
# Docker daemon socket for container resource metrics.
|
||||
user: root
|
||||
entrypoint: ["telegraf"]
|
||||
volumes:
|
||||
- /var/run/docker.sock:/var/run/docker.sock
|
||||
depends_on:
|
||||
- influxdb
|
||||
environment:
|
||||
@ -295,6 +354,12 @@ services:
|
||||
traffic-gen:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-traffic-gen
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5051'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
profiles: ["test"]
|
||||
mem_limit: 1g
|
||||
build:
|
||||
@ -312,6 +377,12 @@ services:
|
||||
traffic-gen-ui:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-traffic-gen-ui
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:5002/ || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
profiles: ["test"]
|
||||
mem_limit: 256m
|
||||
build:
|
||||
@ -323,6 +394,12 @@ services:
|
||||
traffic-gen-responder:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-traffic-gen-responder
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/5053'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
profiles: ["test"]
|
||||
mem_limit: 1g
|
||||
build:
|
||||
@ -345,6 +422,12 @@ services:
|
||||
whois:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-whois
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "bash -c 'echo > /dev/tcp/localhost/43'"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 30s
|
||||
image: openbmp/whois:2.2.0
|
||||
mem_limit: 1g
|
||||
sysctls:
|
||||
@ -378,6 +461,12 @@ services:
|
||||
portal:
|
||||
restart: unless-stopped
|
||||
container_name: obmp-portal
|
||||
healthcheck:
|
||||
test: ["CMD-SHELL", "wget -q --spider http://localhost:80/ || exit 1"]
|
||||
interval: 30s
|
||||
timeout: 10s
|
||||
retries: 3
|
||||
start_period: 20s
|
||||
profiles: ["auth"]
|
||||
mem_limit: 128m
|
||||
image: nginx:alpine
|
||||
|
||||
78
obmp-grafana/dashboards/Telemetry-3001/stack_resources.json
Normal file
78
obmp-grafana/dashboards/Telemetry-3001/stack_resources.json
Normal file
@ -0,0 +1,78 @@
|
||||
{
|
||||
"annotations": {"list": [{"builtIn": 1,"datasource": {"type": "datasource","uid": "grafana"},"enable": true,"hide": true,"iconColor": "rgba(0, 211, 255, 1)","name": "Annotations & Alerts","type": "dashboard"}]},
|
||||
"description": "Per-container CPU, memory, and I/O for the OpenBMP stack — collected by the Telegraf docker input. Watch memory % to catch a container approaching its mem_limit before it OOM-crashes.",
|
||||
"editable": true,
|
||||
"fiscalYearStartMonth": 0,
|
||||
"graphTooltip": 1,
|
||||
"id": null,
|
||||
"links": [{"asDropdown": true,"icon": "external link","includeVars": true,"keepTime": true,"tags": ["obmp-nav"],"title": "OBMP Dashboards","type": "dashboards"}],
|
||||
"liveNow": false,
|
||||
"panels": [
|
||||
{
|
||||
"datasource": {"type": "influxdb","uid": "obmp_influxdb"},
|
||||
"description": "Memory usage as a percentage of each container's mem_limit. Sustained values near 100% precede an OOM kill.",
|
||||
"fieldConfig": {
|
||||
"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0,"thresholds": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 0},
|
||||
"id": 1,
|
||||
"options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
||||
"targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}],
|
||||
"title": "Container Memory %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "influxdb","uid": "obmp_influxdb"},
|
||||
"description": "CPU usage per container (cpu-total). Can exceed 100% — that is multiple cores.",
|
||||
"fieldConfig": {
|
||||
"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "percent","min": 0},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 0},
|
||||
"id": 2,
|
||||
"options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
||||
"targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_cpu\" and r._field == \"usage_percent\" and r.cpu == \"cpu-total\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}],
|
||||
"title": "Container CPU %",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "influxdb","uid": "obmp_influxdb"},
|
||||
"description": "Absolute memory usage per container.",
|
||||
"fieldConfig": {
|
||||
"defaults": {"color": {"mode": "palette-classic"},"custom": {"axisPlacement": "auto","drawStyle": "line","fillOpacity": 10,"lineInterpolation": "smooth","lineWidth": 1,"pointSize": 5,"showPoints": "never","spanNulls": false,"stacking": {"group": "A","mode": "none"}},"unit": "bytes","min": 0},
|
||||
"overrides": []
|
||||
},
|
||||
"gridPos": {"h": 9,"w": 12,"x": 0,"y": 9},
|
||||
"id": 3,
|
||||
"options": {"legend": {"calcs": ["max"],"displayMode": "table","placement": "right","showLegend": true},"tooltip": {"mode": "multi","sort": "desc"}},
|
||||
"targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: v.timeRangeStart, stop: v.timeRangeStop)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage\")\n |> aggregateWindow(every: v.windowPeriod, fn: mean, createEmpty: false)\n |> keep(columns: [\"_time\", \"_value\", \"container_name\"])\n |> group(columns: [\"container_name\"])","refId": "A"}],
|
||||
"title": "Container Memory Usage",
|
||||
"type": "timeseries"
|
||||
},
|
||||
{
|
||||
"datasource": {"type": "influxdb","uid": "obmp_influxdb"},
|
||||
"description": "Current memory pressure per container. Anything in orange/red is close to its mem_limit.",
|
||||
"fieldConfig": {
|
||||
"defaults": {"custom": {"align": "auto","displayMode": "auto"},"unit": "percent"},
|
||||
"overrides": [{"matcher": {"id": "byName","options": "Memory %"},"properties": [{"id": "custom.displayMode","value": "gradient-gauge"},{"id": "max","value": 100},{"id": "thresholds","value": {"mode": "absolute","steps": [{"color": "green","value": null},{"color": "orange","value": 80},{"color": "red","value": 95}]}}]}]
|
||||
},
|
||||
"gridPos": {"h": 9,"w": 12,"x": 12,"y": 9},
|
||||
"id": 4,
|
||||
"options": {"showHeader": true,"sortBy": [{"desc": true,"displayName": "Memory %"}]},
|
||||
"targets": [{"datasource": {"type": "influxdb","uid": "obmp_influxdb"},"query": "from(bucket: \"telemetry\")\n |> range(start: -5m)\n |> filter(fn: (r) => r._measurement == \"docker_container_mem\" and r._field == \"usage_percent\")\n |> last()\n |> keep(columns: [\"container_name\", \"_value\"])\n |> group()\n |> rename(columns: {_value: \"Memory %\", container_name: \"Container\"})\n |> sort(columns: [\"Memory %\"], desc: true)","refId": "A"}],
|
||||
"title": "Current Memory % by Container",
|
||||
"type": "table"
|
||||
}
|
||||
],
|
||||
"refresh": "30s",
|
||||
"schemaVersion": 36,
|
||||
"style": "dark",
|
||||
"tags": ["obmp", "obmp-nav", "telemetry", "resources"],
|
||||
"time": {"from": "now-1h","to": "now"},
|
||||
"timepicker": {},
|
||||
"timezone": "browser",
|
||||
"title": "Stack Resources",
|
||||
"uid": "obmp-stack-resources",
|
||||
"version": 1
|
||||
}
|
||||
@ -53,6 +53,17 @@
|
||||
subscription_mode = "sample"
|
||||
sample_interval = "30s"
|
||||
|
||||
## Docker container resource metrics — CPU, memory (incl. limit + %), network,
|
||||
## and block IO for every obmp-* container. Surfaces resource pressure (e.g. a
|
||||
## container approaching its mem_limit) before it OOM-crashes.
|
||||
[[inputs.docker]]
|
||||
endpoint = "unix:///var/run/docker.sock"
|
||||
gather_services = false
|
||||
container_name_include = ["obmp-*"]
|
||||
perdevice = false
|
||||
total = true
|
||||
timeout = "10s"
|
||||
|
||||
###############################################################################
|
||||
# OUTPUT PLUGINS #
|
||||
###############################################################################
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user