RCA: the exabgp container was OOM-killed — its 512m mem_limit was far too small for the full-table feature (900K route objects in memory). Raises the limit to a parameterized 6g default (EXABGP_MEM_LIMIT). Adds Docker healthchecks to 14 services (port/HTTP probes) so unhealthy containers are visible. Adds a Telegraf docker input that collects per- container CPU/memory/IO into InfluxDB, plus a "Stack Resources" dashboard — so resource pressure is caught before it causes an OOM crash. telegraf runs with an overridden entrypoint so it keeps root and can read the docker socket. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
76 lines
2.6 KiB
Plaintext
76 lines
2.6 KiB
Plaintext
# Telegraf Configuration for gNMI Streaming Telemetry
|
|
# Collects interface counters and data rates from IOS-XR routers
|
|
|
|
[global_tags]
|
|
|
|
[agent]
|
|
interval = "10s"
|
|
round_interval = true
|
|
metric_batch_size = 1000
|
|
metric_buffer_limit = 10000
|
|
collection_jitter = "0s"
|
|
flush_interval = "10s"
|
|
flush_jitter = "0s"
|
|
precision = "0s"
|
|
|
|
###############################################################################
|
|
# INPUT PLUGINS #
|
|
###############################################################################
|
|
|
|
## gNMI targets — driven by environment variables so the telemetry fleet can
|
|
## scale without editing this file. Set in .env:
|
|
## GNMI_ADDRESSES — quoted, comma-separated host:port list, e.g.
|
|
## GNMI_ADDRESSES="10.0.0.1:57400", "10.0.0.2:57400"
|
|
## GNMI_USERNAME / GNMI_PASSWORD — gNMI credentials (uniform across the fleet)
|
|
## Every target must have gNMI/grpc enabled and be reachable on the gRPC port.
|
|
[[inputs.gnmi]]
|
|
addresses = [ ${GNMI_ADDRESSES} ]
|
|
username = "${GNMI_USERNAME}"
|
|
password = "${GNMI_PASSWORD}"
|
|
|
|
## No TLS (lab environment)
|
|
enable_tls = false
|
|
|
|
## Use json_ietf encoding (supported by IOS-XR 24.3.1)
|
|
encoding = "json_ietf"
|
|
|
|
## Redial in case of failures after
|
|
redial = "10s"
|
|
|
|
## OpenConfig interface counters (bytes, packets, errors, discards)
|
|
[[inputs.gnmi.subscription]]
|
|
name = "interface_counters"
|
|
origin = "openconfig-interfaces"
|
|
path = "/interfaces/interface/state/counters"
|
|
subscription_mode = "sample"
|
|
sample_interval = "10s"
|
|
|
|
## OpenConfig interface state (admin/oper status, description, type)
|
|
[[inputs.gnmi.subscription]]
|
|
name = "interface_state"
|
|
origin = "openconfig-interfaces"
|
|
path = "/interfaces/interface/state"
|
|
subscription_mode = "sample"
|
|
sample_interval = "30s"
|
|
|
|
## Docker container resource metrics — CPU, memory (incl. limit + %), network,
|
|
## and block IO for every obmp-* container. Surfaces resource pressure (e.g. a
|
|
## container approaching its mem_limit) before it OOM-crashes.
|
|
[[inputs.docker]]
|
|
endpoint = "unix:///var/run/docker.sock"
|
|
gather_services = false
|
|
container_name_include = ["obmp-*"]
|
|
perdevice = false
|
|
total = true
|
|
timeout = "10s"
|
|
|
|
###############################################################################
|
|
# OUTPUT PLUGINS #
|
|
###############################################################################
|
|
|
|
[[outputs.influxdb_v2]]
|
|
urls = ["http://localhost:8086"]
|
|
token = "${INFLUXDB_TOKEN}"
|
|
organization = "openbmp"
|
|
bucket = "telemetry"
|