obmp-docker/telegraf/telegraf.conf
sam 9d74940614 Fix ExaBGP OOM, add container health checks and resource monitoring
RCA: the exabgp container was OOM-killed — its 512m mem_limit was far too
small for the full-table feature (900K route objects in memory). Raises the
limit to a parameterized 6g default (EXABGP_MEM_LIMIT).

Adds Docker healthchecks to 14 services (port/HTTP probes) so unhealthy
containers are visible. Adds a Telegraf docker input that collects per-
container CPU/memory/IO into InfluxDB, plus a "Stack Resources" dashboard —
so resource pressure is caught before it causes an OOM crash. telegraf runs
with an overridden entrypoint so it keeps root and can read the docker socket.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-18 22:03:52 -07:00

76 lines
2.6 KiB
Plaintext

# Telegraf Configuration for gNMI Streaming Telemetry
# Collects interface counters and data rates from IOS-XR routers
[global_tags]
[agent]
interval = "10s"
round_interval = true
metric_batch_size = 1000
metric_buffer_limit = 10000
collection_jitter = "0s"
flush_interval = "10s"
flush_jitter = "0s"
precision = "0s"
###############################################################################
# INPUT PLUGINS #
###############################################################################
## gNMI targets — driven by environment variables so the telemetry fleet can
## scale without editing this file. Set in .env:
## GNMI_ADDRESSES — quoted, comma-separated host:port list, e.g.
## GNMI_ADDRESSES="10.0.0.1:57400", "10.0.0.2:57400"
## GNMI_USERNAME / GNMI_PASSWORD — gNMI credentials (uniform across the fleet)
## Every target must have gNMI/grpc enabled and be reachable on the gRPC port.
[[inputs.gnmi]]
addresses = [ ${GNMI_ADDRESSES} ]
username = "${GNMI_USERNAME}"
password = "${GNMI_PASSWORD}"
## No TLS (lab environment)
enable_tls = false
## Use json_ietf encoding (supported by IOS-XR 24.3.1)
encoding = "json_ietf"
## Redial in case of failures after
redial = "10s"
## OpenConfig interface counters (bytes, packets, errors, discards)
[[inputs.gnmi.subscription]]
name = "interface_counters"
origin = "openconfig-interfaces"
path = "/interfaces/interface/state/counters"
subscription_mode = "sample"
sample_interval = "10s"
## OpenConfig interface state (admin/oper status, description, type)
[[inputs.gnmi.subscription]]
name = "interface_state"
origin = "openconfig-interfaces"
path = "/interfaces/interface/state"
subscription_mode = "sample"
sample_interval = "30s"
## Docker container resource metrics — CPU, memory (incl. limit + %), network,
## and block IO for every obmp-* container. Surfaces resource pressure (e.g. a
## container approaching its mem_limit) before it OOM-crashes.
[[inputs.docker]]
endpoint = "unix:///var/run/docker.sock"
gather_services = false
container_name_include = ["obmp-*"]
perdevice = false
total = true
timeout = "10s"
###############################################################################
# OUTPUT PLUGINS #
###############################################################################
[[outputs.influxdb_v2]]
urls = ["http://localhost:8086"]
token = "${INFLUXDB_TOKEN}"
organization = "openbmp"
bucket = "telemetry"