"description":"Kafka consumer-group lag for the OpenBMP ingestion path, sampled every 30s by the kafka-lag-monitor service. Use it to sanity-check ingestion under load: lag spikes during a BGP convergence storm and should drain back to ~0; the consumer member count rises when psql-app is scaled out.",
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"table","rawSql":"SELECT sum(lag) AS \"Total Lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId":"A"}],
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"table","rawSql":"SELECT members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' ORDER BY ts DESC LIMIT 1","refId":"A"}],
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"table","rawSql":"SELECT count(*) AS \"Partitions\" FROM kafka_consumer_lag WHERE group_id = '$group' AND ts = (SELECT max(ts) FROM kafka_consumer_lag WHERE group_id = '$group')","refId":"A"}],
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"table","rawSql":"SELECT max(t.total) AS \"Peak Lag\" FROM (SELECT ts, sum(lag) AS total FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts) t","refId":"A"}],
"description":"Total consumer lag over time. A healthy ingestion path returns to near-zero after a burst; sustained growth means consumers cannot keep up.",
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"time_series","rawSql":"SELECT ts AS time, sum(lag) AS \"Total lag\" FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts ORDER BY ts","refId":"A"}],
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"time_series","rawSql":"SELECT ts AS time, members AS \"Consumers\" FROM kafka_consumer_members WHERE group_id = '$group' AND $__timeFilter(ts) ORDER BY ts","refId":"A"}],
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"time_series","rawSql":"SELECT ts AS time, topic AS metric, sum(lag) AS lag FROM kafka_consumer_lag WHERE group_id = '$group' AND $__timeFilter(ts) GROUP BY ts, topic ORDER BY ts","refId":"A"}],
"description":"Per-partition lag for openbmp.parsed.unicast_prefix. A single deep partition that lags while others stay flat indicates a hot partition (skewed message keying) -- adding consumers gives it a dedicated thread but cannot split it.",
"targets":[{"datasource":{"type":"postgres","uid":"obmp_postgres"},"format":"time_series","rawSql":"SELECT ts AS time, 'p' || partition AS metric, lag FROM kafka_consumer_lag WHERE group_id = '$group' AND topic = 'openbmp.parsed.unicast_prefix' AND $__timeFilter(ts) ORDER BY ts","refId":"A"}],
"title":"Lag by Partition (unicast_prefix)","type":"timeseries"
}
],
"refresh":"30s",
"schemaVersion":36,
"style":"dark",
"tags":["obmp","obmp-nav","telemetry","kafka"],
"templating":{
"list":[
{"name":"group","type":"query","label":"Consumer Group","datasource":{"type":"postgres","uid":"obmp_postgres"},"query":"SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","definition":"SELECT DISTINCT group_id FROM kafka_consumer_members ORDER BY 1","refresh":1,"includeAll":false,"multi":false,"current":{"selected":true,"text":"obmp-psql-consumer","value":"obmp-psql-consumer"},"options":[],"sort":1,"hide":0}