obmp-docker/tools/churn_storm.py

#!/usr/bin/env python3
"""churn_storm.py -- programmatic BGP churn-storm generator for load testing.

Drives churn through the live OpenBMP pipeline by flapping GoBGP's eBGP
sessions to the lab core routers. Each session reset withdraws and re-learns
that core's full table (~1M routes), which propagates fleet-wide through the
route reflectors -- a realistic, large churn event.

Pair it with the Kafka Ingestion Lag dashboard (uid kafka-lag) to measure how
the pipeline copes: peak lag, drain time, and -- by also watching `docker
stats` -- whether the bottleneck is the consumer (psql-app) or the database.

Usage:
    python3 tools/churn_storm.py                          # 1 cycle, all 4 cores
    python3 tools/churn_storm.py --cycles 5 --interval 120
    python3 tools/churn_storm.py --neighbors 10.100.0.100,10.100.0.200
"""
import argparse
import datetime
import subprocess
import time

ALL_CORES = ["10.100.0.100", "10.100.0.200", "10.100.1.100", "10.100.1.200"]


def ts():
    return datetime.datetime.now().strftime("%H:%M:%S")


def reset(neighbor):
    r = subprocess.run(
        ["docker", "exec", "obmp-gobgp", "gobgp", "neighbor", neighbor, "reset"],
        capture_output=True, text=True)
    ok = r.returncode == 0
    detail = "ok" if ok else "FAIL " + (r.stderr or r.stdout).strip()
    print(f"  {ts()}  reset {neighbor}: {detail}", flush=True)
    return ok


def main():
    ap = argparse.ArgumentParser(description="BGP churn-storm generator")
    ap.add_argument("--neighbors", default=",".join(ALL_CORES),
                    help="comma-separated GoBGP neighbor IPs to flap")
    ap.add_argument("--cycles", type=int, default=1,
                    help="number of flap cycles")
    ap.add_argument("--interval", type=int, default=120,
                    help="seconds between cycles")
    a = ap.parse_args()
    neighbors = [n.strip() for n in a.neighbors.split(",") if n.strip()]

    print(f"{ts()}  churn storm: {a.cycles} cycle(s), {len(neighbors)} "
          f"neighbor(s), {a.interval}s interval", flush=True)
    for c in range(1, a.cycles + 1):
        print(f"{ts()}  --- cycle {c}/{a.cycles} ---", flush=True)
        for n in neighbors:
            reset(n)
        if c < a.cycles:
            time.sleep(a.interval)
    print(f"{ts()}  storm complete -- watch the Kafka Ingestion Lag dashboard",
          flush=True)


if __name__ == "__main__":
    main()