From da0b51701faf4a31c7cc8c1dd5cccb9d5f6da929 Mon Sep 17 00:00:00 2001 From: gru Date: Tue, 7 Oct 2025 10:29:07 +0200 Subject: [PATCH] Add check_arping --- check_arping | 167 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 167 insertions(+) create mode 100644 check_arping diff --git a/check_arping b/check_arping new file mode 100644 index 0000000..e53214e --- /dev/null +++ b/check_arping @@ -0,0 +1,167 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +check_arp_ping.py — Nagios/Icinga plugin +RTT i utrata pakietów przez arping. Wyjście kompatybilne z check_ping. +""" + +import argparse +import math +import re +import shutil +import subprocess +import sys +from typing import List, Tuple, Optional + +OK, WARNING, CRITICAL, UNKNOWN = 0, 1, 2, 3 +DEFAULT_LABEL = "ARPING" + +TIME_RE = re.compile( + r"""(?ix) + (?:time[=\s]) # 'time=' lub 'time ' + (?P\d+(?:\.\d+)?) # liczba + \s*(?Pms|usec|us|µs) + """ +) + +HOST_LINE_RE = re.compile( + r"""(?ix) + (?:reply|response|bytes\s+from|Unicast\s+reply|Unicast\s+from) + """ +) + +def run_arping(host: str, count: int, timeout: int, iface: Optional[str]) -> Tuple[int, str, str]: + if not shutil.which("arping"): + return (127, "", "arping not found") + cmd = ["arping", "-c", str(count), "-w", str(timeout)] + if iface: + cmd += ["-I", iface] + cmd += [host] + try: + p = subprocess.run( + cmd, capture_output=True, text=True, + timeout=timeout + 2, check=False + ) + return (p.returncode, p.stdout or "", p.stderr or "") + except subprocess.TimeoutExpired: + return (124, "", "arping timed out") + + +def extract_timings(output: str, host: str) -> List[float]: + timings: List[float] = [] + for line in output.splitlines(): + if host in line or HOST_LINE_RE.search(line): + m = TIME_RE.search(line) + if m: + v = float(m.group("val")) + u = m.group("unit").lower() + if u in ("usec", "us", "µs"): + v /= 1000.0 # µs → ms + timings.append(v) + return timings + + +def compute_stats(samples: List[float]) -> Tuple[float, float, float]: + if not samples: + raise ValueError("no samples") + mn = min(samples) + mx = max(samples) + avg = sum(samples) / len(samples) + return (avg, mn, mx) + + +def fmt(x: float, d: int = 1) -> str: + return f"{x:.{d}f}".replace(",", ".") + + +def status_from(avg_ms: Optional[float], pl_pct: int, warn_rta: float, crit_rta: float, + warn_pl: int, crit_pl: int) -> int: + # Najpierw RTA, potem PL + if avg_ms is not None: + if avg_ms >= crit_rta: + return CRITICAL + if avg_ms >= warn_rta: + return WARNING + if pl_pct >= crit_pl: + return CRITICAL + if pl_pct >= warn_pl: + return WARNING + return OK + + +def main(): + ap = argparse.ArgumentParser(description="Nagios/Icinga plugin: ARP ping (arping) z perfdata (rta, pl).") + ap.add_argument("host") + ap.add_argument("warn_rta", type=float, help="WARN RTT (ms)") + ap.add_argument("crit_rta", type=float, help="CRIT RTT (ms)") + ap.add_argument("--warn-pl", type=int, default=20, dest="warn_pl", help="WARN PL (%) [20]") + ap.add_argument("--crit-pl", type=int, default=80, dest="crit_pl", help="CRIT PL (%) [80]") + ap.add_argument("-c", "--count", type=int, default=5, help="Liczba prób [5]") + ap.add_argument("-t", "--timeout", type=int, default=5, help="Timeout arping w s [5]") + ap.add_argument("-I", "--interface", help="Interfejs (np. eth0)") + ap.add_argument("--label", default=DEFAULT_LABEL, help="Etykieta [ARPING]") + ap.add_argument("--version", action="version", version="check_arp_ping.py 2.1.0") + args = ap.parse_args() + + if args.warn_rta >= args.crit_rta: + print(f"{args.label} UNKNOWN - Złe progi RTA: warn >= crit") + sys.exit(UNKNOWN) + if args.warn_pl >= args.crit_pl: + print(f"{args.label} UNKNOWN - Złe progi PL: warn >= crit") + sys.exit(UNKNOWN) + if args.count <= 0: + print(f"{args.label} UNKNOWN - count musi być > 0") + sys.exit(UNKNOWN) + + rc, out, err = run_arping(args.host, args.count, args.timeout, args.interface) + + # Domyślne: pełna utrata + timings = extract_timings(out, args.host) + received = len(timings) + lost = max(0, args.count - received) + pl_pct = int(round(100.0 * lost / args.count)) + + # Brak arping / brak uprawnień / twardy timeout + if rc not in (0, 1, 2): # 0=ok, 1/2 bywa przy stratach + perf = f"'rta'=-;{args.warn_rta};{args.crit_rta};; 'pl'={pl_pct}%;" \ + f"{args.warn_pl};{args.crit_pl};0;100" + print(f"{args.label} CRITICAL - No response from host {args.host} | {perf}") + sys.exit(CRITICAL) + + # Jeśli arping coś zwrócił, ale nie udało się sparsować timingów, + # rozróżnij 100% PL od błędu parsowania. + any_reply_line = any((args.host in ln) or HOST_LINE_RE.search(ln) for ln in out.splitlines()) + + if not timings: + if pl_pct == 100 and not any_reply_line: + # faktycznie 100% PL + perf = f"'rta'=-;{args.warn_rta};{args.crit_rta};; 'pl'={pl_pct}%;" \ + f"{args.warn_pl};{args.crit_pl};0;100" + print(f"{args.label} CRITICAL - Packet loss = {pl_pct}%, RTA = - ms | {perf}") + sys.exit(CRITICAL) + else: + # mamy odpowiedzi, ale bez pola 'time' → błąd parsowania/formatu + perf = f"'rta'=-;{args.warn_rta};{args.crit_rta};; 'pl'={pl_pct}%;" \ + f"{args.warn_pl};{args.crit_pl};0;100" + print(f"{args.label} UNKNOWN - Nie rozpoznano formatów czasu w wyjściu arping | {perf}") + sys.exit(UNKNOWN) + + avg, mn, mx = compute_stats(timings) + state = status_from(avg, pl_pct, args.warn_rta, args.crit_rta, args.warn_pl, args.crit_pl) + state_str = {OK: "OK", WARNING: "WARNING", CRITICAL: "CRITICAL", UNKNOWN: "UNKNOWN"}[state] + + msg = f"Packet loss = {pl_pct}%, RTA = {fmt(avg)} ms" + perf = ( + f"'rta'={fmt(avg)}ms;{args.warn_rta};{args.crit_rta};{fmt(mn)};{fmt(mx)} " + f"'pl'={pl_pct}% ;{args.warn_pl};{args.crit_pl};0;100" + ) + print(f"{args.label} {state_str} - {msg} | {perf}") + sys.exit(state) + + +if __name__ == "__main__": + try: + main() + except Exception as e: + print(f"{DEFAULT_LABEL} UNKNOWN - {e}") + sys.exit(UNKNOWN)