nagios-plugins/check_xfs.py

#!/usr/bin/env python3
import subprocess
import sys
import re
import argparse

ESC = {"reset": "\033[0m", "red": "\033[31m", "yellow": "\033[33m", "green": "\033[32m", "cyan": "\033[36m", "bold": "\033[1m"}

def color_text(text, color):
    return f"{ESC[color]}{text}{ESC['reset']}"

def run_cmd(cmd):
    try:
        return subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT).strip()
    except subprocess.CalledProcessError as e:
        return e.output if e.output else ""

def find_xfs_mounts():
    mounts = run_cmd("mount | grep 'type xfs'")
    xfs_mounts = []
    for line in mounts.splitlines():
        parts = line.split()
        if len(parts) >= 3:
            xfs_mounts.append(parts[2])
    return xfs_mounts

def check_disk_usage(mount):
    output = run_cmd(f"df -h {mount}")
    for line in output.splitlines():
        if mount in line:
            parts = line.split()
            usage_str = parts[4]  # e.g. '45%'
            usage_pct = int(usage_str.strip('%'))
            size = parts[1]
            avail = parts[3]
            return usage_pct, size, avail
    return None, None, None

def check_xfs_repair(mount):
    output = run_cmd(f"xfs_repair -n {mount} 2>&1")
    if "UNRECOVERABLE" in output or "could not" in output or "errors detected" in output:
        return False, output
    return True, output

def print_status(status, msg):
    color_map = {0: "green", 1: "yellow", 2: "red", 3: "red"}
    prefix = {0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN"}
    print(f"{color_text(prefix[status], color_map[status])}: {msg}")

def main():
    parser = argparse.ArgumentParser(
        add_help=False,  # Disable default help to customize
        description="XFS Filesystem Health Check",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog="""
Examples:
  %(prog)s                           # Check all XFS mounts (shows help if no mounts found)
  %(prog)s --list                    # List available XFS mounts
  %(prog)s /data                     # Check specific mount
  %(prog)s -m /data,/var             # Check multiple mounts
  %(prog)s -w 85 -c 95               # Custom thresholds
  %(prog)s -m /data -w 80 -c 90      # Mount + thresholds
        """
    )

    parser.add_argument('-h', '--help', action='store_true', help='Show this help message')
    parser.add_argument('-l', '--list', action='store_true', help='List all mounted XFS filesystems')
    parser.add_argument('-m', '--mount', help='Comma-separated XFS mount points to check')
    parser.add_argument('-w', '--warn-threshold', type=int, default=80, help='Warning threshold %% (default: 80)')
    parser.add_argument('-c', '--crit-threshold', type=int, default=90, help='Critical threshold %% (default: 90)')

    # Positional argument for single mount
    parser.add_argument('mount', nargs='?', help='Single mount point to check')

    args = parser.parse_args()

    if args.help:
        parser.print_help()
        sys.exit(0)

    if args.list:
        mounts = find_xfs_mounts()
        if mounts:
            print(color_text("Available XFS mount points:", "cyan"))
            for m in mounts:
                print(f"  -> {m}")
        else:
            print(color_text("No mounted XFS filesystems found.", "red"))
        sys.exit(0)

    # Determine target mounts
    target_mounts = []
    if args.mount:
        target_mounts = [m.strip() for m in args.mount.split(',') if m.strip()]
    elif args.mount is not None:  # Positional arg
        target_mounts = [args.mount]
    else:
        target_mounts = find_xfs_mounts()

    if not target_mounts:
        print(color_text("No XFS mount points found or specified.", "yellow"))
        print("\nAvailable commands:")
        print("  python3 check_xfs.py --list          # Show available XFS mounts")
        print("  python3 check_xfs.py /data           # Check specific mount")
        print("  python3 check_xfs.py -m /data,/var   # Check multiple mounts")
        print("  python3 check_xfs.py -h              # Full help")
        sys.exit(3)

    warn_threshold = args.warn_threshold
    crit_threshold = args.crit_threshold
    global_status = 0
    all_perfdata = []

    print(color_text("Checking XFS filesystems:", "bold"))
    print(f"Thresholds: WARN>{warn_threshold}%% CRIT>{crit_threshold}%%")
    print("=" * 60)

    for mount in target_mounts:
        print(f"\n{color_text(mount, 'cyan')}")

        usage_pct, size, avail = check_disk_usage(mount)
        if usage_pct is None:
            print(color_text("  UNKNOWN: Unable to read disk usage", "red"))
            global_status = max(global_status, 3)
            continue

        usage_status = 0
        if usage_pct > crit_threshold:
            usage_status = 2
        elif usage_pct > warn_threshold:
            usage_status = 1

        usage_color = "green" if usage_status == 0 else "yellow" if usage_status == 1 else "red"
        print(f"  Usage: {color_text(f'{usage_pct}%', usage_color)} (Size: {size}, Available: {avail})")

        repair_ok, repair_out = check_xfs_repair(mount)
        if not repair_ok:
            print(color_text("  CRITICAL: XFS filesystem issues detected:", "red"))
            print(f"    {repair_out[:400]}...")
            global_status = max(global_status, 2)
        else:
            print(color_text("  XFS filesystem OK (xfs_repair dry-run)", "green"))

        global_status = max(global_status, usage_status)
        all_perfdata.append(f"{mount}_used_pct={usage_pct};{warn_threshold};{crit_threshold} size={size} avail={avail}")

    perfdata = " ".join(all_perfdata)
    print("\n" + "=" * 60)
    status_text = {0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN"}
    status_color = "green" if global_status == 0 else "yellow" if global_status == 1 else "red"
    print(color_text(f"FINAL STATUS: {status_text[global_status]}", status_color))
    print(f"| {perfdata}")

    sys.exit(global_status)

if __name__ == "__main__":
    main()