Files
nagios-plugins/check_xfs.py
2025-12-02 10:54:53 +01:00

190 lines
6.8 KiB
Python

#!/usr/bin/env python3
import subprocess
import sys
import re
import argparse
import os
ESC = {"reset": "\033[0m", "red": "\033[31m", "yellow": "\033[33m", "green": "\033[32m", "cyan": "\033[36m", "bold": "\033[1m"}
def color_text(text, color):
return f"{ESC[color]}{text}{ESC['reset']}"
def is_interactive():
"""Detect if running interactively (not Nagios)"""
return sys.stdout.isatty()
def run_cmd(cmd):
try:
return subprocess.check_output(cmd, shell=True, text=True, stderr=subprocess.STDOUT).strip()
except subprocess.CalledProcessError as e:
return e.output if e.output else ""
def find_xfs_mounts():
mounts = run_cmd("mount | grep 'type xfs'")
xfs_mounts = []
for line in mounts.splitlines():
parts = line.split()
if len(parts) >= 3:
xfs_mounts.append(parts[2])
return xfs_mounts
def is_xfs_mount(mount):
"""Check if mount point is actually XFS"""
mounts_output = run_cmd(f"mount | grep {re.escape(mount)}")
return "xfs" in mounts_output.lower()
def check_disk_usage(mount):
"""Use df -P for consistent POSIX output"""
output = run_cmd(f"df -P -h {mount}")
for line in output.splitlines():
parts = line.split()
if len(parts) >= 6 and mount in parts[5]: # Target is LAST column in -P
usage_str = parts[4]
if '%' in usage_str:
usage_pct = int(usage_str.strip('%'))
size = parts[1]
avail = parts[3]
return usage_pct, size, avail
return None, None, None
def check_xfs_repair(mount):
output = run_cmd(f"xfs_repair -n {mount} 2>&1")
if "UNRECOVERABLE" in output or "could not" in output or "errors detected" in output:
return False, output
return True, output
def main():
parser = argparse.ArgumentParser(
add_help=False,
description="XFS Filesystem Health Check",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s # Check all XFS mounts
%(prog)s --list # List available XFS mounts
%(prog)s /data # Check specific mount
%(prog)s -m /data,/var # Check multiple mounts
%(prog)s -w 85 -c 95 # Custom thresholds
"""
)
parser.add_argument('-h', '--help', action='store_true', help='Show this help message')
parser.add_argument('-l', '--list', action='store_true', help='List all mounted XFS filesystems')
parser.add_argument('-m', '--mount', help='Comma-separated XFS mount points to check')
parser.add_argument('-w', '--warn-threshold', type=int, default=80, help='Warning threshold %% (default: 80)')
parser.add_argument('-c', '--crit-threshold', type=int, default=90, help='Critical threshold %% (default: 90)')
parser.add_argument('mount', nargs='?', help='Single mount point to check')
args = parser.parse_args()
# Nagios: --help and --list exit 0 (OK)
if args.help:
parser.print_help()
sys.exit(0)
if args.list:
mounts = find_xfs_mounts()
if mounts:
print(color_text("Available XFS mount points:", "cyan"))
for m in mounts:
print(f" -> {m}")
else:
print(color_text("No mounted XFS filesystems found.", "red"))
sys.exit(0)
# Determine target mounts
target_mounts = []
if args.mount:
target_mounts = [m.strip() for m in args.mount.split(',') if m.strip()]
elif args.mount is not None:
target_mounts = [args.mount]
else:
target_mounts = find_xfs_mounts()
# Nagios: No mounts = UNKNOWN (3) - SINGLE LINE
if not target_mounts:
print("UNKNOWN No XFS mount points found or specified|")
sys.exit(3)
warn_threshold = args.warn_threshold
crit_threshold = args.crit_threshold
global_status = 0
all_perfdata = []
# ONLY show verbose colorful output in INTERACTIVE mode
if is_interactive():
print(color_text("Checking XFS filesystems:", "bold"))
print(f"Thresholds: WARN>{warn_threshold}% CRIT>{crit_threshold}%")
print("=" * 60)
for mount in target_mounts:
if is_interactive():
print(f"\n{color_text(mount, 'cyan')}")
# Validate it's actually XFS
if not is_xfs_mount(mount):
if is_interactive():
print(color_text(" UNKNOWN: Mount point not found or not XFS", "red"))
global_status = max(global_status, 3)
all_perfdata.append(f"{mount.replace('/', '_')}_status=3")
continue
usage_pct, size, avail = check_disk_usage(mount)
if usage_pct is None:
if is_interactive():
print(color_text(" UNKNOWN: Unable to read disk usage", "red"))
global_status = max(global_status, 3)
all_perfdata.append(f"{mount.replace('/', '_')}_status=3")
continue
# Usage status
usage_status = 0
if usage_pct > crit_threshold:
usage_status = 2
elif usage_pct > warn_threshold:
usage_status = 1
if is_interactive():
usage_color = "green" if usage_status == 0 else "yellow" if usage_status == 1 else "red"
print(f" Usage: {color_text(f'{usage_pct}%', usage_color)} (Size: {size}, Available: {avail})")
# XFS repair check
repair_ok, repair_out = check_xfs_repair(mount)
xfs_status = 2 if not repair_ok else 0
if is_interactive():
if not repair_ok:
print(color_text(" CRITICAL: XFS filesystem issues detected", "red"))
else:
print(color_text(" XFS filesystem OK (xfs_repair dry-run)", "green"))
# Final status for this mount
mount_status = max(usage_status, xfs_status)
global_status = max(global_status, mount_status)
# Nagios perfdata format
mount_name = mount.replace('/', '_').replace(' ', '_')
perf = f"{mount_name}_used_pct={usage_pct};{warn_threshold};{crit_threshold};0;100 size={size} avail={avail} xfs_status={xfs_status}"
all_perfdata.append(perf)
perfdata = "| " + " ".join(all_perfdata)
status_text = {0: "OK", 1: "WARNING", 2: "CRITICAL", 3: "UNKNOWN"}
# ALWAYS: Clean Nagios output FIRST LINE (parseable)
print(f"{status_text[global_status]} XFS check: {len(target_mounts)} mount(s){perfdata}")
# ONLY interactive: Additional colorful summary
if is_interactive():
status_color = "green" if global_status == 0 else "yellow" if global_status == 1 else "red"
print("\n" + "=" * 60)
print(color_text(f"FINAL STATUS: {status_text[global_status]}", status_color))
print(f"{perfdata}")
# Nagios standard exit codes: 0=OK, 1=WARNING, 2=CRITICAL, 3=UNKNOWN
sys.exit(global_status)
if __name__ == "__main__":
main()