#!/usr/bin/env python3 """ Nagios plugin for comprehensive DRBD/LINSTOR monitoring Author: @linuxiarz.pl Mateusz GruszczyƄski License: GPL v3 """ import argparse import sys import subprocess import json import re from typing import Dict, List, Tuple # Nagios exit codes STATE_OK = 0 STATE_WARNING = 1 STATE_CRITICAL = 2 STATE_UNKNOWN = 3 class DRBDMonitor: def __init__(self): self.resources = {} self.perfdata = [] self.warnings = [] self.criticals = [] self.ok_messages = [] def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]: """Execute system command and return returncode, stdout, stderr""" try: proc = subprocess.Popen( cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, universal_newlines=True ) stdout, stderr = proc.communicate(timeout=30) return proc.returncode, stdout, stderr except subprocess.TimeoutExpired: return 124, "", "Command timeout" except Exception as e: return 1, "", str(e) def parse_events2_output(self, output: str) -> Dict: """Parse drbdsetup events2 --now --statistics output""" data = { 'resources': {}, 'connections': {}, 'devices': {}, 'peer_devices': {} } for line in output.strip().split('\n'): if not line or line.startswith('#'): continue parts = line.split() if len(parts) < 2: continue event_type = parts[0] # exists, create, change, destroy object_type = parts[1] # resource, connection, device, peer-device # Parse key:value pairs props = {} for part in parts[2:]: if ':' in part: key, value = part.split(':', 1) props[key] = value # Store data by object type if object_type == 'resource': res_name = props.get('name', 'unknown') data['resources'][res_name] = props elif object_type == 'connection': conn_name = props.get('name', 'unknown') data['connections'][conn_name] = props elif object_type == 'device': dev_name = props.get('name', 'unknown') volume = props.get('volume', '0') key = f"{dev_name}:{volume}" data['devices'][key] = props elif object_type == 'peer-device': peer_name = props.get('name', 'unknown') volume = props.get('volume', '0') key = f"{peer_name}:{volume}" data['peer_devices'][key] = props return data def check_resource_status(self, resource_data: Dict, args): """Check resource role and status""" for res_name, props in resource_data.items(): role = props.get('role', 'Unknown') suspended = props.get('suspended', 'no') may_promote = props.get('may_promote', 'no') # Check role if args.check_role: if role not in ['Primary', 'Secondary']: self.criticals.append(f"Resource {res_name}: Invalid role {role}") elif role == 'Primary': self.ok_messages.append(f"Resource {res_name}: Role={role}") elif role == 'Secondary' and args.require_primary: self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary") else: self.ok_messages.append(f"Resource {res_name}: Role={role}") # Check suspended state if args.check_suspended and suspended == 'yes': self.criticals.append(f"Resource {res_name}: SUSPENDED") # Check promotion capability - ONLY if resource is Secondary and explicitly required # For diskless clients this is normal behavior if args.check_promotion and may_promote == 'no' and role == 'Secondary': # Don't warn for diskless resources (clients), only if explicitly checking if args.require_promotion_capability: self.warnings.append(f"Resource {res_name}: Cannot be promoted") def check_connection_status(self, connection_data: Dict, args): """Check connection state between peers""" for conn_name, props in connection_data.items(): connection = props.get('connection', 'Unknown') role = props.get('role', 'Unknown') congested = props.get('congested', 'no') peer_node_id = props.get('peer-node-id', 'unknown') # Check connection state if args.check_connection: if connection not in ['Connected', 'StandAlone']: if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure', 'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']: self.criticals.append(f"Connection {conn_name}: State={connection}") elif connection in ['WFConnection', 'WFReportParams']: self.warnings.append(f"Connection {conn_name}: State={connection}") else: self.criticals.append(f"Connection {conn_name}: Unknown state {connection}") else: self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}") # Check congestion - only warn if it's persistent or critical if args.check_congestion and congested == 'yes': # Congestion can be temporary, so only warn instead of critical if not args.ignore_transient_congestion: self.warnings.append(f"Connection {conn_name}: CONGESTED") def check_device_status(self, device_data: Dict, args): """Check device/volume disk state""" for dev_key, props in device_data.items(): dev_name = props.get('name', 'unknown') volume = props.get('volume', '0') disk = props.get('disk', 'Unknown') minor = props.get('minor', 'unknown') client = props.get('client', 'no') quorum = props.get('quorum', 'yes') # Check disk state if args.check_disk: if disk not in ['UpToDate', 'Diskless']: if disk in ['Failed', 'Detaching']: self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") elif disk in ['Inconsistent', 'Outdated', 'DUnknown']: self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") elif disk in ['Attaching', 'Negotiating']: self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") else: self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") else: self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}") # Check quorum - but not for diskless clients if args.check_quorum and quorum == 'no' and disk != 'Diskless': self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM") # Check client mode if args.check_client and client == 'yes': self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode") def check_peer_device_status(self, peer_device_data: Dict, args): """Check peer device replication state""" for peer_key, props in peer_device_data.items(): peer_name = props.get('name', 'unknown') volume = props.get('volume', '0') replication = props.get('replication', 'Unknown') peer_disk = props.get('peer-disk', 'Unknown') resync_suspended = props.get('resync-suspended', 'no') peer_client = props.get('peer-client', 'no') peer_node_id = props.get('peer-node-id', 'unknown') # Check replication state if args.check_replication: if replication not in ['Established', 'Off']: if replication in ['SyncSource', 'SyncTarget']: # Synchronization in progress - warning or OK depending on config if args.warn_on_sync: self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") else: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") elif replication in ['PausedSyncS', 'PausedSyncT']: self.warnings.append(f"Peer {peer_name} vol:{volume}: Sync paused ({replication})") elif replication in ['WFBitMapS', 'WFBitMapT', 'WFSyncUUID']: self.warnings.append(f"Peer {peer_name} vol:{volume}: Waiting for sync ({replication})") elif replication in ['StartingSyncS', 'StartingSyncT']: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Starting sync ({replication})") elif replication in ['VerifyS', 'VerifyT']: self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Verifying ({replication})") else: self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}") else: self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}") # Check peer disk state if args.check_peer_disk: if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']: if peer_disk in ['Failed', 'Outdated', 'Inconsistent']: self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") else: self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") # Check resync suspended if args.check_resync_suspended and resync_suspended == 'yes': self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED") def get_statistics(self, resource: str = 'all') -> Dict: """Get DRBD statistics from drbdsetup events2 --statistics""" cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: return {} stats = {} for line in stdout.strip().split('\n'): parts = line.split() if len(parts) < 2: continue # Look for statistics in the output for part in parts[2:]: if ':' in part: key, value = part.split(':', 1) try: stats[key] = int(value) except ValueError: stats[key] = value return stats def add_performance_data(self, device_data: Dict, peer_device_data: Dict): """Add performance data for Nagios""" # Count resources, devices, connections resource_count = len(device_data) self.perfdata.append(f"resources={resource_count}") # Count by disk state uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate') self.perfdata.append(f"uptodate_devices={uptodate}") # Count by replication state established = sum(1 for p in peer_device_data.values() if p.get('replication') == 'Established') syncing = sum(1 for p in peer_device_data.values() if p.get('replication') in ['SyncSource', 'SyncTarget']) self.perfdata.append(f"established_replications={established}") self.perfdata.append(f"syncing_replications={syncing}") def check_linstor_status(self, args): """Check LINSTOR specific status if available""" if not args.check_linstor: return # Check if linstor command is available cmd = ['which', 'linstor'] rc, _, _ = self.execute_command(cmd) if rc != 0: if args.verbose: self.ok_messages.append("LINSTOR: Command not available (optional)") return # Get resource list - try different output formats cmd = ['linstor', '--machine-readable', 'resource', 'list'] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: if args.verbose: self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}") return try: # Parse JSON output linstor_output = json.loads(stdout) # LINSTOR returns an array, first check if it's valid if not isinstance(linstor_output, list) or len(linstor_output) == 0: if args.verbose: self.ok_messages.append("LINSTOR: No data returned") return # Try to extract resource data from various possible formats linstor_resource_count = 0 linstor_volume_count = 0 # Format 1: Array of response objects with 'resources' key for item in linstor_output: if isinstance(item, dict): # Try 'resources' key resources = item.get('resources', []) if resources and isinstance(resources, list): for res in resources: if isinstance(res, dict): linstor_resource_count += 1 volumes = res.get('vlms', []) or res.get('volumes', []) if isinstance(volumes, list): linstor_volume_count += len(volumes) # Format 2: Direct array of resources (older format) if linstor_resource_count == 0: for item in linstor_output: if isinstance(item, dict) and 'name' in item: linstor_resource_count += 1 volumes = item.get('vlms', []) or item.get('volumes', []) if isinstance(volumes, list): linstor_volume_count += len(volumes) if linstor_resource_count > 0: self.ok_messages.append( f"LINSTOR: {linstor_resource_count} resources, " f"{linstor_volume_count} volumes" ) # Add performance data if args.performance_data: self.perfdata.append(f"linstor_resources={linstor_resource_count}") self.perfdata.append(f"linstor_volumes={linstor_volume_count}") else: if args.verbose: self.ok_messages.append("LINSTOR: No resources found") except json.JSONDecodeError as e: if args.verbose: self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}") except Exception as e: if args.verbose: self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}") def run_checks(self, args): """Main check execution""" # Get DRBD events2 output resource_filter = args.resource if args.resource else 'all' cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter] rc, stdout, stderr = self.execute_command(cmd) if rc != 0: print(f"CRITICAL - Failed to execute drbdsetup: {stderr}") sys.exit(STATE_CRITICAL) if not stdout.strip(): print("CRITICAL - No DRBD resources found") sys.exit(STATE_CRITICAL) # Parse output data = self.parse_events2_output(stdout) # Run all checks if data['resources']: self.check_resource_status(data['resources'], args) if data['connections']: self.check_connection_status(data['connections'], args) if data['devices']: self.check_device_status(data['devices'], args) if data['peer_devices']: self.check_peer_device_status(data['peer_devices'], args) # Add performance data if args.performance_data: self.add_performance_data(data['devices'], data['peer_devices']) # Check LINSTOR if requested self.check_linstor_status(args) # Determine final status return self.get_final_status(args) def get_final_status(self, args) -> int: """Determine final Nagios status and output""" if self.criticals: status = STATE_CRITICAL status_text = "CRITICAL" messages = self.criticals if args.verbose: messages.extend(self.warnings) messages.extend(self.ok_messages) elif self.warnings: status = STATE_WARNING status_text = "WARNING" messages = self.warnings if args.verbose: messages.extend(self.ok_messages) else: status = STATE_OK status_text = "OK" messages = self.ok_messages if args.verbose else ["All DRBD checks passed"] # Build output output = f"{status_text} - {'; '.join(messages)}" if self.perfdata: output += " | " + " ".join(self.perfdata) print(output) return status def main(): parser = argparse.ArgumentParser( description='Comprehensive DRBD/LINSTOR Nagios monitoring plugin', formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: # Check all parameters for all resources %(prog)s --all # Check specific resource %(prog)s --resource r0 --all # Check only connection and replication status %(prog)s --check-connection --check-replication # Check with performance data and verbose output %(prog)s --all --performance-data --verbose # Check LINSTOR status as well %(prog)s --all --check-linstor # Ignore transient congestion warnings %(prog)s --all --ignore-transient-congestion """ ) # Resource selection parser.add_argument('-r', '--resource', help='DRBD resource name to check (default: all)') # Check options parser.add_argument('--all', action='store_true', help='Enable all checks (recommended)') parser.add_argument('--check-role', action='store_true', help='Check resource role (Primary/Secondary)') parser.add_argument('--check-disk', action='store_true', help='Check disk state (UpToDate/Inconsistent/etc)') parser.add_argument('--check-connection', action='store_true', help='Check connection state between nodes') parser.add_argument('--check-replication', action='store_true', help='Check replication state (Established/SyncSource/etc)') parser.add_argument('--check-peer-disk', action='store_true', help='Check peer disk state') parser.add_argument('--check-suspended', action='store_true', help='Check if resource is suspended') parser.add_argument('--check-promotion', action='store_true', help='Check if resource may be promoted (disabled by default for diskless clients)') parser.add_argument('--require-promotion-capability', action='store_true', help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)') parser.add_argument('--check-quorum', action='store_true', help='Check quorum status') parser.add_argument('--check-congestion', action='store_true', help='Check network congestion') parser.add_argument('--ignore-transient-congestion', action='store_true', help='Ignore transient congestion warnings (recommended for busy networks)') parser.add_argument('--check-client', action='store_true', help='Check if running in client mode') parser.add_argument('--check-resync-suspended', action='store_true', help='Check if resync is suspended') parser.add_argument('--check-linstor', action='store_true', help='Check LINSTOR status (requires linstor command)') # Behavior options parser.add_argument('--require-primary', action='store_true', help='Warn if resource is not Primary') parser.add_argument('--warn-on-sync', action='store_true', help='Warn when synchronization is in progress (default: OK)') parser.add_argument('--performance-data', action='store_true', help='Include performance data for graphing') parser.add_argument('-v', '--verbose', action='store_true', help='Verbose output (show all status messages)') parser.add_argument('--version', action='version', version='%(prog)s 1.2') args = parser.parse_args() # If --all is specified, enable all checks EXCEPT require-promotion-capability if args.all: args.check_role = True args.check_disk = True args.check_connection = True args.check_replication = True args.check_peer_disk = True args.check_suspended = True # Don't enable check_promotion by default - diskless clients can't promote # args.check_promotion = True args.check_quorum = True args.check_congestion = True args.check_client = True args.check_resync_suspended = True args.performance_data = True # If no checks specified, enable basic checks if not any([args.check_role, args.check_disk, args.check_connection, args.check_replication, args.check_peer_disk, args.check_suspended, args.check_promotion, args.check_quorum, args.check_congestion, args.check_client, args.check_resync_suspended]): args.check_role = True args.check_disk = True args.check_connection = True args.check_replication = True # Run checks monitor = DRBDMonitor() try: status = monitor.run_checks(args) sys.exit(status) except KeyboardInterrupt: print("UNKNOWN - Check interrupted") sys.exit(STATE_UNKNOWN) except Exception as e: print(f"UNKNOWN - Unexpected error: {e}") sys.exit(STATE_UNKNOWN) if __name__ == '__main__': main()