diff --git a/check_drdb_linstor.py b/check_drdb_linstor.py new file mode 100644 index 0000000..f1aef88 --- /dev/null +++ b/check_drdb_linstor.py @@ -0,0 +1,552 @@ +#!/usr/bin/env python3 +""" +Nagios plugin for comprehensive DRBD/LINSTOR monitoring +Author: @linuxiarz.pl Mateusz GruszczyƄski +License: GPL v3 +""" + +import argparse +import sys +import subprocess +import json +import re +from typing import Dict, List, Tuple + +# Nagios exit codes +STATE_OK = 0 +STATE_WARNING = 1 +STATE_CRITICAL = 2 +STATE_UNKNOWN = 3 + +class DRBDMonitor: + def __init__(self): + self.resources = {} + self.perfdata = [] + self.warnings = [] + self.criticals = [] + self.ok_messages = [] + + def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]: + """Execute system command and return returncode, stdout, stderr""" + try: + proc = subprocess.Popen( + cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + universal_newlines=True + ) + stdout, stderr = proc.communicate(timeout=30) + return proc.returncode, stdout, stderr + except subprocess.TimeoutExpired: + return 124, "", "Command timeout" + except Exception as e: + return 1, "", str(e) + + def parse_events2_output(self, output: str) -> Dict: + """Parse drbdsetup events2 --now --statistics output""" + data = { + 'resources': {}, + 'connections': {}, + 'devices': {}, + 'peer_devices': {} + } + + for line in output.strip().split('\n'): + if not line or line.startswith('#'): + continue + + parts = line.split() + if len(parts) < 2: + continue + + event_type = parts[0] # exists, create, change, destroy + object_type = parts[1] # resource, connection, device, peer-device + + # Parse key:value pairs + props = {} + for part in parts[2:]: + if ':' in part: + key, value = part.split(':', 1) + props[key] = value + + # Store data by object type + if object_type == 'resource': + res_name = props.get('name', 'unknown') + data['resources'][res_name] = props + elif object_type == 'connection': + conn_name = props.get('name', 'unknown') + data['connections'][conn_name] = props + elif object_type == 'device': + dev_name = props.get('name', 'unknown') + volume = props.get('volume', '0') + key = f"{dev_name}:{volume}" + data['devices'][key] = props + elif object_type == 'peer-device': + peer_name = props.get('name', 'unknown') + volume = props.get('volume', '0') + key = f"{peer_name}:{volume}" + data['peer_devices'][key] = props + + return data + + def check_resource_status(self, resource_data: Dict, args): + """Check resource role and status""" + for res_name, props in resource_data.items(): + role = props.get('role', 'Unknown') + suspended = props.get('suspended', 'no') + may_promote = props.get('may_promote', 'no') + + # Check role + if args.check_role: + if role not in ['Primary', 'Secondary']: + self.criticals.append(f"Resource {res_name}: Invalid role {role}") + elif role == 'Primary': + self.ok_messages.append(f"Resource {res_name}: Role={role}") + elif role == 'Secondary' and args.require_primary: + self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary") + else: + self.ok_messages.append(f"Resource {res_name}: Role={role}") + + # Check suspended state + if args.check_suspended and suspended == 'yes': + self.criticals.append(f"Resource {res_name}: SUSPENDED") + + # Check promotion capability - ONLY if resource is Secondary and explicitly required + # For diskless clients this is normal behavior + if args.check_promotion and may_promote == 'no' and role == 'Secondary': + # Don't warn for diskless resources (clients), only if explicitly checking + if args.require_promotion_capability: + self.warnings.append(f"Resource {res_name}: Cannot be promoted") + + def check_connection_status(self, connection_data: Dict, args): + """Check connection state between peers""" + for conn_name, props in connection_data.items(): + connection = props.get('connection', 'Unknown') + role = props.get('role', 'Unknown') + congested = props.get('congested', 'no') + peer_node_id = props.get('peer-node-id', 'unknown') + + # Check connection state + if args.check_connection: + if connection not in ['Connected', 'StandAlone']: + if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure', + 'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']: + self.criticals.append(f"Connection {conn_name}: State={connection}") + elif connection in ['WFConnection', 'WFReportParams']: + self.warnings.append(f"Connection {conn_name}: State={connection}") + else: + self.criticals.append(f"Connection {conn_name}: Unknown state {connection}") + else: + self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}") + + # Check congestion - only warn if it's persistent or critical + if args.check_congestion and congested == 'yes': + # Congestion can be temporary, so only warn instead of critical + if not args.ignore_transient_congestion: + self.warnings.append(f"Connection {conn_name}: CONGESTED") + + def check_device_status(self, device_data: Dict, args): + """Check device/volume disk state""" + for dev_key, props in device_data.items(): + dev_name = props.get('name', 'unknown') + volume = props.get('volume', '0') + disk = props.get('disk', 'Unknown') + minor = props.get('minor', 'unknown') + client = props.get('client', 'no') + quorum = props.get('quorum', 'yes') + + # Check disk state + if args.check_disk: + if disk not in ['UpToDate', 'Diskless']: + if disk in ['Failed', 'Detaching']: + self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") + elif disk in ['Inconsistent', 'Outdated', 'DUnknown']: + self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}") + elif disk in ['Attaching', 'Negotiating']: + self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") + else: + self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}") + else: + self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}") + + # Check quorum - but not for diskless clients + if args.check_quorum and quorum == 'no' and disk != 'Diskless': + self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM") + + # Check client mode + if args.check_client and client == 'yes': + self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode") + + def check_peer_device_status(self, peer_device_data: Dict, args): + """Check peer device replication state""" + for peer_key, props in peer_device_data.items(): + peer_name = props.get('name', 'unknown') + volume = props.get('volume', '0') + replication = props.get('replication', 'Unknown') + peer_disk = props.get('peer-disk', 'Unknown') + resync_suspended = props.get('resync-suspended', 'no') + peer_client = props.get('peer-client', 'no') + peer_node_id = props.get('peer-node-id', 'unknown') + + # Check replication state + if args.check_replication: + if replication not in ['Established', 'Off']: + if replication in ['SyncSource', 'SyncTarget']: + # Synchronization in progress - warning or OK depending on config + if args.warn_on_sync: + self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") + else: + self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})") + elif replication in ['PausedSyncS', 'PausedSyncT']: + self.warnings.append(f"Peer {peer_name} vol:{volume}: Sync paused ({replication})") + elif replication in ['WFBitMapS', 'WFBitMapT', 'WFSyncUUID']: + self.warnings.append(f"Peer {peer_name} vol:{volume}: Waiting for sync ({replication})") + elif replication in ['StartingSyncS', 'StartingSyncT']: + self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Starting sync ({replication})") + elif replication in ['VerifyS', 'VerifyT']: + self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Verifying ({replication})") + else: + self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}") + else: + self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}") + + # Check peer disk state + if args.check_peer_disk: + if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']: + if peer_disk in ['Failed', 'Outdated', 'Inconsistent']: + self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") + else: + self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}") + + # Check resync suspended + if args.check_resync_suspended and resync_suspended == 'yes': + self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED") + + def get_statistics(self, resource: str = 'all') -> Dict: + """Get DRBD statistics from drbdsetup events2 --statistics""" + cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource] + rc, stdout, stderr = self.execute_command(cmd) + + if rc != 0: + return {} + + stats = {} + for line in stdout.strip().split('\n'): + parts = line.split() + if len(parts) < 2: + continue + + # Look for statistics in the output + for part in parts[2:]: + if ':' in part: + key, value = part.split(':', 1) + try: + stats[key] = int(value) + except ValueError: + stats[key] = value + + return stats + + def add_performance_data(self, device_data: Dict, peer_device_data: Dict): + """Add performance data for Nagios""" + # Count resources, devices, connections + resource_count = len(device_data) + self.perfdata.append(f"resources={resource_count}") + + # Count by disk state + uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate') + self.perfdata.append(f"uptodate_devices={uptodate}") + + # Count by replication state + established = sum(1 for p in peer_device_data.values() + if p.get('replication') == 'Established') + syncing = sum(1 for p in peer_device_data.values() + if p.get('replication') in ['SyncSource', 'SyncTarget']) + self.perfdata.append(f"established_replications={established}") + self.perfdata.append(f"syncing_replications={syncing}") + + def check_linstor_status(self, args): + """Check LINSTOR specific status if available""" + if not args.check_linstor: + return + + # Check if linstor command is available + cmd = ['which', 'linstor'] + rc, _, _ = self.execute_command(cmd) + if rc != 0: + if args.verbose: + self.ok_messages.append("LINSTOR: Command not available (optional)") + return + + # Get resource list - try different output formats + cmd = ['linstor', '--machine-readable', 'resource', 'list'] + rc, stdout, stderr = self.execute_command(cmd) + + if rc != 0: + if args.verbose: + self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}") + return + + try: + # Parse JSON output + linstor_output = json.loads(stdout) + + # LINSTOR returns an array, first check if it's valid + if not isinstance(linstor_output, list) or len(linstor_output) == 0: + if args.verbose: + self.ok_messages.append("LINSTOR: No data returned") + return + + # Try to extract resource data from various possible formats + linstor_resource_count = 0 + linstor_volume_count = 0 + + # Format 1: Array of response objects with 'resources' key + for item in linstor_output: + if isinstance(item, dict): + # Try 'resources' key + resources = item.get('resources', []) + if resources and isinstance(resources, list): + for res in resources: + if isinstance(res, dict): + linstor_resource_count += 1 + volumes = res.get('vlms', []) or res.get('volumes', []) + if isinstance(volumes, list): + linstor_volume_count += len(volumes) + + # Format 2: Direct array of resources (older format) + if linstor_resource_count == 0: + for item in linstor_output: + if isinstance(item, dict) and 'name' in item: + linstor_resource_count += 1 + volumes = item.get('vlms', []) or item.get('volumes', []) + if isinstance(volumes, list): + linstor_volume_count += len(volumes) + + if linstor_resource_count > 0: + self.ok_messages.append( + f"LINSTOR: {linstor_resource_count} resources, " + f"{linstor_volume_count} volumes" + ) + + # Add performance data + if args.performance_data: + self.perfdata.append(f"linstor_resources={linstor_resource_count}") + self.perfdata.append(f"linstor_volumes={linstor_volume_count}") + else: + if args.verbose: + self.ok_messages.append("LINSTOR: No resources found") + + except json.JSONDecodeError as e: + if args.verbose: + self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}") + except Exception as e: + if args.verbose: + self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}") + + def run_checks(self, args): + """Main check execution""" + # Get DRBD events2 output + resource_filter = args.resource if args.resource else 'all' + cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter] + + rc, stdout, stderr = self.execute_command(cmd) + + if rc != 0: + print(f"CRITICAL - Failed to execute drbdsetup: {stderr}") + sys.exit(STATE_CRITICAL) + + if not stdout.strip(): + print("CRITICAL - No DRBD resources found") + sys.exit(STATE_CRITICAL) + + # Parse output + data = self.parse_events2_output(stdout) + + # Run all checks + if data['resources']: + self.check_resource_status(data['resources'], args) + + if data['connections']: + self.check_connection_status(data['connections'], args) + + if data['devices']: + self.check_device_status(data['devices'], args) + + if data['peer_devices']: + self.check_peer_device_status(data['peer_devices'], args) + + # Add performance data + if args.performance_data: + self.add_performance_data(data['devices'], data['peer_devices']) + + # Check LINSTOR if requested + self.check_linstor_status(args) + + # Determine final status + return self.get_final_status(args) + + def get_final_status(self, args) -> int: + """Determine final Nagios status and output""" + if self.criticals: + status = STATE_CRITICAL + status_text = "CRITICAL" + messages = self.criticals + if args.verbose: + messages.extend(self.warnings) + messages.extend(self.ok_messages) + elif self.warnings: + status = STATE_WARNING + status_text = "WARNING" + messages = self.warnings + if args.verbose: + messages.extend(self.ok_messages) + else: + status = STATE_OK + status_text = "OK" + messages = self.ok_messages if args.verbose else ["All DRBD checks passed"] + + # Build output + output = f"{status_text} - {'; '.join(messages)}" + + if self.perfdata: + output += " | " + " ".join(self.perfdata) + + print(output) + return status + + +def main(): + parser = argparse.ArgumentParser( + description='Comprehensive DRBD/LINSTOR Nagios monitoring plugin', + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Check all parameters for all resources + %(prog)s --all + + # Check specific resource + %(prog)s --resource r0 --all + + # Check only connection and replication status + %(prog)s --check-connection --check-replication + + # Check with performance data and verbose output + %(prog)s --all --performance-data --verbose + + # Check LINSTOR status as well + %(prog)s --all --check-linstor + + # Ignore transient congestion warnings + %(prog)s --all --ignore-transient-congestion + """ + ) + + # Resource selection + parser.add_argument('-r', '--resource', + help='DRBD resource name to check (default: all)') + + # Check options + parser.add_argument('--all', action='store_true', + help='Enable all checks (recommended)') + + parser.add_argument('--check-role', action='store_true', + help='Check resource role (Primary/Secondary)') + + parser.add_argument('--check-disk', action='store_true', + help='Check disk state (UpToDate/Inconsistent/etc)') + + parser.add_argument('--check-connection', action='store_true', + help='Check connection state between nodes') + + parser.add_argument('--check-replication', action='store_true', + help='Check replication state (Established/SyncSource/etc)') + + parser.add_argument('--check-peer-disk', action='store_true', + help='Check peer disk state') + + parser.add_argument('--check-suspended', action='store_true', + help='Check if resource is suspended') + + parser.add_argument('--check-promotion', action='store_true', + help='Check if resource may be promoted (disabled by default for diskless clients)') + + parser.add_argument('--require-promotion-capability', action='store_true', + help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)') + + parser.add_argument('--check-quorum', action='store_true', + help='Check quorum status') + + parser.add_argument('--check-congestion', action='store_true', + help='Check network congestion') + + parser.add_argument('--ignore-transient-congestion', action='store_true', + help='Ignore transient congestion warnings (recommended for busy networks)') + + parser.add_argument('--check-client', action='store_true', + help='Check if running in client mode') + + parser.add_argument('--check-resync-suspended', action='store_true', + help='Check if resync is suspended') + + parser.add_argument('--check-linstor', action='store_true', + help='Check LINSTOR status (requires linstor command)') + + # Behavior options + parser.add_argument('--require-primary', action='store_true', + help='Warn if resource is not Primary') + + parser.add_argument('--warn-on-sync', action='store_true', + help='Warn when synchronization is in progress (default: OK)') + + parser.add_argument('--performance-data', action='store_true', + help='Include performance data for graphing') + + parser.add_argument('-v', '--verbose', action='store_true', + help='Verbose output (show all status messages)') + + parser.add_argument('--version', action='version', version='%(prog)s 1.2') + + args = parser.parse_args() + + # If --all is specified, enable all checks EXCEPT require-promotion-capability + if args.all: + args.check_role = True + args.check_disk = True + args.check_connection = True + args.check_replication = True + args.check_peer_disk = True + args.check_suspended = True + # Don't enable check_promotion by default - diskless clients can't promote + # args.check_promotion = True + args.check_quorum = True + args.check_congestion = True + args.check_client = True + args.check_resync_suspended = True + args.performance_data = True + + # If no checks specified, enable basic checks + if not any([args.check_role, args.check_disk, args.check_connection, + args.check_replication, args.check_peer_disk, args.check_suspended, + args.check_promotion, args.check_quorum, args.check_congestion, + args.check_client, args.check_resync_suspended]): + args.check_role = True + args.check_disk = True + args.check_connection = True + args.check_replication = True + + # Run checks + monitor = DRBDMonitor() + try: + status = monitor.run_checks(args) + sys.exit(status) + except KeyboardInterrupt: + print("UNKNOWN - Check interrupted") + sys.exit(STATE_UNKNOWN) + except Exception as e: + print(f"UNKNOWN - Unexpected error: {e}") + sys.exit(STATE_UNKNOWN) + + +if __name__ == '__main__': + main()