Files
nagios-plugins/check_drdb_linstor.py
2025-10-31 15:35:04 +01:00

553 lines
23 KiB
Python

#!/usr/bin/env python3
"""
Nagios plugin for comprehensive DRBD/LINSTOR monitoring
Author: @linuxiarz.pl Mateusz Gruszczyński
License: GPL v3
"""
import argparse
import sys
import subprocess
import json
import re
from typing import Dict, List, Tuple
# Nagios exit codes
STATE_OK = 0
STATE_WARNING = 1
STATE_CRITICAL = 2
STATE_UNKNOWN = 3
class DRBDMonitor:
def __init__(self):
self.resources = {}
self.perfdata = []
self.warnings = []
self.criticals = []
self.ok_messages = []
def execute_command(self, cmd: List[str]) -> Tuple[int, str, str]:
"""Execute system command and return returncode, stdout, stderr"""
try:
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
universal_newlines=True
)
stdout, stderr = proc.communicate(timeout=30)
return proc.returncode, stdout, stderr
except subprocess.TimeoutExpired:
return 124, "", "Command timeout"
except Exception as e:
return 1, "", str(e)
def parse_events2_output(self, output: str) -> Dict:
"""Parse drbdsetup events2 --now --statistics output"""
data = {
'resources': {},
'connections': {},
'devices': {},
'peer_devices': {}
}
for line in output.strip().split('\n'):
if not line or line.startswith('#'):
continue
parts = line.split()
if len(parts) < 2:
continue
event_type = parts[0] # exists, create, change, destroy
object_type = parts[1] # resource, connection, device, peer-device
# Parse key:value pairs
props = {}
for part in parts[2:]:
if ':' in part:
key, value = part.split(':', 1)
props[key] = value
# Store data by object type
if object_type == 'resource':
res_name = props.get('name', 'unknown')
data['resources'][res_name] = props
elif object_type == 'connection':
conn_name = props.get('name', 'unknown')
data['connections'][conn_name] = props
elif object_type == 'device':
dev_name = props.get('name', 'unknown')
volume = props.get('volume', '0')
key = f"{dev_name}:{volume}"
data['devices'][key] = props
elif object_type == 'peer-device':
peer_name = props.get('name', 'unknown')
volume = props.get('volume', '0')
key = f"{peer_name}:{volume}"
data['peer_devices'][key] = props
return data
def check_resource_status(self, resource_data: Dict, args):
"""Check resource role and status"""
for res_name, props in resource_data.items():
role = props.get('role', 'Unknown')
suspended = props.get('suspended', 'no')
may_promote = props.get('may_promote', 'no')
# Check role
if args.check_role:
if role not in ['Primary', 'Secondary']:
self.criticals.append(f"Resource {res_name}: Invalid role {role}")
elif role == 'Primary':
self.ok_messages.append(f"Resource {res_name}: Role={role}")
elif role == 'Secondary' and args.require_primary:
self.warnings.append(f"Resource {res_name}: Role is Secondary, expected Primary")
else:
self.ok_messages.append(f"Resource {res_name}: Role={role}")
# Check suspended state
if args.check_suspended and suspended == 'yes':
self.criticals.append(f"Resource {res_name}: SUSPENDED")
# Check promotion capability - ONLY if resource is Secondary and explicitly required
# For diskless clients this is normal behavior
if args.check_promotion and may_promote == 'no' and role == 'Secondary':
# Don't warn for diskless resources (clients), only if explicitly checking
if args.require_promotion_capability:
self.warnings.append(f"Resource {res_name}: Cannot be promoted")
def check_connection_status(self, connection_data: Dict, args):
"""Check connection state between peers"""
for conn_name, props in connection_data.items():
connection = props.get('connection', 'Unknown')
role = props.get('role', 'Unknown')
congested = props.get('congested', 'no')
peer_node_id = props.get('peer-node-id', 'unknown')
# Check connection state
if args.check_connection:
if connection not in ['Connected', 'StandAlone']:
if connection in ['Connecting', 'Timeout', 'BrokenPipe', 'NetworkFailure',
'ProtocolError', 'TearDown', 'Unconnected', 'Disconnecting']:
self.criticals.append(f"Connection {conn_name}: State={connection}")
elif connection in ['WFConnection', 'WFReportParams']:
self.warnings.append(f"Connection {conn_name}: State={connection}")
else:
self.criticals.append(f"Connection {conn_name}: Unknown state {connection}")
else:
self.ok_messages.append(f"Connection {conn_name} to peer-{peer_node_id}: {connection}")
# Check congestion - only warn if it's persistent or critical
if args.check_congestion and congested == 'yes':
# Congestion can be temporary, so only warn instead of critical
if not args.ignore_transient_congestion:
self.warnings.append(f"Connection {conn_name}: CONGESTED")
def check_device_status(self, device_data: Dict, args):
"""Check device/volume disk state"""
for dev_key, props in device_data.items():
dev_name = props.get('name', 'unknown')
volume = props.get('volume', '0')
disk = props.get('disk', 'Unknown')
minor = props.get('minor', 'unknown')
client = props.get('client', 'no')
quorum = props.get('quorum', 'yes')
# Check disk state
if args.check_disk:
if disk not in ['UpToDate', 'Diskless']:
if disk in ['Failed', 'Detaching']:
self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
elif disk in ['Inconsistent', 'Outdated', 'DUnknown']:
self.criticals.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
elif disk in ['Attaching', 'Negotiating']:
self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
else:
self.warnings.append(f"Device {dev_name} vol:{volume}: Disk={disk}")
else:
self.ok_messages.append(f"Device {dev_name} vol:{volume} (minor:{minor}): Disk={disk}")
# Check quorum - but not for diskless clients
if args.check_quorum and quorum == 'no' and disk != 'Diskless':
self.criticals.append(f"Device {dev_name} vol:{volume}: NO QUORUM")
# Check client mode
if args.check_client and client == 'yes':
self.ok_messages.append(f"Device {dev_name} vol:{volume}: Running in client mode")
def check_peer_device_status(self, peer_device_data: Dict, args):
"""Check peer device replication state"""
for peer_key, props in peer_device_data.items():
peer_name = props.get('name', 'unknown')
volume = props.get('volume', '0')
replication = props.get('replication', 'Unknown')
peer_disk = props.get('peer-disk', 'Unknown')
resync_suspended = props.get('resync-suspended', 'no')
peer_client = props.get('peer-client', 'no')
peer_node_id = props.get('peer-node-id', 'unknown')
# Check replication state
if args.check_replication:
if replication not in ['Established', 'Off']:
if replication in ['SyncSource', 'SyncTarget']:
# Synchronization in progress - warning or OK depending on config
if args.warn_on_sync:
self.warnings.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
else:
self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Syncing ({replication})")
elif replication in ['PausedSyncS', 'PausedSyncT']:
self.warnings.append(f"Peer {peer_name} vol:{volume}: Sync paused ({replication})")
elif replication in ['WFBitMapS', 'WFBitMapT', 'WFSyncUUID']:
self.warnings.append(f"Peer {peer_name} vol:{volume}: Waiting for sync ({replication})")
elif replication in ['StartingSyncS', 'StartingSyncT']:
self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Starting sync ({replication})")
elif replication in ['VerifyS', 'VerifyT']:
self.ok_messages.append(f"Peer {peer_name} vol:{volume}: Verifying ({replication})")
else:
self.criticals.append(f"Peer {peer_name} vol:{volume}: Replication={replication}")
else:
self.ok_messages.append(f"Peer {peer_name} (node-{peer_node_id}) vol:{volume}: {replication}")
# Check peer disk state
if args.check_peer_disk:
if peer_disk not in ['UpToDate', 'Diskless', 'DUnknown']:
if peer_disk in ['Failed', 'Outdated', 'Inconsistent']:
self.criticals.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
else:
self.warnings.append(f"Peer {peer_name} vol:{volume}: Peer-Disk={peer_disk}")
# Check resync suspended
if args.check_resync_suspended and resync_suspended == 'yes':
self.warnings.append(f"Peer {peer_name} vol:{volume}: Resync SUSPENDED")
def get_statistics(self, resource: str = 'all') -> Dict:
"""Get DRBD statistics from drbdsetup events2 --statistics"""
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource]
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
return {}
stats = {}
for line in stdout.strip().split('\n'):
parts = line.split()
if len(parts) < 2:
continue
# Look for statistics in the output
for part in parts[2:]:
if ':' in part:
key, value = part.split(':', 1)
try:
stats[key] = int(value)
except ValueError:
stats[key] = value
return stats
def add_performance_data(self, device_data: Dict, peer_device_data: Dict):
"""Add performance data for Nagios"""
# Count resources, devices, connections
resource_count = len(device_data)
self.perfdata.append(f"resources={resource_count}")
# Count by disk state
uptodate = sum(1 for d in device_data.values() if d.get('disk') == 'UpToDate')
self.perfdata.append(f"uptodate_devices={uptodate}")
# Count by replication state
established = sum(1 for p in peer_device_data.values()
if p.get('replication') == 'Established')
syncing = sum(1 for p in peer_device_data.values()
if p.get('replication') in ['SyncSource', 'SyncTarget'])
self.perfdata.append(f"established_replications={established}")
self.perfdata.append(f"syncing_replications={syncing}")
def check_linstor_status(self, args):
"""Check LINSTOR specific status if available"""
if not args.check_linstor:
return
# Check if linstor command is available
cmd = ['which', 'linstor']
rc, _, _ = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.ok_messages.append("LINSTOR: Command not available (optional)")
return
# Get resource list - try different output formats
cmd = ['linstor', '--machine-readable', 'resource', 'list']
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
if args.verbose:
self.warnings.append(f"LINSTOR: Failed to get resource list: {stderr}")
return
try:
# Parse JSON output
linstor_output = json.loads(stdout)
# LINSTOR returns an array, first check if it's valid
if not isinstance(linstor_output, list) or len(linstor_output) == 0:
if args.verbose:
self.ok_messages.append("LINSTOR: No data returned")
return
# Try to extract resource data from various possible formats
linstor_resource_count = 0
linstor_volume_count = 0
# Format 1: Array of response objects with 'resources' key
for item in linstor_output:
if isinstance(item, dict):
# Try 'resources' key
resources = item.get('resources', [])
if resources and isinstance(resources, list):
for res in resources:
if isinstance(res, dict):
linstor_resource_count += 1
volumes = res.get('vlms', []) or res.get('volumes', [])
if isinstance(volumes, list):
linstor_volume_count += len(volumes)
# Format 2: Direct array of resources (older format)
if linstor_resource_count == 0:
for item in linstor_output:
if isinstance(item, dict) and 'name' in item:
linstor_resource_count += 1
volumes = item.get('vlms', []) or item.get('volumes', [])
if isinstance(volumes, list):
linstor_volume_count += len(volumes)
if linstor_resource_count > 0:
self.ok_messages.append(
f"LINSTOR: {linstor_resource_count} resources, "
f"{linstor_volume_count} volumes"
)
# Add performance data
if args.performance_data:
self.perfdata.append(f"linstor_resources={linstor_resource_count}")
self.perfdata.append(f"linstor_volumes={linstor_volume_count}")
else:
if args.verbose:
self.ok_messages.append("LINSTOR: No resources found")
except json.JSONDecodeError as e:
if args.verbose:
self.warnings.append(f"LINSTOR: JSON parse error: {str(e)[:50]}")
except Exception as e:
if args.verbose:
self.warnings.append(f"LINSTOR: Processing error: {str(e)[:50]}")
def run_checks(self, args):
"""Main check execution"""
# Get DRBD events2 output
resource_filter = args.resource if args.resource else 'all'
cmd = ['drbdsetup', 'events2', '--now', '--statistics', resource_filter]
rc, stdout, stderr = self.execute_command(cmd)
if rc != 0:
print(f"CRITICAL - Failed to execute drbdsetup: {stderr}")
sys.exit(STATE_CRITICAL)
if not stdout.strip():
print("CRITICAL - No DRBD resources found")
sys.exit(STATE_CRITICAL)
# Parse output
data = self.parse_events2_output(stdout)
# Run all checks
if data['resources']:
self.check_resource_status(data['resources'], args)
if data['connections']:
self.check_connection_status(data['connections'], args)
if data['devices']:
self.check_device_status(data['devices'], args)
if data['peer_devices']:
self.check_peer_device_status(data['peer_devices'], args)
# Add performance data
if args.performance_data:
self.add_performance_data(data['devices'], data['peer_devices'])
# Check LINSTOR if requested
self.check_linstor_status(args)
# Determine final status
return self.get_final_status(args)
def get_final_status(self, args) -> int:
"""Determine final Nagios status and output"""
if self.criticals:
status = STATE_CRITICAL
status_text = "CRITICAL"
messages = self.criticals
if args.verbose:
messages.extend(self.warnings)
messages.extend(self.ok_messages)
elif self.warnings:
status = STATE_WARNING
status_text = "WARNING"
messages = self.warnings
if args.verbose:
messages.extend(self.ok_messages)
else:
status = STATE_OK
status_text = "OK"
messages = self.ok_messages if args.verbose else ["All DRBD checks passed"]
# Build output
output = f"{status_text} - {'; '.join(messages)}"
if self.perfdata:
output += " | " + " ".join(self.perfdata)
print(output)
return status
def main():
parser = argparse.ArgumentParser(
description='Comprehensive DRBD/LINSTOR Nagios monitoring plugin',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Check all parameters for all resources
%(prog)s --all
# Check specific resource
%(prog)s --resource r0 --all
# Check only connection and replication status
%(prog)s --check-connection --check-replication
# Check with performance data and verbose output
%(prog)s --all --performance-data --verbose
# Check LINSTOR status as well
%(prog)s --all --check-linstor
# Ignore transient congestion warnings
%(prog)s --all --ignore-transient-congestion
"""
)
# Resource selection
parser.add_argument('-r', '--resource',
help='DRBD resource name to check (default: all)')
# Check options
parser.add_argument('--all', action='store_true',
help='Enable all checks (recommended)')
parser.add_argument('--check-role', action='store_true',
help='Check resource role (Primary/Secondary)')
parser.add_argument('--check-disk', action='store_true',
help='Check disk state (UpToDate/Inconsistent/etc)')
parser.add_argument('--check-connection', action='store_true',
help='Check connection state between nodes')
parser.add_argument('--check-replication', action='store_true',
help='Check replication state (Established/SyncSource/etc)')
parser.add_argument('--check-peer-disk', action='store_true',
help='Check peer disk state')
parser.add_argument('--check-suspended', action='store_true',
help='Check if resource is suspended')
parser.add_argument('--check-promotion', action='store_true',
help='Check if resource may be promoted (disabled by default for diskless clients)')
parser.add_argument('--require-promotion-capability', action='store_true',
help='Warn if Secondary resources cannot be promoted (usually OK for diskless clients)')
parser.add_argument('--check-quorum', action='store_true',
help='Check quorum status')
parser.add_argument('--check-congestion', action='store_true',
help='Check network congestion')
parser.add_argument('--ignore-transient-congestion', action='store_true',
help='Ignore transient congestion warnings (recommended for busy networks)')
parser.add_argument('--check-client', action='store_true',
help='Check if running in client mode')
parser.add_argument('--check-resync-suspended', action='store_true',
help='Check if resync is suspended')
parser.add_argument('--check-linstor', action='store_true',
help='Check LINSTOR status (requires linstor command)')
# Behavior options
parser.add_argument('--require-primary', action='store_true',
help='Warn if resource is not Primary')
parser.add_argument('--warn-on-sync', action='store_true',
help='Warn when synchronization is in progress (default: OK)')
parser.add_argument('--performance-data', action='store_true',
help='Include performance data for graphing')
parser.add_argument('-v', '--verbose', action='store_true',
help='Verbose output (show all status messages)')
parser.add_argument('--version', action='version', version='%(prog)s 1.2')
args = parser.parse_args()
# If --all is specified, enable all checks EXCEPT require-promotion-capability
if args.all:
args.check_role = True
args.check_disk = True
args.check_connection = True
args.check_replication = True
args.check_peer_disk = True
args.check_suspended = True
# Don't enable check_promotion by default - diskless clients can't promote
# args.check_promotion = True
args.check_quorum = True
args.check_congestion = True
args.check_client = True
args.check_resync_suspended = True
args.performance_data = True
# If no checks specified, enable basic checks
if not any([args.check_role, args.check_disk, args.check_connection,
args.check_replication, args.check_peer_disk, args.check_suspended,
args.check_promotion, args.check_quorum, args.check_congestion,
args.check_client, args.check_resync_suspended]):
args.check_role = True
args.check_disk = True
args.check_connection = True
args.check_replication = True
# Run checks
monitor = DRBDMonitor()
try:
status = monitor.run_checks(args)
sys.exit(status)
except KeyboardInterrupt:
print("UNKNOWN - Check interrupted")
sys.exit(STATE_UNKNOWN)
except Exception as e:
print(f"UNKNOWN - Unexpected error: {e}")
sys.exit(STATE_UNKNOWN)
if __name__ == '__main__':
main()