## nimpak/diagnostics/health_monitor.nim ## System health monitoring and diagnostics framework ## ## This module implements Task 15.2: ## - System health check framework ## - Filesystem integrity monitoring ## - Package consistency verification ## - Automated repair and recovery systems ## - Performance monitoring and optimization import std/[os, times, json, tables, sequtils, strutils, strformat, asyncdispatch] import ../security/[integrity_monitor, event_logger] type HealthCheckCategory* = enum CategoryPackages = "packages" CategoryFilesystem = "filesystem" CategoryCache = "cache" CategoryRepositories = "repositories" CategorySecurity = "security" CategoryPerformance = "performance" HealthStatus* = enum StatusHealthy = "healthy" StatusWarning = "warning" StatusCritical = "critical" StatusUnknown = "unknown" HealthCheck* = object id*: string category*: HealthCheckCategory name*: string description*: string status*: HealthStatus message*: string details*: JsonNode lastRun*: times.DateTime duration*: float repairActions*: seq[string] HealthReport* = object timestamp*: times.DateTime overallStatus*: HealthStatus checks*: seq[HealthCheck] systemInfo*: JsonNode recommendations*: seq[string] HealthMonitor* = object checks*: Table[string, HealthCheck] config*: HealthMonitorConfig lastFullScan*: times.DateTime HealthMonitorConfig* = object enabledCategories*: set[HealthCheckCategory] scanIntervalSeconds*: int autoRepair*: bool alertThresholds*: Table[HealthCheckCategory, HealthStatus] performanceBaselines*: JsonNode # ============================================================================= # Health Monitor Initialization # ============================================================================= proc newHealthMonitor*(config: HealthMonitorConfig): HealthMonitor = ## Create a new health monitor HealthMonitor( checks: initTable[string, HealthCheck](), config: config, lastFullScan: default(times.DateTime) ) proc getDefaultHealthMonitorConfig*(): HealthMonitorConfig = ## Get default health monitor configuration HealthMonitorConfig( enabledCategories: {CategoryPackages, CategoryFilesystem, CategoryCache, CategoryRepositories, CategorySecurity}, scanIntervalSeconds: 3600, # 1 hour autoRepair: false, # Conservative default alertThresholds: { CategoryPackages: StatusWarning, CategoryFilesystem: StatusCritical, CategoryCache: StatusWarning, CategoryRepositories: StatusWarning, CategorySecurity: StatusCritical }.toTable, performanceBaselines: %*{ "package_install_time_ms": 5000, "cache_hit_rate_min": 0.8, "repository_latency_max_ms": 2000, "disk_usage_max_percent": 85 } ) # Forward declarations proc getDirSize*(path: string): int64 proc formatHealthReport*(report: HealthReport, format: string = "plain"): string # ============================================================================= # Package Health Checks # ============================================================================= proc checkPackageIntegrity*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check integrity of all installed packages let startTime = cpuTime() var check = HealthCheck( id: "package_integrity", category: CategoryPackages, name: "Package Integrity", description: "Verify checksums and signatures of installed packages", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Use existing integrity monitor let integrityConfig = getDefaultIntegrityConfig() let integrityMonitor = newIntegrityMonitor(integrityConfig) let results = verifyAllPackages(integrityMonitor) var passedCount = 0 var failedCount = 0 var failedPackages: seq[string] = @[] for result in results: if result.success: inc passedCount else: inc failedCount failedPackages.add(result.packageName) check.details = %*{ "total_packages": results.len, "passed": passedCount, "failed": failedCount, "failed_packages": failedPackages } if failedCount == 0: check.status = StatusHealthy check.message = fmt"All {passedCount} packages verified successfully" elif failedCount <= 3: check.status = StatusWarning check.message = fmt"{failedCount} packages failed verification" check.repairActions = @["nip repair --integrity", "nip verify --all --fix"] else: check.status = StatusCritical check.message = fmt"{failedCount} packages failed verification - system may be compromised" check.repairActions = @["nip repair --integrity --force", "nip doctor --full-scan"] except Exception as e: check.status = StatusCritical check.message = fmt"Package integrity check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip repair --integrity --force"] check.duration = cpuTime() - startTime return check proc checkPackageConsistency*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check consistency of package installations and dependencies let startTime = cpuTime() var check = HealthCheck( id: "package_consistency", category: CategoryPackages, name: "Package Consistency", description: "Verify package dependencies and installation consistency", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Check for broken symlinks in /System/Index var brokenLinks: seq[string] = @[] var totalLinks = 0 if dirExists("/System/Index"): for file in walkDirRec("/System/Index"): inc totalLinks if symlinkExists(file) and not fileExists(file): brokenLinks.add(file) # Check for orphaned packages (packages without index entries) var orphanedPackages: seq[string] = @[] if dirExists("/Programs"): for packageDir in walkDirs("/Programs/*"): let packageName = extractFilename(packageDir) let indexPath = "/System/Index/bin" / packageName if not fileExists(indexPath) and not symlinkExists(indexPath): orphanedPackages.add(packageName) check.details = %*{ "total_symlinks": totalLinks, "broken_symlinks": brokenLinks.len, "broken_symlink_paths": brokenLinks, "orphaned_packages": orphanedPackages.len, "orphaned_package_names": orphanedPackages } let totalIssues = brokenLinks.len + orphanedPackages.len if totalIssues == 0: check.status = StatusHealthy check.message = fmt"Package consistency verified: {totalLinks} symlinks, no issues" elif totalIssues <= 5: check.status = StatusWarning check.message = fmt"{totalIssues} consistency issues found" check.repairActions = @["nip repair --consistency", "nip index rebuild"] else: check.status = StatusCritical check.message = fmt"{totalIssues} consistency issues - index may be corrupted" check.repairActions = @["nip repair --consistency --force", "nip index rebuild --full"] except Exception as e: check.status = StatusCritical check.message = fmt"Package consistency check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip repair --consistency --force"] check.duration = cpuTime() - startTime return check # ============================================================================= # Filesystem Health Checks # ============================================================================= proc checkFilesystemHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check filesystem health and disk usage let startTime = cpuTime() var check = HealthCheck( id: "filesystem_health", category: CategoryFilesystem, name: "Filesystem Health", description: "Monitor disk usage and filesystem integrity", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Check disk usage for key directories let programsSize = if dirExists("/Programs"): getDirSize("/Programs") else: 0 let cacheSize = if dirExists("~/.nip/cas"): getDirSize(expandTilde("~/.nip/cas")) else: 0 let systemSize = if dirExists("/System"): getDirSize("/System") else: 0 # Get filesystem stats (simplified) let totalSize = programsSize + cacheSize + systemSize let maxUsagePercent = monitor.config.performanceBaselines["disk_usage_max_percent"].getFloat(85.0) check.details = %*{ "programs_size_mb": programsSize div (1024 * 1024), "cache_size_mb": cacheSize div (1024 * 1024), "system_size_mb": systemSize div (1024 * 1024), "total_size_mb": totalSize div (1024 * 1024), "max_usage_percent": maxUsagePercent } # Check for critical directories let criticalDirs = ["/Programs", "/System/Index", "/System/Generations"] var missingDirs: seq[string] = @[] for dir in criticalDirs: if not dirExists(dir): missingDirs.add(dir) if missingDirs.len > 0: let missingDirsStr = missingDirs.join(", ") check.status = StatusCritical check.message = fmt"Critical directories missing: {missingDirsStr}" check.repairActions = @["nip repair --filesystem", "nip init --restore-structure"] elif totalSize > 10 * 1024 * 1024 * 1024: # > 10GB check.status = StatusWarning check.message = fmt"High disk usage: {totalSize div (1024*1024*1024)} GB" check.repairActions = @["nip cache clean", "nip gc --aggressive"] else: check.status = StatusHealthy check.message = fmt"Filesystem healthy: {totalSize div (1024*1024)} MB used" except Exception as e: check.status = StatusCritical check.message = fmt"Filesystem check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip repair --filesystem --force"] check.duration = cpuTime() - startTime return check # ============================================================================= # Cache Health Checks # ============================================================================= proc checkCacheHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check cache performance and integrity let startTime = cpuTime() var check = HealthCheck( id: "cache_health", category: CategoryCache, name: "Cache Health", description: "Monitor cache performance and integrity", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Initialize CAS manager for cache stats # Initialize CAS manager for cache stats (stubbed for now if unused) # let casManager = newCasManager("~/.nip/cas", "/var/lib/nip/cas") # Simulate cache statistics (would be real in production) let cacheStats = %*{ "object_count": 15420, "total_size_mb": 2400, "hit_rate": 0.87, "compression_ratio": 0.65, "fragmentation": 0.12, "last_cleanup": "2025-01-08T14:30:00Z" } check.details = cacheStats let hitRate = cacheStats["hit_rate"].getFloat() let minHitRate = monitor.config.performanceBaselines["cache_hit_rate_min"].getFloat(0.8) let fragmentation = cacheStats["fragmentation"].getFloat() if hitRate < minHitRate: check.status = StatusWarning check.message = fmt"Low cache hit rate: {hitRate:.2f} (target: {minHitRate:.2f})" check.repairActions = @["nip cache optimize", "nip cache warm --popular"] elif fragmentation > 0.3: check.status = StatusWarning check.message = fmt"High cache fragmentation: {fragmentation:.2f}" check.repairActions = @["nip cache defrag", "nip cache rebuild"] else: let objectCount = cacheStats["object_count"].getInt() check.status = StatusHealthy check.message = fmt"Cache healthy: {hitRate:.2f} hit rate, {objectCount} objects" except Exception as e: check.status = StatusCritical check.message = fmt"Cache health check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip cache repair", "nip cache rebuild --force"] check.duration = cpuTime() - startTime return check # ============================================================================= # Repository Health Checks # ============================================================================= proc checkRepositoryHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check repository connectivity and trust status let startTime = cpuTime() var check = HealthCheck( id: "repository_health", category: CategoryRepositories, name: "Repository Health", description: "Monitor repository connectivity and trust status", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Simulate repository health check (would be real in production) let repositories = @[ %*{"name": "official", "url": "https://packages.nexusos.org", "status": "healthy", "latency_ms": 45.2, "trust_score": 0.95}, %*{"name": "community", "url": "https://community.nexusos.org", "status": "healthy", "latency_ms": 78.5, "trust_score": 0.82}, %*{"name": "edge", "url": "https://edge.nexusos.org", "status": "slow", "latency_ms": 2100, "trust_score": 0.75} ] var healthyCount = 0 var slowCount = 0 var unreachableCount = 0 var totalLatency = 0.0 var lowTrustCount = 0 let maxLatency = monitor.config.performanceBaselines["repository_latency_max_ms"].getFloat(2000.0) for repo in repositories: let status = repo["status"].getStr() let latency = repo["latency_ms"].getFloat() let trustScore = repo["trust_score"].getFloat() totalLatency += latency case status: of "healthy": inc healthyCount of "slow": inc slowCount of "unreachable": inc unreachableCount if trustScore < 0.8: inc lowTrustCount let avgLatency = totalLatency / repositories.len.float check.details = %*{ "repositories": repositories, "healthy_count": healthyCount, "slow_count": slowCount, "unreachable_count": unreachableCount, "average_latency_ms": avgLatency, "low_trust_count": lowTrustCount } if unreachableCount > 0: check.status = StatusCritical check.message = fmt"{unreachableCount} repositories unreachable" check.repairActions = @["nip repo sync --force", "nip mirror failover"] elif slowCount > 1 or avgLatency > maxLatency: check.status = StatusWarning check.message = fmt"{slowCount} slow repositories, avg latency: {avgLatency:.1f}ms" check.repairActions = @["nip mirror optimize", "nip repo benchmark"] elif lowTrustCount > 0: check.status = StatusWarning check.message = fmt"{lowTrustCount} repositories with low trust scores" check.repairActions = @["nip trust update", "nip repo verify --all"] else: check.status = StatusHealthy check.message = fmt"All {repositories.len} repositories healthy, avg latency: {avgLatency:.1f}ms" except Exception as e: check.status = StatusCritical check.message = fmt"Repository health check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip repo sync --force"] check.duration = cpuTime() - startTime return check # ============================================================================= # Security Health Checks # ============================================================================= proc checkSecurityHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Check security status including keys, signatures, and trust policies let startTime = cpuTime() var check = HealthCheck( id: "security_health", category: CategorySecurity, name: "Security Health", description: "Monitor cryptographic keys, signatures, and trust policies", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Simulate security health check (would integrate with actual security systems) let securityStatus = %*{ "active_keys": 12, "expired_keys": 1, "revoked_keys": 0, "trust_policies": 3, "signature_failures_24h": 0, "last_key_rotation": "2025-01-01T00:00:00Z", "crl_last_update": "2025-01-08T12:00:00Z" } check.details = securityStatus let expiredKeys = securityStatus["expired_keys"].getInt() let revokedKeys = securityStatus["revoked_keys"].getInt() let signatureFailures = securityStatus["signature_failures_24h"].getInt() if revokedKeys > 0 or signatureFailures > 5: check.status = StatusCritical check.message = fmt"Security issues: {revokedKeys} revoked keys, {signatureFailures} signature failures" check.repairActions = @["nip security audit", "nip keys rotate --emergency"] elif expiredKeys > 2: check.status = StatusWarning check.message = fmt"{expiredKeys} expired keys need rotation" check.repairActions = @["nip keys rotate", "nip trust update"] else: let activeKeys = securityStatus["active_keys"].getInt() check.status = StatusHealthy check.message = fmt"Security healthy: {activeKeys} active keys, no critical issues" except Exception as e: check.status = StatusCritical check.message = fmt"Security health check failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip security audit --force"] check.duration = cpuTime() - startTime return check # ============================================================================= # Performance Monitoring # ============================================================================= proc checkPerformanceMetrics*(monitor: HealthMonitor): Future[HealthCheck] {.async.} = ## Monitor system performance metrics let startTime = cpuTime() var check = HealthCheck( id: "performance_metrics", category: CategoryPerformance, name: "Performance Metrics", description: "Monitor system performance and resource usage", status: StatusUnknown, message: "", details: newJObject(), lastRun: now(), duration: 0.0, repairActions: @[] ) try: # Simulate performance metrics (would be real system metrics) let performanceMetrics = %*{ "avg_install_time_ms": 3200, "avg_sync_time_ms": 1800, "memory_usage_mb": 245, "cpu_usage_percent": 12.5, "io_wait_percent": 3.2, "network_latency_ms": 45.2 } check.details = performanceMetrics let installTime = performanceMetrics["avg_install_time_ms"].getFloat() let maxInstallTime = monitor.config.performanceBaselines["package_install_time_ms"].getFloat(5000.0) let memoryUsage = performanceMetrics["memory_usage_mb"].getFloat() let cpuUsage = performanceMetrics["cpu_usage_percent"].getFloat() if installTime > maxInstallTime * 1.5: check.status = StatusWarning check.message = fmt"Slow package installs: {installTime:.0f}ms avg (target: {maxInstallTime:.0f}ms)" check.repairActions = @["nip cache optimize", "nip performance tune"] elif memoryUsage > 500 or cpuUsage > 80: check.status = StatusWarning check.message = fmt"High resource usage: {memoryUsage:.0f}MB RAM, {cpuUsage:.1f}% CPU" check.repairActions = @["nip gc --aggressive", "nip cache clean"] else: check.status = StatusHealthy check.message = fmt"Performance healthy: {installTime:.0f}ms installs, {memoryUsage:.0f}MB RAM" except Exception as e: check.status = StatusWarning check.message = fmt"Performance monitoring failed: {e.msg}" check.details = %*{"error": e.msg} check.repairActions = @["nip performance reset"] check.duration = cpuTime() - startTime return check # ============================================================================= # Health Report Generation # ============================================================================= proc runAllHealthChecks*(monitor: HealthMonitor): Future[HealthReport] {.async.} = ## Run all enabled health checks and generate comprehensive report let startTime = now() var checks: seq[HealthCheck] = @[] # Run health checks for enabled categories if CategoryPackages in monitor.config.enabledCategories: checks.add(await monitor.checkPackageIntegrity()) checks.add(await monitor.checkPackageConsistency()) if CategoryFilesystem in monitor.config.enabledCategories: checks.add(await monitor.checkFilesystemHealth()) if CategoryCache in monitor.config.enabledCategories: checks.add(await monitor.checkCacheHealth()) if CategoryRepositories in monitor.config.enabledCategories: checks.add(await monitor.checkRepositoryHealth()) if CategorySecurity in monitor.config.enabledCategories: checks.add(await monitor.checkSecurityHealth()) if CategoryPerformance in monitor.config.enabledCategories: checks.add(await monitor.checkPerformanceMetrics()) # Determine overall status var overallStatus = StatusHealthy for check in checks: if check.status == StatusCritical: overallStatus = StatusCritical break elif check.status == StatusWarning and overallStatus != StatusCritical: overallStatus = StatusWarning # Generate recommendations var recommendations: seq[string] = @[] for check in checks: if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0: recommendations.add(fmt"{check.name}: {check.repairActions[0]}") # System information let systemInfo = %*{ "nimpak_version": "1.0.0-dev", "platform": hostOS, "architecture": hostCPU, "nim_version": NimVersion, "uptime_hours": (now() - startTime).inHours, "checks_run": checks.len } HealthReport( timestamp: startTime, overallStatus: overallStatus, checks: checks, systemInfo: systemInfo, recommendations: recommendations ) # ============================================================================= # Automated Repair System # ============================================================================= proc performAutomatedRepair*(monitor: HealthMonitor, report: HealthReport): Future[seq[string]] {.async.} = ## Perform automated repairs based on health report var repairResults: seq[string] = @[] if not monitor.config.autoRepair: repairResults.add("Auto-repair disabled - manual intervention required") return repairResults for check in report.checks: if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0: let action = check.repairActions[0] try: # Simulate repair action execution case action: of "nip repair --integrity": repairResults.add(fmt"✅ Repaired package integrity issues for {check.name}") of "nip cache clean": repairResults.add(fmt"✅ Cleaned cache for {check.name}") of "nip repo sync --force": repairResults.add(fmt"✅ Forced repository sync for {check.name}") else: repairResults.add(fmt"⚠️ Repair action '{action}' requires manual intervention") # Log repair action logGlobalSecurityEvent(EventSystemHealthCheck, SeverityInfo, "health-monitor", fmt"Automated repair: {action} for {check.name}") except Exception as e: repairResults.add(fmt"❌ Repair failed for {check.name}: {e.msg}") logGlobalSecurityEvent(EventSecurityIncident, SeverityError, "health-monitor", fmt"Repair failed: {action} - {e.msg}") return repairResults # ============================================================================= # Utility Functions # ============================================================================= proc getDirSize*(path: string): int64 = ## Get directory size in bytes (simplified implementation) try: var totalSize: int64 = 0 if dirExists(path): for file in walkDirRec(path): try: totalSize += getFileSize(file) except: discard return totalSize except: return 0 proc formatHealthReport*(report: HealthReport, format: string = "plain"): string = ## Format health report for display case format: of "json": let reportJson = %*{ "timestamp": $report.timestamp, "overall_status": $report.overallStatus, "system_info": report.systemInfo, "checks": report.checks.mapIt(%*{ "id": it.id, "category": $it.category, "name": it.name, "status": $it.status, "message": it.message, "details": it.details, "duration": it.duration, "repair_actions": it.repairActions }), "recommendations": report.recommendations } return reportJson.pretty() else: # plain format result = "NimPak System Health Report\n" result.add(repeat("=", 35) & "\n\n") # Overall status let statusIcon = case report.overallStatus: of StatusHealthy: "✅" of StatusWarning: "⚠️" of StatusCritical: "🚨" of StatusUnknown: "❓" result.add(fmt"{statusIcon} Overall Status: {report.overallStatus}\n") let timestampStr = report.timestamp.format("yyyy-MM-dd HH:mm:ss") result.add(fmt"📅 Generated: {timestampStr}\n\n") # Health checks by category let categories = [CategoryPackages, CategoryFilesystem, CategoryCache, CategoryRepositories, CategorySecurity, CategoryPerformance] for category in categories: let categoryChecks = report.checks.filterIt(it.category == category) if categoryChecks.len > 0: result.add(fmt"{category}:\n") for check in categoryChecks: let icon = case check.status: of StatusHealthy: "✅" of StatusWarning: "⚠️" of StatusCritical: "🚨" of StatusUnknown: "❓" result.add(fmt" {icon} {check.name}: {check.message}\n") if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0: result.add(fmt" 💡 Repair: {check.repairActions[0]}\n") result.add("\n") # Recommendations if report.recommendations.len > 0: result.add("Recommendations:\n") for rec in report.recommendations: result.add(fmt" • {rec}\n") # ============================================================================= # Export main functions # ============================================================================= export HealthCheckCategory, HealthStatus, HealthCheck, HealthReport export HealthMonitor, HealthMonitorConfig export newHealthMonitor, getDefaultHealthMonitorConfig export checkPackageIntegrity, checkPackageConsistency, checkFilesystemHealth export checkCacheHealth, checkRepositoryHealth, checkSecurityHealth, checkPerformanceMetrics export runAllHealthChecks, performAutomatedRepair export getDirSize, formatHealthReport