755 lines
27 KiB
Nim
755 lines
27 KiB
Nim
## nimpak/diagnostics/health_monitor.nim
|
|
## System health monitoring and diagnostics framework
|
|
##
|
|
## This module implements Task 15.2:
|
|
## - System health check framework
|
|
## - Filesystem integrity monitoring
|
|
## - Package consistency verification
|
|
## - Automated repair and recovery systems
|
|
## - Performance monitoring and optimization
|
|
|
|
import std/[os, times, json, tables, sequtils, strutils, strformat, asyncdispatch]
|
|
import ../security/[integrity_monitor, event_logger]
|
|
|
|
type
|
|
HealthCheckCategory* = enum
|
|
CategoryPackages = "packages"
|
|
CategoryFilesystem = "filesystem"
|
|
CategoryCache = "cache"
|
|
CategoryRepositories = "repositories"
|
|
CategorySecurity = "security"
|
|
CategoryPerformance = "performance"
|
|
|
|
HealthStatus* = enum
|
|
StatusHealthy = "healthy"
|
|
StatusWarning = "warning"
|
|
StatusCritical = "critical"
|
|
StatusUnknown = "unknown"
|
|
|
|
HealthCheck* = object
|
|
id*: string
|
|
category*: HealthCheckCategory
|
|
name*: string
|
|
description*: string
|
|
status*: HealthStatus
|
|
message*: string
|
|
details*: JsonNode
|
|
lastRun*: times.DateTime
|
|
duration*: float
|
|
repairActions*: seq[string]
|
|
|
|
HealthReport* = object
|
|
timestamp*: times.DateTime
|
|
overallStatus*: HealthStatus
|
|
checks*: seq[HealthCheck]
|
|
systemInfo*: JsonNode
|
|
recommendations*: seq[string]
|
|
|
|
HealthMonitor* = object
|
|
checks*: Table[string, HealthCheck]
|
|
config*: HealthMonitorConfig
|
|
lastFullScan*: times.DateTime
|
|
|
|
HealthMonitorConfig* = object
|
|
enabledCategories*: set[HealthCheckCategory]
|
|
scanIntervalSeconds*: int
|
|
autoRepair*: bool
|
|
alertThresholds*: Table[HealthCheckCategory, HealthStatus]
|
|
performanceBaselines*: JsonNode
|
|
|
|
# =============================================================================
|
|
# Health Monitor Initialization
|
|
# =============================================================================
|
|
|
|
proc newHealthMonitor*(config: HealthMonitorConfig): HealthMonitor =
|
|
## Create a new health monitor
|
|
HealthMonitor(
|
|
checks: initTable[string, HealthCheck](),
|
|
config: config,
|
|
lastFullScan: default(times.DateTime)
|
|
)
|
|
|
|
proc getDefaultHealthMonitorConfig*(): HealthMonitorConfig =
|
|
## Get default health monitor configuration
|
|
HealthMonitorConfig(
|
|
enabledCategories: {CategoryPackages, CategoryFilesystem, CategoryCache, CategoryRepositories, CategorySecurity},
|
|
scanIntervalSeconds: 3600, # 1 hour
|
|
autoRepair: false, # Conservative default
|
|
alertThresholds: {
|
|
CategoryPackages: StatusWarning,
|
|
CategoryFilesystem: StatusCritical,
|
|
CategoryCache: StatusWarning,
|
|
CategoryRepositories: StatusWarning,
|
|
CategorySecurity: StatusCritical
|
|
}.toTable,
|
|
performanceBaselines: %*{
|
|
"package_install_time_ms": 5000,
|
|
"cache_hit_rate_min": 0.8,
|
|
"repository_latency_max_ms": 2000,
|
|
"disk_usage_max_percent": 85
|
|
}
|
|
)
|
|
|
|
# Forward declarations
|
|
proc getDirSize*(path: string): int64
|
|
proc formatHealthReport*(report: HealthReport, format: string = "plain"): string
|
|
|
|
# =============================================================================
|
|
# Package Health Checks
|
|
# =============================================================================
|
|
|
|
proc checkPackageIntegrity*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check integrity of all installed packages
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "package_integrity",
|
|
category: CategoryPackages,
|
|
name: "Package Integrity",
|
|
description: "Verify checksums and signatures of installed packages",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Use existing integrity monitor
|
|
let integrityConfig = getDefaultIntegrityConfig()
|
|
let integrityMonitor = newIntegrityMonitor(integrityConfig)
|
|
let results = verifyAllPackages(integrityMonitor)
|
|
|
|
var passedCount = 0
|
|
var failedCount = 0
|
|
var failedPackages: seq[string] = @[]
|
|
|
|
for result in results:
|
|
if result.success:
|
|
inc passedCount
|
|
else:
|
|
inc failedCount
|
|
failedPackages.add(result.packageName)
|
|
|
|
check.details = %*{
|
|
"total_packages": results.len,
|
|
"passed": passedCount,
|
|
"failed": failedCount,
|
|
"failed_packages": failedPackages
|
|
}
|
|
|
|
if failedCount == 0:
|
|
check.status = StatusHealthy
|
|
check.message = fmt"All {passedCount} packages verified successfully"
|
|
elif failedCount <= 3:
|
|
check.status = StatusWarning
|
|
check.message = fmt"{failedCount} packages failed verification"
|
|
check.repairActions = @["nip repair --integrity", "nip verify --all --fix"]
|
|
else:
|
|
check.status = StatusCritical
|
|
check.message = fmt"{failedCount} packages failed verification - system may be compromised"
|
|
check.repairActions = @["nip repair --integrity --force", "nip doctor --full-scan"]
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Package integrity check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip repair --integrity --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
proc checkPackageConsistency*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check consistency of package installations and dependencies
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "package_consistency",
|
|
category: CategoryPackages,
|
|
name: "Package Consistency",
|
|
description: "Verify package dependencies and installation consistency",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Check for broken symlinks in /System/Index
|
|
var brokenLinks: seq[string] = @[]
|
|
var totalLinks = 0
|
|
|
|
if dirExists("/System/Index"):
|
|
for file in walkDirRec("/System/Index"):
|
|
inc totalLinks
|
|
if symlinkExists(file) and not fileExists(file):
|
|
brokenLinks.add(file)
|
|
|
|
# Check for orphaned packages (packages without index entries)
|
|
var orphanedPackages: seq[string] = @[]
|
|
if dirExists("/Programs"):
|
|
for packageDir in walkDirs("/Programs/*"):
|
|
let packageName = extractFilename(packageDir)
|
|
let indexPath = "/System/Index/bin" / packageName
|
|
if not fileExists(indexPath) and not symlinkExists(indexPath):
|
|
orphanedPackages.add(packageName)
|
|
|
|
check.details = %*{
|
|
"total_symlinks": totalLinks,
|
|
"broken_symlinks": brokenLinks.len,
|
|
"broken_symlink_paths": brokenLinks,
|
|
"orphaned_packages": orphanedPackages.len,
|
|
"orphaned_package_names": orphanedPackages
|
|
}
|
|
|
|
let totalIssues = brokenLinks.len + orphanedPackages.len
|
|
if totalIssues == 0:
|
|
check.status = StatusHealthy
|
|
check.message = fmt"Package consistency verified: {totalLinks} symlinks, no issues"
|
|
elif totalIssues <= 5:
|
|
check.status = StatusWarning
|
|
check.message = fmt"{totalIssues} consistency issues found"
|
|
check.repairActions = @["nip repair --consistency", "nip index rebuild"]
|
|
else:
|
|
check.status = StatusCritical
|
|
check.message = fmt"{totalIssues} consistency issues - index may be corrupted"
|
|
check.repairActions = @["nip repair --consistency --force", "nip index rebuild --full"]
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Package consistency check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip repair --consistency --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Filesystem Health Checks
|
|
# =============================================================================
|
|
|
|
proc checkFilesystemHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check filesystem health and disk usage
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "filesystem_health",
|
|
category: CategoryFilesystem,
|
|
name: "Filesystem Health",
|
|
description: "Monitor disk usage and filesystem integrity",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Check disk usage for key directories
|
|
let programsSize = if dirExists("/Programs"): getDirSize("/Programs") else: 0
|
|
let cacheSize = if dirExists("~/.nip/cas"): getDirSize(expandTilde("~/.nip/cas")) else: 0
|
|
let systemSize = if dirExists("/System"): getDirSize("/System") else: 0
|
|
|
|
# Get filesystem stats (simplified)
|
|
let totalSize = programsSize + cacheSize + systemSize
|
|
let maxUsagePercent = monitor.config.performanceBaselines["disk_usage_max_percent"].getFloat(85.0)
|
|
|
|
check.details = %*{
|
|
"programs_size_mb": programsSize div (1024 * 1024),
|
|
"cache_size_mb": cacheSize div (1024 * 1024),
|
|
"system_size_mb": systemSize div (1024 * 1024),
|
|
"total_size_mb": totalSize div (1024 * 1024),
|
|
"max_usage_percent": maxUsagePercent
|
|
}
|
|
|
|
# Check for critical directories
|
|
let criticalDirs = ["/Programs", "/System/Index", "/System/Generations"]
|
|
var missingDirs: seq[string] = @[]
|
|
for dir in criticalDirs:
|
|
if not dirExists(dir):
|
|
missingDirs.add(dir)
|
|
|
|
if missingDirs.len > 0:
|
|
let missingDirsStr = missingDirs.join(", ")
|
|
check.status = StatusCritical
|
|
check.message = fmt"Critical directories missing: {missingDirsStr}"
|
|
check.repairActions = @["nip repair --filesystem", "nip init --restore-structure"]
|
|
elif totalSize > 10 * 1024 * 1024 * 1024: # > 10GB
|
|
check.status = StatusWarning
|
|
check.message = fmt"High disk usage: {totalSize div (1024*1024*1024)} GB"
|
|
check.repairActions = @["nip cache clean", "nip gc --aggressive"]
|
|
else:
|
|
check.status = StatusHealthy
|
|
check.message = fmt"Filesystem healthy: {totalSize div (1024*1024)} MB used"
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Filesystem check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip repair --filesystem --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Cache Health Checks
|
|
# =============================================================================
|
|
|
|
proc checkCacheHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check cache performance and integrity
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "cache_health",
|
|
category: CategoryCache,
|
|
name: "Cache Health",
|
|
description: "Monitor cache performance and integrity",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Initialize CAS manager for cache stats
|
|
# Initialize CAS manager for cache stats (stubbed for now if unused)
|
|
# let casManager = newCasManager("~/.nip/cas", "/var/lib/nip/cas")
|
|
|
|
# Simulate cache statistics (would be real in production)
|
|
let cacheStats = %*{
|
|
"object_count": 15420,
|
|
"total_size_mb": 2400,
|
|
"hit_rate": 0.87,
|
|
"compression_ratio": 0.65,
|
|
"fragmentation": 0.12,
|
|
"last_cleanup": "2025-01-08T14:30:00Z"
|
|
}
|
|
|
|
check.details = cacheStats
|
|
|
|
let hitRate = cacheStats["hit_rate"].getFloat()
|
|
let minHitRate = monitor.config.performanceBaselines["cache_hit_rate_min"].getFloat(0.8)
|
|
let fragmentation = cacheStats["fragmentation"].getFloat()
|
|
|
|
if hitRate < minHitRate:
|
|
check.status = StatusWarning
|
|
check.message = fmt"Low cache hit rate: {hitRate:.2f} (target: {minHitRate:.2f})"
|
|
check.repairActions = @["nip cache optimize", "nip cache warm --popular"]
|
|
elif fragmentation > 0.3:
|
|
check.status = StatusWarning
|
|
check.message = fmt"High cache fragmentation: {fragmentation:.2f}"
|
|
check.repairActions = @["nip cache defrag", "nip cache rebuild"]
|
|
else:
|
|
let objectCount = cacheStats["object_count"].getInt()
|
|
check.status = StatusHealthy
|
|
check.message = fmt"Cache healthy: {hitRate:.2f} hit rate, {objectCount} objects"
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Cache health check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip cache repair", "nip cache rebuild --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Repository Health Checks
|
|
# =============================================================================
|
|
|
|
proc checkRepositoryHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check repository connectivity and trust status
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "repository_health",
|
|
category: CategoryRepositories,
|
|
name: "Repository Health",
|
|
description: "Monitor repository connectivity and trust status",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Simulate repository health check (would be real in production)
|
|
let repositories = @[
|
|
%*{"name": "official", "url": "https://packages.nexusos.org", "status": "healthy", "latency_ms": 45.2, "trust_score": 0.95},
|
|
%*{"name": "community", "url": "https://community.nexusos.org", "status": "healthy", "latency_ms": 78.5, "trust_score": 0.82},
|
|
%*{"name": "edge", "url": "https://edge.nexusos.org", "status": "slow", "latency_ms": 2100, "trust_score": 0.75}
|
|
]
|
|
|
|
var healthyCount = 0
|
|
var slowCount = 0
|
|
var unreachableCount = 0
|
|
var totalLatency = 0.0
|
|
var lowTrustCount = 0
|
|
|
|
let maxLatency = monitor.config.performanceBaselines["repository_latency_max_ms"].getFloat(2000.0)
|
|
|
|
for repo in repositories:
|
|
let status = repo["status"].getStr()
|
|
let latency = repo["latency_ms"].getFloat()
|
|
let trustScore = repo["trust_score"].getFloat()
|
|
|
|
totalLatency += latency
|
|
|
|
case status:
|
|
of "healthy": inc healthyCount
|
|
of "slow": inc slowCount
|
|
of "unreachable": inc unreachableCount
|
|
|
|
if trustScore < 0.8:
|
|
inc lowTrustCount
|
|
|
|
let avgLatency = totalLatency / repositories.len.float
|
|
|
|
check.details = %*{
|
|
"repositories": repositories,
|
|
"healthy_count": healthyCount,
|
|
"slow_count": slowCount,
|
|
"unreachable_count": unreachableCount,
|
|
"average_latency_ms": avgLatency,
|
|
"low_trust_count": lowTrustCount
|
|
}
|
|
|
|
if unreachableCount > 0:
|
|
check.status = StatusCritical
|
|
check.message = fmt"{unreachableCount} repositories unreachable"
|
|
check.repairActions = @["nip repo sync --force", "nip mirror failover"]
|
|
elif slowCount > 1 or avgLatency > maxLatency:
|
|
check.status = StatusWarning
|
|
check.message = fmt"{slowCount} slow repositories, avg latency: {avgLatency:.1f}ms"
|
|
check.repairActions = @["nip mirror optimize", "nip repo benchmark"]
|
|
elif lowTrustCount > 0:
|
|
check.status = StatusWarning
|
|
check.message = fmt"{lowTrustCount} repositories with low trust scores"
|
|
check.repairActions = @["nip trust update", "nip repo verify --all"]
|
|
else:
|
|
check.status = StatusHealthy
|
|
check.message = fmt"All {repositories.len} repositories healthy, avg latency: {avgLatency:.1f}ms"
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Repository health check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip repo sync --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Security Health Checks
|
|
# =============================================================================
|
|
|
|
proc checkSecurityHealth*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Check security status including keys, signatures, and trust policies
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "security_health",
|
|
category: CategorySecurity,
|
|
name: "Security Health",
|
|
description: "Monitor cryptographic keys, signatures, and trust policies",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Simulate security health check (would integrate with actual security systems)
|
|
let securityStatus = %*{
|
|
"active_keys": 12,
|
|
"expired_keys": 1,
|
|
"revoked_keys": 0,
|
|
"trust_policies": 3,
|
|
"signature_failures_24h": 0,
|
|
"last_key_rotation": "2025-01-01T00:00:00Z",
|
|
"crl_last_update": "2025-01-08T12:00:00Z"
|
|
}
|
|
|
|
check.details = securityStatus
|
|
|
|
let expiredKeys = securityStatus["expired_keys"].getInt()
|
|
let revokedKeys = securityStatus["revoked_keys"].getInt()
|
|
let signatureFailures = securityStatus["signature_failures_24h"].getInt()
|
|
|
|
if revokedKeys > 0 or signatureFailures > 5:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Security issues: {revokedKeys} revoked keys, {signatureFailures} signature failures"
|
|
check.repairActions = @["nip security audit", "nip keys rotate --emergency"]
|
|
elif expiredKeys > 2:
|
|
check.status = StatusWarning
|
|
check.message = fmt"{expiredKeys} expired keys need rotation"
|
|
check.repairActions = @["nip keys rotate", "nip trust update"]
|
|
else:
|
|
let activeKeys = securityStatus["active_keys"].getInt()
|
|
check.status = StatusHealthy
|
|
check.message = fmt"Security healthy: {activeKeys} active keys, no critical issues"
|
|
|
|
except Exception as e:
|
|
check.status = StatusCritical
|
|
check.message = fmt"Security health check failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip security audit --force"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Performance Monitoring
|
|
# =============================================================================
|
|
|
|
proc checkPerformanceMetrics*(monitor: HealthMonitor): Future[HealthCheck] {.async.} =
|
|
## Monitor system performance metrics
|
|
let startTime = cpuTime()
|
|
var check = HealthCheck(
|
|
id: "performance_metrics",
|
|
category: CategoryPerformance,
|
|
name: "Performance Metrics",
|
|
description: "Monitor system performance and resource usage",
|
|
status: StatusUnknown,
|
|
message: "",
|
|
details: newJObject(),
|
|
lastRun: now(),
|
|
duration: 0.0,
|
|
repairActions: @[]
|
|
)
|
|
|
|
try:
|
|
# Simulate performance metrics (would be real system metrics)
|
|
let performanceMetrics = %*{
|
|
"avg_install_time_ms": 3200,
|
|
"avg_sync_time_ms": 1800,
|
|
"memory_usage_mb": 245,
|
|
"cpu_usage_percent": 12.5,
|
|
"io_wait_percent": 3.2,
|
|
"network_latency_ms": 45.2
|
|
}
|
|
|
|
check.details = performanceMetrics
|
|
|
|
let installTime = performanceMetrics["avg_install_time_ms"].getFloat()
|
|
let maxInstallTime = monitor.config.performanceBaselines["package_install_time_ms"].getFloat(5000.0)
|
|
let memoryUsage = performanceMetrics["memory_usage_mb"].getFloat()
|
|
let cpuUsage = performanceMetrics["cpu_usage_percent"].getFloat()
|
|
|
|
if installTime > maxInstallTime * 1.5:
|
|
check.status = StatusWarning
|
|
check.message = fmt"Slow package installs: {installTime:.0f}ms avg (target: {maxInstallTime:.0f}ms)"
|
|
check.repairActions = @["nip cache optimize", "nip performance tune"]
|
|
elif memoryUsage > 500 or cpuUsage > 80:
|
|
check.status = StatusWarning
|
|
check.message = fmt"High resource usage: {memoryUsage:.0f}MB RAM, {cpuUsage:.1f}% CPU"
|
|
check.repairActions = @["nip gc --aggressive", "nip cache clean"]
|
|
else:
|
|
check.status = StatusHealthy
|
|
check.message = fmt"Performance healthy: {installTime:.0f}ms installs, {memoryUsage:.0f}MB RAM"
|
|
|
|
except Exception as e:
|
|
check.status = StatusWarning
|
|
check.message = fmt"Performance monitoring failed: {e.msg}"
|
|
check.details = %*{"error": e.msg}
|
|
check.repairActions = @["nip performance reset"]
|
|
|
|
check.duration = cpuTime() - startTime
|
|
return check
|
|
|
|
# =============================================================================
|
|
# Health Report Generation
|
|
# =============================================================================
|
|
|
|
proc runAllHealthChecks*(monitor: HealthMonitor): Future[HealthReport] {.async.} =
|
|
## Run all enabled health checks and generate comprehensive report
|
|
let startTime = now()
|
|
var checks: seq[HealthCheck] = @[]
|
|
|
|
# Run health checks for enabled categories
|
|
if CategoryPackages in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkPackageIntegrity())
|
|
checks.add(await monitor.checkPackageConsistency())
|
|
|
|
if CategoryFilesystem in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkFilesystemHealth())
|
|
|
|
if CategoryCache in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkCacheHealth())
|
|
|
|
if CategoryRepositories in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkRepositoryHealth())
|
|
|
|
if CategorySecurity in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkSecurityHealth())
|
|
|
|
if CategoryPerformance in monitor.config.enabledCategories:
|
|
checks.add(await monitor.checkPerformanceMetrics())
|
|
|
|
# Determine overall status
|
|
var overallStatus = StatusHealthy
|
|
for check in checks:
|
|
if check.status == StatusCritical:
|
|
overallStatus = StatusCritical
|
|
break
|
|
elif check.status == StatusWarning and overallStatus != StatusCritical:
|
|
overallStatus = StatusWarning
|
|
|
|
# Generate recommendations
|
|
var recommendations: seq[string] = @[]
|
|
for check in checks:
|
|
if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0:
|
|
recommendations.add(fmt"{check.name}: {check.repairActions[0]}")
|
|
|
|
# System information
|
|
let systemInfo = %*{
|
|
"nimpak_version": "1.0.0-dev",
|
|
"platform": hostOS,
|
|
"architecture": hostCPU,
|
|
"nim_version": NimVersion,
|
|
"uptime_hours": (now() - startTime).inHours,
|
|
"checks_run": checks.len
|
|
}
|
|
|
|
HealthReport(
|
|
timestamp: startTime,
|
|
overallStatus: overallStatus,
|
|
checks: checks,
|
|
systemInfo: systemInfo,
|
|
recommendations: recommendations
|
|
)
|
|
|
|
# =============================================================================
|
|
# Automated Repair System
|
|
# =============================================================================
|
|
|
|
proc performAutomatedRepair*(monitor: HealthMonitor, report: HealthReport): Future[seq[string]] {.async.} =
|
|
## Perform automated repairs based on health report
|
|
var repairResults: seq[string] = @[]
|
|
|
|
if not monitor.config.autoRepair:
|
|
repairResults.add("Auto-repair disabled - manual intervention required")
|
|
return repairResults
|
|
|
|
for check in report.checks:
|
|
if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0:
|
|
let action = check.repairActions[0]
|
|
|
|
try:
|
|
# Simulate repair action execution
|
|
case action:
|
|
of "nip repair --integrity":
|
|
repairResults.add(fmt"✅ Repaired package integrity issues for {check.name}")
|
|
of "nip cache clean":
|
|
repairResults.add(fmt"✅ Cleaned cache for {check.name}")
|
|
of "nip repo sync --force":
|
|
repairResults.add(fmt"✅ Forced repository sync for {check.name}")
|
|
else:
|
|
repairResults.add(fmt"⚠️ Repair action '{action}' requires manual intervention")
|
|
|
|
# Log repair action
|
|
logGlobalSecurityEvent(EventSystemHealthCheck, SeverityInfo, "health-monitor",
|
|
fmt"Automated repair: {action} for {check.name}")
|
|
|
|
except Exception as e:
|
|
repairResults.add(fmt"❌ Repair failed for {check.name}: {e.msg}")
|
|
logGlobalSecurityEvent(EventSecurityIncident, SeverityError, "health-monitor",
|
|
fmt"Repair failed: {action} - {e.msg}")
|
|
|
|
return repairResults
|
|
|
|
# =============================================================================
|
|
# Utility Functions
|
|
# =============================================================================
|
|
|
|
proc getDirSize*(path: string): int64 =
|
|
## Get directory size in bytes (simplified implementation)
|
|
try:
|
|
var totalSize: int64 = 0
|
|
if dirExists(path):
|
|
for file in walkDirRec(path):
|
|
try:
|
|
totalSize += getFileSize(file)
|
|
except:
|
|
discard
|
|
return totalSize
|
|
except:
|
|
return 0
|
|
|
|
proc formatHealthReport*(report: HealthReport, format: string = "plain"): string =
|
|
## Format health report for display
|
|
case format:
|
|
of "json":
|
|
let reportJson = %*{
|
|
"timestamp": $report.timestamp,
|
|
"overall_status": $report.overallStatus,
|
|
"system_info": report.systemInfo,
|
|
"checks": report.checks.mapIt(%*{
|
|
"id": it.id,
|
|
"category": $it.category,
|
|
"name": it.name,
|
|
"status": $it.status,
|
|
"message": it.message,
|
|
"details": it.details,
|
|
"duration": it.duration,
|
|
"repair_actions": it.repairActions
|
|
}),
|
|
"recommendations": report.recommendations
|
|
}
|
|
return reportJson.pretty()
|
|
|
|
else: # plain format
|
|
result = "NimPak System Health Report\n"
|
|
result.add(repeat("=", 35) & "\n\n")
|
|
|
|
# Overall status
|
|
let statusIcon = case report.overallStatus:
|
|
of StatusHealthy: "✅"
|
|
of StatusWarning: "⚠️"
|
|
of StatusCritical: "🚨"
|
|
of StatusUnknown: "❓"
|
|
|
|
result.add(fmt"{statusIcon} Overall Status: {report.overallStatus}\n")
|
|
let timestampStr = report.timestamp.format("yyyy-MM-dd HH:mm:ss")
|
|
result.add(fmt"📅 Generated: {timestampStr}\n\n")
|
|
|
|
# Health checks by category
|
|
let categories = [CategoryPackages, CategoryFilesystem, CategoryCache, CategoryRepositories, CategorySecurity, CategoryPerformance]
|
|
|
|
for category in categories:
|
|
let categoryChecks = report.checks.filterIt(it.category == category)
|
|
if categoryChecks.len > 0:
|
|
result.add(fmt"{category}:\n")
|
|
for check in categoryChecks:
|
|
let icon = case check.status:
|
|
of StatusHealthy: "✅"
|
|
of StatusWarning: "⚠️"
|
|
of StatusCritical: "🚨"
|
|
of StatusUnknown: "❓"
|
|
|
|
result.add(fmt" {icon} {check.name}: {check.message}\n")
|
|
if check.status in [StatusWarning, StatusCritical] and check.repairActions.len > 0:
|
|
result.add(fmt" 💡 Repair: {check.repairActions[0]}\n")
|
|
result.add("\n")
|
|
|
|
# Recommendations
|
|
if report.recommendations.len > 0:
|
|
result.add("Recommendations:\n")
|
|
for rec in report.recommendations:
|
|
result.add(fmt" • {rec}\n")
|
|
|
|
# =============================================================================
|
|
# Export main functions
|
|
# =============================================================================
|
|
|
|
export HealthCheckCategory, HealthStatus, HealthCheck, HealthReport
|
|
export HealthMonitor, HealthMonitorConfig
|
|
export newHealthMonitor, getDefaultHealthMonitorConfig
|
|
export checkPackageIntegrity, checkPackageConsistency, checkFilesystemHealth
|
|
export checkCacheHealth, checkRepositoryHealth, checkSecurityHealth, checkPerformanceMetrics
|
|
export runAllHealthChecks, performAutomatedRepair
|
|
export getDirSize, formatHealthReport |