Coverage for src / moai_adk / core / comprehensive_monitoring_system.py: 0.00%
509 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
1"""
2Comprehensive Monitoring System
4Real-time monitoring, analytics, and predictive analysis for MoAI-ADK
5with automated alerting and optimization capabilities.
7Key Features:
8- Real-time metrics collection and analysis
9- User behavior analytics and pattern recognition
10- Predictive analytics and trend analysis
11- Automated alerting system
12- System health monitoring
13- Performance optimization recommendations
14- Real-time dashboard interface
15"""
17import json
18import logging
19import statistics
20import threading
21import time
22from collections import defaultdict, deque
23from dataclasses import dataclass, field
24from datetime import datetime, timedelta
25from enum import Enum
26from pathlib import Path
27from typing import Any, Callable, Dict, List, Optional, Union
29import psutil
31# Set up logging
32logging.basicConfig(level=logging.INFO)
33logger = logging.getLogger(__name__)
36class MetricType(Enum):
37 """Types of metrics collected by the monitoring system"""
38 SYSTEM_PERFORMANCE = "system_performance"
39 USER_BEHAVIOR = "user_behavior"
40 TOKEN_USAGE = "token_usage"
41 ERROR_RATE = "error_rate"
42 RESPONSE_TIME = "response_time"
43 MEMORY_USAGE = "memory_usage"
44 CPU_USAGE = "cpu_usage"
45 THROUGHPUT = "throughput"
46 AVAILABILITY = "availability"
49class AlertSeverity(Enum):
50 """Alert severity levels"""
51 LOW = 1
52 MEDIUM = 2
53 HIGH = 3
54 CRITICAL = 4
55 EMERGENCY = 5
58class HealthStatus(Enum):
59 """System health status"""
60 HEALTHY = "healthy"
61 WARNING = "warning"
62 DEGRADED = "degraded"
63 CRITICAL = "critical"
64 DOWN = "down"
67@dataclass
68class MetricData:
69 """Single metric data point"""
70 timestamp: datetime
71 metric_type: MetricType
72 value: Union[int, float, str, bool]
73 tags: Dict[str, str] = field(default_factory=dict)
74 source: str = ""
75 metadata: Dict[str, Any] = field(default_factory=dict)
77 def to_dict(self) -> Dict[str, Any]:
78 """Convert to dictionary for serialization"""
79 return {
80 "timestamp": self.timestamp.isoformat(),
81 "metric_type": self.metric_type.value,
82 "value": self.value,
83 "tags": self.tags,
84 "source": self.source,
85 "metadata": self.metadata
86 }
89@dataclass
90class Alert:
91 """Alert definition and data"""
92 alert_id: str
93 severity: AlertSeverity
94 title: str
95 description: str
96 timestamp: datetime
97 metric_type: MetricType
98 threshold: float
99 current_value: float
100 source: str
101 tags: Dict[str, str] = field(default_factory=dict)
102 resolved: bool = False
103 resolved_at: Optional[datetime] = None
104 acknowledged: bool = False
105 acknowledged_at: Optional[datetime] = None
107 def to_dict(self) -> Dict[str, Any]:
108 """Convert to dictionary for serialization"""
109 return {
110 "alert_id": self.alert_id,
111 "severity": self.severity.value,
112 "title": self.title,
113 "description": self.description,
114 "timestamp": self.timestamp.isoformat(),
115 "metric_type": self.metric_type.value,
116 "threshold": self.threshold,
117 "current_value": self.current_value,
118 "source": self.source,
119 "tags": self.tags,
120 "resolved": self.resolved,
121 "resolved_at": self.resolved_at.isoformat() if self.resolved_at else None,
122 "acknowledged": self.acknowledged,
123 "acknowledged_at": self.acknowledged_at.isoformat() if self.acknowledged_at else None
124 }
127@dataclass
128class SystemHealth:
129 """System health status information"""
130 status: HealthStatus
131 timestamp: datetime
132 overall_score: float # 0-100
133 component_scores: Dict[str, float] = field(default_factory=dict)
134 active_alerts: List[str] = field(default_factory=list)
135 recent_metrics: Dict[str, float] = field(default_factory=dict)
136 uptime_percentage: float = 100.0
137 last_check: Optional[datetime] = None
139 def to_dict(self) -> Dict[str, Any]:
140 """Convert to dictionary for serialization"""
141 return {
142 "status": self.status.value,
143 "timestamp": self.timestamp.isoformat(),
144 "overall_score": self.overall_score,
145 "component_scores": self.component_scores,
146 "active_alerts": self.active_alerts,
147 "recent_metrics": self.recent_metrics,
148 "uptime_percentage": self.uptime_percentage,
149 "last_check": self.last_check.isoformat() if self.last_check else None
150 }
153class MetricsCollector:
154 """Collects and manages system metrics"""
156 def __init__(self, buffer_size: int = 10000, retention_hours: int = 24):
157 self.buffer_size = buffer_size
158 self.retention_hours = retention_hours
159 self.metrics_buffer: Dict[MetricType, deque] = defaultdict(lambda: deque(maxlen=buffer_size))
160 self.aggregated_metrics: Dict[MetricType, Dict[str, Any]] = defaultdict(dict)
161 self._lock = threading.Lock()
162 self._last_cleanup = datetime.now()
164 def add_metric(self, metric: MetricData) -> None:
165 """Add a metric to the collection"""
166 with self._lock:
167 self.metrics_buffer[metric.metric_type].append(metric)
168 self._update_aggregated_metrics(metric)
169 self._cleanup_old_metrics()
171 def _update_aggregated_metrics(self, metric: MetricData) -> None:
172 """Update aggregated statistics for a metric type"""
173 if metric.metric_type not in self.aggregated_metrics:
174 self.aggregated_metrics[metric.metric_type] = {
175 "count": 0,
176 "sum": 0,
177 "min": float('inf'),
178 "max": float('-inf'),
179 "values": []
180 }
182 agg = self.aggregated_metrics[metric.metric_type]
184 if isinstance(metric.value, (int, float)):
185 agg["count"] += 1
186 agg["sum"] += metric.value
187 agg["min"] = min(agg["min"], metric.value)
188 agg["max"] = max(agg["max"], metric.value)
189 agg["values"].append(metric.value)
191 # Keep only recent values for statistics
192 if len(agg["values"]) > 1000:
193 agg["values"] = agg["values"][-1000:]
195 def _cleanup_old_metrics(self) -> None:
196 """Remove metrics older than retention period"""
197 now = datetime.now()
198 if (now - self._last_cleanup).seconds < 300: # Cleanup every 5 minutes
199 return
201 cutoff_time = now - timedelta(hours=self.retention_hours)
203 for metric_type in self.metrics_buffer:
204 while (self.metrics_buffer[metric_type] and
205 self.metrics_buffer[metric_type][0].timestamp < cutoff_time):
206 self.metrics_buffer[metric_type].popleft()
208 self._last_cleanup = now
210 def get_metrics(
211 self,
212 metric_type: Optional[MetricType] = None,
213 start_time: Optional[datetime] = None,
214 end_time: Optional[datetime] = None,
215 limit: Optional[int] = None
216 ) -> List[MetricData]:
217 """Get metrics with optional filtering"""
218 with self._lock:
219 if metric_type:
220 metrics = list(self.metrics_buffer[metric_type])
221 else:
222 metrics = []
223 for mlist in self.metrics_buffer.values():
224 metrics.extend(mlist)
226 # Filter by time range
227 if start_time:
228 metrics = [m for m in metrics if m.timestamp >= start_time]
229 if end_time:
230 metrics = [m for m in metrics if m.timestamp <= end_time]
232 # Sort by timestamp (newest first)
233 metrics.sort(key=lambda m: m.timestamp, reverse=True)
235 # Apply limit
236 if limit:
237 metrics = metrics[:limit]
239 return metrics
241 def get_statistics(self, metric_type: MetricType, minutes: int = 60) -> Dict[str, Any]:
242 """Get statistical summary for a metric type"""
243 with self._lock:
244 agg = self.aggregated_metrics.get(metric_type, {})
246 if not agg or agg["count"] == 0:
247 return {
248 "count": 0,
249 "average": None,
250 "min": None,
251 "max": None,
252 "median": None,
253 "std_dev": None
254 }
256 values = agg["values"]
257 if not values:
258 return {
259 "count": agg["count"],
260 "average": agg["sum"] / agg["count"],
261 "min": agg["min"],
262 "max": agg["max"],
263 "median": None,
264 "std_dev": None
265 }
267 try:
268 return {
269 "count": len(values),
270 "average": statistics.mean(values),
271 "median": statistics.median(values),
272 "min": min(values),
273 "max": max(values),
274 "std_dev": statistics.stdev(values) if len(values) > 1 else 0,
275 "p95": statistics.quantiles(values, n=20)[18] if len(values) > 20 else max(values),
276 "p99": statistics.quantiles(values, n=100)[98] if len(values) > 100 else max(values)
277 }
278 except (statistics.StatisticsError, IndexError):
279 return {
280 "count": len(values),
281 "average": statistics.mean(values),
282 "median": statistics.median(values),
283 "min": min(values),
284 "max": max(values),
285 "std_dev": 0,
286 "p95": max(values),
287 "p99": max(values)
288 }
291class AlertManager:
292 """Manages alert rules, detection, and notification"""
294 def __init__(self, metrics_collector: MetricsCollector):
295 self.metrics_collector = metrics_collector
296 self.alert_rules: List[Dict[str, Any]] = []
297 self.active_alerts: Dict[str, Alert] = {}
298 self.alert_history: List[Alert] = []
299 self.alert_callbacks: List[Callable[[Alert], None]] = []
300 self._lock = threading.Lock()
302 def add_alert_rule(
303 self,
304 name: str,
305 metric_type: MetricType,
306 threshold: float,
307 operator: str = "gt", # gt, lt, eq, ne
308 severity: AlertSeverity = AlertSeverity.MEDIUM,
309 window_minutes: int = 5,
310 consecutive_violations: int = 1,
311 tags: Optional[Dict[str, str]] = None,
312 description: Optional[str] = None
313 ) -> None:
314 """Add an alert rule"""
315 rule = {
316 "name": name,
317 "metric_type": metric_type,
318 "threshold": threshold,
319 "operator": operator,
320 "severity": severity,
321 "window_minutes": window_minutes,
322 "consecutive_violations": consecutive_violations,
323 "tags": tags or {},
324 "description": description or f"Alert when {metric_type.value} {operator} {threshold}",
325 "violation_count": 0,
326 "last_check": None,
327 "enabled": True
328 }
330 with self._lock:
331 self.alert_rules.append(rule)
333 def check_alerts(self) -> List[Alert]:
334 """Check all alert rules and generate alerts for violations"""
335 triggered_alerts = []
337 with self._lock:
338 for rule in self.alert_rules:
339 if not rule["enabled"]:
340 continue
342 # Get recent metrics for this rule
343 recent_metrics = self.metrics_collector.get_metrics(
344 metric_type=rule["metric_type"],
345 start_time=datetime.now() - timedelta(minutes=rule["window_minutes"])
346 )
348 if not recent_metrics:
349 continue
351 # Check for violations
352 violations = 0
353 latest_value = None
355 for metric in recent_metrics:
356 if isinstance(metric.value, (int, float)):
357 if self._evaluate_condition(metric.value, rule["threshold"], rule["operator"]):
358 violations += 1
359 latest_value = metric.value
361 # Trigger alert if threshold exceeded
362 if violations >= rule["consecutive_violations"]:
363 alert_id = f"{rule['name']}_{int(time.time())}"
365 alert = Alert(
366 alert_id=alert_id,
367 severity=rule["severity"],
368 title=f"{rule['name']} Alert Triggered",
369 description=rule["description"],
370 timestamp=datetime.now(),
371 metric_type=rule["metric_type"],
372 threshold=rule["threshold"],
373 current_value=latest_value or 0,
374 source="monitoring_system",
375 tags=rule["tags"]
376 )
378 self.active_alerts[alert_id] = alert
379 self.alert_history.append(alert)
380 triggered_alerts.append(alert)
382 # Trigger callbacks
383 for callback in self.alert_callbacks:
384 try:
385 callback(alert)
386 except Exception as e:
387 logger.error(f"Error in alert callback: {e}")
389 rule["violation_count"] = violations
390 rule["last_check"] = datetime.now()
392 # Check for resolved alerts
393 resolved_alerts = []
394 for alert_id, alert in list(self.active_alerts.items()):
395 # Check if alert condition is no longer met
396 rule = next((r for r in self.alert_rules if r["name"] in alert_id), None)
397 if rule:
398 recent_metrics = self.metrics_collector.get_metrics(
399 metric_type=rule["metric_type"],
400 start_time=datetime.now() - timedelta(minutes=1) # Check last minute
401 )
403 if recent_metrics:
404 latest_value = None
405 for metric in recent_metrics:
406 if isinstance(metric.value, (int, float)):
407 if not self._evaluate_condition(metric.value, rule["threshold"], rule["operator"]):
408 latest_value = metric.value
409 break
411 if latest_value is not None:
412 # Alert is resolved
413 alert.resolved = True
414 alert.resolved_at = datetime.now()
415 resolved_alerts.append(alert)
416 del self.active_alerts[alert_id]
418 return triggered_alerts
420 def _evaluate_condition(self, value: float, threshold: float, operator: str) -> bool:
421 """Evaluate alert condition"""
422 if operator == "gt":
423 return value > threshold
424 elif operator == "lt":
425 return value < threshold
426 elif operator == "eq":
427 return value == threshold
428 elif operator == "ne":
429 return value != threshold
430 elif operator == "gte":
431 return value >= threshold
432 elif operator == "lte":
433 return value <= threshold
434 else:
435 return False
437 def add_alert_callback(self, callback: Callable[[Alert], None]) -> None:
438 """Add a callback function to be triggered when alerts fire"""
439 self.alert_callbacks.append(callback)
441 def acknowledge_alert(self, alert_id: str) -> bool:
442 """Acknowledge an alert"""
443 with self._lock:
444 if alert_id in self.active_alerts:
445 self.active_alerts[alert_id].acknowledged = True
446 self.active_alerts[alert_id].acknowledged_at = datetime.now()
447 return True
448 return False
450 def get_active_alerts(self, severity: Optional[AlertSeverity] = None) -> List[Alert]:
451 """Get currently active alerts"""
452 alerts = list(self.active_alerts.values())
453 if severity:
454 alerts = [a for a in alerts if a.severity == severity]
455 return sorted(alerts, key=lambda a: (a.severity.value, a.timestamp), reverse=True)
457 def get_alert_history(self, hours: int = 24) -> List[Alert]:
458 """Get alert history"""
459 cutoff_time = datetime.now() - timedelta(hours=hours)
460 return [a for a in self.alert_history if a.timestamp >= cutoff_time]
463class PredictiveAnalytics:
464 """Predictive analytics for system performance and user behavior"""
466 def __init__(self, metrics_collector: MetricsCollector):
467 self.metrics_collector = metrics_collector
468 self.models: Dict[str, Dict[str, Any]] = {}
469 self.predictions: Dict[str, Dict[str, Any]] = {}
471 def predict_metric_trend(
472 self,
473 metric_type: MetricType,
474 hours_ahead: int = 1,
475 confidence_threshold: float = 0.7
476 ) -> Dict[str, Any]:
477 """Predict metric values for specified hours ahead"""
478 try:
479 # Get historical data
480 historical_metrics = self.metrics_collector.get_metrics(
481 metric_type=metric_type,
482 start_time=datetime.now() - timedelta(hours=24)
483 )
485 if len(historical_metrics) < 10:
486 return {
487 "prediction": None,
488 "confidence": 0.0,
489 "reason": "Insufficient historical data"
490 }
492 # Extract numeric values
493 values = []
494 timestamps = []
495 for metric in historical_metrics:
496 if isinstance(metric.value, (int, float)):
497 values.append(metric.value)
498 timestamps.append(metric.timestamp)
500 if len(values) < 10:
501 return {
502 "prediction": None,
503 "confidence": 0.0,
504 "reason": "Insufficient numeric data points"
505 }
507 # Simple linear regression for prediction
508 import numpy as np
510 # Convert timestamps to numeric values (hours ago)
511 now = datetime.now()
512 x = np.array([(now - ts).total_seconds() / 3600 for ts in timestamps])
513 y = np.array(values)
515 # Fit linear model
516 coeffs = np.polyfit(x, y, 1)
518 # Predict future values
519 future_x = np.array([-h for h in range(1, hours_ahead + 1)])
520 future_y = np.polyval(coeffs, future_x)
522 # Calculate confidence based on R-squared
523 y_pred = np.polyval(coeffs, x)
524 ss_res = np.sum((y - y_pred) ** 2)
525 ss_tot = np.sum((y - np.mean(y)) ** 2)
526 r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0
528 confidence = max(0, r_squared)
530 return {
531 "prediction": {
532 "future_values": future_y.tolist(),
533 "trend": "increasing" if coeffs[0] > 0 else "decreasing" if coeffs[0] < 0 else "stable",
534 "slope": coeffs[0]
535 },
536 "confidence": confidence,
537 "data_points": len(values),
538 "model_type": "linear_regression",
539 "reason": f"Linear regression on {len(values)} data points with R²={r_squared:.3f}"
540 }
542 except Exception as e:
543 logger.error(f"Error in predictive analytics: {e}")
544 return {
545 "prediction": None,
546 "confidence": 0.0,
547 "reason": f"Analysis error: {str(e)}"
548 }
550 def detect_anomalies(
551 self,
552 metric_type: MetricType,
553 z_score_threshold: float = 2.0,
554 window_minutes: int = 60
555 ) -> Dict[str, Any]:
556 """Detect anomalies in metric data using statistical methods"""
557 try:
558 recent_metrics = self.metrics_collector.get_metrics(
559 metric_type=metric_type,
560 start_time=datetime.now() - timedelta(minutes=window_minutes)
561 )
563 values = []
564 for metric in recent_metrics:
565 if isinstance(metric.value, (int, float)):
566 values.append(metric.value)
568 if len(values) < 5:
569 return {
570 "anomalies": [],
571 "statistics": {},
572 "reason": "Insufficient data for anomaly detection"
573 }
575 import numpy as np
577 values_array = np.array(values)
578 mean = np.mean(values_array)
579 std = np.std(values_array)
581 if std == 0:
582 return {
583 "anomalies": [],
584 "statistics": {"mean": mean, "std": std},
585 "reason": "No variance in data"
586 }
588 # Detect anomalies using Z-score
589 z_scores = np.abs((values_array - mean) / std)
590 anomaly_indices = np.where(z_scores > z_score_threshold)[0]
592 anomalies = []
593 for i, idx in enumerate(anomaly_indices):
594 metric = recent_metrics[idx]
595 anomalies.append({
596 "timestamp": metric.timestamp.isoformat(),
597 "value": metric.value,
598 "z_score": float(z_scores[idx]),
599 "deviation": float(values[idx] - mean)
600 })
602 return {
603 "anomalies": anomalies,
604 "statistics": {
605 "mean": float(mean),
606 "std": float(std),
607 "min": float(np.min(values_array)),
608 "max": float(np.max(values_array)),
609 "count": len(values)
610 },
611 "threshold": z_score_threshold,
612 "reason": f"Found {len(anomalies)} anomalies using Z-score > {z_score_threshold}"
613 }
615 except Exception as e:
616 logger.error(f"Error in anomaly detection: {e}")
617 return {
618 "anomalies": [],
619 "statistics": {},
620 "reason": f"Analysis error: {str(e)}"
621 }
624class PerformanceMonitor:
625 """System performance monitoring"""
627 def __init__(self):
628 self.start_time = datetime.now()
629 self.metrics_collector = MetricsCollector()
630 self.alert_manager = AlertManager(self.metrics_collector)
631 self.predictive_analytics = PredictiveAnalytics(self.metrics_collector)
632 self._running = False
633 self._monitor_thread: Optional[threading.Thread] = None
634 self._monitor_interval = 30 # seconds
636 def start(self) -> None:
637 """Start performance monitoring"""
638 if self._running:
639 return
641 self._running = True
642 self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True)
643 self._monitor_thread.start()
644 logger.info("Performance monitoring started")
646 def stop(self) -> None:
647 """Stop performance monitoring"""
648 self._running = False
649 if self._monitor_thread:
650 self._monitor_thread.join(timeout=5)
651 logger.info("Performance monitoring stopped")
653 def _monitor_loop(self) -> None:
654 """Main monitoring loop"""
655 while self._running:
656 try:
657 self._collect_system_metrics()
658 self._check_alerts()
659 time.sleep(self._monitor_interval)
660 except Exception as e:
661 logger.error(f"Error in monitoring loop: {e}")
662 time.sleep(self._monitor_interval)
664 def _collect_system_metrics(self) -> None:
665 """Collect system performance metrics"""
666 try:
667 # CPU Usage
668 cpu_percent = psutil.cpu_percent(interval=1)
669 self.metrics_collector.add_metric(MetricData(
670 timestamp=datetime.now(),
671 metric_type=MetricType.CPU_USAGE,
672 value=cpu_percent,
673 tags={"component": "system"},
674 source="psutil"
675 ))
677 # Memory Usage
678 memory = psutil.virtual_memory()
679 self.metrics_collector.add_metric(MetricData(
680 timestamp=datetime.now(),
681 metric_type=MetricType.MEMORY_USAGE,
682 value=memory.percent,
683 tags={"component": "system", "total_gb": memory.total / (1024**3)},
684 source="psutil"
685 ))
687 # Python process memory
688 process = psutil.Process()
689 process_memory = process.memory_info()
690 self.metrics_collector.add_metric(MetricData(
691 timestamp=datetime.now(),
692 metric_type=MetricType.MEMORY_USAGE,
693 value=process_memory.rss / (1024**2), # MB
694 tags={"component": "python_process"},
695 source="psutil"
696 ))
698 # System load
699 load_avg = psutil.getloadavg()
700 self.metrics_collector.add_metric(MetricData(
701 timestamp=datetime.now(),
702 metric_type=MetricType.SYSTEM_PERFORMANCE,
703 value=load_avg[0], # 1-minute load average
704 tags={"component": "system", "metric": "load_1min"},
705 source="psutil"
706 ))
708 except Exception as e:
709 logger.error(f"Error collecting system metrics: {e}")
711 def _check_alerts(self) -> None:
712 """Check for alerts"""
713 try:
714 alerts = self.alert_manager.check_alerts()
715 if alerts:
716 for alert in alerts:
717 logger.warning(f"Alert triggered: {alert.title} - {alert.current_value}")
719 except Exception as e:
720 logger.error(f"Error checking alerts: {e}")
722 def add_custom_metric(
723 self,
724 metric_type: MetricType,
725 value: Union[int, float],
726 tags: Optional[Dict[str, str]] = None,
727 source: str = "custom"
728 ) -> None:
729 """Add a custom metric"""
730 self.metrics_collector.add_metric(MetricData(
731 timestamp=datetime.now(),
732 metric_type=metric_type,
733 value=value,
734 tags=tags or {},
735 source=source
736 ))
738 def get_system_health(self) -> SystemHealth:
739 """Get overall system health status"""
740 try:
741 # Calculate component scores
742 component_scores = {}
744 # CPU health
745 cpu_metrics = self.metrics_collector.get_metrics(
746 MetricType.CPU_USAGE,
747 start_time=datetime.now() - timedelta(minutes=5)
748 )
749 if cpu_metrics:
750 cpu_values = [m.value for m in cpu_metrics if isinstance(m.value, (int, float))]
751 if cpu_values:
752 avg_cpu = statistics.mean(cpu_values)
753 cpu_score = max(0, 100 - avg_cpu) # Lower CPU usage = higher score
754 component_scores["cpu"] = cpu_score
756 # Memory health
757 memory_metrics = self.metrics_collector.get_metrics(
758 MetricType.MEMORY_USAGE,
759 start_time=datetime.now() - timedelta(minutes=5)
760 )
761 if memory_metrics:
762 memory_values = [m.value for m in memory_metrics if isinstance(m.value, (int, float))]
763 if memory_values:
764 avg_memory = statistics.mean(memory_values)
765 memory_score = max(0, 100 - avg_memory) # Lower memory usage = higher score
766 component_scores["memory"] = memory_score
768 # Error rate health
769 error_metrics = self.metrics_collector.get_metrics(
770 MetricType.ERROR_RATE,
771 start_time=datetime.now() - timedelta(minutes=10)
772 )
773 if error_metrics:
774 error_values = [m.value for m in error_metrics if isinstance(m.value, (int, float))]
775 if error_values:
776 avg_error = statistics.mean(error_values)
777 error_score = max(0, 100 - avg_error * 10) # Lower error rate = higher score
778 component_scores["error_rate"] = error_score
780 # Calculate overall score
781 if component_scores:
782 overall_score = statistics.mean(component_scores.values())
783 else:
784 overall_score = 100.0
786 # Determine health status
787 if overall_score >= 90:
788 status = HealthStatus.HEALTHY
789 elif overall_score >= 70:
790 status = HealthStatus.WARNING
791 elif overall_score >= 50:
792 status = HealthStatus.DEGRADED
793 elif overall_score >= 30:
794 status = HealthStatus.CRITICAL
795 else:
796 status = HealthStatus.DOWN
798 # Get active alerts
799 active_alerts = list(self.alert_manager.active_alerts.keys())
801 # Get recent metrics summary
802 recent_metrics = {}
803 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE, MetricType.ERROR_RATE]:
804 recent_metric = self.metrics_collector.get_metrics(metric_type, limit=1)
805 if recent_metric and isinstance(recent_metric[0].value, (int, float)):
806 recent_metrics[metric_type.value] = recent_metric[0].value
808 return SystemHealth(
809 status=status,
810 timestamp=datetime.now(),
811 overall_score=overall_score,
812 component_scores=component_scores,
813 active_alerts=active_alerts,
814 recent_metrics=recent_metrics,
815 last_check=datetime.now()
816 )
818 except Exception as e:
819 logger.error(f"Error calculating system health: {e}")
820 return SystemHealth(
821 status=HealthStatus.DOWN,
822 timestamp=datetime.now(),
823 overall_score=0.0,
824 last_check=datetime.now()
825 )
827 def setup_default_alerts(self) -> None:
828 """Setup default alert rules"""
829 # CPU usage alert
830 self.alert_manager.add_alert_rule(
831 name="High CPU Usage",
832 metric_type=MetricType.CPU_USAGE,
833 threshold=80.0,
834 operator="gt",
835 severity=AlertSeverity.HIGH,
836 window_minutes=5,
837 consecutive_violations=2,
838 tags={"component": "cpu"}
839 )
841 # Memory usage alert
842 self.alert_manager.add_alert_rule(
843 name="High Memory Usage",
844 metric_type=MetricType.MEMORY_USAGE,
845 threshold=85.0,
846 operator="gt",
847 severity=AlertSeverity.HIGH,
848 window_minutes=5,
849 consecutive_violations=2,
850 tags={"component": "memory"}
851 )
853 # Error rate alert
854 self.alert_manager.add_alert_rule(
855 name="High Error Rate",
856 metric_type=MetricType.ERROR_RATE,
857 threshold=5.0,
858 operator="gt",
859 severity=AlertSeverity.CRITICAL,
860 window_minutes=2,
861 consecutive_violations=1,
862 tags={"component": "errors"}
863 )
865 logger.info("Default alert rules configured")
868class ComprehensiveMonitoringSystem:
869 """Main monitoring system orchestrator"""
871 def __init__(self, config_file: Optional[Path] = None):
872 self.config_file = config_file or Path.cwd() / ".moai" / "config" / "monitoring.json"
873 self.config = self._load_config()
875 # Initialize components
876 self.metrics_collector = MetricsCollector(
877 buffer_size=self.config.get("buffer_size", 10000),
878 retention_hours=self.config.get("retention_hours", 24)
879 )
881 self.alert_manager = AlertManager(self.metrics_collector)
882 self.predictive_analytics = PredictiveAnalytics(self.metrics_collector)
883 self.performance_monitor = PerformanceMonitor()
885 # Initialize monitoring status
886 self._running = False
887 self._startup_time = datetime.now()
889 def _load_config(self) -> Dict[str, Any]:
890 """Load monitoring configuration"""
891 default_config = {
892 "buffer_size": 10000,
893 "retention_hours": 24,
894 "monitor_interval": 30,
895 "alert_check_interval": 60,
896 "predictive_analysis_hours": 24,
897 "health_check_interval": 300,
898 "enable_predictions": True,
899 "enable_anomaly_detection": True,
900 "auto_optimization": False
901 }
903 if self.config_file.exists():
904 try:
905 with open(self.config_file, 'r') as f:
906 config = json.load(f)
907 default_config.update(config)
908 except Exception as e:
909 logger.error(f"Error loading monitoring config: {e}")
911 return default_config
913 def start(self) -> None:
914 """Start the monitoring system"""
915 if self._running:
916 return
918 logger.info("Starting Comprehensive Monitoring System")
920 # Start performance monitoring
921 self.performance_monitor.start()
923 # Setup default alerts
924 self.performance_monitor.setup_default_alerts()
926 # Setup alert callbacks
927 self.alert_manager.add_alert_callback(self._handle_alert)
929 self._running = True
930 logger.info("Comprehensive Monitoring System started successfully")
932 def stop(self) -> None:
933 """Stop the monitoring system"""
934 if not self._running:
935 return
937 logger.info("Stopping Comprehensive Monitoring System")
939 self.performance_monitor.stop()
940 self._running = False
942 logger.info("Comprehensive Monitoring System stopped")
944 def _handle_alert(self, alert: Alert) -> None:
945 """Handle triggered alerts"""
946 logger.warning(f"ALERT: {alert.title} - {alert.description}")
948 # Here you could add additional alert handling:
949 # - Send notifications
950 # - Trigger automated responses
951 # - Log to external systems
952 # - Send to monitoring dashboard
954 def add_metric(
955 self,
956 metric_type: MetricType,
957 value: Union[int, float],
958 tags: Optional[Dict[str, str]] = None,
959 source: str = "user"
960 ) -> None:
961 """Add a custom metric"""
962 self.performance_monitor.add_custom_metric(metric_type, value, tags, source)
964 def get_dashboard_data(self) -> Dict[str, Any]:
965 """Get data for monitoring dashboard"""
966 try:
967 # System health
968 health = self.performance_monitor.get_system_health()
970 # Active alerts
971 active_alerts = self.alert_manager.get_active_alerts()
973 # Recent metrics summary
974 recent_metrics = {}
975 for metric_type in [
976 MetricType.CPU_USAGE,
977 MetricType.MEMORY_USAGE,
978 MetricType.ERROR_RATE,
979 MetricType.RESPONSE_TIME
980 ]:
981 stats = self.metrics_collector.get_statistics(metric_type, minutes=60)
982 if stats["count"] > 0:
983 recent_metrics[metric_type.value] = stats
985 # Predictions
986 predictions = {}
987 if self.config.get("enable_predictions", True):
988 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE]:
989 pred = self.predictive_analytics.predict_metric_trend(
990 metric_type, hours_ahead=1
991 )
992 if pred["confidence"] > 0.5:
993 predictions[metric_type.value] = pred
995 return {
996 "health": health.to_dict(),
997 "active_alerts": [alert.to_dict() for alert in active_alerts],
998 "recent_metrics": recent_metrics,
999 "predictions": predictions,
1000 "uptime_seconds": (datetime.now() - self._startup_time).total_seconds(),
1001 "last_update": datetime.now().isoformat()
1002 }
1004 except Exception as e:
1005 logger.error(f"Error getting dashboard data: {e}")
1006 return {
1007 "error": str(e),
1008 "last_update": datetime.now().isoformat()
1009 }
1011 def get_analytics_report(self, hours: int = 24) -> Dict[str, Any]:
1012 """Generate comprehensive analytics report"""
1013 try:
1014 # Overall metrics summary
1015 summary = {}
1016 for metric_type in MetricType:
1017 stats = self.metrics_collector.get_statistics(metric_type, minutes=hours * 60)
1018 if stats["count"] > 0:
1019 summary[metric_type.value] = stats
1021 # Anomaly detection
1022 anomalies = {}
1023 if self.config.get("enable_anomaly_detection", True):
1024 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE, MetricType.ERROR_RATE]:
1025 anomaly_result = self.predictive_analytics.detect_anomalies(metric_type)
1026 if anomaly_result["anomalies"]:
1027 anomalies[metric_type.value] = anomaly_result
1029 # Alert summary
1030 alert_history = self.alert_manager.get_alert_history(hours=hours)
1031 alert_summary = {
1032 "total_alerts": len(alert_history),
1033 "by_severity": {},
1034 "by_metric_type": {},
1035 "resolved_count": sum(1 for a in alert_history if a.resolved),
1036 "acknowledged_count": sum(1 for a in alert_history if a.acknowledged)
1037 }
1039 for alert in alert_history:
1040 severity_key = alert.severity.name
1041 alert_summary["by_severity"][severity_key] = alert_summary["by_severity"].get(severity_key, 0) + 1
1043 metric_key = alert.metric_type.value
1044 alert_summary["by_metric_type"][metric_key] = alert_summary["by_metric_type"].get(metric_key, 0) + 1
1046 return {
1047 "report_period_hours": hours,
1048 "generated_at": datetime.now().isoformat(),
1049 "metrics_summary": summary,
1050 "anomalies": anomalies,
1051 "alert_summary": alert_summary,
1052 "system_health": self.performance_monitor.get_system_health().to_dict(),
1053 "recommendations": self._generate_recommendations(summary, anomalies)
1054 }
1056 except Exception as e:
1057 logger.error(f"Error generating analytics report: {e}")
1058 return {
1059 "error": str(e),
1060 "generated_at": datetime.now().isoformat()
1061 }
1063 def _generate_recommendations(
1064 self,
1065 metrics_summary: Dict[str, Any],
1066 anomalies: Dict[str, Any]
1067 ) -> List[str]:
1068 """Generate optimization recommendations based on metrics and anomalies"""
1069 recommendations = []
1071 # CPU recommendations
1072 if MetricType.CPU_USAGE.value in metrics_summary:
1073 cpu_stats = metrics_summary[MetricType.CPU_USAGE.value]
1074 if cpu_stats["average"] > 70:
1075 recommendations.append("High CPU usage detected. Consider optimizing code or scaling resources.")
1077 # Memory recommendations
1078 if MetricType.MEMORY_USAGE.value in metrics_summary:
1079 memory_stats = metrics_summary[MetricType.MEMORY_USAGE.value]
1080 if memory_stats["average"] > 80:
1081 recommendations.append("High memory usage detected. Consider memory optimization or increasing available memory.")
1083 # Error rate recommendations
1084 if MetricType.ERROR_RATE.value in metrics_summary:
1085 error_stats = metrics_summary[MetricType.ERROR_RATE.value]
1086 if error_stats["average"] > 5:
1087 recommendations.append("High error rate detected. Review error logs and implement better error handling.")
1089 # Anomaly recommendations
1090 if anomalies:
1091 recommendations.append("Anomalies detected in system metrics. Review the detailed anomaly report for specific issues.")
1093 return recommendations
1096# Global instance for easy access
1097_monitoring_system: Optional[ComprehensiveMonitoringSystem] = None
1100def get_monitoring_system() -> ComprehensiveMonitoringSystem:
1101 """Get or create global monitoring system instance"""
1102 global _monitoring_system
1103 if _monitoring_system is None:
1104 _monitoring_system = ComprehensiveMonitoringSystem()
1105 return _monitoring_system
1108# Convenience functions
1109def start_monitoring() -> None:
1110 """Start the monitoring system"""
1111 system = get_monitoring_system()
1112 system.start()
1115def stop_monitoring() -> None:
1116 """Stop the monitoring system"""
1117 system = get_monitoring_system()
1118 system.stop()
1121def add_metric(
1122 metric_type: MetricType,
1123 value: Union[int, float],
1124 tags: Optional[Dict[str, str]] = None,
1125 source: str = "user"
1126) -> None:
1127 """Add a custom metric"""
1128 system = get_monitoring_system()
1129 system.add_metric(metric_type, value, tags, source)
1132def get_dashboard_data() -> Dict[str, Any]:
1133 """Get monitoring dashboard data"""
1134 system = get_monitoring_system()
1135 return system.get_dashboard_data()
1138if __name__ == "__main__":
1139 # Example usage
1140 print("Starting Comprehensive Monitoring System...")
1142 monitoring = ComprehensiveMonitoringSystem()
1143 monitoring.start()
1145 try:
1146 # Simulate some metrics
1147 for i in range(10):
1148 monitoring.add_metric(MetricType.CPU_USAGE, 50 + i * 3)
1149 monitoring.add_metric(MetricType.MEMORY_USAGE, 60 + i * 2)
1150 time.sleep(1)
1152 # Get dashboard data
1153 dashboard_data = monitoring.get_dashboard_data()
1154 print(f"System Health: {dashboard_data['health']['status']}")
1155 print(f"Overall Score: {dashboard_data['health']['overall_score']}")
1156 print(f"Active Alerts: {len(dashboard_data['active_alerts'])}")
1158 # Generate analytics report
1159 report = monitoring.get_analytics_report(hours=1)
1160 print(f"Analytics Report: {len(report['metrics_summary'])} metric types tracked")
1162 finally:
1163 monitoring.stop()
1164 print("Monitoring stopped.")