Coverage for src / moai_adk / core / comprehensive_monitoring_system.py: 0.00%

509 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-20 20:52 +0900

1""" 

2Comprehensive Monitoring System 

3 

4Real-time monitoring, analytics, and predictive analysis for MoAI-ADK 

5with automated alerting and optimization capabilities. 

6 

7Key Features: 

8- Real-time metrics collection and analysis 

9- User behavior analytics and pattern recognition 

10- Predictive analytics and trend analysis 

11- Automated alerting system 

12- System health monitoring 

13- Performance optimization recommendations 

14- Real-time dashboard interface 

15""" 

16 

17import json 

18import logging 

19import statistics 

20import threading 

21import time 

22from collections import defaultdict, deque 

23from dataclasses import dataclass, field 

24from datetime import datetime, timedelta 

25from enum import Enum 

26from pathlib import Path 

27from typing import Any, Callable, Dict, List, Optional, Union 

28 

29import psutil 

30 

31# Set up logging 

32logging.basicConfig(level=logging.INFO) 

33logger = logging.getLogger(__name__) 

34 

35 

36class MetricType(Enum): 

37 """Types of metrics collected by the monitoring system""" 

38 SYSTEM_PERFORMANCE = "system_performance" 

39 USER_BEHAVIOR = "user_behavior" 

40 TOKEN_USAGE = "token_usage" 

41 ERROR_RATE = "error_rate" 

42 RESPONSE_TIME = "response_time" 

43 MEMORY_USAGE = "memory_usage" 

44 CPU_USAGE = "cpu_usage" 

45 THROUGHPUT = "throughput" 

46 AVAILABILITY = "availability" 

47 

48 

49class AlertSeverity(Enum): 

50 """Alert severity levels""" 

51 LOW = 1 

52 MEDIUM = 2 

53 HIGH = 3 

54 CRITICAL = 4 

55 EMERGENCY = 5 

56 

57 

58class HealthStatus(Enum): 

59 """System health status""" 

60 HEALTHY = "healthy" 

61 WARNING = "warning" 

62 DEGRADED = "degraded" 

63 CRITICAL = "critical" 

64 DOWN = "down" 

65 

66 

67@dataclass 

68class MetricData: 

69 """Single metric data point""" 

70 timestamp: datetime 

71 metric_type: MetricType 

72 value: Union[int, float, str, bool] 

73 tags: Dict[str, str] = field(default_factory=dict) 

74 source: str = "" 

75 metadata: Dict[str, Any] = field(default_factory=dict) 

76 

77 def to_dict(self) -> Dict[str, Any]: 

78 """Convert to dictionary for serialization""" 

79 return { 

80 "timestamp": self.timestamp.isoformat(), 

81 "metric_type": self.metric_type.value, 

82 "value": self.value, 

83 "tags": self.tags, 

84 "source": self.source, 

85 "metadata": self.metadata 

86 } 

87 

88 

89@dataclass 

90class Alert: 

91 """Alert definition and data""" 

92 alert_id: str 

93 severity: AlertSeverity 

94 title: str 

95 description: str 

96 timestamp: datetime 

97 metric_type: MetricType 

98 threshold: float 

99 current_value: float 

100 source: str 

101 tags: Dict[str, str] = field(default_factory=dict) 

102 resolved: bool = False 

103 resolved_at: Optional[datetime] = None 

104 acknowledged: bool = False 

105 acknowledged_at: Optional[datetime] = None 

106 

107 def to_dict(self) -> Dict[str, Any]: 

108 """Convert to dictionary for serialization""" 

109 return { 

110 "alert_id": self.alert_id, 

111 "severity": self.severity.value, 

112 "title": self.title, 

113 "description": self.description, 

114 "timestamp": self.timestamp.isoformat(), 

115 "metric_type": self.metric_type.value, 

116 "threshold": self.threshold, 

117 "current_value": self.current_value, 

118 "source": self.source, 

119 "tags": self.tags, 

120 "resolved": self.resolved, 

121 "resolved_at": self.resolved_at.isoformat() if self.resolved_at else None, 

122 "acknowledged": self.acknowledged, 

123 "acknowledged_at": self.acknowledged_at.isoformat() if self.acknowledged_at else None 

124 } 

125 

126 

127@dataclass 

128class SystemHealth: 

129 """System health status information""" 

130 status: HealthStatus 

131 timestamp: datetime 

132 overall_score: float # 0-100 

133 component_scores: Dict[str, float] = field(default_factory=dict) 

134 active_alerts: List[str] = field(default_factory=list) 

135 recent_metrics: Dict[str, float] = field(default_factory=dict) 

136 uptime_percentage: float = 100.0 

137 last_check: Optional[datetime] = None 

138 

139 def to_dict(self) -> Dict[str, Any]: 

140 """Convert to dictionary for serialization""" 

141 return { 

142 "status": self.status.value, 

143 "timestamp": self.timestamp.isoformat(), 

144 "overall_score": self.overall_score, 

145 "component_scores": self.component_scores, 

146 "active_alerts": self.active_alerts, 

147 "recent_metrics": self.recent_metrics, 

148 "uptime_percentage": self.uptime_percentage, 

149 "last_check": self.last_check.isoformat() if self.last_check else None 

150 } 

151 

152 

153class MetricsCollector: 

154 """Collects and manages system metrics""" 

155 

156 def __init__(self, buffer_size: int = 10000, retention_hours: int = 24): 

157 self.buffer_size = buffer_size 

158 self.retention_hours = retention_hours 

159 self.metrics_buffer: Dict[MetricType, deque] = defaultdict(lambda: deque(maxlen=buffer_size)) 

160 self.aggregated_metrics: Dict[MetricType, Dict[str, Any]] = defaultdict(dict) 

161 self._lock = threading.Lock() 

162 self._last_cleanup = datetime.now() 

163 

164 def add_metric(self, metric: MetricData) -> None: 

165 """Add a metric to the collection""" 

166 with self._lock: 

167 self.metrics_buffer[metric.metric_type].append(metric) 

168 self._update_aggregated_metrics(metric) 

169 self._cleanup_old_metrics() 

170 

171 def _update_aggregated_metrics(self, metric: MetricData) -> None: 

172 """Update aggregated statistics for a metric type""" 

173 if metric.metric_type not in self.aggregated_metrics: 

174 self.aggregated_metrics[metric.metric_type] = { 

175 "count": 0, 

176 "sum": 0, 

177 "min": float('inf'), 

178 "max": float('-inf'), 

179 "values": [] 

180 } 

181 

182 agg = self.aggregated_metrics[metric.metric_type] 

183 

184 if isinstance(metric.value, (int, float)): 

185 agg["count"] += 1 

186 agg["sum"] += metric.value 

187 agg["min"] = min(agg["min"], metric.value) 

188 agg["max"] = max(agg["max"], metric.value) 

189 agg["values"].append(metric.value) 

190 

191 # Keep only recent values for statistics 

192 if len(agg["values"]) > 1000: 

193 agg["values"] = agg["values"][-1000:] 

194 

195 def _cleanup_old_metrics(self) -> None: 

196 """Remove metrics older than retention period""" 

197 now = datetime.now() 

198 if (now - self._last_cleanup).seconds < 300: # Cleanup every 5 minutes 

199 return 

200 

201 cutoff_time = now - timedelta(hours=self.retention_hours) 

202 

203 for metric_type in self.metrics_buffer: 

204 while (self.metrics_buffer[metric_type] and 

205 self.metrics_buffer[metric_type][0].timestamp < cutoff_time): 

206 self.metrics_buffer[metric_type].popleft() 

207 

208 self._last_cleanup = now 

209 

210 def get_metrics( 

211 self, 

212 metric_type: Optional[MetricType] = None, 

213 start_time: Optional[datetime] = None, 

214 end_time: Optional[datetime] = None, 

215 limit: Optional[int] = None 

216 ) -> List[MetricData]: 

217 """Get metrics with optional filtering""" 

218 with self._lock: 

219 if metric_type: 

220 metrics = list(self.metrics_buffer[metric_type]) 

221 else: 

222 metrics = [] 

223 for mlist in self.metrics_buffer.values(): 

224 metrics.extend(mlist) 

225 

226 # Filter by time range 

227 if start_time: 

228 metrics = [m for m in metrics if m.timestamp >= start_time] 

229 if end_time: 

230 metrics = [m for m in metrics if m.timestamp <= end_time] 

231 

232 # Sort by timestamp (newest first) 

233 metrics.sort(key=lambda m: m.timestamp, reverse=True) 

234 

235 # Apply limit 

236 if limit: 

237 metrics = metrics[:limit] 

238 

239 return metrics 

240 

241 def get_statistics(self, metric_type: MetricType, minutes: int = 60) -> Dict[str, Any]: 

242 """Get statistical summary for a metric type""" 

243 with self._lock: 

244 agg = self.aggregated_metrics.get(metric_type, {}) 

245 

246 if not agg or agg["count"] == 0: 

247 return { 

248 "count": 0, 

249 "average": None, 

250 "min": None, 

251 "max": None, 

252 "median": None, 

253 "std_dev": None 

254 } 

255 

256 values = agg["values"] 

257 if not values: 

258 return { 

259 "count": agg["count"], 

260 "average": agg["sum"] / agg["count"], 

261 "min": agg["min"], 

262 "max": agg["max"], 

263 "median": None, 

264 "std_dev": None 

265 } 

266 

267 try: 

268 return { 

269 "count": len(values), 

270 "average": statistics.mean(values), 

271 "median": statistics.median(values), 

272 "min": min(values), 

273 "max": max(values), 

274 "std_dev": statistics.stdev(values) if len(values) > 1 else 0, 

275 "p95": statistics.quantiles(values, n=20)[18] if len(values) > 20 else max(values), 

276 "p99": statistics.quantiles(values, n=100)[98] if len(values) > 100 else max(values) 

277 } 

278 except (statistics.StatisticsError, IndexError): 

279 return { 

280 "count": len(values), 

281 "average": statistics.mean(values), 

282 "median": statistics.median(values), 

283 "min": min(values), 

284 "max": max(values), 

285 "std_dev": 0, 

286 "p95": max(values), 

287 "p99": max(values) 

288 } 

289 

290 

291class AlertManager: 

292 """Manages alert rules, detection, and notification""" 

293 

294 def __init__(self, metrics_collector: MetricsCollector): 

295 self.metrics_collector = metrics_collector 

296 self.alert_rules: List[Dict[str, Any]] = [] 

297 self.active_alerts: Dict[str, Alert] = {} 

298 self.alert_history: List[Alert] = [] 

299 self.alert_callbacks: List[Callable[[Alert], None]] = [] 

300 self._lock = threading.Lock() 

301 

302 def add_alert_rule( 

303 self, 

304 name: str, 

305 metric_type: MetricType, 

306 threshold: float, 

307 operator: str = "gt", # gt, lt, eq, ne 

308 severity: AlertSeverity = AlertSeverity.MEDIUM, 

309 window_minutes: int = 5, 

310 consecutive_violations: int = 1, 

311 tags: Optional[Dict[str, str]] = None, 

312 description: Optional[str] = None 

313 ) -> None: 

314 """Add an alert rule""" 

315 rule = { 

316 "name": name, 

317 "metric_type": metric_type, 

318 "threshold": threshold, 

319 "operator": operator, 

320 "severity": severity, 

321 "window_minutes": window_minutes, 

322 "consecutive_violations": consecutive_violations, 

323 "tags": tags or {}, 

324 "description": description or f"Alert when {metric_type.value} {operator} {threshold}", 

325 "violation_count": 0, 

326 "last_check": None, 

327 "enabled": True 

328 } 

329 

330 with self._lock: 

331 self.alert_rules.append(rule) 

332 

333 def check_alerts(self) -> List[Alert]: 

334 """Check all alert rules and generate alerts for violations""" 

335 triggered_alerts = [] 

336 

337 with self._lock: 

338 for rule in self.alert_rules: 

339 if not rule["enabled"]: 

340 continue 

341 

342 # Get recent metrics for this rule 

343 recent_metrics = self.metrics_collector.get_metrics( 

344 metric_type=rule["metric_type"], 

345 start_time=datetime.now() - timedelta(minutes=rule["window_minutes"]) 

346 ) 

347 

348 if not recent_metrics: 

349 continue 

350 

351 # Check for violations 

352 violations = 0 

353 latest_value = None 

354 

355 for metric in recent_metrics: 

356 if isinstance(metric.value, (int, float)): 

357 if self._evaluate_condition(metric.value, rule["threshold"], rule["operator"]): 

358 violations += 1 

359 latest_value = metric.value 

360 

361 # Trigger alert if threshold exceeded 

362 if violations >= rule["consecutive_violations"]: 

363 alert_id = f"{rule['name']}_{int(time.time())}" 

364 

365 alert = Alert( 

366 alert_id=alert_id, 

367 severity=rule["severity"], 

368 title=f"{rule['name']} Alert Triggered", 

369 description=rule["description"], 

370 timestamp=datetime.now(), 

371 metric_type=rule["metric_type"], 

372 threshold=rule["threshold"], 

373 current_value=latest_value or 0, 

374 source="monitoring_system", 

375 tags=rule["tags"] 

376 ) 

377 

378 self.active_alerts[alert_id] = alert 

379 self.alert_history.append(alert) 

380 triggered_alerts.append(alert) 

381 

382 # Trigger callbacks 

383 for callback in self.alert_callbacks: 

384 try: 

385 callback(alert) 

386 except Exception as e: 

387 logger.error(f"Error in alert callback: {e}") 

388 

389 rule["violation_count"] = violations 

390 rule["last_check"] = datetime.now() 

391 

392 # Check for resolved alerts 

393 resolved_alerts = [] 

394 for alert_id, alert in list(self.active_alerts.items()): 

395 # Check if alert condition is no longer met 

396 rule = next((r for r in self.alert_rules if r["name"] in alert_id), None) 

397 if rule: 

398 recent_metrics = self.metrics_collector.get_metrics( 

399 metric_type=rule["metric_type"], 

400 start_time=datetime.now() - timedelta(minutes=1) # Check last minute 

401 ) 

402 

403 if recent_metrics: 

404 latest_value = None 

405 for metric in recent_metrics: 

406 if isinstance(metric.value, (int, float)): 

407 if not self._evaluate_condition(metric.value, rule["threshold"], rule["operator"]): 

408 latest_value = metric.value 

409 break 

410 

411 if latest_value is not None: 

412 # Alert is resolved 

413 alert.resolved = True 

414 alert.resolved_at = datetime.now() 

415 resolved_alerts.append(alert) 

416 del self.active_alerts[alert_id] 

417 

418 return triggered_alerts 

419 

420 def _evaluate_condition(self, value: float, threshold: float, operator: str) -> bool: 

421 """Evaluate alert condition""" 

422 if operator == "gt": 

423 return value > threshold 

424 elif operator == "lt": 

425 return value < threshold 

426 elif operator == "eq": 

427 return value == threshold 

428 elif operator == "ne": 

429 return value != threshold 

430 elif operator == "gte": 

431 return value >= threshold 

432 elif operator == "lte": 

433 return value <= threshold 

434 else: 

435 return False 

436 

437 def add_alert_callback(self, callback: Callable[[Alert], None]) -> None: 

438 """Add a callback function to be triggered when alerts fire""" 

439 self.alert_callbacks.append(callback) 

440 

441 def acknowledge_alert(self, alert_id: str) -> bool: 

442 """Acknowledge an alert""" 

443 with self._lock: 

444 if alert_id in self.active_alerts: 

445 self.active_alerts[alert_id].acknowledged = True 

446 self.active_alerts[alert_id].acknowledged_at = datetime.now() 

447 return True 

448 return False 

449 

450 def get_active_alerts(self, severity: Optional[AlertSeverity] = None) -> List[Alert]: 

451 """Get currently active alerts""" 

452 alerts = list(self.active_alerts.values()) 

453 if severity: 

454 alerts = [a for a in alerts if a.severity == severity] 

455 return sorted(alerts, key=lambda a: (a.severity.value, a.timestamp), reverse=True) 

456 

457 def get_alert_history(self, hours: int = 24) -> List[Alert]: 

458 """Get alert history""" 

459 cutoff_time = datetime.now() - timedelta(hours=hours) 

460 return [a for a in self.alert_history if a.timestamp >= cutoff_time] 

461 

462 

463class PredictiveAnalytics: 

464 """Predictive analytics for system performance and user behavior""" 

465 

466 def __init__(self, metrics_collector: MetricsCollector): 

467 self.metrics_collector = metrics_collector 

468 self.models: Dict[str, Dict[str, Any]] = {} 

469 self.predictions: Dict[str, Dict[str, Any]] = {} 

470 

471 def predict_metric_trend( 

472 self, 

473 metric_type: MetricType, 

474 hours_ahead: int = 1, 

475 confidence_threshold: float = 0.7 

476 ) -> Dict[str, Any]: 

477 """Predict metric values for specified hours ahead""" 

478 try: 

479 # Get historical data 

480 historical_metrics = self.metrics_collector.get_metrics( 

481 metric_type=metric_type, 

482 start_time=datetime.now() - timedelta(hours=24) 

483 ) 

484 

485 if len(historical_metrics) < 10: 

486 return { 

487 "prediction": None, 

488 "confidence": 0.0, 

489 "reason": "Insufficient historical data" 

490 } 

491 

492 # Extract numeric values 

493 values = [] 

494 timestamps = [] 

495 for metric in historical_metrics: 

496 if isinstance(metric.value, (int, float)): 

497 values.append(metric.value) 

498 timestamps.append(metric.timestamp) 

499 

500 if len(values) < 10: 

501 return { 

502 "prediction": None, 

503 "confidence": 0.0, 

504 "reason": "Insufficient numeric data points" 

505 } 

506 

507 # Simple linear regression for prediction 

508 import numpy as np 

509 

510 # Convert timestamps to numeric values (hours ago) 

511 now = datetime.now() 

512 x = np.array([(now - ts).total_seconds() / 3600 for ts in timestamps]) 

513 y = np.array(values) 

514 

515 # Fit linear model 

516 coeffs = np.polyfit(x, y, 1) 

517 

518 # Predict future values 

519 future_x = np.array([-h for h in range(1, hours_ahead + 1)]) 

520 future_y = np.polyval(coeffs, future_x) 

521 

522 # Calculate confidence based on R-squared 

523 y_pred = np.polyval(coeffs, x) 

524 ss_res = np.sum((y - y_pred) ** 2) 

525 ss_tot = np.sum((y - np.mean(y)) ** 2) 

526 r_squared = 1 - (ss_res / ss_tot) if ss_tot != 0 else 0 

527 

528 confidence = max(0, r_squared) 

529 

530 return { 

531 "prediction": { 

532 "future_values": future_y.tolist(), 

533 "trend": "increasing" if coeffs[0] > 0 else "decreasing" if coeffs[0] < 0 else "stable", 

534 "slope": coeffs[0] 

535 }, 

536 "confidence": confidence, 

537 "data_points": len(values), 

538 "model_type": "linear_regression", 

539 "reason": f"Linear regression on {len(values)} data points with R²={r_squared:.3f}" 

540 } 

541 

542 except Exception as e: 

543 logger.error(f"Error in predictive analytics: {e}") 

544 return { 

545 "prediction": None, 

546 "confidence": 0.0, 

547 "reason": f"Analysis error: {str(e)}" 

548 } 

549 

550 def detect_anomalies( 

551 self, 

552 metric_type: MetricType, 

553 z_score_threshold: float = 2.0, 

554 window_minutes: int = 60 

555 ) -> Dict[str, Any]: 

556 """Detect anomalies in metric data using statistical methods""" 

557 try: 

558 recent_metrics = self.metrics_collector.get_metrics( 

559 metric_type=metric_type, 

560 start_time=datetime.now() - timedelta(minutes=window_minutes) 

561 ) 

562 

563 values = [] 

564 for metric in recent_metrics: 

565 if isinstance(metric.value, (int, float)): 

566 values.append(metric.value) 

567 

568 if len(values) < 5: 

569 return { 

570 "anomalies": [], 

571 "statistics": {}, 

572 "reason": "Insufficient data for anomaly detection" 

573 } 

574 

575 import numpy as np 

576 

577 values_array = np.array(values) 

578 mean = np.mean(values_array) 

579 std = np.std(values_array) 

580 

581 if std == 0: 

582 return { 

583 "anomalies": [], 

584 "statistics": {"mean": mean, "std": std}, 

585 "reason": "No variance in data" 

586 } 

587 

588 # Detect anomalies using Z-score 

589 z_scores = np.abs((values_array - mean) / std) 

590 anomaly_indices = np.where(z_scores > z_score_threshold)[0] 

591 

592 anomalies = [] 

593 for i, idx in enumerate(anomaly_indices): 

594 metric = recent_metrics[idx] 

595 anomalies.append({ 

596 "timestamp": metric.timestamp.isoformat(), 

597 "value": metric.value, 

598 "z_score": float(z_scores[idx]), 

599 "deviation": float(values[idx] - mean) 

600 }) 

601 

602 return { 

603 "anomalies": anomalies, 

604 "statistics": { 

605 "mean": float(mean), 

606 "std": float(std), 

607 "min": float(np.min(values_array)), 

608 "max": float(np.max(values_array)), 

609 "count": len(values) 

610 }, 

611 "threshold": z_score_threshold, 

612 "reason": f"Found {len(anomalies)} anomalies using Z-score > {z_score_threshold}" 

613 } 

614 

615 except Exception as e: 

616 logger.error(f"Error in anomaly detection: {e}") 

617 return { 

618 "anomalies": [], 

619 "statistics": {}, 

620 "reason": f"Analysis error: {str(e)}" 

621 } 

622 

623 

624class PerformanceMonitor: 

625 """System performance monitoring""" 

626 

627 def __init__(self): 

628 self.start_time = datetime.now() 

629 self.metrics_collector = MetricsCollector() 

630 self.alert_manager = AlertManager(self.metrics_collector) 

631 self.predictive_analytics = PredictiveAnalytics(self.metrics_collector) 

632 self._running = False 

633 self._monitor_thread: Optional[threading.Thread] = None 

634 self._monitor_interval = 30 # seconds 

635 

636 def start(self) -> None: 

637 """Start performance monitoring""" 

638 if self._running: 

639 return 

640 

641 self._running = True 

642 self._monitor_thread = threading.Thread(target=self._monitor_loop, daemon=True) 

643 self._monitor_thread.start() 

644 logger.info("Performance monitoring started") 

645 

646 def stop(self) -> None: 

647 """Stop performance monitoring""" 

648 self._running = False 

649 if self._monitor_thread: 

650 self._monitor_thread.join(timeout=5) 

651 logger.info("Performance monitoring stopped") 

652 

653 def _monitor_loop(self) -> None: 

654 """Main monitoring loop""" 

655 while self._running: 

656 try: 

657 self._collect_system_metrics() 

658 self._check_alerts() 

659 time.sleep(self._monitor_interval) 

660 except Exception as e: 

661 logger.error(f"Error in monitoring loop: {e}") 

662 time.sleep(self._monitor_interval) 

663 

664 def _collect_system_metrics(self) -> None: 

665 """Collect system performance metrics""" 

666 try: 

667 # CPU Usage 

668 cpu_percent = psutil.cpu_percent(interval=1) 

669 self.metrics_collector.add_metric(MetricData( 

670 timestamp=datetime.now(), 

671 metric_type=MetricType.CPU_USAGE, 

672 value=cpu_percent, 

673 tags={"component": "system"}, 

674 source="psutil" 

675 )) 

676 

677 # Memory Usage 

678 memory = psutil.virtual_memory() 

679 self.metrics_collector.add_metric(MetricData( 

680 timestamp=datetime.now(), 

681 metric_type=MetricType.MEMORY_USAGE, 

682 value=memory.percent, 

683 tags={"component": "system", "total_gb": memory.total / (1024**3)}, 

684 source="psutil" 

685 )) 

686 

687 # Python process memory 

688 process = psutil.Process() 

689 process_memory = process.memory_info() 

690 self.metrics_collector.add_metric(MetricData( 

691 timestamp=datetime.now(), 

692 metric_type=MetricType.MEMORY_USAGE, 

693 value=process_memory.rss / (1024**2), # MB 

694 tags={"component": "python_process"}, 

695 source="psutil" 

696 )) 

697 

698 # System load 

699 load_avg = psutil.getloadavg() 

700 self.metrics_collector.add_metric(MetricData( 

701 timestamp=datetime.now(), 

702 metric_type=MetricType.SYSTEM_PERFORMANCE, 

703 value=load_avg[0], # 1-minute load average 

704 tags={"component": "system", "metric": "load_1min"}, 

705 source="psutil" 

706 )) 

707 

708 except Exception as e: 

709 logger.error(f"Error collecting system metrics: {e}") 

710 

711 def _check_alerts(self) -> None: 

712 """Check for alerts""" 

713 try: 

714 alerts = self.alert_manager.check_alerts() 

715 if alerts: 

716 for alert in alerts: 

717 logger.warning(f"Alert triggered: {alert.title} - {alert.current_value}") 

718 

719 except Exception as e: 

720 logger.error(f"Error checking alerts: {e}") 

721 

722 def add_custom_metric( 

723 self, 

724 metric_type: MetricType, 

725 value: Union[int, float], 

726 tags: Optional[Dict[str, str]] = None, 

727 source: str = "custom" 

728 ) -> None: 

729 """Add a custom metric""" 

730 self.metrics_collector.add_metric(MetricData( 

731 timestamp=datetime.now(), 

732 metric_type=metric_type, 

733 value=value, 

734 tags=tags or {}, 

735 source=source 

736 )) 

737 

738 def get_system_health(self) -> SystemHealth: 

739 """Get overall system health status""" 

740 try: 

741 # Calculate component scores 

742 component_scores = {} 

743 

744 # CPU health 

745 cpu_metrics = self.metrics_collector.get_metrics( 

746 MetricType.CPU_USAGE, 

747 start_time=datetime.now() - timedelta(minutes=5) 

748 ) 

749 if cpu_metrics: 

750 cpu_values = [m.value for m in cpu_metrics if isinstance(m.value, (int, float))] 

751 if cpu_values: 

752 avg_cpu = statistics.mean(cpu_values) 

753 cpu_score = max(0, 100 - avg_cpu) # Lower CPU usage = higher score 

754 component_scores["cpu"] = cpu_score 

755 

756 # Memory health 

757 memory_metrics = self.metrics_collector.get_metrics( 

758 MetricType.MEMORY_USAGE, 

759 start_time=datetime.now() - timedelta(minutes=5) 

760 ) 

761 if memory_metrics: 

762 memory_values = [m.value for m in memory_metrics if isinstance(m.value, (int, float))] 

763 if memory_values: 

764 avg_memory = statistics.mean(memory_values) 

765 memory_score = max(0, 100 - avg_memory) # Lower memory usage = higher score 

766 component_scores["memory"] = memory_score 

767 

768 # Error rate health 

769 error_metrics = self.metrics_collector.get_metrics( 

770 MetricType.ERROR_RATE, 

771 start_time=datetime.now() - timedelta(minutes=10) 

772 ) 

773 if error_metrics: 

774 error_values = [m.value for m in error_metrics if isinstance(m.value, (int, float))] 

775 if error_values: 

776 avg_error = statistics.mean(error_values) 

777 error_score = max(0, 100 - avg_error * 10) # Lower error rate = higher score 

778 component_scores["error_rate"] = error_score 

779 

780 # Calculate overall score 

781 if component_scores: 

782 overall_score = statistics.mean(component_scores.values()) 

783 else: 

784 overall_score = 100.0 

785 

786 # Determine health status 

787 if overall_score >= 90: 

788 status = HealthStatus.HEALTHY 

789 elif overall_score >= 70: 

790 status = HealthStatus.WARNING 

791 elif overall_score >= 50: 

792 status = HealthStatus.DEGRADED 

793 elif overall_score >= 30: 

794 status = HealthStatus.CRITICAL 

795 else: 

796 status = HealthStatus.DOWN 

797 

798 # Get active alerts 

799 active_alerts = list(self.alert_manager.active_alerts.keys()) 

800 

801 # Get recent metrics summary 

802 recent_metrics = {} 

803 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE, MetricType.ERROR_RATE]: 

804 recent_metric = self.metrics_collector.get_metrics(metric_type, limit=1) 

805 if recent_metric and isinstance(recent_metric[0].value, (int, float)): 

806 recent_metrics[metric_type.value] = recent_metric[0].value 

807 

808 return SystemHealth( 

809 status=status, 

810 timestamp=datetime.now(), 

811 overall_score=overall_score, 

812 component_scores=component_scores, 

813 active_alerts=active_alerts, 

814 recent_metrics=recent_metrics, 

815 last_check=datetime.now() 

816 ) 

817 

818 except Exception as e: 

819 logger.error(f"Error calculating system health: {e}") 

820 return SystemHealth( 

821 status=HealthStatus.DOWN, 

822 timestamp=datetime.now(), 

823 overall_score=0.0, 

824 last_check=datetime.now() 

825 ) 

826 

827 def setup_default_alerts(self) -> None: 

828 """Setup default alert rules""" 

829 # CPU usage alert 

830 self.alert_manager.add_alert_rule( 

831 name="High CPU Usage", 

832 metric_type=MetricType.CPU_USAGE, 

833 threshold=80.0, 

834 operator="gt", 

835 severity=AlertSeverity.HIGH, 

836 window_minutes=5, 

837 consecutive_violations=2, 

838 tags={"component": "cpu"} 

839 ) 

840 

841 # Memory usage alert 

842 self.alert_manager.add_alert_rule( 

843 name="High Memory Usage", 

844 metric_type=MetricType.MEMORY_USAGE, 

845 threshold=85.0, 

846 operator="gt", 

847 severity=AlertSeverity.HIGH, 

848 window_minutes=5, 

849 consecutive_violations=2, 

850 tags={"component": "memory"} 

851 ) 

852 

853 # Error rate alert 

854 self.alert_manager.add_alert_rule( 

855 name="High Error Rate", 

856 metric_type=MetricType.ERROR_RATE, 

857 threshold=5.0, 

858 operator="gt", 

859 severity=AlertSeverity.CRITICAL, 

860 window_minutes=2, 

861 consecutive_violations=1, 

862 tags={"component": "errors"} 

863 ) 

864 

865 logger.info("Default alert rules configured") 

866 

867 

868class ComprehensiveMonitoringSystem: 

869 """Main monitoring system orchestrator""" 

870 

871 def __init__(self, config_file: Optional[Path] = None): 

872 self.config_file = config_file or Path.cwd() / ".moai" / "config" / "monitoring.json" 

873 self.config = self._load_config() 

874 

875 # Initialize components 

876 self.metrics_collector = MetricsCollector( 

877 buffer_size=self.config.get("buffer_size", 10000), 

878 retention_hours=self.config.get("retention_hours", 24) 

879 ) 

880 

881 self.alert_manager = AlertManager(self.metrics_collector) 

882 self.predictive_analytics = PredictiveAnalytics(self.metrics_collector) 

883 self.performance_monitor = PerformanceMonitor() 

884 

885 # Initialize monitoring status 

886 self._running = False 

887 self._startup_time = datetime.now() 

888 

889 def _load_config(self) -> Dict[str, Any]: 

890 """Load monitoring configuration""" 

891 default_config = { 

892 "buffer_size": 10000, 

893 "retention_hours": 24, 

894 "monitor_interval": 30, 

895 "alert_check_interval": 60, 

896 "predictive_analysis_hours": 24, 

897 "health_check_interval": 300, 

898 "enable_predictions": True, 

899 "enable_anomaly_detection": True, 

900 "auto_optimization": False 

901 } 

902 

903 if self.config_file.exists(): 

904 try: 

905 with open(self.config_file, 'r') as f: 

906 config = json.load(f) 

907 default_config.update(config) 

908 except Exception as e: 

909 logger.error(f"Error loading monitoring config: {e}") 

910 

911 return default_config 

912 

913 def start(self) -> None: 

914 """Start the monitoring system""" 

915 if self._running: 

916 return 

917 

918 logger.info("Starting Comprehensive Monitoring System") 

919 

920 # Start performance monitoring 

921 self.performance_monitor.start() 

922 

923 # Setup default alerts 

924 self.performance_monitor.setup_default_alerts() 

925 

926 # Setup alert callbacks 

927 self.alert_manager.add_alert_callback(self._handle_alert) 

928 

929 self._running = True 

930 logger.info("Comprehensive Monitoring System started successfully") 

931 

932 def stop(self) -> None: 

933 """Stop the monitoring system""" 

934 if not self._running: 

935 return 

936 

937 logger.info("Stopping Comprehensive Monitoring System") 

938 

939 self.performance_monitor.stop() 

940 self._running = False 

941 

942 logger.info("Comprehensive Monitoring System stopped") 

943 

944 def _handle_alert(self, alert: Alert) -> None: 

945 """Handle triggered alerts""" 

946 logger.warning(f"ALERT: {alert.title} - {alert.description}") 

947 

948 # Here you could add additional alert handling: 

949 # - Send notifications 

950 # - Trigger automated responses 

951 # - Log to external systems 

952 # - Send to monitoring dashboard 

953 

954 def add_metric( 

955 self, 

956 metric_type: MetricType, 

957 value: Union[int, float], 

958 tags: Optional[Dict[str, str]] = None, 

959 source: str = "user" 

960 ) -> None: 

961 """Add a custom metric""" 

962 self.performance_monitor.add_custom_metric(metric_type, value, tags, source) 

963 

964 def get_dashboard_data(self) -> Dict[str, Any]: 

965 """Get data for monitoring dashboard""" 

966 try: 

967 # System health 

968 health = self.performance_monitor.get_system_health() 

969 

970 # Active alerts 

971 active_alerts = self.alert_manager.get_active_alerts() 

972 

973 # Recent metrics summary 

974 recent_metrics = {} 

975 for metric_type in [ 

976 MetricType.CPU_USAGE, 

977 MetricType.MEMORY_USAGE, 

978 MetricType.ERROR_RATE, 

979 MetricType.RESPONSE_TIME 

980 ]: 

981 stats = self.metrics_collector.get_statistics(metric_type, minutes=60) 

982 if stats["count"] > 0: 

983 recent_metrics[metric_type.value] = stats 

984 

985 # Predictions 

986 predictions = {} 

987 if self.config.get("enable_predictions", True): 

988 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE]: 

989 pred = self.predictive_analytics.predict_metric_trend( 

990 metric_type, hours_ahead=1 

991 ) 

992 if pred["confidence"] > 0.5: 

993 predictions[metric_type.value] = pred 

994 

995 return { 

996 "health": health.to_dict(), 

997 "active_alerts": [alert.to_dict() for alert in active_alerts], 

998 "recent_metrics": recent_metrics, 

999 "predictions": predictions, 

1000 "uptime_seconds": (datetime.now() - self._startup_time).total_seconds(), 

1001 "last_update": datetime.now().isoformat() 

1002 } 

1003 

1004 except Exception as e: 

1005 logger.error(f"Error getting dashboard data: {e}") 

1006 return { 

1007 "error": str(e), 

1008 "last_update": datetime.now().isoformat() 

1009 } 

1010 

1011 def get_analytics_report(self, hours: int = 24) -> Dict[str, Any]: 

1012 """Generate comprehensive analytics report""" 

1013 try: 

1014 # Overall metrics summary 

1015 summary = {} 

1016 for metric_type in MetricType: 

1017 stats = self.metrics_collector.get_statistics(metric_type, minutes=hours * 60) 

1018 if stats["count"] > 0: 

1019 summary[metric_type.value] = stats 

1020 

1021 # Anomaly detection 

1022 anomalies = {} 

1023 if self.config.get("enable_anomaly_detection", True): 

1024 for metric_type in [MetricType.CPU_USAGE, MetricType.MEMORY_USAGE, MetricType.ERROR_RATE]: 

1025 anomaly_result = self.predictive_analytics.detect_anomalies(metric_type) 

1026 if anomaly_result["anomalies"]: 

1027 anomalies[metric_type.value] = anomaly_result 

1028 

1029 # Alert summary 

1030 alert_history = self.alert_manager.get_alert_history(hours=hours) 

1031 alert_summary = { 

1032 "total_alerts": len(alert_history), 

1033 "by_severity": {}, 

1034 "by_metric_type": {}, 

1035 "resolved_count": sum(1 for a in alert_history if a.resolved), 

1036 "acknowledged_count": sum(1 for a in alert_history if a.acknowledged) 

1037 } 

1038 

1039 for alert in alert_history: 

1040 severity_key = alert.severity.name 

1041 alert_summary["by_severity"][severity_key] = alert_summary["by_severity"].get(severity_key, 0) + 1 

1042 

1043 metric_key = alert.metric_type.value 

1044 alert_summary["by_metric_type"][metric_key] = alert_summary["by_metric_type"].get(metric_key, 0) + 1 

1045 

1046 return { 

1047 "report_period_hours": hours, 

1048 "generated_at": datetime.now().isoformat(), 

1049 "metrics_summary": summary, 

1050 "anomalies": anomalies, 

1051 "alert_summary": alert_summary, 

1052 "system_health": self.performance_monitor.get_system_health().to_dict(), 

1053 "recommendations": self._generate_recommendations(summary, anomalies) 

1054 } 

1055 

1056 except Exception as e: 

1057 logger.error(f"Error generating analytics report: {e}") 

1058 return { 

1059 "error": str(e), 

1060 "generated_at": datetime.now().isoformat() 

1061 } 

1062 

1063 def _generate_recommendations( 

1064 self, 

1065 metrics_summary: Dict[str, Any], 

1066 anomalies: Dict[str, Any] 

1067 ) -> List[str]: 

1068 """Generate optimization recommendations based on metrics and anomalies""" 

1069 recommendations = [] 

1070 

1071 # CPU recommendations 

1072 if MetricType.CPU_USAGE.value in metrics_summary: 

1073 cpu_stats = metrics_summary[MetricType.CPU_USAGE.value] 

1074 if cpu_stats["average"] > 70: 

1075 recommendations.append("High CPU usage detected. Consider optimizing code or scaling resources.") 

1076 

1077 # Memory recommendations 

1078 if MetricType.MEMORY_USAGE.value in metrics_summary: 

1079 memory_stats = metrics_summary[MetricType.MEMORY_USAGE.value] 

1080 if memory_stats["average"] > 80: 

1081 recommendations.append("High memory usage detected. Consider memory optimization or increasing available memory.") 

1082 

1083 # Error rate recommendations 

1084 if MetricType.ERROR_RATE.value in metrics_summary: 

1085 error_stats = metrics_summary[MetricType.ERROR_RATE.value] 

1086 if error_stats["average"] > 5: 

1087 recommendations.append("High error rate detected. Review error logs and implement better error handling.") 

1088 

1089 # Anomaly recommendations 

1090 if anomalies: 

1091 recommendations.append("Anomalies detected in system metrics. Review the detailed anomaly report for specific issues.") 

1092 

1093 return recommendations 

1094 

1095 

1096# Global instance for easy access 

1097_monitoring_system: Optional[ComprehensiveMonitoringSystem] = None 

1098 

1099 

1100def get_monitoring_system() -> ComprehensiveMonitoringSystem: 

1101 """Get or create global monitoring system instance""" 

1102 global _monitoring_system 

1103 if _monitoring_system is None: 

1104 _monitoring_system = ComprehensiveMonitoringSystem() 

1105 return _monitoring_system 

1106 

1107 

1108# Convenience functions 

1109def start_monitoring() -> None: 

1110 """Start the monitoring system""" 

1111 system = get_monitoring_system() 

1112 system.start() 

1113 

1114 

1115def stop_monitoring() -> None: 

1116 """Stop the monitoring system""" 

1117 system = get_monitoring_system() 

1118 system.stop() 

1119 

1120 

1121def add_metric( 

1122 metric_type: MetricType, 

1123 value: Union[int, float], 

1124 tags: Optional[Dict[str, str]] = None, 

1125 source: str = "user" 

1126) -> None: 

1127 """Add a custom metric""" 

1128 system = get_monitoring_system() 

1129 system.add_metric(metric_type, value, tags, source) 

1130 

1131 

1132def get_dashboard_data() -> Dict[str, Any]: 

1133 """Get monitoring dashboard data""" 

1134 system = get_monitoring_system() 

1135 return system.get_dashboard_data() 

1136 

1137 

1138if __name__ == "__main__": 

1139 # Example usage 

1140 print("Starting Comprehensive Monitoring System...") 

1141 

1142 monitoring = ComprehensiveMonitoringSystem() 

1143 monitoring.start() 

1144 

1145 try: 

1146 # Simulate some metrics 

1147 for i in range(10): 

1148 monitoring.add_metric(MetricType.CPU_USAGE, 50 + i * 3) 

1149 monitoring.add_metric(MetricType.MEMORY_USAGE, 60 + i * 2) 

1150 time.sleep(1) 

1151 

1152 # Get dashboard data 

1153 dashboard_data = monitoring.get_dashboard_data() 

1154 print(f"System Health: {dashboard_data['health']['status']}") 

1155 print(f"Overall Score: {dashboard_data['health']['overall_score']}") 

1156 print(f"Active Alerts: {len(dashboard_data['active_alerts'])}") 

1157 

1158 # Generate analytics report 

1159 report = monitoring.get_analytics_report(hours=1) 

1160 print(f"Analytics Report: {len(report['metrics_summary'])} metric types tracked") 

1161 

1162 finally: 

1163 monitoring.stop() 

1164 print("Monitoring stopped.")