Coverage for src / moai_adk / core / error_recovery_system.py: 0.00%
494 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
1"""
2Comprehensive Error Handling & Recovery System for Research Workflows
4Provides:
5- Error detection and classification
6- Recovery procedures and fallback mechanisms
7- Integration with research hooks, agents, and skills
8- Documentation of error handling procedures
9- Troubleshooting guides and automated recovery
11Features:
12- Multi-level error handling (critical, warning, info)
13- Automatic recovery mechanisms
14- Manual recovery procedures
15- Error logging and tracking
16- System health monitoring
17- Emergency recovery procedures
18"""
20import hashlib
21import json
22import logging
23import os
24import sys
25import tempfile
26import threading
27import time
28import traceback
29from dataclasses import asdict, dataclass
30from datetime import datetime, timedelta, timezone
31from enum import Enum
32from pathlib import Path
33from typing import Any, Callable, Dict, List, Optional
35# Configure comprehensive logging
36logging.basicConfig(
37 level=logging.INFO,
38 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
39 handlers=[
40 logging.FileHandler(Path(tempfile.gettempdir()) / "moai_error_recovery.log"),
41 logging.StreamHandler(sys.stdout),
42 ],
43)
45logger = logging.getLogger(__name__)
48class ErrorSeverity(Enum):
49 """Error severity levels"""
51 CRITICAL = "critical" # System failure, immediate attention required
52 HIGH = "high" # Major functionality impacted
53 MEDIUM = "medium" # Partial functionality impacted
54 LOW = "low" # Minor issue, can be deferred
55 INFO = "info" # Informational message
58class ErrorCategory(Enum):
59 """Error categories for classification"""
61 SYSTEM = "system" # System-level errors
62 CONFIGURATION = "configuration" # Configuration errors
63 RESEARCH = "research" # Research workflow errors
64 INTEGRATION = "integration" # Integration errors
65 COMMUNICATION = "communication" # Agent/communication errors
66 VALIDATION = "validation" # Validation errors
67 PERFORMANCE = "performance" # Performance issues
68 RESOURCE = "resource" # Resource exhaustion
69 NETWORK = "network" # Network-related errors
70 USER_INPUT = "user_input" # User input errors
73@dataclass
74class ErrorReport:
75 """Comprehensive error report structure"""
77 id: str
78 timestamp: datetime
79 severity: ErrorSeverity
80 category: ErrorCategory
81 message: str
82 details: Dict[str, Any]
83 stack_trace: Optional[str]
84 context: Dict[str, Any]
85 recovery_attempted: bool = False
86 recovery_successful: bool = False
87 resolution_message: Optional[str] = None
90@dataclass
91class RecoveryAction:
92 """Recovery action definition"""
94 name: str
95 description: str
96 action_type: str # "automatic", "manual", "assisted"
97 severity_filter: List[ErrorSeverity]
98 category_filter: List[ErrorCategory]
99 handler: Callable
100 timeout: Optional[float] = None
101 max_attempts: int = 3
102 success_criteria: Optional[str] = None
105@dataclass
106class RecoveryResult:
107 """Result of recovery action"""
109 success: bool
110 action_name: str
111 message: str
112 duration: float
113 details: Dict[str, Any] = None
114 next_actions: List[str] = None
117class ErrorRecoverySystem:
118 """Comprehensive error handling and recovery system"""
120 def __init__(self, project_root: Path = None):
121 self.project_root = project_root or Path.cwd()
122 self.error_log_dir = self.project_root / ".moai" / "error_logs"
123 self.error_log_dir.mkdir(parents=True, exist_ok=True)
125 # Error tracking
126 self.active_errors: Dict[str, ErrorReport] = {}
127 self.error_history: List[ErrorReport] = []
128 self.recovery_actions: Dict[str, RecoveryAction] = {}
129 self.error_stats: Dict[str, Any] = {
130 "total_errors": 0,
131 "by_severity": {},
132 "by_category": {},
133 "recovery_success_rate": 0.0,
134 }
136 # System health monitoring
137 self.system_health = {
138 "status": "healthy",
139 "last_check": datetime.now(timezone.utc),
140 "issues": [],
141 "metrics": {},
142 }
144 # Initialize recovery actions
145 self._initialize_recovery_actions()
147 # Background monitoring thread
148 self.monitoring_active = True
149 self.monitor_thread = threading.Thread(
150 target=self._background_monitoring, daemon=True
151 )
152 self.monitor_thread.start()
154 logger.info("Error Recovery System initialized")
156 def handle_error(
157 self,
158 error: Exception,
159 context: Dict[str, Any] = None,
160 severity: ErrorSeverity = ErrorSeverity.MEDIUM,
161 category: ErrorCategory = ErrorCategory.SYSTEM,
162 ) -> ErrorReport:
163 """
164 Handle an error with comprehensive logging and recovery
166 Args:
167 error: Exception that occurred
168 context: Additional context information
169 severity: Error severity level
170 category: Error category
172 Returns:
173 ErrorReport with handling details
174 """
175 error_id = self._generate_error_id()
176 timestamp = datetime.now(timezone.utc)
178 # Create error report
179 error_report = ErrorReport(
180 id=error_id,
181 timestamp=timestamp,
182 severity=severity,
183 category=category,
184 message=str(error),
185 details={
186 "exception_type": type(error).__name__,
187 "exception_module": type(error).__module__,
188 "error_code": getattr(error, "code", None),
189 },
190 stack_trace=traceback.format_exc(),
191 context=context or {},
192 recovery_attempted=False,
193 recovery_successful=False,
194 )
196 # Log error
197 self._log_error(error_report)
199 # Update statistics
200 self._update_error_stats(error_report)
202 # Store error
203 self.active_errors[error_id] = error_report
204 self.error_history.append(error_report)
206 # Attempt automatic recovery
207 if severity in [ErrorSeverity.CRITICAL, ErrorSeverity.HIGH]:
208 recovery_result = self._attempt_automatic_recovery(error_report)
209 error_report.recovery_attempted = True
210 error_report.recovery_successful = recovery_result.success
211 error_report.resolution_message = recovery_result.message
213 if recovery_result.success:
214 logger.info(f"Automatic recovery successful for error {error_id}")
215 self.active_errors.pop(error_id, None)
216 else:
217 logger.warning(
218 f"Automatic recovery failed for error {error_id}: {recovery_result.message}"
219 )
221 # Update system health
222 self._update_system_health()
224 return error_report
226 def register_recovery_action(self, action: RecoveryAction):
227 """
228 Register a new recovery action
230 Args:
231 action: RecoveryAction definition
232 """
233 self.recovery_actions[action.name] = action
234 logger.info(f"Registered recovery action: {action.name}")
236 def attempt_manual_recovery(
237 self, error_id: str, action_name: str, parameters: Dict[str, Any] = None
238 ) -> RecoveryResult:
239 """
240 Attempt manual recovery for a specific error
242 Args:
243 error_id: ID of error to recover
244 action_name: Name of recovery action to attempt
245 parameters: Additional parameters for recovery
247 Returns:
248 RecoveryResult with operation details
249 """
250 if error_id not in self.active_errors:
251 return RecoveryResult(
252 success=False,
253 action_name=action_name,
254 message=f"Error {error_id} not found in active errors",
255 )
257 if action_name not in self.recovery_actions:
258 return RecoveryResult(
259 success=False,
260 action_name=action_name,
261 message=f"Recovery action {action_name} not found",
262 )
264 error_report = self.active_errors[error_id]
265 recovery_action = self.recovery_actions[action_name]
267 logger.info(f"Attempting manual recovery {action_name} for error {error_id}")
269 try:
270 start_time = time.time()
272 # Execute recovery action
273 result = recovery_action.handler(error_report, parameters or {})
275 duration = time.time() - start_time
277 if result:
278 recovery_result = RecoveryResult(
279 success=True,
280 action_name=action_name,
281 message="Manual recovery completed successfully",
282 duration=duration,
283 details={"result": result},
284 )
286 # Update error report
287 error_report.recovery_successful = True
288 error_report.resolution_message = recovery_result.message
290 # Remove from active errors
291 self.active_errors.pop(error_id, None)
293 else:
294 recovery_result = RecoveryResult(
295 success=False,
296 action_name=action_name,
297 message="Manual recovery returned unsuccessful result",
298 duration=duration,
299 )
301 except Exception as e:
302 duration = time.time() - start_time
303 recovery_result = RecoveryResult(
304 success=False,
305 action_name=action_name,
306 message=f"Manual recovery failed: {str(e)}",
307 duration=duration,
308 details={"exception": str(e)},
309 )
311 return recovery_result
313 def get_system_health(self) -> Dict[str, Any]:
314 """
315 Get current system health status
317 Returns:
318 System health information
319 """
320 self._update_system_health()
322 return {
323 "status": self.system_health["status"],
324 "last_check": self.system_health["last_check"].isoformat(),
325 "active_errors": len(self.active_errors),
326 "total_errors": len(self.error_history),
327 "error_stats": self.error_stats.copy(),
328 "issues": self.system_health["issues"].copy(),
329 "metrics": self.system_health["metrics"].copy(),
330 "recovery_actions_available": len(self.recovery_actions),
331 }
333 def get_error_summary(self, limit: int = 50) -> Dict[str, Any]:
334 """
335 Get summary of recent errors
337 Args:
338 limit: Maximum number of errors to include
340 Returns:
341 Error summary information
342 """
343 recent_errors = self.error_history[-limit:]
345 # Categorize errors
346 by_severity = {}
347 by_category = {}
349 for error in recent_errors:
350 # By severity
351 severity = error.severity.value
352 if severity not in by_severity:
353 by_severity[severity] = []
354 by_severity[severity].append(error.id)
356 # By category
357 category = error.category.value
358 if category not in by_category:
359 by_category[category] = []
360 by_category[category].append(error.id)
362 # Common error patterns
363 error_patterns = self._identify_error_patterns(recent_errors)
365 return {
366 "total_recent_errors": len(recent_errors),
367 "active_errors": len(self.active_errors),
368 "by_severity": {k: len(v) for k, v in by_severity.items()},
369 "by_category": {k: len(v) for k, v in by_category.items()},
370 "common_patterns": error_patterns,
371 "recovery_rate": self._calculate_recovery_rate(recent_errors),
372 "recent_errors": [
373 {
374 "id": error.id,
375 "timestamp": error.timestamp.isoformat(),
376 "severity": error.severity.value,
377 "category": error.category.value,
378 "message": error.message,
379 "recovered": error.recovery_successful,
380 }
381 for error in recent_errors[-10:] # Last 10 errors
382 ],
383 }
385 def generate_troubleshooting_guide(self) -> Dict[str, Any]:
386 """
387 Generate troubleshooting guide based on error history
389 Returns:
390 Troubleshooting guide with solutions
391 """
392 guide = {
393 "generated_at": datetime.now(timezone.utc).isoformat(),
394 "common_issues": [],
395 "recovery_procedures": {},
396 "prevention_tips": [],
397 "emergency_procedures": [],
398 }
400 # Analyze common issues
401 error_patterns = self._identify_error_patterns(self.error_history)
402 for pattern, frequency in error_patterns.items():
403 if frequency > 2: # Issues that occurred more than twice
404 guide["common_issues"].append(
405 {
406 "pattern": pattern,
407 "frequency": frequency,
408 "severity": self._get_pattern_severity(pattern),
409 "solutions": self._get_solutions_for_pattern(pattern),
410 }
411 )
413 # Generate recovery procedures
414 for action_name, action in self.recovery_actions.items():
415 guide["recovery_procedures"][action_name] = {
416 "description": action.description,
417 "type": action.action_type,
418 "for_severities": [s.value for s in action.severity_filter],
419 "for_categories": [c.value for c in action.category_filter],
420 }
422 # Prevention tips
423 guide["prevention_tips"] = self._generate_prevention_tips()
425 # Emergency procedures
426 guide["emergency_procedures"] = self._generate_emergency_procedures()
428 return guide
430 def cleanup_old_errors(self, days_to_keep: int = 30) -> Dict[str, Any]:
431 """
432 Clean up old error records
434 Args:
435 days_to_keep: Number of days to keep error records
437 Returns:
438 Cleanup operation results
439 """
440 cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep)
442 old_errors = [e for e in self.error_history if e.timestamp < cutoff_date]
443 removed_count = len(old_errors)
445 # Keep only recent errors
446 self.error_history = [
447 e for e in self.error_history if e.timestamp >= cutoff_date
448 ]
450 # Save updated error history
451 self._save_error_history()
453 logger.info(f"Cleaned up {removed_count} old error records")
455 return {
456 "removed_count": removed_count,
457 "remaining_count": len(self.error_history),
458 "cutoff_date": cutoff_date.isoformat(),
459 }
461 def _initialize_recovery_actions(self):
462 """Initialize default recovery actions"""
463 # System recovery actions
464 self.register_recovery_action(
465 RecoveryAction(
466 name="restart_research_engines",
467 description="Restart research engines and clear caches",
468 action_type="automatic",
469 severity_filter=[ErrorSeverity.HIGH, ErrorSeverity.CRITICAL],
470 category_filter=[ErrorCategory.RESEARCH, ErrorCategory.SYSTEM],
471 handler=self._restart_research_engines,
472 timeout=30.0,
473 )
474 )
476 self.register_recovery_action(
477 RecoveryAction(
478 name="restore_config_backup",
479 description="Restore configuration from last known good backup",
480 action_type="automatic",
481 severity_filter=[ErrorSeverity.CRITICAL],
482 category_filter=[ErrorCategory.CONFIGURATION],
483 handler=self._restore_config_backup,
484 timeout=15.0,
485 )
486 )
488 self.register_recovery_action(
489 RecoveryAction(
490 name="clear_agent_cache",
491 description="Clear agent communication cache and reset connections",
492 action_type="automatic",
493 severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH],
494 category_filter=[ErrorCategory.COMMUNICATION],
495 handler=self._clear_agent_cache,
496 timeout=10.0,
497 )
498 )
500 self.register_recovery_action(
501 RecoveryAction(
502 name="validate_research_integrity",
503 description="Validate research component integrity and repair if needed",
504 action_type="assisted",
505 severity_filter=[ErrorSeverity.HIGH],
506 category_filter=[ErrorCategory.RESEARCH, ErrorCategory.VALIDATION],
507 handler=self._validate_research_integrity,
508 timeout=60.0,
509 )
510 )
512 self.register_recovery_action(
513 RecoveryAction(
514 name="rollback_last_changes",
515 description="Rollback last research integration changes",
516 action_type="manual",
517 severity_filter=[ErrorSeverity.CRITICAL],
518 category_filter=[ErrorCategory.INTEGRATION, ErrorCategory.RESEARCH],
519 handler=self._rollback_last_changes,
520 timeout=45.0,
521 )
522 )
524 self.register_recovery_action(
525 RecoveryAction(
526 name="reset_system_state",
527 description="Reset system to known good state",
528 action_type="manual",
529 severity_filter=[ErrorSeverity.CRITICAL],
530 category_filter=[ErrorCategory.SYSTEM],
531 handler=self._reset_system_state,
532 timeout=120.0,
533 )
534 )
536 # Performance recovery actions
537 self.register_recovery_action(
538 RecoveryAction(
539 name="optimize_performance",
540 description="Optimize system performance and clear bottlenecks",
541 action_type="automatic",
542 severity_filter=[ErrorSeverity.MEDIUM],
543 category_filter=[ErrorCategory.PERFORMANCE],
544 handler=self._optimize_performance,
545 timeout=30.0,
546 )
547 )
549 # Resource recovery actions
550 self.register_recovery_action(
551 RecoveryAction(
552 name="free_resources",
553 description="Free up system resources and memory",
554 action_type="automatic",
555 severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH],
556 category_filter=[ErrorCategory.RESOURCE],
557 handler=self._free_resources,
558 timeout=20.0,
559 )
560 )
562 def _attempt_automatic_recovery(self, error_report: ErrorReport) -> RecoveryResult:
563 """Attempt automatic recovery for an error"""
564 suitable_actions = []
566 # Find suitable recovery actions
567 for action_name, action in self.recovery_actions.items():
568 if (
569 action.action_type == "automatic"
570 and error_report.severity in action.severity_filter
571 and error_report.category in action.category_filter
572 ):
573 suitable_actions.append(action)
575 # Try actions in order of priority
576 for action in suitable_actions:
577 try:
578 logger.info(f"Attempting automatic recovery: {action.name}")
580 start_time = time.time()
581 result = action.handler(error_report, {})
582 duration = time.time() - start_time
584 if result:
585 return RecoveryResult(
586 success=True,
587 action_name=action.name,
588 message=f"Automatic recovery successful: {action.name}",
589 duration=duration,
590 details={"result": result},
591 )
593 except Exception as e:
594 logger.warning(f"Recovery action {action.name} failed: {str(e)}")
595 continue
597 return RecoveryResult(
598 success=False,
599 action_name="none",
600 message="No suitable automatic recovery action succeeded",
601 )
603 def _restart_research_engines(
604 self, error_report: ErrorReport, parameters: Dict[str, Any]
605 ) -> bool:
606 """Restart research engines and clear caches"""
607 try:
608 logger.info("Restarting research engines...")
610 # Clear research engine caches
611 cache_dirs = [
612 self.project_root / ".moai" / "cache",
613 self.project_root / ".claude" / "cache",
614 ]
616 for cache_dir in cache_dirs:
617 if cache_dir.exists():
618 import shutil
620 shutil.rmtree(cache_dir)
621 cache_dir.mkdir(parents=True, exist_ok=True)
623 # Reset research engine state
624 research_state_file = self.project_root / ".moai" / "research_state.json"
625 if research_state_file.exists():
626 research_state_file.unlink()
628 # Reinitialize research components
629 self._reinitialize_research_components()
631 logger.info("Research engines restarted successfully")
632 return True
634 except Exception as e:
635 logger.error(f"Failed to restart research engines: {str(e)}")
636 return False
638 def _restore_config_backup(
639 self, error_report: ErrorReport, parameters: Dict[str, Any]
640 ) -> bool:
641 """Restore configuration from backup"""
642 try:
643 logger.info("Restoring configuration from backup...")
645 backup_dir = self.project_root / ".moai" / "config_backups"
646 if not backup_dir.exists():
647 logger.warning("No configuration backup directory found")
648 return False
650 # Find most recent backup
651 backup_files = list(backup_dir.glob("config_*.json"))
652 if not backup_files:
653 logger.warning("No configuration backups found")
654 return False
656 latest_backup = max(backup_files, key=lambda f: f.stat().st_mtime)
658 # Restore configuration
659 config_file = self.project_root / ".moai" / "config" / "config.json"
660 import shutil
662 shutil.copy2(latest_backup, config_file)
664 logger.info(f"Configuration restored from {latest_backup}")
665 return True
667 except Exception as e:
668 logger.error(f"Failed to restore configuration: {str(e)}")
669 return False
671 def _clear_agent_cache(
672 self, error_report: ErrorReport, parameters: Dict[str, Any]
673 ) -> bool:
674 """Clear agent communication cache"""
675 try:
676 logger.info("Clearing agent cache...")
678 # Clear agent state files
679 agent_state_dir = self.project_root / ".moai" / "agent_state"
680 if agent_state_dir.exists():
681 import shutil
683 shutil.rmtree(agent_state_dir)
684 agent_state_dir.mkdir(parents=True, exist_ok=True)
686 # Reset communication channels
687 comm_cache_dir = self.project_root / ".moai" / "comm_cache"
688 if comm_cache_dir.exists():
689 import shutil
691 shutil.rmtree(comm_cache_dir)
692 comm_cache_dir.mkdir(parents=True, exist_ok=True)
694 logger.info("Agent cache cleared successfully")
695 return True
697 except Exception as e:
698 logger.error(f"Failed to clear agent cache: {str(e)}")
699 return False
701 def _validate_research_integrity(
702 self, error_report: ErrorReport, parameters: Dict[str, Any]
703 ) -> Dict[str, Any]:
704 """Validate research component integrity"""
705 validation_results = {
706 "skills_valid": True,
707 "agents_valid": True,
708 "commands_valid": True,
709 "hooks_valid": True,
710 "issues_found": [],
711 "repairs_made": [],
712 }
714 try:
715 logger.info("Validating research integrity...")
717 # Validate skills
718 skills_dir = self.project_root / ".claude" / "skills"
719 if skills_dir.exists():
720 for skill_file in skills_dir.glob("*.md"):
721 if not self._validate_skill_file(skill_file):
722 validation_results["skills_valid"] = False
723 validation_results["issues_found"].append(
724 f"Invalid skill file: {skill_file}"
725 )
727 # Attempt repair
728 if self._repair_skill_file(skill_file):
729 validation_results["repairs_made"].append(
730 f"Repaired: {skill_file}"
731 )
733 # Validate agents
734 agents_dir = self.project_root / ".claude" / "agents" / "alfred"
735 if agents_dir.exists():
736 for agent_file in agents_dir.glob("*.md"):
737 if not self._validate_agent_file(agent_file):
738 validation_results["agents_valid"] = False
739 validation_results["issues_found"].append(
740 f"Invalid agent file: {agent_file}"
741 )
743 # Validate commands
744 commands_dir = self.project_root / ".claude" / "commands" / "alfred"
745 if commands_dir.exists():
746 for command_file in commands_dir.glob("*.md"):
747 if not self._validate_command_file(command_file):
748 validation_results["commands_valid"] = False
749 validation_results["issues_found"].append(
750 f"Invalid command file: {command_file}"
751 )
753 logger.info(
754 f"Research integrity validation completed. Issues: "
755 f"{len(validation_results['issues_found'])}, "
756 f"Repairs: {len(validation_results['repairs_made'])}"
757 )
759 except Exception as e:
760 logger.error(f"Research integrity validation failed: {str(e)}")
761 validation_results["validation_error"] = str(e)
763 return validation_results
765 def _rollback_last_changes(
766 self, error_report: ErrorReport, parameters: Dict[str, Any]
767 ) -> bool:
768 """Rollback last research integration changes"""
769 try:
770 logger.info("Rolling back last research changes...")
772 # Import rollback manager
773 sys.path.insert(0, str(self.project_root / "src"))
774 from moai_adk.core.rollback_manager import RollbackManager
776 rollback_manager = RollbackManager(self.project_root)
778 # Find latest rollback point for research integration
779 rollback_points = rollback_manager.list_rollback_points(limit=5)
780 if not rollback_points:
781 logger.warning("No rollback points available")
782 return False
784 # Use the most recent rollback point
785 latest_rollback = rollback_points[0]
786 result = rollback_manager.rollback_to_point(latest_rollback["id"])
788 if result.success:
789 logger.info(f"Successfully rolled back to {latest_rollback['id']}")
790 return True
791 else:
792 logger.error(f"Rollback failed: {result.message}")
793 return False
795 except Exception as e:
796 logger.error(f"Rollback operation failed: {str(e)}")
797 return False
799 def _reset_system_state(
800 self, error_report: ErrorReport, parameters: Dict[str, Any]
801 ) -> bool:
802 """Reset system to known good state"""
803 try:
804 logger.info("Resetting system to known good state...")
806 # Clear all caches
807 cache_dirs = [
808 self.project_root / ".moai" / "cache",
809 self.project_root / ".claude" / "cache",
810 self.project_root / ".moai" / "agent_state",
811 self.project_root / ".moai" / "comm_cache",
812 ]
814 for cache_dir in cache_dirs:
815 if cache_dir.exists():
816 import shutil
818 shutil.rmtree(cache_dir)
819 cache_dir.mkdir(parents=True, exist_ok=True)
821 # Reset error state
822 self.active_errors.clear()
824 # Reinitialize core components
825 self._reinitialize_core_components()
827 logger.info("System state reset completed")
828 return True
830 except Exception as e:
831 logger.error(f"System state reset failed: {str(e)}")
832 return False
834 def _optimize_performance(
835 self, error_report: ErrorReport, parameters: Dict[str, Any]
836 ) -> bool:
837 """Optimize system performance"""
838 try:
839 logger.info("Optimizing system performance...")
841 # Clear temporary files
842 temp_dirs = [
843 self.project_root / ".moai" / "temp",
844 self.project_root / ".claude" / "temp",
845 ]
847 for temp_dir in temp_dirs:
848 if temp_dir.exists():
849 import shutil
851 shutil.rmtree(temp_dir)
853 # Optimize database connections if applicable
854 self._optimize_connections()
856 # Clear memory caches
857 import gc
859 gc.collect()
861 logger.info("Performance optimization completed")
862 return True
864 except Exception as e:
865 logger.error(f"Performance optimization failed: {str(e)}")
866 return False
868 def _free_resources(
869 self, error_report: ErrorReport, parameters: Dict[str, Any]
870 ) -> bool:
871 """Free up system resources"""
872 try:
873 logger.info("Freeing up system resources...")
875 # Clear memory caches
876 import gc
878 gc.collect()
880 # Close any open file handles
881 self._close_file_handles()
883 # Terminate any hanging processes
884 self._terminate_hanging_processes()
886 logger.info("Resource cleanup completed")
887 return True
889 except Exception as e:
890 logger.error(f"Resource cleanup failed: {str(e)}")
891 return False
893 def _generate_error_id(self) -> str:
894 """Generate unique error ID"""
895 timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
896 random_suffix = hashlib.md5(os.urandom(4)).hexdigest()[:6]
897 return f"ERR_{timestamp}_{random_suffix}"
899 def _log_error(self, error_report: ErrorReport):
900 """Log error to file and system"""
901 # Log to file
902 error_file = self.error_log_dir / f"error_{error_report.id}.json"
903 try:
904 with open(error_file, "w", encoding="utf-8") as f:
905 json.dump(asdict(error_report), f, indent=2, default=str, ensure_ascii=False)
906 except Exception as e:
907 logger.error(f"Failed to log error to file: {str(e)}")
909 # Log to system
910 log_level = {
911 ErrorSeverity.CRITICAL: logging.CRITICAL,
912 ErrorSeverity.HIGH: logging.ERROR,
913 ErrorSeverity.MEDIUM: logging.WARNING,
914 ErrorSeverity.LOW: logging.INFO,
915 ErrorSeverity.INFO: logging.INFO,
916 }.get(error_report.severity, logging.WARNING)
918 logger.log(log_level, f"Error {error_report.id}: {error_report.message}")
920 def _update_error_stats(self, error_report: ErrorReport):
921 """Update error statistics"""
922 self.error_stats["total_errors"] += 1
924 # By severity
925 severity = error_report.severity.value
926 if severity not in self.error_stats["by_severity"]:
927 self.error_stats["by_severity"][severity] = 0
928 self.error_stats["by_severity"][severity] += 1
930 # By category
931 category = error_report.category.value
932 if category not in self.error_stats["by_category"]:
933 self.error_stats["by_category"][category] = 0
934 self.error_stats["by_category"][category] += 1
936 def _update_system_health(self):
937 """Update system health status"""
938 current_time = datetime.now(timezone.utc)
940 # Determine system status
941 critical_errors = [
942 e
943 for e in self.active_errors.values()
944 if e.severity == ErrorSeverity.CRITICAL
945 ]
946 high_errors = [
947 e for e in self.active_errors.values() if e.severity == ErrorSeverity.HIGH
948 ]
950 if critical_errors:
951 self.system_health["status"] = "critical"
952 elif high_errors:
953 self.system_health["status"] = "degraded"
954 elif len(self.active_errors) > 5:
955 self.system_health["status"] = "warning"
956 else:
957 self.system_health["status"] = "healthy"
959 # Update metrics
960 self.system_health["last_check"] = current_time
961 self.system_health["metrics"] = {
962 "active_errors": len(self.active_errors),
963 "total_errors": len(self.error_history),
964 "recovery_success_rate": self._calculate_recovery_rate(self.error_history),
965 }
967 # Identify issues
968 self.system_health["issues"] = [
969 {
970 "type": "active_errors",
971 "count": len(self.active_errors),
972 "severity_distribution": {
973 severity: len(
974 [
975 e
976 for e in self.active_errors.values()
977 if e.severity.value == severity
978 ]
979 )
980 for severity in set(
981 e.severity.value for e in self.active_errors.values()
982 )
983 },
984 }
985 ]
987 def _background_monitoring(self):
988 """Background monitoring thread"""
989 while self.monitoring_active:
990 try:
991 # Check system health every 30 seconds
992 time.sleep(30)
993 self._update_system_health()
995 # Check for error patterns that need attention
996 self._check_error_patterns()
998 except Exception as e:
999 logger.error(f"Background monitoring error: {str(e)}")
1001 def _check_error_patterns(self):
1002 """Check for concerning error patterns"""
1003 recent_errors = [
1004 e
1005 for e in self.error_history
1006 if (datetime.now(timezone.utc) - e.timestamp).total_seconds() < 300
1007 ] # Last 5 minutes
1009 # Check for error bursts
1010 if len(recent_errors) > 10:
1011 logger.warning(
1012 f"High error rate detected: {len(recent_errors)} errors in last 5 minutes"
1013 )
1015 # Check for repeated errors
1016 error_messages = [e.message for e in recent_errors]
1017 message_counts = {}
1018 for msg in error_messages:
1019 message_counts[msg] = message_counts.get(msg, 0) + 1
1021 repeated_errors = [msg for msg, count in message_counts.items() if count > 3]
1022 if repeated_errors:
1023 logger.warning(f"Repeated errors detected: {repeated_errors}")
1025 def _calculate_recovery_rate(self, errors: List[ErrorReport]) -> float:
1026 """Calculate recovery success rate"""
1027 if not errors:
1028 return 0.0
1030 recovered_errors = [e for e in errors if e.recovery_successful]
1031 return len(recovered_errors) / len(errors)
1033 def _identify_error_patterns(self, errors: List[ErrorReport]) -> Dict[str, int]:
1034 """Identify common error patterns"""
1035 patterns = {}
1037 for error in errors:
1038 # Pattern by exception type
1039 pattern = f"{error.category.value}:{error.details.get('exception_type', 'unknown')}"
1040 patterns[pattern] = patterns.get(pattern, 0) + 1
1042 return patterns
1044 def _get_pattern_severity(self, pattern: str) -> str:
1045 """Get typical severity for an error pattern"""
1046 severity_map = {
1047 "research:Exception": "high",
1048 "system:Exception": "critical",
1049 "configuration:Exception": "high",
1050 "communication:Exception": "medium",
1051 "validation:Exception": "medium",
1052 }
1054 for key, severity in severity_map.items():
1055 if key in pattern:
1056 return severity
1058 return "medium"
1060 def _get_solutions_for_pattern(self, pattern: str) -> List[str]:
1061 """Get common solutions for error pattern"""
1062 solutions = {
1063 "research:Exception": [
1064 "Restart research engines",
1065 "Clear research cache",
1066 "Validate research components",
1067 ],
1068 "system:Exception": [
1069 "Check system resources",
1070 "Restart system components",
1071 "Verify system configuration",
1072 ],
1073 "configuration:Exception": [
1074 "Restore configuration backup",
1075 "Validate configuration syntax",
1076 "Check configuration permissions",
1077 ],
1078 }
1080 for key, sols in solutions.items():
1081 if key in pattern:
1082 return sols
1084 return ["Contact system administrator", "Check system logs"]
1086 def _generate_prevention_tips(self) -> List[str]:
1087 """Generate prevention tips based on error history"""
1088 tips = []
1090 # Add tips based on common error categories
1091 category_counts = {}
1092 for error in self.error_history:
1093 category = error.category.value
1094 category_counts[category] = category_counts.get(category, 0) + 1
1096 if category_counts.get("configuration", 0) > 5:
1097 tips.append("Regularly validate configuration files before making changes")
1099 if category_counts.get("research", 0) > 5:
1100 tips.append(
1101 "Monitor research engine performance and clear caches regularly"
1102 )
1104 if category_counts.get("communication", 0) > 5:
1105 tips.append("Ensure stable network connections for agent communication")
1107 return tips
1109 def _generate_emergency_procedures(self) -> List[Dict[str, str]]:
1110 """Generate emergency recovery procedures"""
1111 return [
1112 {
1113 "condition": "System completely unresponsive",
1114 "procedure": "Use system_reset recovery action to restore to known good state",
1115 },
1116 {
1117 "condition": "Critical research engine failure",
1118 "procedure": "Rollback last research changes using rollback_last_changes action",
1119 },
1120 {
1121 "condition": "Configuration corruption",
1122 "procedure": "Restore configuration from backup using restore_config_backup action",
1123 },
1124 {
1125 "condition": "Multiple agent communication failures",
1126 "procedure": "Clear agent cache and restart communication channels",
1127 },
1128 ]
1130 # Helper methods for component validation and repair
1131 def _validate_skill_file(self, skill_file: Path) -> bool:
1132 """Validate skill file format"""
1133 try:
1134 with open(skill_file, "r", encoding="utf-8") as f:
1135 content = f.read()
1137 # Basic validation
1138 return "---" in content and len(content) > 100
1139 except (OSError, UnicodeDecodeError):
1140 return False
1142 def _validate_agent_file(self, agent_file: Path) -> bool:
1143 """Validate agent file format"""
1144 try:
1145 with open(agent_file, "r", encoding="utf-8") as f:
1146 content = f.read()
1148 return "role:" in content and len(content) > 200
1149 except (OSError, UnicodeDecodeError):
1150 return False
1152 def _validate_command_file(self, command_file: Path) -> bool:
1153 """Validate command file format"""
1154 try:
1155 with open(command_file, "r", encoding="utf-8") as f:
1156 content = f.read()
1158 return "name:" in content and "allowed-tools:" in content
1159 except (OSError, UnicodeDecodeError):
1160 return False
1162 def _repair_skill_file(self, skill_file: Path) -> bool:
1163 """Attempt to repair skill file"""
1164 try:
1165 # Basic repair - ensure file has minimum required content
1166 with open(skill_file, "r", encoding="utf-8") as f:
1167 content = f.read()
1169 if not content.startswith("---"):
1170 content = f"---\nname: {skill_file.stem}\ndescription: Repaired skill file\n---\n\n{content}"
1172 with open(skill_file, "w", encoding="utf-8") as f:
1173 f.write(content)
1175 return True
1176 except (OSError, UnicodeDecodeError):
1177 return False
1179 def _reinitialize_research_components(self):
1180 """Reinitialize research components"""
1181 # Implementation would depend on specific research components
1182 pass
1184 def _reinitialize_core_components(self):
1185 """Reinitialize core system components"""
1186 # Implementation would depend on specific core components
1187 pass
1189 def _optimize_connections(self):
1190 """Optimize database/network connections"""
1191 # Implementation would depend on specific connection types
1192 pass
1194 def _close_file_handles(self):
1195 """Close open file handles"""
1196 import gc
1198 gc.collect() # Force garbage collection to close file handles
1200 def _terminate_hanging_processes(self):
1201 """Terminate hanging processes"""
1202 # Implementation would identify and terminate hanging processes
1203 pass
1205 def _save_error_history(self):
1206 """Save error history to file"""
1207 history_file = self.error_log_dir / "error_history.json"
1208 try:
1209 with open(history_file, "w") as f:
1210 json.dump(
1211 [asdict(e) for e in self.error_history], f, indent=2, default=str
1212 )
1213 except Exception as e:
1214 logger.error(f"Failed to save error history: {str(e)}")
1217# Global error recovery system instance
1218_error_recovery_system = None
1221def get_error_recovery_system(project_root: Path = None) -> ErrorRecoverySystem:
1222 """Get or create global error recovery system instance"""
1223 global _error_recovery_system
1224 if _error_recovery_system is None:
1225 _error_recovery_system = ErrorRecoverySystem(project_root)
1226 return _error_recovery_system
1229def handle_error(
1230 error: Exception,
1231 context: Dict[str, Any] = None,
1232 severity: ErrorSeverity = ErrorSeverity.MEDIUM,
1233 category: ErrorCategory = ErrorCategory.SYSTEM,
1234) -> ErrorReport:
1235 """Convenience function to handle errors using global system"""
1236 return get_error_recovery_system().handle_error(error, context, severity, category)
1239# Decorator for automatic error handling
1240def error_handler(
1241 severity: ErrorSeverity = ErrorSeverity.MEDIUM,
1242 category: ErrorCategory = ErrorCategory.SYSTEM,
1243 context: Dict[str, Any] = None,
1244):
1245 """Decorator for automatic error handling"""
1247 def decorator(func):
1248 def wrapper(*args, **kwargs):
1249 try:
1250 return func(*args, **kwargs)
1251 except Exception as e:
1252 error_context = {
1253 "function": func.__name__,
1254 "module": func.__module__,
1255 "args": str(args)[:100], # Limit length
1256 "kwargs": str(kwargs)[:100],
1257 **(context or {}),
1258 }
1259 handle_error(e, error_context, severity, category)
1260 raise
1262 return wrapper
1264 return decorator
1267if __name__ == "__main__":
1268 # Demo usage
1269 recovery_system = ErrorRecoverySystem()
1271 print("Error Recovery System Demo")
1272 print("=" * 50)
1274 # Simulate some errors
1275 try:
1276 raise ValueError("This is a test error for demonstration")
1277 except Exception as e:
1278 error_report = recovery_system.handle_error(
1279 e,
1280 context={"demo": True},
1281 severity=ErrorSeverity.MEDIUM,
1282 category=ErrorCategory.SYSTEM,
1283 )
1284 print(f"Handled error: {error_report.id}")
1286 # Show system health
1287 health = recovery_system.get_system_health()
1288 print(f"System health: {health['status']}")
1290 # Show error summary
1291 summary = recovery_system.get_error_summary()
1292 print(f"Total errors: {summary['total_recent_errors']}")
1294 print("\nError Recovery System demo completed")