Coverage for src / moai_adk / core / error_recovery_system.py: 0.00%

494 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-20 20:52 +0900

1""" 

2Comprehensive Error Handling & Recovery System for Research Workflows 

3 

4Provides: 

5- Error detection and classification 

6- Recovery procedures and fallback mechanisms 

7- Integration with research hooks, agents, and skills 

8- Documentation of error handling procedures 

9- Troubleshooting guides and automated recovery 

10 

11Features: 

12- Multi-level error handling (critical, warning, info) 

13- Automatic recovery mechanisms 

14- Manual recovery procedures 

15- Error logging and tracking 

16- System health monitoring 

17- Emergency recovery procedures 

18""" 

19 

20import hashlib 

21import json 

22import logging 

23import os 

24import sys 

25import tempfile 

26import threading 

27import time 

28import traceback 

29from dataclasses import asdict, dataclass 

30from datetime import datetime, timedelta, timezone 

31from enum import Enum 

32from pathlib import Path 

33from typing import Any, Callable, Dict, List, Optional 

34 

35# Configure comprehensive logging 

36logging.basicConfig( 

37 level=logging.INFO, 

38 format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", 

39 handlers=[ 

40 logging.FileHandler(Path(tempfile.gettempdir()) / "moai_error_recovery.log"), 

41 logging.StreamHandler(sys.stdout), 

42 ], 

43) 

44 

45logger = logging.getLogger(__name__) 

46 

47 

48class ErrorSeverity(Enum): 

49 """Error severity levels""" 

50 

51 CRITICAL = "critical" # System failure, immediate attention required 

52 HIGH = "high" # Major functionality impacted 

53 MEDIUM = "medium" # Partial functionality impacted 

54 LOW = "low" # Minor issue, can be deferred 

55 INFO = "info" # Informational message 

56 

57 

58class ErrorCategory(Enum): 

59 """Error categories for classification""" 

60 

61 SYSTEM = "system" # System-level errors 

62 CONFIGURATION = "configuration" # Configuration errors 

63 RESEARCH = "research" # Research workflow errors 

64 INTEGRATION = "integration" # Integration errors 

65 COMMUNICATION = "communication" # Agent/communication errors 

66 VALIDATION = "validation" # Validation errors 

67 PERFORMANCE = "performance" # Performance issues 

68 RESOURCE = "resource" # Resource exhaustion 

69 NETWORK = "network" # Network-related errors 

70 USER_INPUT = "user_input" # User input errors 

71 

72 

73@dataclass 

74class ErrorReport: 

75 """Comprehensive error report structure""" 

76 

77 id: str 

78 timestamp: datetime 

79 severity: ErrorSeverity 

80 category: ErrorCategory 

81 message: str 

82 details: Dict[str, Any] 

83 stack_trace: Optional[str] 

84 context: Dict[str, Any] 

85 recovery_attempted: bool = False 

86 recovery_successful: bool = False 

87 resolution_message: Optional[str] = None 

88 

89 

90@dataclass 

91class RecoveryAction: 

92 """Recovery action definition""" 

93 

94 name: str 

95 description: str 

96 action_type: str # "automatic", "manual", "assisted" 

97 severity_filter: List[ErrorSeverity] 

98 category_filter: List[ErrorCategory] 

99 handler: Callable 

100 timeout: Optional[float] = None 

101 max_attempts: int = 3 

102 success_criteria: Optional[str] = None 

103 

104 

105@dataclass 

106class RecoveryResult: 

107 """Result of recovery action""" 

108 

109 success: bool 

110 action_name: str 

111 message: str 

112 duration: float 

113 details: Dict[str, Any] = None 

114 next_actions: List[str] = None 

115 

116 

117class ErrorRecoverySystem: 

118 """Comprehensive error handling and recovery system""" 

119 

120 def __init__(self, project_root: Path = None): 

121 self.project_root = project_root or Path.cwd() 

122 self.error_log_dir = self.project_root / ".moai" / "error_logs" 

123 self.error_log_dir.mkdir(parents=True, exist_ok=True) 

124 

125 # Error tracking 

126 self.active_errors: Dict[str, ErrorReport] = {} 

127 self.error_history: List[ErrorReport] = [] 

128 self.recovery_actions: Dict[str, RecoveryAction] = {} 

129 self.error_stats: Dict[str, Any] = { 

130 "total_errors": 0, 

131 "by_severity": {}, 

132 "by_category": {}, 

133 "recovery_success_rate": 0.0, 

134 } 

135 

136 # System health monitoring 

137 self.system_health = { 

138 "status": "healthy", 

139 "last_check": datetime.now(timezone.utc), 

140 "issues": [], 

141 "metrics": {}, 

142 } 

143 

144 # Initialize recovery actions 

145 self._initialize_recovery_actions() 

146 

147 # Background monitoring thread 

148 self.monitoring_active = True 

149 self.monitor_thread = threading.Thread( 

150 target=self._background_monitoring, daemon=True 

151 ) 

152 self.monitor_thread.start() 

153 

154 logger.info("Error Recovery System initialized") 

155 

156 def handle_error( 

157 self, 

158 error: Exception, 

159 context: Dict[str, Any] = None, 

160 severity: ErrorSeverity = ErrorSeverity.MEDIUM, 

161 category: ErrorCategory = ErrorCategory.SYSTEM, 

162 ) -> ErrorReport: 

163 """ 

164 Handle an error with comprehensive logging and recovery 

165 

166 Args: 

167 error: Exception that occurred 

168 context: Additional context information 

169 severity: Error severity level 

170 category: Error category 

171 

172 Returns: 

173 ErrorReport with handling details 

174 """ 

175 error_id = self._generate_error_id() 

176 timestamp = datetime.now(timezone.utc) 

177 

178 # Create error report 

179 error_report = ErrorReport( 

180 id=error_id, 

181 timestamp=timestamp, 

182 severity=severity, 

183 category=category, 

184 message=str(error), 

185 details={ 

186 "exception_type": type(error).__name__, 

187 "exception_module": type(error).__module__, 

188 "error_code": getattr(error, "code", None), 

189 }, 

190 stack_trace=traceback.format_exc(), 

191 context=context or {}, 

192 recovery_attempted=False, 

193 recovery_successful=False, 

194 ) 

195 

196 # Log error 

197 self._log_error(error_report) 

198 

199 # Update statistics 

200 self._update_error_stats(error_report) 

201 

202 # Store error 

203 self.active_errors[error_id] = error_report 

204 self.error_history.append(error_report) 

205 

206 # Attempt automatic recovery 

207 if severity in [ErrorSeverity.CRITICAL, ErrorSeverity.HIGH]: 

208 recovery_result = self._attempt_automatic_recovery(error_report) 

209 error_report.recovery_attempted = True 

210 error_report.recovery_successful = recovery_result.success 

211 error_report.resolution_message = recovery_result.message 

212 

213 if recovery_result.success: 

214 logger.info(f"Automatic recovery successful for error {error_id}") 

215 self.active_errors.pop(error_id, None) 

216 else: 

217 logger.warning( 

218 f"Automatic recovery failed for error {error_id}: {recovery_result.message}" 

219 ) 

220 

221 # Update system health 

222 self._update_system_health() 

223 

224 return error_report 

225 

226 def register_recovery_action(self, action: RecoveryAction): 

227 """ 

228 Register a new recovery action 

229 

230 Args: 

231 action: RecoveryAction definition 

232 """ 

233 self.recovery_actions[action.name] = action 

234 logger.info(f"Registered recovery action: {action.name}") 

235 

236 def attempt_manual_recovery( 

237 self, error_id: str, action_name: str, parameters: Dict[str, Any] = None 

238 ) -> RecoveryResult: 

239 """ 

240 Attempt manual recovery for a specific error 

241 

242 Args: 

243 error_id: ID of error to recover 

244 action_name: Name of recovery action to attempt 

245 parameters: Additional parameters for recovery 

246 

247 Returns: 

248 RecoveryResult with operation details 

249 """ 

250 if error_id not in self.active_errors: 

251 return RecoveryResult( 

252 success=False, 

253 action_name=action_name, 

254 message=f"Error {error_id} not found in active errors", 

255 ) 

256 

257 if action_name not in self.recovery_actions: 

258 return RecoveryResult( 

259 success=False, 

260 action_name=action_name, 

261 message=f"Recovery action {action_name} not found", 

262 ) 

263 

264 error_report = self.active_errors[error_id] 

265 recovery_action = self.recovery_actions[action_name] 

266 

267 logger.info(f"Attempting manual recovery {action_name} for error {error_id}") 

268 

269 try: 

270 start_time = time.time() 

271 

272 # Execute recovery action 

273 result = recovery_action.handler(error_report, parameters or {}) 

274 

275 duration = time.time() - start_time 

276 

277 if result: 

278 recovery_result = RecoveryResult( 

279 success=True, 

280 action_name=action_name, 

281 message="Manual recovery completed successfully", 

282 duration=duration, 

283 details={"result": result}, 

284 ) 

285 

286 # Update error report 

287 error_report.recovery_successful = True 

288 error_report.resolution_message = recovery_result.message 

289 

290 # Remove from active errors 

291 self.active_errors.pop(error_id, None) 

292 

293 else: 

294 recovery_result = RecoveryResult( 

295 success=False, 

296 action_name=action_name, 

297 message="Manual recovery returned unsuccessful result", 

298 duration=duration, 

299 ) 

300 

301 except Exception as e: 

302 duration = time.time() - start_time 

303 recovery_result = RecoveryResult( 

304 success=False, 

305 action_name=action_name, 

306 message=f"Manual recovery failed: {str(e)}", 

307 duration=duration, 

308 details={"exception": str(e)}, 

309 ) 

310 

311 return recovery_result 

312 

313 def get_system_health(self) -> Dict[str, Any]: 

314 """ 

315 Get current system health status 

316 

317 Returns: 

318 System health information 

319 """ 

320 self._update_system_health() 

321 

322 return { 

323 "status": self.system_health["status"], 

324 "last_check": self.system_health["last_check"].isoformat(), 

325 "active_errors": len(self.active_errors), 

326 "total_errors": len(self.error_history), 

327 "error_stats": self.error_stats.copy(), 

328 "issues": self.system_health["issues"].copy(), 

329 "metrics": self.system_health["metrics"].copy(), 

330 "recovery_actions_available": len(self.recovery_actions), 

331 } 

332 

333 def get_error_summary(self, limit: int = 50) -> Dict[str, Any]: 

334 """ 

335 Get summary of recent errors 

336 

337 Args: 

338 limit: Maximum number of errors to include 

339 

340 Returns: 

341 Error summary information 

342 """ 

343 recent_errors = self.error_history[-limit:] 

344 

345 # Categorize errors 

346 by_severity = {} 

347 by_category = {} 

348 

349 for error in recent_errors: 

350 # By severity 

351 severity = error.severity.value 

352 if severity not in by_severity: 

353 by_severity[severity] = [] 

354 by_severity[severity].append(error.id) 

355 

356 # By category 

357 category = error.category.value 

358 if category not in by_category: 

359 by_category[category] = [] 

360 by_category[category].append(error.id) 

361 

362 # Common error patterns 

363 error_patterns = self._identify_error_patterns(recent_errors) 

364 

365 return { 

366 "total_recent_errors": len(recent_errors), 

367 "active_errors": len(self.active_errors), 

368 "by_severity": {k: len(v) for k, v in by_severity.items()}, 

369 "by_category": {k: len(v) for k, v in by_category.items()}, 

370 "common_patterns": error_patterns, 

371 "recovery_rate": self._calculate_recovery_rate(recent_errors), 

372 "recent_errors": [ 

373 { 

374 "id": error.id, 

375 "timestamp": error.timestamp.isoformat(), 

376 "severity": error.severity.value, 

377 "category": error.category.value, 

378 "message": error.message, 

379 "recovered": error.recovery_successful, 

380 } 

381 for error in recent_errors[-10:] # Last 10 errors 

382 ], 

383 } 

384 

385 def generate_troubleshooting_guide(self) -> Dict[str, Any]: 

386 """ 

387 Generate troubleshooting guide based on error history 

388 

389 Returns: 

390 Troubleshooting guide with solutions 

391 """ 

392 guide = { 

393 "generated_at": datetime.now(timezone.utc).isoformat(), 

394 "common_issues": [], 

395 "recovery_procedures": {}, 

396 "prevention_tips": [], 

397 "emergency_procedures": [], 

398 } 

399 

400 # Analyze common issues 

401 error_patterns = self._identify_error_patterns(self.error_history) 

402 for pattern, frequency in error_patterns.items(): 

403 if frequency > 2: # Issues that occurred more than twice 

404 guide["common_issues"].append( 

405 { 

406 "pattern": pattern, 

407 "frequency": frequency, 

408 "severity": self._get_pattern_severity(pattern), 

409 "solutions": self._get_solutions_for_pattern(pattern), 

410 } 

411 ) 

412 

413 # Generate recovery procedures 

414 for action_name, action in self.recovery_actions.items(): 

415 guide["recovery_procedures"][action_name] = { 

416 "description": action.description, 

417 "type": action.action_type, 

418 "for_severities": [s.value for s in action.severity_filter], 

419 "for_categories": [c.value for c in action.category_filter], 

420 } 

421 

422 # Prevention tips 

423 guide["prevention_tips"] = self._generate_prevention_tips() 

424 

425 # Emergency procedures 

426 guide["emergency_procedures"] = self._generate_emergency_procedures() 

427 

428 return guide 

429 

430 def cleanup_old_errors(self, days_to_keep: int = 30) -> Dict[str, Any]: 

431 """ 

432 Clean up old error records 

433 

434 Args: 

435 days_to_keep: Number of days to keep error records 

436 

437 Returns: 

438 Cleanup operation results 

439 """ 

440 cutoff_date = datetime.now(timezone.utc) - timedelta(days=days_to_keep) 

441 

442 old_errors = [e for e in self.error_history if e.timestamp < cutoff_date] 

443 removed_count = len(old_errors) 

444 

445 # Keep only recent errors 

446 self.error_history = [ 

447 e for e in self.error_history if e.timestamp >= cutoff_date 

448 ] 

449 

450 # Save updated error history 

451 self._save_error_history() 

452 

453 logger.info(f"Cleaned up {removed_count} old error records") 

454 

455 return { 

456 "removed_count": removed_count, 

457 "remaining_count": len(self.error_history), 

458 "cutoff_date": cutoff_date.isoformat(), 

459 } 

460 

461 def _initialize_recovery_actions(self): 

462 """Initialize default recovery actions""" 

463 # System recovery actions 

464 self.register_recovery_action( 

465 RecoveryAction( 

466 name="restart_research_engines", 

467 description="Restart research engines and clear caches", 

468 action_type="automatic", 

469 severity_filter=[ErrorSeverity.HIGH, ErrorSeverity.CRITICAL], 

470 category_filter=[ErrorCategory.RESEARCH, ErrorCategory.SYSTEM], 

471 handler=self._restart_research_engines, 

472 timeout=30.0, 

473 ) 

474 ) 

475 

476 self.register_recovery_action( 

477 RecoveryAction( 

478 name="restore_config_backup", 

479 description="Restore configuration from last known good backup", 

480 action_type="automatic", 

481 severity_filter=[ErrorSeverity.CRITICAL], 

482 category_filter=[ErrorCategory.CONFIGURATION], 

483 handler=self._restore_config_backup, 

484 timeout=15.0, 

485 ) 

486 ) 

487 

488 self.register_recovery_action( 

489 RecoveryAction( 

490 name="clear_agent_cache", 

491 description="Clear agent communication cache and reset connections", 

492 action_type="automatic", 

493 severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH], 

494 category_filter=[ErrorCategory.COMMUNICATION], 

495 handler=self._clear_agent_cache, 

496 timeout=10.0, 

497 ) 

498 ) 

499 

500 self.register_recovery_action( 

501 RecoveryAction( 

502 name="validate_research_integrity", 

503 description="Validate research component integrity and repair if needed", 

504 action_type="assisted", 

505 severity_filter=[ErrorSeverity.HIGH], 

506 category_filter=[ErrorCategory.RESEARCH, ErrorCategory.VALIDATION], 

507 handler=self._validate_research_integrity, 

508 timeout=60.0, 

509 ) 

510 ) 

511 

512 self.register_recovery_action( 

513 RecoveryAction( 

514 name="rollback_last_changes", 

515 description="Rollback last research integration changes", 

516 action_type="manual", 

517 severity_filter=[ErrorSeverity.CRITICAL], 

518 category_filter=[ErrorCategory.INTEGRATION, ErrorCategory.RESEARCH], 

519 handler=self._rollback_last_changes, 

520 timeout=45.0, 

521 ) 

522 ) 

523 

524 self.register_recovery_action( 

525 RecoveryAction( 

526 name="reset_system_state", 

527 description="Reset system to known good state", 

528 action_type="manual", 

529 severity_filter=[ErrorSeverity.CRITICAL], 

530 category_filter=[ErrorCategory.SYSTEM], 

531 handler=self._reset_system_state, 

532 timeout=120.0, 

533 ) 

534 ) 

535 

536 # Performance recovery actions 

537 self.register_recovery_action( 

538 RecoveryAction( 

539 name="optimize_performance", 

540 description="Optimize system performance and clear bottlenecks", 

541 action_type="automatic", 

542 severity_filter=[ErrorSeverity.MEDIUM], 

543 category_filter=[ErrorCategory.PERFORMANCE], 

544 handler=self._optimize_performance, 

545 timeout=30.0, 

546 ) 

547 ) 

548 

549 # Resource recovery actions 

550 self.register_recovery_action( 

551 RecoveryAction( 

552 name="free_resources", 

553 description="Free up system resources and memory", 

554 action_type="automatic", 

555 severity_filter=[ErrorSeverity.MEDIUM, ErrorSeverity.HIGH], 

556 category_filter=[ErrorCategory.RESOURCE], 

557 handler=self._free_resources, 

558 timeout=20.0, 

559 ) 

560 ) 

561 

562 def _attempt_automatic_recovery(self, error_report: ErrorReport) -> RecoveryResult: 

563 """Attempt automatic recovery for an error""" 

564 suitable_actions = [] 

565 

566 # Find suitable recovery actions 

567 for action_name, action in self.recovery_actions.items(): 

568 if ( 

569 action.action_type == "automatic" 

570 and error_report.severity in action.severity_filter 

571 and error_report.category in action.category_filter 

572 ): 

573 suitable_actions.append(action) 

574 

575 # Try actions in order of priority 

576 for action in suitable_actions: 

577 try: 

578 logger.info(f"Attempting automatic recovery: {action.name}") 

579 

580 start_time = time.time() 

581 result = action.handler(error_report, {}) 

582 duration = time.time() - start_time 

583 

584 if result: 

585 return RecoveryResult( 

586 success=True, 

587 action_name=action.name, 

588 message=f"Automatic recovery successful: {action.name}", 

589 duration=duration, 

590 details={"result": result}, 

591 ) 

592 

593 except Exception as e: 

594 logger.warning(f"Recovery action {action.name} failed: {str(e)}") 

595 continue 

596 

597 return RecoveryResult( 

598 success=False, 

599 action_name="none", 

600 message="No suitable automatic recovery action succeeded", 

601 ) 

602 

603 def _restart_research_engines( 

604 self, error_report: ErrorReport, parameters: Dict[str, Any] 

605 ) -> bool: 

606 """Restart research engines and clear caches""" 

607 try: 

608 logger.info("Restarting research engines...") 

609 

610 # Clear research engine caches 

611 cache_dirs = [ 

612 self.project_root / ".moai" / "cache", 

613 self.project_root / ".claude" / "cache", 

614 ] 

615 

616 for cache_dir in cache_dirs: 

617 if cache_dir.exists(): 

618 import shutil 

619 

620 shutil.rmtree(cache_dir) 

621 cache_dir.mkdir(parents=True, exist_ok=True) 

622 

623 # Reset research engine state 

624 research_state_file = self.project_root / ".moai" / "research_state.json" 

625 if research_state_file.exists(): 

626 research_state_file.unlink() 

627 

628 # Reinitialize research components 

629 self._reinitialize_research_components() 

630 

631 logger.info("Research engines restarted successfully") 

632 return True 

633 

634 except Exception as e: 

635 logger.error(f"Failed to restart research engines: {str(e)}") 

636 return False 

637 

638 def _restore_config_backup( 

639 self, error_report: ErrorReport, parameters: Dict[str, Any] 

640 ) -> bool: 

641 """Restore configuration from backup""" 

642 try: 

643 logger.info("Restoring configuration from backup...") 

644 

645 backup_dir = self.project_root / ".moai" / "config_backups" 

646 if not backup_dir.exists(): 

647 logger.warning("No configuration backup directory found") 

648 return False 

649 

650 # Find most recent backup 

651 backup_files = list(backup_dir.glob("config_*.json")) 

652 if not backup_files: 

653 logger.warning("No configuration backups found") 

654 return False 

655 

656 latest_backup = max(backup_files, key=lambda f: f.stat().st_mtime) 

657 

658 # Restore configuration 

659 config_file = self.project_root / ".moai" / "config" / "config.json" 

660 import shutil 

661 

662 shutil.copy2(latest_backup, config_file) 

663 

664 logger.info(f"Configuration restored from {latest_backup}") 

665 return True 

666 

667 except Exception as e: 

668 logger.error(f"Failed to restore configuration: {str(e)}") 

669 return False 

670 

671 def _clear_agent_cache( 

672 self, error_report: ErrorReport, parameters: Dict[str, Any] 

673 ) -> bool: 

674 """Clear agent communication cache""" 

675 try: 

676 logger.info("Clearing agent cache...") 

677 

678 # Clear agent state files 

679 agent_state_dir = self.project_root / ".moai" / "agent_state" 

680 if agent_state_dir.exists(): 

681 import shutil 

682 

683 shutil.rmtree(agent_state_dir) 

684 agent_state_dir.mkdir(parents=True, exist_ok=True) 

685 

686 # Reset communication channels 

687 comm_cache_dir = self.project_root / ".moai" / "comm_cache" 

688 if comm_cache_dir.exists(): 

689 import shutil 

690 

691 shutil.rmtree(comm_cache_dir) 

692 comm_cache_dir.mkdir(parents=True, exist_ok=True) 

693 

694 logger.info("Agent cache cleared successfully") 

695 return True 

696 

697 except Exception as e: 

698 logger.error(f"Failed to clear agent cache: {str(e)}") 

699 return False 

700 

701 def _validate_research_integrity( 

702 self, error_report: ErrorReport, parameters: Dict[str, Any] 

703 ) -> Dict[str, Any]: 

704 """Validate research component integrity""" 

705 validation_results = { 

706 "skills_valid": True, 

707 "agents_valid": True, 

708 "commands_valid": True, 

709 "hooks_valid": True, 

710 "issues_found": [], 

711 "repairs_made": [], 

712 } 

713 

714 try: 

715 logger.info("Validating research integrity...") 

716 

717 # Validate skills 

718 skills_dir = self.project_root / ".claude" / "skills" 

719 if skills_dir.exists(): 

720 for skill_file in skills_dir.glob("*.md"): 

721 if not self._validate_skill_file(skill_file): 

722 validation_results["skills_valid"] = False 

723 validation_results["issues_found"].append( 

724 f"Invalid skill file: {skill_file}" 

725 ) 

726 

727 # Attempt repair 

728 if self._repair_skill_file(skill_file): 

729 validation_results["repairs_made"].append( 

730 f"Repaired: {skill_file}" 

731 ) 

732 

733 # Validate agents 

734 agents_dir = self.project_root / ".claude" / "agents" / "alfred" 

735 if agents_dir.exists(): 

736 for agent_file in agents_dir.glob("*.md"): 

737 if not self._validate_agent_file(agent_file): 

738 validation_results["agents_valid"] = False 

739 validation_results["issues_found"].append( 

740 f"Invalid agent file: {agent_file}" 

741 ) 

742 

743 # Validate commands 

744 commands_dir = self.project_root / ".claude" / "commands" / "alfred" 

745 if commands_dir.exists(): 

746 for command_file in commands_dir.glob("*.md"): 

747 if not self._validate_command_file(command_file): 

748 validation_results["commands_valid"] = False 

749 validation_results["issues_found"].append( 

750 f"Invalid command file: {command_file}" 

751 ) 

752 

753 logger.info( 

754 f"Research integrity validation completed. Issues: " 

755 f"{len(validation_results['issues_found'])}, " 

756 f"Repairs: {len(validation_results['repairs_made'])}" 

757 ) 

758 

759 except Exception as e: 

760 logger.error(f"Research integrity validation failed: {str(e)}") 

761 validation_results["validation_error"] = str(e) 

762 

763 return validation_results 

764 

765 def _rollback_last_changes( 

766 self, error_report: ErrorReport, parameters: Dict[str, Any] 

767 ) -> bool: 

768 """Rollback last research integration changes""" 

769 try: 

770 logger.info("Rolling back last research changes...") 

771 

772 # Import rollback manager 

773 sys.path.insert(0, str(self.project_root / "src")) 

774 from moai_adk.core.rollback_manager import RollbackManager 

775 

776 rollback_manager = RollbackManager(self.project_root) 

777 

778 # Find latest rollback point for research integration 

779 rollback_points = rollback_manager.list_rollback_points(limit=5) 

780 if not rollback_points: 

781 logger.warning("No rollback points available") 

782 return False 

783 

784 # Use the most recent rollback point 

785 latest_rollback = rollback_points[0] 

786 result = rollback_manager.rollback_to_point(latest_rollback["id"]) 

787 

788 if result.success: 

789 logger.info(f"Successfully rolled back to {latest_rollback['id']}") 

790 return True 

791 else: 

792 logger.error(f"Rollback failed: {result.message}") 

793 return False 

794 

795 except Exception as e: 

796 logger.error(f"Rollback operation failed: {str(e)}") 

797 return False 

798 

799 def _reset_system_state( 

800 self, error_report: ErrorReport, parameters: Dict[str, Any] 

801 ) -> bool: 

802 """Reset system to known good state""" 

803 try: 

804 logger.info("Resetting system to known good state...") 

805 

806 # Clear all caches 

807 cache_dirs = [ 

808 self.project_root / ".moai" / "cache", 

809 self.project_root / ".claude" / "cache", 

810 self.project_root / ".moai" / "agent_state", 

811 self.project_root / ".moai" / "comm_cache", 

812 ] 

813 

814 for cache_dir in cache_dirs: 

815 if cache_dir.exists(): 

816 import shutil 

817 

818 shutil.rmtree(cache_dir) 

819 cache_dir.mkdir(parents=True, exist_ok=True) 

820 

821 # Reset error state 

822 self.active_errors.clear() 

823 

824 # Reinitialize core components 

825 self._reinitialize_core_components() 

826 

827 logger.info("System state reset completed") 

828 return True 

829 

830 except Exception as e: 

831 logger.error(f"System state reset failed: {str(e)}") 

832 return False 

833 

834 def _optimize_performance( 

835 self, error_report: ErrorReport, parameters: Dict[str, Any] 

836 ) -> bool: 

837 """Optimize system performance""" 

838 try: 

839 logger.info("Optimizing system performance...") 

840 

841 # Clear temporary files 

842 temp_dirs = [ 

843 self.project_root / ".moai" / "temp", 

844 self.project_root / ".claude" / "temp", 

845 ] 

846 

847 for temp_dir in temp_dirs: 

848 if temp_dir.exists(): 

849 import shutil 

850 

851 shutil.rmtree(temp_dir) 

852 

853 # Optimize database connections if applicable 

854 self._optimize_connections() 

855 

856 # Clear memory caches 

857 import gc 

858 

859 gc.collect() 

860 

861 logger.info("Performance optimization completed") 

862 return True 

863 

864 except Exception as e: 

865 logger.error(f"Performance optimization failed: {str(e)}") 

866 return False 

867 

868 def _free_resources( 

869 self, error_report: ErrorReport, parameters: Dict[str, Any] 

870 ) -> bool: 

871 """Free up system resources""" 

872 try: 

873 logger.info("Freeing up system resources...") 

874 

875 # Clear memory caches 

876 import gc 

877 

878 gc.collect() 

879 

880 # Close any open file handles 

881 self._close_file_handles() 

882 

883 # Terminate any hanging processes 

884 self._terminate_hanging_processes() 

885 

886 logger.info("Resource cleanup completed") 

887 return True 

888 

889 except Exception as e: 

890 logger.error(f"Resource cleanup failed: {str(e)}") 

891 return False 

892 

893 def _generate_error_id(self) -> str: 

894 """Generate unique error ID""" 

895 timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") 

896 random_suffix = hashlib.md5(os.urandom(4)).hexdigest()[:6] 

897 return f"ERR_{timestamp}_{random_suffix}" 

898 

899 def _log_error(self, error_report: ErrorReport): 

900 """Log error to file and system""" 

901 # Log to file 

902 error_file = self.error_log_dir / f"error_{error_report.id}.json" 

903 try: 

904 with open(error_file, "w", encoding="utf-8") as f: 

905 json.dump(asdict(error_report), f, indent=2, default=str, ensure_ascii=False) 

906 except Exception as e: 

907 logger.error(f"Failed to log error to file: {str(e)}") 

908 

909 # Log to system 

910 log_level = { 

911 ErrorSeverity.CRITICAL: logging.CRITICAL, 

912 ErrorSeverity.HIGH: logging.ERROR, 

913 ErrorSeverity.MEDIUM: logging.WARNING, 

914 ErrorSeverity.LOW: logging.INFO, 

915 ErrorSeverity.INFO: logging.INFO, 

916 }.get(error_report.severity, logging.WARNING) 

917 

918 logger.log(log_level, f"Error {error_report.id}: {error_report.message}") 

919 

920 def _update_error_stats(self, error_report: ErrorReport): 

921 """Update error statistics""" 

922 self.error_stats["total_errors"] += 1 

923 

924 # By severity 

925 severity = error_report.severity.value 

926 if severity not in self.error_stats["by_severity"]: 

927 self.error_stats["by_severity"][severity] = 0 

928 self.error_stats["by_severity"][severity] += 1 

929 

930 # By category 

931 category = error_report.category.value 

932 if category not in self.error_stats["by_category"]: 

933 self.error_stats["by_category"][category] = 0 

934 self.error_stats["by_category"][category] += 1 

935 

936 def _update_system_health(self): 

937 """Update system health status""" 

938 current_time = datetime.now(timezone.utc) 

939 

940 # Determine system status 

941 critical_errors = [ 

942 e 

943 for e in self.active_errors.values() 

944 if e.severity == ErrorSeverity.CRITICAL 

945 ] 

946 high_errors = [ 

947 e for e in self.active_errors.values() if e.severity == ErrorSeverity.HIGH 

948 ] 

949 

950 if critical_errors: 

951 self.system_health["status"] = "critical" 

952 elif high_errors: 

953 self.system_health["status"] = "degraded" 

954 elif len(self.active_errors) > 5: 

955 self.system_health["status"] = "warning" 

956 else: 

957 self.system_health["status"] = "healthy" 

958 

959 # Update metrics 

960 self.system_health["last_check"] = current_time 

961 self.system_health["metrics"] = { 

962 "active_errors": len(self.active_errors), 

963 "total_errors": len(self.error_history), 

964 "recovery_success_rate": self._calculate_recovery_rate(self.error_history), 

965 } 

966 

967 # Identify issues 

968 self.system_health["issues"] = [ 

969 { 

970 "type": "active_errors", 

971 "count": len(self.active_errors), 

972 "severity_distribution": { 

973 severity: len( 

974 [ 

975 e 

976 for e in self.active_errors.values() 

977 if e.severity.value == severity 

978 ] 

979 ) 

980 for severity in set( 

981 e.severity.value for e in self.active_errors.values() 

982 ) 

983 }, 

984 } 

985 ] 

986 

987 def _background_monitoring(self): 

988 """Background monitoring thread""" 

989 while self.monitoring_active: 

990 try: 

991 # Check system health every 30 seconds 

992 time.sleep(30) 

993 self._update_system_health() 

994 

995 # Check for error patterns that need attention 

996 self._check_error_patterns() 

997 

998 except Exception as e: 

999 logger.error(f"Background monitoring error: {str(e)}") 

1000 

1001 def _check_error_patterns(self): 

1002 """Check for concerning error patterns""" 

1003 recent_errors = [ 

1004 e 

1005 for e in self.error_history 

1006 if (datetime.now(timezone.utc) - e.timestamp).total_seconds() < 300 

1007 ] # Last 5 minutes 

1008 

1009 # Check for error bursts 

1010 if len(recent_errors) > 10: 

1011 logger.warning( 

1012 f"High error rate detected: {len(recent_errors)} errors in last 5 minutes" 

1013 ) 

1014 

1015 # Check for repeated errors 

1016 error_messages = [e.message for e in recent_errors] 

1017 message_counts = {} 

1018 for msg in error_messages: 

1019 message_counts[msg] = message_counts.get(msg, 0) + 1 

1020 

1021 repeated_errors = [msg for msg, count in message_counts.items() if count > 3] 

1022 if repeated_errors: 

1023 logger.warning(f"Repeated errors detected: {repeated_errors}") 

1024 

1025 def _calculate_recovery_rate(self, errors: List[ErrorReport]) -> float: 

1026 """Calculate recovery success rate""" 

1027 if not errors: 

1028 return 0.0 

1029 

1030 recovered_errors = [e for e in errors if e.recovery_successful] 

1031 return len(recovered_errors) / len(errors) 

1032 

1033 def _identify_error_patterns(self, errors: List[ErrorReport]) -> Dict[str, int]: 

1034 """Identify common error patterns""" 

1035 patterns = {} 

1036 

1037 for error in errors: 

1038 # Pattern by exception type 

1039 pattern = f"{error.category.value}:{error.details.get('exception_type', 'unknown')}" 

1040 patterns[pattern] = patterns.get(pattern, 0) + 1 

1041 

1042 return patterns 

1043 

1044 def _get_pattern_severity(self, pattern: str) -> str: 

1045 """Get typical severity for an error pattern""" 

1046 severity_map = { 

1047 "research:Exception": "high", 

1048 "system:Exception": "critical", 

1049 "configuration:Exception": "high", 

1050 "communication:Exception": "medium", 

1051 "validation:Exception": "medium", 

1052 } 

1053 

1054 for key, severity in severity_map.items(): 

1055 if key in pattern: 

1056 return severity 

1057 

1058 return "medium" 

1059 

1060 def _get_solutions_for_pattern(self, pattern: str) -> List[str]: 

1061 """Get common solutions for error pattern""" 

1062 solutions = { 

1063 "research:Exception": [ 

1064 "Restart research engines", 

1065 "Clear research cache", 

1066 "Validate research components", 

1067 ], 

1068 "system:Exception": [ 

1069 "Check system resources", 

1070 "Restart system components", 

1071 "Verify system configuration", 

1072 ], 

1073 "configuration:Exception": [ 

1074 "Restore configuration backup", 

1075 "Validate configuration syntax", 

1076 "Check configuration permissions", 

1077 ], 

1078 } 

1079 

1080 for key, sols in solutions.items(): 

1081 if key in pattern: 

1082 return sols 

1083 

1084 return ["Contact system administrator", "Check system logs"] 

1085 

1086 def _generate_prevention_tips(self) -> List[str]: 

1087 """Generate prevention tips based on error history""" 

1088 tips = [] 

1089 

1090 # Add tips based on common error categories 

1091 category_counts = {} 

1092 for error in self.error_history: 

1093 category = error.category.value 

1094 category_counts[category] = category_counts.get(category, 0) + 1 

1095 

1096 if category_counts.get("configuration", 0) > 5: 

1097 tips.append("Regularly validate configuration files before making changes") 

1098 

1099 if category_counts.get("research", 0) > 5: 

1100 tips.append( 

1101 "Monitor research engine performance and clear caches regularly" 

1102 ) 

1103 

1104 if category_counts.get("communication", 0) > 5: 

1105 tips.append("Ensure stable network connections for agent communication") 

1106 

1107 return tips 

1108 

1109 def _generate_emergency_procedures(self) -> List[Dict[str, str]]: 

1110 """Generate emergency recovery procedures""" 

1111 return [ 

1112 { 

1113 "condition": "System completely unresponsive", 

1114 "procedure": "Use system_reset recovery action to restore to known good state", 

1115 }, 

1116 { 

1117 "condition": "Critical research engine failure", 

1118 "procedure": "Rollback last research changes using rollback_last_changes action", 

1119 }, 

1120 { 

1121 "condition": "Configuration corruption", 

1122 "procedure": "Restore configuration from backup using restore_config_backup action", 

1123 }, 

1124 { 

1125 "condition": "Multiple agent communication failures", 

1126 "procedure": "Clear agent cache and restart communication channels", 

1127 }, 

1128 ] 

1129 

1130 # Helper methods for component validation and repair 

1131 def _validate_skill_file(self, skill_file: Path) -> bool: 

1132 """Validate skill file format""" 

1133 try: 

1134 with open(skill_file, "r", encoding="utf-8") as f: 

1135 content = f.read() 

1136 

1137 # Basic validation 

1138 return "---" in content and len(content) > 100 

1139 except (OSError, UnicodeDecodeError): 

1140 return False 

1141 

1142 def _validate_agent_file(self, agent_file: Path) -> bool: 

1143 """Validate agent file format""" 

1144 try: 

1145 with open(agent_file, "r", encoding="utf-8") as f: 

1146 content = f.read() 

1147 

1148 return "role:" in content and len(content) > 200 

1149 except (OSError, UnicodeDecodeError): 

1150 return False 

1151 

1152 def _validate_command_file(self, command_file: Path) -> bool: 

1153 """Validate command file format""" 

1154 try: 

1155 with open(command_file, "r", encoding="utf-8") as f: 

1156 content = f.read() 

1157 

1158 return "name:" in content and "allowed-tools:" in content 

1159 except (OSError, UnicodeDecodeError): 

1160 return False 

1161 

1162 def _repair_skill_file(self, skill_file: Path) -> bool: 

1163 """Attempt to repair skill file""" 

1164 try: 

1165 # Basic repair - ensure file has minimum required content 

1166 with open(skill_file, "r", encoding="utf-8") as f: 

1167 content = f.read() 

1168 

1169 if not content.startswith("---"): 

1170 content = f"---\nname: {skill_file.stem}\ndescription: Repaired skill file\n---\n\n{content}" 

1171 

1172 with open(skill_file, "w", encoding="utf-8") as f: 

1173 f.write(content) 

1174 

1175 return True 

1176 except (OSError, UnicodeDecodeError): 

1177 return False 

1178 

1179 def _reinitialize_research_components(self): 

1180 """Reinitialize research components""" 

1181 # Implementation would depend on specific research components 

1182 pass 

1183 

1184 def _reinitialize_core_components(self): 

1185 """Reinitialize core system components""" 

1186 # Implementation would depend on specific core components 

1187 pass 

1188 

1189 def _optimize_connections(self): 

1190 """Optimize database/network connections""" 

1191 # Implementation would depend on specific connection types 

1192 pass 

1193 

1194 def _close_file_handles(self): 

1195 """Close open file handles""" 

1196 import gc 

1197 

1198 gc.collect() # Force garbage collection to close file handles 

1199 

1200 def _terminate_hanging_processes(self): 

1201 """Terminate hanging processes""" 

1202 # Implementation would identify and terminate hanging processes 

1203 pass 

1204 

1205 def _save_error_history(self): 

1206 """Save error history to file""" 

1207 history_file = self.error_log_dir / "error_history.json" 

1208 try: 

1209 with open(history_file, "w") as f: 

1210 json.dump( 

1211 [asdict(e) for e in self.error_history], f, indent=2, default=str 

1212 ) 

1213 except Exception as e: 

1214 logger.error(f"Failed to save error history: {str(e)}") 

1215 

1216 

1217# Global error recovery system instance 

1218_error_recovery_system = None 

1219 

1220 

1221def get_error_recovery_system(project_root: Path = None) -> ErrorRecoverySystem: 

1222 """Get or create global error recovery system instance""" 

1223 global _error_recovery_system 

1224 if _error_recovery_system is None: 

1225 _error_recovery_system = ErrorRecoverySystem(project_root) 

1226 return _error_recovery_system 

1227 

1228 

1229def handle_error( 

1230 error: Exception, 

1231 context: Dict[str, Any] = None, 

1232 severity: ErrorSeverity = ErrorSeverity.MEDIUM, 

1233 category: ErrorCategory = ErrorCategory.SYSTEM, 

1234) -> ErrorReport: 

1235 """Convenience function to handle errors using global system""" 

1236 return get_error_recovery_system().handle_error(error, context, severity, category) 

1237 

1238 

1239# Decorator for automatic error handling 

1240def error_handler( 

1241 severity: ErrorSeverity = ErrorSeverity.MEDIUM, 

1242 category: ErrorCategory = ErrorCategory.SYSTEM, 

1243 context: Dict[str, Any] = None, 

1244): 

1245 """Decorator for automatic error handling""" 

1246 

1247 def decorator(func): 

1248 def wrapper(*args, **kwargs): 

1249 try: 

1250 return func(*args, **kwargs) 

1251 except Exception as e: 

1252 error_context = { 

1253 "function": func.__name__, 

1254 "module": func.__module__, 

1255 "args": str(args)[:100], # Limit length 

1256 "kwargs": str(kwargs)[:100], 

1257 **(context or {}), 

1258 } 

1259 handle_error(e, error_context, severity, category) 

1260 raise 

1261 

1262 return wrapper 

1263 

1264 return decorator 

1265 

1266 

1267if __name__ == "__main__": 

1268 # Demo usage 

1269 recovery_system = ErrorRecoverySystem() 

1270 

1271 print("Error Recovery System Demo") 

1272 print("=" * 50) 

1273 

1274 # Simulate some errors 

1275 try: 

1276 raise ValueError("This is a test error for demonstration") 

1277 except Exception as e: 

1278 error_report = recovery_system.handle_error( 

1279 e, 

1280 context={"demo": True}, 

1281 severity=ErrorSeverity.MEDIUM, 

1282 category=ErrorCategory.SYSTEM, 

1283 ) 

1284 print(f"Handled error: {error_report.id}") 

1285 

1286 # Show system health 

1287 health = recovery_system.get_system_health() 

1288 print(f"System health: {health['status']}") 

1289 

1290 # Show error summary 

1291 summary = recovery_system.get_error_summary() 

1292 print(f"Total errors: {summary['total_recent_errors']}") 

1293 

1294 print("\nError Recovery System demo completed")