Coverage for src / moai_adk / core / robust_json_parser.py: 15.28%

288 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-20 20:52 +0900

1""" 

2Robust JSON Parser for MoAI-ADK 

3 

4Production-ready JSON parser with automatic error recovery, comprehensive logging, 

5and fallback strategies to handle malformed JSON input from various sources. 

6 

7Author: MoAI-ADK Core Team 

8Version: 1.0.0 

9""" 

10 

11import json 

12import logging 

13import re 

14from dataclasses import dataclass 

15from enum import Enum 

16from typing import Any, Dict, List, Optional, Tuple, Union 

17 

18# Configure logging 

19logger = logging.getLogger(__name__) 

20 

21 

22class ErrorSeverity(Enum): 

23 """Error severity levels for classification""" 

24 LOW = "low" 

25 MEDIUM = "medium" 

26 HIGH = "high" 

27 CRITICAL = "critical" 

28 

29 

30@dataclass 

31class ParseResult: 

32 """Result of JSON parsing with metadata""" 

33 success: bool 

34 data: Optional[Any] 

35 error: Optional[str] 

36 original_input: str 

37 recovery_attempts: int 

38 severity: ErrorSeverity 

39 parse_time_ms: float 

40 warnings: List[str] 

41 

42 

43class RobustJSONParser: 

44 """ 

45 Production-ready JSON parser with comprehensive error recovery strategies. 

46 

47 Features: 

48 - Multiple error recovery strategies 

49 - Detailed logging and error tracking 

50 - Performance monitoring 

51 - Fallback parsing methods 

52 - Security validation 

53 """ 

54 

55 def __init__(self, max_recovery_attempts: int = 3, enable_logging: bool = True): 

56 self.max_recovery_attempts = max_recovery_attempts 

57 self.enable_logging = enable_logging 

58 self.error_patterns = self._load_error_patterns() 

59 self.recovery_strategies = self._load_recovery_strategies() 

60 self.stats = { 

61 'total_parses': 0, 

62 'successful_parses': 0, 

63 'recovered_parses': 0, 

64 'failed_parses': 0, 

65 'total_recovery_time': 0.0 

66 } 

67 

68 def _load_error_patterns(self) -> Dict[str, re.Pattern]: 

69 """Load common JSON error patterns""" 

70 return { 

71 'missing_quotes': re.compile(r'(?<!\\)"(?:[^"\\]|\\.)*$'), 

72 'trailing_comma': re.compile(r',\s*[}\]]'), 

73 'escape_sequence': re.compile(r'\\(?![nrtbf"\'\\/])'), 

74 'partial_object': re.compile(r'^\s*\{[^}]*\s*$'), 

75 'missing_brace': re.compile(r'^[^{]*\{[^}]*[^}]*$'), 

76 'invalid_quotes': re.compile(r'(?<!\\)"(?:[^"\\]|\\.)*?[^\\]"(?![\s,}\]:])'), 

77 'control_chars': re.compile(r'[\x00-\x1F\x7F-\x9F]'), 

78 } 

79 

80 def _load_recovery_strategies(self) -> List[callable]: 

81 """Load error recovery strategies in order of application""" 

82 return [ 

83 self._fix_missing_quotes, 

84 self._fix_trailing_commas, 

85 self._fix_escape_sequences, 

86 self._handle_partial_objects, 

87 self._fix_invalid_quotes, 

88 self._remove_control_characters, 

89 self._handle_escaped_newlines, 

90 self._fix_common_syntax_errors, 

91 self._attempt_partial_parse, 

92 ] 

93 

94 def parse(self, json_string: str, context: Optional[Dict] = None) -> ParseResult: 

95 """ 

96 Parse JSON string with comprehensive error recovery. 

97 

98 Args: 

99 json_string: JSON string to parse 

100 context: Optional context information for error reporting 

101 

102 Returns: 

103 ParseResult with data and metadata 

104 """ 

105 import time 

106 start_time = time.time() 

107 

108 self.stats['total_parses'] += 1 

109 

110 original_input = json_string 

111 current_input = json_string 

112 recovery_attempts = 0 

113 warnings = [] 

114 

115 # Initial validation 

116 if not isinstance(json_string, str): 

117 result = ParseResult( 

118 success=False, 

119 data=None, 

120 error=f"Input must be string, got {type(json_string)}", 

121 original_input=original_input, 

122 recovery_attempts=0, 

123 severity=ErrorSeverity.CRITICAL, 

124 parse_time_ms=(time.time() - start_time) * 1000, 

125 warnings=warnings 

126 ) 

127 self.stats['failed_parses'] += 1 

128 return result 

129 

130 # Try direct parsing first 

131 try: 

132 data = json.loads(json_string) 

133 self.stats['successful_parses'] += 1 

134 

135 result = ParseResult( 

136 success=True, 

137 data=data, 

138 error=None, 

139 original_input=original_input, 

140 recovery_attempts=0, 

141 severity=ErrorSeverity.LOW, 

142 parse_time_ms=(time.time() - start_time) * 1000, 

143 warnings=warnings 

144 ) 

145 

146 if self.enable_logging: 

147 logger.debug("JSON parsed successfully on first attempt") 

148 

149 return result 

150 

151 except json.JSONDecodeError as e: 

152 if self.enable_logging: 

153 logger.warning(f"Initial JSON parse failed: {e.msg} at line {e.lineno}, col {e.colno}") 

154 

155 last_error = str(e) 

156 

157 # Apply recovery strategies 

158 for attempt in range(self.max_recovery_attempts): 

159 recovery_attempts += 1 

160 

161 try: 

162 # Apply recovery strategies 

163 for strategy in self.recovery_strategies: 

164 try: 

165 modified_input, applied_warnings = strategy(current_input) 

166 if modified_input != current_input: 

167 current_input = modified_input 

168 warnings.extend(applied_warnings) 

169 if self.enable_logging: 

170 logger.debug(f"Applied recovery strategy: {strategy.__name__}") 

171 break 

172 except Exception as strategy_error: 

173 if self.enable_logging: 

174 logger.debug(f"Recovery strategy {strategy.__name__} failed: {strategy_error}") 

175 continue 

176 

177 # Try parsing with recovered input 

178 data = json.loads(current_input) 

179 self.stats['recovered_parses'] += 1 

180 

181 result = ParseResult( 

182 success=True, 

183 data=data, 

184 error=None, 

185 original_input=original_input, 

186 recovery_attempts=recovery_attempts, 

187 severity=ErrorSeverity.MEDIUM if recovery_attempts > 0 else ErrorSeverity.LOW, 

188 parse_time_ms=(time.time() - start_time) * 1000, 

189 warnings=warnings 

190 ) 

191 

192 if self.enable_logging: 

193 logger.info(f"JSON recovered after {recovery_attempts} attempts") 

194 

195 return result 

196 

197 except json.JSONDecodeError as e: 

198 last_error = str(e) 

199 if self.enable_logging: 

200 logger.debug(f"Parse attempt {attempt + 1} failed: {e.msg}") 

201 

202 # Try more aggressive recovery for later attempts 

203 if attempt == self.max_recovery_attempts - 1: 

204 break 

205 

206 # Make more aggressive modifications for next attempt 

207 current_input = self._apply_aggressive_recovery(current_input) 

208 

209 # All recovery attempts failed 

210 self.stats['failed_parses'] += 1 

211 

212 result = ParseResult( 

213 success=False, 

214 data=None, 

215 error=last_error, 

216 original_input=original_input, 

217 recovery_attempts=recovery_attempts, 

218 severity=ErrorSeverity.HIGH if recovery_attempts > 0 else ErrorSeverity.CRITICAL, 

219 parse_time_ms=(time.time() - start_time) * 1000, 

220 warnings=warnings 

221 ) 

222 

223 if self.enable_logging: 

224 logger.error(f"JSON parsing failed after {recovery_attempts} recovery attempts: {last_error}") 

225 

226 return result 

227 

228 def _fix_missing_quotes(self, json_string: str) -> Tuple[str, List[str]]: 

229 """Fix missing quotes around string values""" 

230 warnings = [] 

231 

232 # Look for unquoted property names - more comprehensive pattern 

233 # Pattern: property_name: (instead of "property_name":) 

234 pattern = r'(\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:' 

235 matches = list(re.finditer(pattern, json_string)) 

236 

237 modified = json_string 

238 offset = 0 

239 

240 for match in reversed(matches): 

241 # Check if this is inside a string (to avoid false positives) 

242 pos_before = match.start() + offset 

243 string_context = self._get_string_context(modified, pos_before) 

244 

245 if not string_context: 

246 # This is likely an unquoted property name 

247 replacement = f'{match.group(1)}"{match.group(2)}":' 

248 start_pos = match.start() + offset 

249 end_pos = match.end() + offset 

250 modified = modified[:start_pos] + replacement + modified[end_pos:] 

251 offset += len(replacement) - (end_pos - start_pos) 

252 warnings.append(f'Added quotes to property name: {match.group(2)}') 

253 

254 return modified, warnings 

255 

256 def _fix_trailing_commas(self, json_string: str) -> Tuple[str, List[str]]: 

257 """Remove trailing commas in objects and arrays""" 

258 warnings = [] 

259 

260 # Remove trailing commas before } or ] 

261 pattern = r',(\s*[}\]])' 

262 matches = list(re.finditer(pattern, json_string)) 

263 

264 modified = json_string 

265 offset = 0 

266 

267 for match in reversed(matches): 

268 # Check if this is inside a string 

269 pos_before = match.start() + offset 

270 string_context = self._get_string_context(modified, pos_before) 

271 

272 if not string_context: 

273 # Remove the comma 

274 replacement = match.group(1) 

275 start_pos = match.start() + offset 

276 end_pos = match.end() + offset 

277 modified = modified[:start_pos] + replacement + modified[end_pos:] 

278 offset -= (match.end() - match.start()) + len(replacement) 

279 warnings.append('Removed trailing comma') 

280 

281 return modified, warnings 

282 

283 def _fix_escape_sequences(self, json_string: str) -> Tuple[str, List[str]]: 

284 """Fix invalid escape sequences""" 

285 warnings = [] 

286 

287 # Find invalid escape sequences 

288 pattern = r'\\(?![nrtbf"\'\\/u])' 

289 matches = list(re.finditer(pattern, json_string)) 

290 

291 modified = json_string 

292 offset = 0 

293 

294 for match in reversed(matches): 

295 # Check if this is inside a string 

296 pos_before = match.start() + offset 

297 string_context = self._get_string_context(modified, pos_before) 

298 

299 if string_context: 

300 # Remove the invalid backslash 

301 replacement = match.group(1)[1:] if len(match.group(1)) > 1 else '' 

302 start_pos = match.start() + offset + 1 # Skip the backslash 

303 end_pos = match.end() + offset 

304 modified = modified[:start_pos] + replacement + modified[end_pos:] 

305 offset -= 1 

306 warnings.append('Removed invalid escape sequence') 

307 

308 return modified, warnings 

309 

310 def _handle_partial_objects(self, json_string: str) -> Tuple[str, List[str]]: 

311 """Handle incomplete JSON objects""" 

312 warnings = [] 

313 

314 stripped = json_string.strip() 

315 

316 # Check if it's a partial object 

317 if stripped.startswith('{') and not stripped.endswith('}'): 

318 # Count braces 

319 open_braces = stripped.count('{') 

320 close_braces = stripped.count('}') 

321 

322 if open_braces > close_braces: 

323 missing_braces = open_braces - close_braces 

324 modified = stripped + '}' * missing_braces 

325 warnings.append(f'Added {missing_braces} closing brace(s)') 

326 return modified, warnings 

327 

328 # Check for partial arrays 

329 if stripped.startswith('[') and not stripped.endswith(']'): 

330 open_brackets = stripped.count('[') 

331 close_brackets = stripped.count(']') 

332 

333 if open_brackets > close_brackets: 

334 missing_brackets = open_brackets - close_brackets 

335 modified = stripped + ']' * missing_brackets 

336 warnings.append(f'Added {missing_brackets} closing bracket(s)') 

337 return modified, warnings 

338 

339 return json_string, warnings 

340 

341 def _fix_invalid_quotes(self, json_string: str) -> Tuple[str, List[str]]: 

342 """Fix invalid quote usage""" 

343 warnings = [] 

344 

345 # Simple replacement: replace single quotes with double quotes 

346 # This handles cases like {'name': 'test'} -> {"name": "test"} 

347 if json_string.startswith("'") or ("'" in json_string and '"' not in json_string): 

348 # Case: entirely single-quoted JSON 

349 modified = json_string.replace("'", '"') 

350 if modified != json_string: 

351 warnings.append('Replaced single quotes with double quotes') 

352 return modified, warnings 

353 

354 # More complex case: mixed quotes 

355 # Replace unescaped single quotes that appear to be string delimiters 

356 modified = [] 

357 i = 0 

358 while i < len(json_string): 

359 char = json_string[i] 

360 

361 if char == "\\": 

362 # Preserve escape sequences 

363 modified.append(char) 

364 i += 1 

365 if i < len(json_string): 

366 modified.append(json_string[i]) 

367 i += 1 

368 continue 

369 

370 if char == "'": 

371 # Replace single quote with double quote 

372 modified.append('"') 

373 i += 1 

374 else: 

375 modified.append(char) 

376 i += 1 

377 

378 final_modified = ''.join(modified) 

379 if final_modified != json_string: 

380 warnings.append('Replaced single quotes with double quotes') 

381 

382 return final_modified, warnings 

383 

384 def _remove_control_characters(self, json_string: str) -> Tuple[str, List[str]]: 

385 """Remove control characters that break JSON parsing""" 

386 warnings = [] 

387 

388 # Remove control characters except allowed ones (tab, newline, carriage return) 

389 pattern = r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]' 

390 matches = re.findall(pattern, json_string) 

391 

392 if matches: 

393 modified = re.sub(pattern, '', json_string) 

394 warnings.append(f'Removed {len(matches)} control character(s)') 

395 return modified, warnings 

396 

397 return json_string, warnings 

398 

399 def _handle_escaped_newlines(self, json_string: str) -> Tuple[str, List[str]]: 

400 """Handle escaped newlines in JSON strings""" 

401 warnings = [] 

402 

403 # Replace escaped newlines with proper JSON representation 

404 modified = json_string.replace('\\n', '\\n') 

405 

406 if modified != json_string: 

407 warnings.append('Normalized escaped newlines') 

408 

409 return modified, warnings 

410 

411 def _fix_common_syntax_errors(self, json_string: str) -> Tuple[str, List[str]]: 

412 """Fix common JSON syntax errors""" 

413 warnings = [] 

414 modified = json_string 

415 

416 # Fix missing commas between array elements 

417 pattern = r'(\]\s*\[)' 

418 matches = re.finditer(pattern, modified) 

419 

420 for match in reversed(list(matches)): 

421 replacement = f'{match.group(1).strip()},{match.group(1).strip()}' 

422 modified = modified[:match.start()] + replacement + modified[match.end():] 

423 warnings.append('Added missing comma between array elements') 

424 

425 # Fix missing colons in object properties 

426 pattern = r'("[^"]+")\s+("[^"]*"|\d+|true|false|null|\{|\[)' 

427 matches = re.finditer(pattern, modified) 

428 

429 for match in reversed(list(matches)): 

430 replacement = f'{match.group(1)}:{match.group(2)}' 

431 modified = modified[:match.start()] + replacement + modified[match.end():] 

432 warnings.append(f'Added missing colon for property: {match.group(1)}') 

433 

434 return modified, warnings 

435 

436 def _attempt_partial_parse(self, json_string: str) -> Tuple[str, List[str]]: 

437 """Attempt to extract and parse valid JSON from a larger string""" 

438 warnings = [] 

439 

440 # Look for JSON-like patterns in the string 

441 patterns = [ 

442 r'\{[^{}]*\}', # Simple object 

443 r'\[[^\[\]]*\]', # Simple array 

444 r'\{(?:[^{}]*\{[^{}]*\})*[^{}]*\}', # Nested objects (one level) 

445 r'\[(?:[^\[\]]*\[[^\[\]]*\])*[^\[\]]*\]', # Nested arrays (one level) 

446 ] 

447 

448 for pattern in patterns: 

449 matches = re.finditer(pattern, json_string) 

450 

451 for match in matches: 

452 try: 

453 # Try to parse this substring 

454 json.loads(match.group()) 

455 

456 # If successful, return this as the cleaned string 

457 warnings.append('Extracted valid JSON from larger string') 

458 return match.group(), warnings 

459 

460 except json.JSONDecodeError: 

461 continue 

462 

463 return json_string, warnings 

464 

465 def _apply_aggressive_recovery(self, json_string: str) -> str: 

466 """Apply more aggressive recovery for final attempts""" 

467 modified = json_string 

468 

469 # Try to extract just the JSON part from a response that might include other text 

470 # Look for patterns that might contain JSON 

471 json_patterns = [ 

472 r'```json\s*(.*?)\s*```', # Markdown code blocks 

473 r'\{.*\}', # Anything between braces 

474 r'\[.*\]', # Anything between brackets 

475 ] 

476 

477 for pattern in json_patterns: 

478 matches = re.findall(pattern, modified, re.DOTALL) 

479 for match in matches: 

480 try: 

481 json.loads(match) 

482 return match # Return the first valid JSON found 

483 except (json.JSONDecodeError, ValueError, TypeError): 

484 continue 

485 

486 # If no JSON found, try to clean up the string 

487 # Remove common non-JSON prefixes/suffixes 

488 lines = modified.split('\n') 

489 json_lines = [] 

490 

491 for line in lines: 

492 line = line.strip() 

493 # Skip lines that are clearly not JSON 

494 if (line.startswith(('```', '#', '*', '-', '>', 'Error:', 'Success:')) or 

495 line.endswith(('```', '.'))): 

496 continue 

497 json_lines.append(line) 

498 

499 return '\n'.join(json_lines) 

500 

501 def _get_string_context(self, json_string: str, position: int) -> bool: 

502 """Check if position is inside a JSON string""" 

503 quote_count = 0 

504 escape_next = False 

505 

506 for i in range(position): 

507 char = json_string[i] 

508 

509 if escape_next: 

510 escape_next = False 

511 continue 

512 

513 if char == '\\': 

514 escape_next = True 

515 continue 

516 

517 if char == '"': 

518 quote_count += 1 

519 

520 return quote_count % 2 == 1 # True if inside string 

521 

522 def get_stats(self) -> Dict[str, Union[int, float]]: 

523 """Get parsing statistics""" 

524 total = self.stats['total_parses'] 

525 if total > 0: 

526 return { 

527 **self.stats, 

528 'success_rate': (self.stats['successful_parses'] + self.stats['recovered_parses']) / total, 

529 'recovery_rate': self.stats['recovered_parses'] / total, 

530 'failure_rate': self.stats['failed_parses'] / total, 

531 'avg_recovery_time': self.stats['total_recovery_time'] / self.stats['recovered_parses'] if self.stats['recovered_parses'] > 0 else 0 

532 } 

533 return self.stats 

534 

535 def reset_stats(self) -> None: 

536 """Reset parsing statistics""" 

537 self.stats = { 

538 'total_parses': 0, 

539 'successful_parses': 0, 

540 'recovered_parses': 0, 

541 'failed_parses': 0, 

542 'total_recovery_time': 0.0 

543 } 

544 

545 

546# Global instance for easy import 

547parser = RobustJSONParser() 

548 

549 

550def parse_json(json_string: str, context: Optional[Dict] = None) -> ParseResult: 

551 """Convenience function to parse JSON with error recovery""" 

552 return parser.parse(json_string, context) 

553 

554 

555def get_parser_stats() -> Dict[str, Union[int, float]]: 

556 """Get global parser statistics""" 

557 return parser.get_stats() 

558 

559 

560def reset_parser_stats() -> None: 

561 """Reset global parser statistics""" 

562 parser.reset_stats() 

563 

564 

565# Test suite 

566if __name__ == "__main__": 

567 # Basic tests 

568 test_cases = [ 

569 # Valid JSON 

570 '{"name": "test", "value": 123}', 

571 

572 # Missing quotes 

573 '{name: "test", value: 123}', 

574 

575 # Trailing comma 

576 '{"name": "test", "value": 123,}', 

577 

578 # Invalid escape sequences 

579 '{"name": "test\\invalid", "value": 123}', 

580 

581 # Partial object 

582 '{"name": "test"', 

583 

584 # Mixed single quotes 

585 "{'name': 'test', 'value': 123}", 

586 

587 # Control characters 

588 '{"name": "test\x00", "value": 123}', 

589 ] 

590 

591 print("Testing Robust JSON Parser...") 

592 

593 for i, test_case in enumerate(test_cases): 

594 print(f"\nTest {i + 1}: {test_case[:50]}...") 

595 

596 result = parser.parse(test_case) 

597 

598 if result.success: 

599 print(f"✓ Success (attempts: {result.recovery_attempts})") 

600 print(f" Data: {result.data}") 

601 else: 

602 print(f"✗ Failed (attempts: {result.recovery_attempts})") 

603 print(f" Error: {result.error}") 

604 

605 if result.warnings: 

606 print(f" Warnings: {result.warnings}") 

607 

608 print("\nParser Statistics:") 

609 stats = parser.get_stats() 

610 for key, value in stats.items(): 

611 print(f" {key}: {value}")