Coverage for src / moai_adk / core / robust_json_parser.py: 15.28%
288 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
1"""
2Robust JSON Parser for MoAI-ADK
4Production-ready JSON parser with automatic error recovery, comprehensive logging,
5and fallback strategies to handle malformed JSON input from various sources.
7Author: MoAI-ADK Core Team
8Version: 1.0.0
9"""
11import json
12import logging
13import re
14from dataclasses import dataclass
15from enum import Enum
16from typing import Any, Dict, List, Optional, Tuple, Union
18# Configure logging
19logger = logging.getLogger(__name__)
22class ErrorSeverity(Enum):
23 """Error severity levels for classification"""
24 LOW = "low"
25 MEDIUM = "medium"
26 HIGH = "high"
27 CRITICAL = "critical"
30@dataclass
31class ParseResult:
32 """Result of JSON parsing with metadata"""
33 success: bool
34 data: Optional[Any]
35 error: Optional[str]
36 original_input: str
37 recovery_attempts: int
38 severity: ErrorSeverity
39 parse_time_ms: float
40 warnings: List[str]
43class RobustJSONParser:
44 """
45 Production-ready JSON parser with comprehensive error recovery strategies.
47 Features:
48 - Multiple error recovery strategies
49 - Detailed logging and error tracking
50 - Performance monitoring
51 - Fallback parsing methods
52 - Security validation
53 """
55 def __init__(self, max_recovery_attempts: int = 3, enable_logging: bool = True):
56 self.max_recovery_attempts = max_recovery_attempts
57 self.enable_logging = enable_logging
58 self.error_patterns = self._load_error_patterns()
59 self.recovery_strategies = self._load_recovery_strategies()
60 self.stats = {
61 'total_parses': 0,
62 'successful_parses': 0,
63 'recovered_parses': 0,
64 'failed_parses': 0,
65 'total_recovery_time': 0.0
66 }
68 def _load_error_patterns(self) -> Dict[str, re.Pattern]:
69 """Load common JSON error patterns"""
70 return {
71 'missing_quotes': re.compile(r'(?<!\\)"(?:[^"\\]|\\.)*$'),
72 'trailing_comma': re.compile(r',\s*[}\]]'),
73 'escape_sequence': re.compile(r'\\(?![nrtbf"\'\\/])'),
74 'partial_object': re.compile(r'^\s*\{[^}]*\s*$'),
75 'missing_brace': re.compile(r'^[^{]*\{[^}]*[^}]*$'),
76 'invalid_quotes': re.compile(r'(?<!\\)"(?:[^"\\]|\\.)*?[^\\]"(?![\s,}\]:])'),
77 'control_chars': re.compile(r'[\x00-\x1F\x7F-\x9F]'),
78 }
80 def _load_recovery_strategies(self) -> List[callable]:
81 """Load error recovery strategies in order of application"""
82 return [
83 self._fix_missing_quotes,
84 self._fix_trailing_commas,
85 self._fix_escape_sequences,
86 self._handle_partial_objects,
87 self._fix_invalid_quotes,
88 self._remove_control_characters,
89 self._handle_escaped_newlines,
90 self._fix_common_syntax_errors,
91 self._attempt_partial_parse,
92 ]
94 def parse(self, json_string: str, context: Optional[Dict] = None) -> ParseResult:
95 """
96 Parse JSON string with comprehensive error recovery.
98 Args:
99 json_string: JSON string to parse
100 context: Optional context information for error reporting
102 Returns:
103 ParseResult with data and metadata
104 """
105 import time
106 start_time = time.time()
108 self.stats['total_parses'] += 1
110 original_input = json_string
111 current_input = json_string
112 recovery_attempts = 0
113 warnings = []
115 # Initial validation
116 if not isinstance(json_string, str):
117 result = ParseResult(
118 success=False,
119 data=None,
120 error=f"Input must be string, got {type(json_string)}",
121 original_input=original_input,
122 recovery_attempts=0,
123 severity=ErrorSeverity.CRITICAL,
124 parse_time_ms=(time.time() - start_time) * 1000,
125 warnings=warnings
126 )
127 self.stats['failed_parses'] += 1
128 return result
130 # Try direct parsing first
131 try:
132 data = json.loads(json_string)
133 self.stats['successful_parses'] += 1
135 result = ParseResult(
136 success=True,
137 data=data,
138 error=None,
139 original_input=original_input,
140 recovery_attempts=0,
141 severity=ErrorSeverity.LOW,
142 parse_time_ms=(time.time() - start_time) * 1000,
143 warnings=warnings
144 )
146 if self.enable_logging:
147 logger.debug("JSON parsed successfully on first attempt")
149 return result
151 except json.JSONDecodeError as e:
152 if self.enable_logging:
153 logger.warning(f"Initial JSON parse failed: {e.msg} at line {e.lineno}, col {e.colno}")
155 last_error = str(e)
157 # Apply recovery strategies
158 for attempt in range(self.max_recovery_attempts):
159 recovery_attempts += 1
161 try:
162 # Apply recovery strategies
163 for strategy in self.recovery_strategies:
164 try:
165 modified_input, applied_warnings = strategy(current_input)
166 if modified_input != current_input:
167 current_input = modified_input
168 warnings.extend(applied_warnings)
169 if self.enable_logging:
170 logger.debug(f"Applied recovery strategy: {strategy.__name__}")
171 break
172 except Exception as strategy_error:
173 if self.enable_logging:
174 logger.debug(f"Recovery strategy {strategy.__name__} failed: {strategy_error}")
175 continue
177 # Try parsing with recovered input
178 data = json.loads(current_input)
179 self.stats['recovered_parses'] += 1
181 result = ParseResult(
182 success=True,
183 data=data,
184 error=None,
185 original_input=original_input,
186 recovery_attempts=recovery_attempts,
187 severity=ErrorSeverity.MEDIUM if recovery_attempts > 0 else ErrorSeverity.LOW,
188 parse_time_ms=(time.time() - start_time) * 1000,
189 warnings=warnings
190 )
192 if self.enable_logging:
193 logger.info(f"JSON recovered after {recovery_attempts} attempts")
195 return result
197 except json.JSONDecodeError as e:
198 last_error = str(e)
199 if self.enable_logging:
200 logger.debug(f"Parse attempt {attempt + 1} failed: {e.msg}")
202 # Try more aggressive recovery for later attempts
203 if attempt == self.max_recovery_attempts - 1:
204 break
206 # Make more aggressive modifications for next attempt
207 current_input = self._apply_aggressive_recovery(current_input)
209 # All recovery attempts failed
210 self.stats['failed_parses'] += 1
212 result = ParseResult(
213 success=False,
214 data=None,
215 error=last_error,
216 original_input=original_input,
217 recovery_attempts=recovery_attempts,
218 severity=ErrorSeverity.HIGH if recovery_attempts > 0 else ErrorSeverity.CRITICAL,
219 parse_time_ms=(time.time() - start_time) * 1000,
220 warnings=warnings
221 )
223 if self.enable_logging:
224 logger.error(f"JSON parsing failed after {recovery_attempts} recovery attempts: {last_error}")
226 return result
228 def _fix_missing_quotes(self, json_string: str) -> Tuple[str, List[str]]:
229 """Fix missing quotes around string values"""
230 warnings = []
232 # Look for unquoted property names - more comprehensive pattern
233 # Pattern: property_name: (instead of "property_name":)
234 pattern = r'(\s*)([a-zA-Z_][a-zA-Z0-9_]*)\s*:'
235 matches = list(re.finditer(pattern, json_string))
237 modified = json_string
238 offset = 0
240 for match in reversed(matches):
241 # Check if this is inside a string (to avoid false positives)
242 pos_before = match.start() + offset
243 string_context = self._get_string_context(modified, pos_before)
245 if not string_context:
246 # This is likely an unquoted property name
247 replacement = f'{match.group(1)}"{match.group(2)}":'
248 start_pos = match.start() + offset
249 end_pos = match.end() + offset
250 modified = modified[:start_pos] + replacement + modified[end_pos:]
251 offset += len(replacement) - (end_pos - start_pos)
252 warnings.append(f'Added quotes to property name: {match.group(2)}')
254 return modified, warnings
256 def _fix_trailing_commas(self, json_string: str) -> Tuple[str, List[str]]:
257 """Remove trailing commas in objects and arrays"""
258 warnings = []
260 # Remove trailing commas before } or ]
261 pattern = r',(\s*[}\]])'
262 matches = list(re.finditer(pattern, json_string))
264 modified = json_string
265 offset = 0
267 for match in reversed(matches):
268 # Check if this is inside a string
269 pos_before = match.start() + offset
270 string_context = self._get_string_context(modified, pos_before)
272 if not string_context:
273 # Remove the comma
274 replacement = match.group(1)
275 start_pos = match.start() + offset
276 end_pos = match.end() + offset
277 modified = modified[:start_pos] + replacement + modified[end_pos:]
278 offset -= (match.end() - match.start()) + len(replacement)
279 warnings.append('Removed trailing comma')
281 return modified, warnings
283 def _fix_escape_sequences(self, json_string: str) -> Tuple[str, List[str]]:
284 """Fix invalid escape sequences"""
285 warnings = []
287 # Find invalid escape sequences
288 pattern = r'\\(?![nrtbf"\'\\/u])'
289 matches = list(re.finditer(pattern, json_string))
291 modified = json_string
292 offset = 0
294 for match in reversed(matches):
295 # Check if this is inside a string
296 pos_before = match.start() + offset
297 string_context = self._get_string_context(modified, pos_before)
299 if string_context:
300 # Remove the invalid backslash
301 replacement = match.group(1)[1:] if len(match.group(1)) > 1 else ''
302 start_pos = match.start() + offset + 1 # Skip the backslash
303 end_pos = match.end() + offset
304 modified = modified[:start_pos] + replacement + modified[end_pos:]
305 offset -= 1
306 warnings.append('Removed invalid escape sequence')
308 return modified, warnings
310 def _handle_partial_objects(self, json_string: str) -> Tuple[str, List[str]]:
311 """Handle incomplete JSON objects"""
312 warnings = []
314 stripped = json_string.strip()
316 # Check if it's a partial object
317 if stripped.startswith('{') and not stripped.endswith('}'):
318 # Count braces
319 open_braces = stripped.count('{')
320 close_braces = stripped.count('}')
322 if open_braces > close_braces:
323 missing_braces = open_braces - close_braces
324 modified = stripped + '}' * missing_braces
325 warnings.append(f'Added {missing_braces} closing brace(s)')
326 return modified, warnings
328 # Check for partial arrays
329 if stripped.startswith('[') and not stripped.endswith(']'):
330 open_brackets = stripped.count('[')
331 close_brackets = stripped.count(']')
333 if open_brackets > close_brackets:
334 missing_brackets = open_brackets - close_brackets
335 modified = stripped + ']' * missing_brackets
336 warnings.append(f'Added {missing_brackets} closing bracket(s)')
337 return modified, warnings
339 return json_string, warnings
341 def _fix_invalid_quotes(self, json_string: str) -> Tuple[str, List[str]]:
342 """Fix invalid quote usage"""
343 warnings = []
345 # Simple replacement: replace single quotes with double quotes
346 # This handles cases like {'name': 'test'} -> {"name": "test"}
347 if json_string.startswith("'") or ("'" in json_string and '"' not in json_string):
348 # Case: entirely single-quoted JSON
349 modified = json_string.replace("'", '"')
350 if modified != json_string:
351 warnings.append('Replaced single quotes with double quotes')
352 return modified, warnings
354 # More complex case: mixed quotes
355 # Replace unescaped single quotes that appear to be string delimiters
356 modified = []
357 i = 0
358 while i < len(json_string):
359 char = json_string[i]
361 if char == "\\":
362 # Preserve escape sequences
363 modified.append(char)
364 i += 1
365 if i < len(json_string):
366 modified.append(json_string[i])
367 i += 1
368 continue
370 if char == "'":
371 # Replace single quote with double quote
372 modified.append('"')
373 i += 1
374 else:
375 modified.append(char)
376 i += 1
378 final_modified = ''.join(modified)
379 if final_modified != json_string:
380 warnings.append('Replaced single quotes with double quotes')
382 return final_modified, warnings
384 def _remove_control_characters(self, json_string: str) -> Tuple[str, List[str]]:
385 """Remove control characters that break JSON parsing"""
386 warnings = []
388 # Remove control characters except allowed ones (tab, newline, carriage return)
389 pattern = r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F-\x9F]'
390 matches = re.findall(pattern, json_string)
392 if matches:
393 modified = re.sub(pattern, '', json_string)
394 warnings.append(f'Removed {len(matches)} control character(s)')
395 return modified, warnings
397 return json_string, warnings
399 def _handle_escaped_newlines(self, json_string: str) -> Tuple[str, List[str]]:
400 """Handle escaped newlines in JSON strings"""
401 warnings = []
403 # Replace escaped newlines with proper JSON representation
404 modified = json_string.replace('\\n', '\\n')
406 if modified != json_string:
407 warnings.append('Normalized escaped newlines')
409 return modified, warnings
411 def _fix_common_syntax_errors(self, json_string: str) -> Tuple[str, List[str]]:
412 """Fix common JSON syntax errors"""
413 warnings = []
414 modified = json_string
416 # Fix missing commas between array elements
417 pattern = r'(\]\s*\[)'
418 matches = re.finditer(pattern, modified)
420 for match in reversed(list(matches)):
421 replacement = f'{match.group(1).strip()},{match.group(1).strip()}'
422 modified = modified[:match.start()] + replacement + modified[match.end():]
423 warnings.append('Added missing comma between array elements')
425 # Fix missing colons in object properties
426 pattern = r'("[^"]+")\s+("[^"]*"|\d+|true|false|null|\{|\[)'
427 matches = re.finditer(pattern, modified)
429 for match in reversed(list(matches)):
430 replacement = f'{match.group(1)}:{match.group(2)}'
431 modified = modified[:match.start()] + replacement + modified[match.end():]
432 warnings.append(f'Added missing colon for property: {match.group(1)}')
434 return modified, warnings
436 def _attempt_partial_parse(self, json_string: str) -> Tuple[str, List[str]]:
437 """Attempt to extract and parse valid JSON from a larger string"""
438 warnings = []
440 # Look for JSON-like patterns in the string
441 patterns = [
442 r'\{[^{}]*\}', # Simple object
443 r'\[[^\[\]]*\]', # Simple array
444 r'\{(?:[^{}]*\{[^{}]*\})*[^{}]*\}', # Nested objects (one level)
445 r'\[(?:[^\[\]]*\[[^\[\]]*\])*[^\[\]]*\]', # Nested arrays (one level)
446 ]
448 for pattern in patterns:
449 matches = re.finditer(pattern, json_string)
451 for match in matches:
452 try:
453 # Try to parse this substring
454 json.loads(match.group())
456 # If successful, return this as the cleaned string
457 warnings.append('Extracted valid JSON from larger string')
458 return match.group(), warnings
460 except json.JSONDecodeError:
461 continue
463 return json_string, warnings
465 def _apply_aggressive_recovery(self, json_string: str) -> str:
466 """Apply more aggressive recovery for final attempts"""
467 modified = json_string
469 # Try to extract just the JSON part from a response that might include other text
470 # Look for patterns that might contain JSON
471 json_patterns = [
472 r'```json\s*(.*?)\s*```', # Markdown code blocks
473 r'\{.*\}', # Anything between braces
474 r'\[.*\]', # Anything between brackets
475 ]
477 for pattern in json_patterns:
478 matches = re.findall(pattern, modified, re.DOTALL)
479 for match in matches:
480 try:
481 json.loads(match)
482 return match # Return the first valid JSON found
483 except (json.JSONDecodeError, ValueError, TypeError):
484 continue
486 # If no JSON found, try to clean up the string
487 # Remove common non-JSON prefixes/suffixes
488 lines = modified.split('\n')
489 json_lines = []
491 for line in lines:
492 line = line.strip()
493 # Skip lines that are clearly not JSON
494 if (line.startswith(('```', '#', '*', '-', '>', 'Error:', 'Success:')) or
495 line.endswith(('```', '.'))):
496 continue
497 json_lines.append(line)
499 return '\n'.join(json_lines)
501 def _get_string_context(self, json_string: str, position: int) -> bool:
502 """Check if position is inside a JSON string"""
503 quote_count = 0
504 escape_next = False
506 for i in range(position):
507 char = json_string[i]
509 if escape_next:
510 escape_next = False
511 continue
513 if char == '\\':
514 escape_next = True
515 continue
517 if char == '"':
518 quote_count += 1
520 return quote_count % 2 == 1 # True if inside string
522 def get_stats(self) -> Dict[str, Union[int, float]]:
523 """Get parsing statistics"""
524 total = self.stats['total_parses']
525 if total > 0:
526 return {
527 **self.stats,
528 'success_rate': (self.stats['successful_parses'] + self.stats['recovered_parses']) / total,
529 'recovery_rate': self.stats['recovered_parses'] / total,
530 'failure_rate': self.stats['failed_parses'] / total,
531 'avg_recovery_time': self.stats['total_recovery_time'] / self.stats['recovered_parses'] if self.stats['recovered_parses'] > 0 else 0
532 }
533 return self.stats
535 def reset_stats(self) -> None:
536 """Reset parsing statistics"""
537 self.stats = {
538 'total_parses': 0,
539 'successful_parses': 0,
540 'recovered_parses': 0,
541 'failed_parses': 0,
542 'total_recovery_time': 0.0
543 }
546# Global instance for easy import
547parser = RobustJSONParser()
550def parse_json(json_string: str, context: Optional[Dict] = None) -> ParseResult:
551 """Convenience function to parse JSON with error recovery"""
552 return parser.parse(json_string, context)
555def get_parser_stats() -> Dict[str, Union[int, float]]:
556 """Get global parser statistics"""
557 return parser.get_stats()
560def reset_parser_stats() -> None:
561 """Reset global parser statistics"""
562 parser.reset_stats()
565# Test suite
566if __name__ == "__main__":
567 # Basic tests
568 test_cases = [
569 # Valid JSON
570 '{"name": "test", "value": 123}',
572 # Missing quotes
573 '{name: "test", value: 123}',
575 # Trailing comma
576 '{"name": "test", "value": 123,}',
578 # Invalid escape sequences
579 '{"name": "test\\invalid", "value": 123}',
581 # Partial object
582 '{"name": "test"',
584 # Mixed single quotes
585 "{'name': 'test', 'value': 123}",
587 # Control characters
588 '{"name": "test\x00", "value": 123}',
589 ]
591 print("Testing Robust JSON Parser...")
593 for i, test_case in enumerate(test_cases):
594 print(f"\nTest {i + 1}: {test_case[:50]}...")
596 result = parser.parse(test_case)
598 if result.success:
599 print(f"✓ Success (attempts: {result.recovery_attempts})")
600 print(f" Data: {result.data}")
601 else:
602 print(f"✗ Failed (attempts: {result.recovery_attempts})")
603 print(f" Error: {result.error}")
605 if result.warnings:
606 print(f" Warnings: {result.warnings}")
608 print("\nParser Statistics:")
609 stats = parser.get_stats()
610 for key, value in stats.items():
611 print(f" {key}: {value}")