Coverage for src / moai_adk / core / spec / confidence_scoring.py: 9.93%
272 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
1"""Confidence Scoring System for Auto-SPEC Generation."""
3import ast
4import logging
5import re
6import time
7from typing import Any, Dict, List, Tuple
9# Configure logging
10logger = logging.getLogger(__name__)
13# SpecGenerator: Placeholder for spec generation functionality
14class SpecGenerator:
15 """Placeholder SpecGenerator class for confidence scoring."""
17 def __init__(self):
18 self.name = "SpecGenerator"
20 def generate_spec(self, file_path: str, content: str) -> str:
21 """Generate a basic SPEC document."""
22 return f"SPEC document for {file_path}\n\nConfidence analysis: {content[:100]}..."
25class ConfidenceScoringSystem:
26 """
27 Advanced confidence scoring system for auto-generated SPECs.
29 This system analyzes code structure, domain relevance, and documentation
30 quality to provide confidence scores for SPEC auto-generation.
31 """
33 def __init__(self):
34 self.spec_generator = SpecGenerator()
35 self.word_patterns = {
36 "security": [
37 "auth",
38 "login",
39 "password",
40 "encrypt",
41 "security",
42 "bcrypt",
43 "hash",
44 "token",
45 ],
46 "data": [
47 "model",
48 "entity",
49 "schema",
50 "database",
51 "persistence",
52 "storage",
53 "cache",
54 ],
55 "api": [
56 "api",
57 "endpoint",
58 "route",
59 "controller",
60 "service",
61 "handler",
62 "middleware",
63 ],
64 "ui": [
65 "ui",
66 "interface",
67 "component",
68 "widget",
69 "layout",
70 "theme",
71 "display",
72 ],
73 "business": ["business", "logic", "process", "workflow", "rule", "policy"],
74 "testing": [
75 "test",
76 "mock",
77 "fixture",
78 "assertion",
79 "verification",
80 "validation",
81 ],
82 }
84 def analyze_code_structure(self, file_path: str) -> Dict[str, float]:
85 """
86 Analyze code structure quality.
88 Args:
89 file_path: Path to the Python code file
91 Returns:
92 Dictionary with structure scores
93 """
94 try:
95 with open(file_path, "r", encoding="utf-8") as f:
96 code_content = f.read()
98 # Parse AST
99 tree = ast.parse(code_content)
101 structure_scores = {
102 "class_count": 0,
103 "function_count": 0,
104 "method_count": 0,
105 "import_count": 0,
106 "complexity_score": 0.0,
107 "nesting_depth": 0,
108 "docstring_coverage": 0.0,
109 "naming_consistency": 0.0,
110 }
112 # Analyze AST nodes
113 for node in ast.walk(tree):
114 if isinstance(node, ast.ClassDef):
115 structure_scores["class_count"] += 1
116 # Check if class has docstring
117 if ast.get_docstring(node):
118 structure_scores["docstring_coverage"] += 0.1
119 # Count methods
120 for child in ast.walk(node):
121 if isinstance(child, ast.FunctionDef):
122 structure_scores["method_count"] += 1
124 elif isinstance(node, ast.FunctionDef):
125 # Check if function is not a method (not inside a class)
126 is_method = False
127 for parent in ast.walk(tree):
128 if isinstance(parent, ast.ClassDef):
129 for child in parent.body:
130 if child == node:
131 is_method = True
132 break
134 if not is_method:
135 structure_scores["function_count"] += 1
137 # Check if function has docstring
138 if ast.get_docstring(node):
139 structure_scores["docstring_coverage"] += 0.1
141 elif isinstance(node, ast.Import) or isinstance(node, ast.ImportFrom):
142 structure_scores["import_count"] += 1
144 # Calculate complexity score (cyclomatic complexity approximation)
145 structure_scores["complexity_score"] = self._calculate_complexity(tree)
147 # Calculate nesting depth
148 structure_scores["nesting_depth"] = self._calculate_nesting_depth(tree)
150 # Calculate naming consistency
151 structure_scores["naming_consistency"] = self._calculate_naming_consistency(
152 tree
153 )
155 # Normalize scores (0-1 range)
156 max_classes = 5 # Reasonable upper bound
157 max_functions = 20
158 max_methods = 50
159 max_imports = 15
160 max_complexity = 10
161 max_nesting = 5
162 max_docstring = 1.0
164 normalized_scores = {
165 "class_ratio": min(structure_scores["class_count"] / max_classes, 1.0),
166 "function_ratio": min(
167 structure_scores["function_count"] / max_functions, 1.0
168 ),
169 "method_ratio": min(
170 structure_scores["method_count"] / max_methods, 1.0
171 ),
172 "import_ratio": min(
173 structure_scores["import_count"] / max_imports, 1.0
174 ),
175 "complexity_ratio": max(
176 0, 1.0 - structure_scores["complexity_score"] / max_complexity
177 ),
178 "nesting_ratio": max(
179 0, 1.0 - structure_scores["nesting_depth"] / max_nesting
180 ),
181 "docstring_score": min(
182 structure_scores["docstring_coverage"] / max_docstring, 1.0
183 ),
184 "naming_score": structure_scores["naming_consistency"],
185 }
187 return normalized_scores
189 except Exception as e:
190 logger.error(f"Error analyzing code structure: {e}")
191 return self._get_default_structure_scores()
193 def _calculate_complexity(self, tree: ast.AST) -> float:
194 """Calculate approximate cyclomatic complexity."""
195 complexity = 1 # Base complexity
197 for node in ast.walk(tree):
198 if isinstance(node, (ast.If, ast.For, ast.While, ast.Try, ast.With)):
199 complexity += 1
200 elif isinstance(node, ast.BoolOp):
201 complexity += len(node.values) - 1
203 return complexity
205 def _calculate_nesting_depth(self, tree: ast.AST) -> int:
206 """Calculate maximum nesting depth."""
207 max_depth = 0
209 def _visit(node, depth):
210 nonlocal max_depth
211 max_depth = max(max_depth, depth)
213 for child in ast.iter_child_nodes(node):
214 if isinstance(child, (ast.If, ast.For, ast.While, ast.Try, ast.With)):
215 _visit(child, depth + 1)
216 else:
217 _visit(child, depth)
219 _visit(tree, 0)
220 return max_depth
222 def _calculate_naming_consistency(self, tree: ast.AST) -> float:
223 """Calculate naming consistency score."""
224 names = []
226 for node in ast.walk(tree):
227 if isinstance(node, ast.Name):
228 names.append(node.id)
229 elif isinstance(node, ast.FunctionDef):
230 names.append(node.name)
231 elif isinstance(node, ast.ClassDef):
232 names.append(node.name)
234 if not names:
235 return 1.0
237 # Check naming patterns
238 snake_case_count = 0
239 camel_case_count = 0
241 for name in names:
242 if re.match(r"^[a-z]+(?:_[a-z]+)*$", name):
243 snake_case_count += 1
244 elif re.match(r"^[A-Z][a-zA-Z0-9]*$", name):
245 camel_case_count += 1
247 # Calculate consistency
248 total_names = len(names)
249 if total_names == 0:
250 return 1.0
252 snake_case_ratio = snake_case_count / total_names
253 camel_case_ratio = camel_case_count / total_names
255 # Favor consistency over specific style
256 if snake_case_ratio > 0.7 or camel_case_ratio > 0.7:
257 return 0.9
258 elif snake_case_ratio > 0.5 or camel_case_ratio > 0.5:
259 return 0.7
260 else:
261 return 0.5
263 def _get_default_structure_scores(self) -> Dict[str, float]:
264 """Get default structure scores for error cases."""
265 return {
266 "class_ratio": 0.5,
267 "function_ratio": 0.5,
268 "method_ratio": 0.5,
269 "import_ratio": 0.5,
270 "complexity_ratio": 0.7,
271 "nesting_ratio": 0.7,
272 "docstring_score": 0.3,
273 "naming_score": 0.5,
274 }
276 def analyze_domain_relevance(self, file_path: str) -> Dict[str, float]:
277 """
278 Analyze domain relevance and keyword patterns.
280 Args:
281 file_path: Path to the code file
283 Returns:
284 Dictionary with domain relevance scores
285 """
286 try:
287 with open(file_path, "r", encoding="utf-8") as f:
288 content = f.read()
290 # Normalize content
291 content = content.lower()
293 # Check for domain-specific patterns
294 domain_scores = {}
296 for domain, patterns in self.word_patterns.items():
297 matches = 0
298 for pattern in patterns:
299 if pattern in content:
300 matches += 1
301 # Normalize by number of patterns
302 domain_scores[f"{domain}_coverage"] = matches / len(patterns)
304 # Calculate overall domain relevance
305 total_relevance = sum(domain_scores.values())
306 domain_scores["overall_relevance"] = min(
307 total_relevance / len(self.word_patterns), 1.0
308 )
310 # Calculate domain specificity (how focused the code is)
311 max_domain = max(domain_scores.values()) if domain_scores.values() else 0
312 domain_scores["specificity"] = max_domain
314 # Calculate technical vocabulary density
315 technical_words = len(
316 re.findall(
317 r"\b(?:api|endpoint|service|controller|model|entity|schema|database|cache|auth|login|password|token|session|user|admin|customer|product|order|payment|billing|subscription|plan|feature|function|method|class|interface|abstract|extends|implements|override|virtual|static|dynamic|async|await|promise|callback|event|handler|middleware|filter|validator|transformer|processor|worker|thread|queue|job|task|cron|scheduler|config|setting|env|variable|constant|property|attribute|field|column|table|index|constraint|foreign|primary|unique|notnull|default|check|trigger|procedure|function|stored|view|materialized|temp|temporary|permanent|persistent|volatile|in-memory|file-based|disk-based|cloud|distributed|clustered|load-balanced|scalable|high-availability|fault-tolerant|redundant|backup|restore|migration|version|branch|merge|conflict|resolve|commit|push|pull|fork|clone|repository|github|gitlab|bitbucket|ci|cd|pipeline|workflow|deployment|staging|production|development|testing|unit|integration|e2e|performance|load|stress|security|vulnerability|attack|breach|authentication|authorization|encryption|decryption|hash|salt|pepper|session|cookie|jwt|oauth|ldap|saml|rbac|abac|detection|prevention|monitoring|logging|tracing|metrics|analytics|dashboard|report|chart|graph|visualization|ui|ux|frontend|backend|fullstack|mobile|web|desktop|cross-platform|native|hybrid|responsive|adaptive|progressive|spa|pwa|ssr|csr|mvc|mvvm|riot|angular|react|vue|ember|backbone|knockout|jquery|vanilla|plain|pure|framework|library|package|module|bundle|dependency|require|import|export|include|extend|inherit|compose|aggregate|delegate|proxy|facade|adapter|bridge|decorator|singleton|factory|builder|prototype|command|observer|strategy|state|chain|iterator|visitor|mediator|composite|flyweight|proxy|interpreter|template|method|abstract|factory|builder|prototype|singleton|adapter|bridge|composite|decorator|facade|flyweight|proxy|chain|command|iterator|mediator|memento|observer|state|strategy|template|visitor)\b",
318 content,
319 )
320 )
321 total_words = len(content.split())
323 if total_words > 0:
324 domain_scores["technical_density"] = min(
325 technical_words / total_words, 1.0
326 )
327 else:
328 domain_scores["technical_density"] = 0.0
330 return domain_scores
332 except Exception as e:
333 logger.error(f"Error analyzing domain relevance: {e}")
334 return {
335 "security_coverage": 0.0,
336 "data_coverage": 0.0,
337 "api_coverage": 0.0,
338 "ui_coverage": 0.0,
339 "business_coverage": 0.0,
340 "testing_coverage": 0.0,
341 "overall_relevance": 0.5,
342 "specificity": 0.5,
343 "technical_density": 0.3,
344 }
346 def analyze_documentation_quality(self, file_path: str) -> Dict[str, float]:
347 """
348 Analyze documentation quality.
350 Args:
351 file_path: Path to the code file
353 Returns:
354 Dictionary with documentation scores
355 """
356 try:
357 with open(file_path, "r", encoding="utf-8") as f:
358 content = f.read()
360 doc_scores = {
361 "docstring_coverage": 0.0,
362 "comment_density": 0.0,
363 "explanation_quality": 0.0,
364 "examples_present": 0.0,
365 "parameter_documentation": 0.0,
366 "return_documentation": 0.0,
367 "exception_documentation": 0.0,
368 }
370 # Parse AST for docstring analysis
371 tree = ast.parse(content)
373 total_functions = 0
374 documented_functions = 0
375 documented_classes = 0
376 total_classes = 0
378 for node in ast.walk(tree):
379 if isinstance(node, ast.FunctionDef):
380 total_functions += 1
381 if ast.get_docstring(node):
382 documented_functions += 1
383 # Check for parameter documentation
384 if ":" in ast.get_docstring(
385 node
386 ) or "param" in ast.get_docstring(node):
387 doc_scores["parameter_documentation"] = 0.8
388 if "return" in ast.get_docstring(
389 node
390 ) or "->" in ast.get_docstring(node):
391 doc_scores["return_documentation"] = 0.8
392 if "raise" in ast.get_docstring(
393 node
394 ) or "exception" in ast.get_docstring(node):
395 doc_scores["exception_documentation"] = 0.8
397 elif isinstance(node, ast.ClassDef):
398 total_classes += 1
399 if ast.get_docstring(node):
400 documented_classes += 1
402 # Calculate docstring coverage
403 if total_functions > 0:
404 doc_scores["docstring_coverage"] = (
405 documented_functions / total_functions
406 )
407 if total_classes > 0:
408 class_coverage = documented_classes / total_classes
409 doc_scores["docstring_coverage"] = max(
410 doc_scores["docstring_coverage"], class_coverage
411 )
413 # Calculate comment density
414 lines = content.split("\n")
415 comment_lines = 0
416 code_lines = 0
418 for line in lines:
419 stripped = line.strip()
420 if stripped.startswith("#"):
421 comment_lines += 1
422 elif (
423 stripped
424 and not stripped.startswith('"""')
425 and not stripped.startswith("'''")
426 ):
427 code_lines += 1
429 if code_lines > 0:
430 doc_scores["comment_density"] = min(comment_lines / code_lines, 1.0)
432 # Check for examples in docstrings
433 total_docstring_length = len(ast.get_docstring(tree) or "")
434 if total_docstring_length > 0:
435 example_count = len(
436 re.findall(r">>>|Example:|example:|\b\d+\.\s", content)
437 )
438 doc_scores["examples_present"] = min(example_count / 3, 1.0)
440 # Calculate explanation quality based on docstring content
441 docstring_content = ast.get_docstring(tree) or ""
442 if docstring_content:
443 # Check for good explanation indicators
444 explanation_indicators = [
445 "provides",
446 "allows",
447 "enables",
448 "implements",
449 "handles",
450 "processes",
451 "manages",
452 ]
453 explanation_count = sum(
454 1
455 for indicator in explanation_indicators
456 if indicator in docstring_content
457 )
458 doc_scores["explanation_quality"] = min(
459 explanation_count / len(explanation_indicators), 1.0
460 )
462 return doc_scores
464 except Exception as e:
465 logger.error(f"Error analyzing documentation quality: {e}")
466 return {
467 "docstring_coverage": 0.3,
468 "comment_density": 0.2,
469 "explanation_quality": 0.3,
470 "examples_present": 0.0,
471 "parameter_documentation": 0.2,
472 "return_documentation": 0.2,
473 "exception_documentation": 0.1,
474 }
476 def calculate_confidence_score(
477 self,
478 file_path: str,
479 structure_weights: Dict[str, float] = None,
480 domain_weights: Dict[str, float] = None,
481 doc_weights: Dict[str, float] = None,
482 ) -> Tuple[float, Dict[str, Any]]:
483 """
484 Calculate overall confidence score for auto-SPEC generation.
486 Args:
487 file_path: Path to the code file
488 structure_weights: Weights for structure analysis
489 domain_weights: Weights for domain analysis
490 doc_weights: Weights for documentation analysis
492 Returns:
493 Tuple of (confidence_score, detailed_analysis)
494 """
495 start_time = time.time()
497 # Default weights
498 default_structure_weights = {
499 "class_ratio": 0.1,
500 "function_ratio": 0.1,
501 "method_ratio": 0.1,
502 "import_ratio": 0.1,
503 "complexity_ratio": 0.15,
504 "nesting_ratio": 0.15,
505 "docstring_score": 0.15,
506 "naming_score": 0.15,
507 }
509 default_domain_weights = {
510 "overall_relevance": 0.3,
511 "specificity": 0.2,
512 "technical_density": 0.3,
513 "security_coverage": 0.1,
514 "data_coverage": 0.1,
515 }
517 default_doc_weights = {
518 "docstring_coverage": 0.3,
519 "comment_density": 0.2,
520 "explanation_quality": 0.2,
521 "examples_present": 0.1,
522 "parameter_documentation": 0.1,
523 "return_documentation": 0.1,
524 }
526 # Use provided weights or defaults
527 structure_weights = structure_weights or default_structure_weights
528 domain_weights = domain_weights or default_domain_weights
529 doc_weights = doc_weights or default_doc_weights
531 # Analyze code
532 structure_analysis = self.analyze_code_structure(file_path)
533 domain_analysis = self.analyze_domain_relevance(file_path)
534 doc_analysis = self.analyze_documentation_quality(file_path)
536 # Calculate weighted scores
537 structure_score = sum(
538 structure_analysis[key] * structure_weights.get(key, 0)
539 for key in structure_analysis
540 )
542 domain_score = sum(
543 domain_analysis[key] * domain_weights.get(key, 0) for key in domain_analysis
544 )
546 doc_score = sum(
547 doc_analysis[key] * doc_weights.get(key, 0) for key in doc_analysis
548 )
550 # Final confidence score (weighted average)
551 total_weights = (
552 sum(structure_weights.values())
553 + sum(domain_weights.values())
554 + sum(doc_weights.values())
555 )
557 final_confidence = (structure_score + domain_score + doc_score) / total_weights
559 # Round to 2 decimal places
560 final_confidence = round(final_confidence, 2)
562 # Create detailed analysis
563 detailed_analysis = {
564 "file_path": file_path,
565 "analysis_time": time.time() - start_time,
566 "confidence_score": final_confidence,
567 "structure_analysis": {
568 "score": round(structure_score, 2),
569 "details": structure_analysis,
570 "weights": structure_weights,
571 },
572 "domain_analysis": {
573 "score": round(domain_score, 2),
574 "details": domain_analysis,
575 "weights": domain_weights,
576 },
577 "documentation_analysis": {
578 "score": round(doc_score, 2),
579 "details": doc_analysis,
580 "weights": doc_weights,
581 },
582 "recommendations": self._generate_recommendations(
583 structure_analysis, domain_analysis, doc_analysis
584 ),
585 }
587 return final_confidence, detailed_analysis
589 def _generate_recommendations(
590 self, structure_analysis: Dict, domain_analysis: Dict, doc_analysis: Dict
591 ) -> List[str]:
592 """Generate improvement recommendations."""
593 recommendations = []
595 # Structure recommendations
596 if structure_analysis.get("docstring_score", 0) < 0.5:
597 recommendations.append(
598 "Add more docstrings to improve documentation coverage"
599 )
601 if structure_analysis.get("complexity_ratio", 0) < 0.7:
602 recommendations.append(
603 "Consider refactoring complex functions to improve maintainability"
604 )
606 if structure_analysis.get("naming_score", 0) < 0.7:
607 recommendations.append(
608 "Improve naming consistency (use consistent naming convention)"
609 )
611 # Domain recommendations
612 if domain_analysis.get("overall_relevance", 0) < 0.6:
613 recommendations.append(
614 "Add domain-specific terminology to improve relevance"
615 )
617 if domain_analysis.get("technical_density", 0) < 0.3:
618 recommendations.append(
619 "Increase technical vocabulary for better specification"
620 )
622 # Documentation recommendations
623 if doc_analysis.get("examples_present", 0) < 0.5:
624 recommendations.append(
625 "Add usage examples in docstrings for better understanding"
626 )
628 if doc_analysis.get("parameter_documentation", 0) < 0.5:
629 recommendations.append("Document function parameters and return values")
631 return recommendations[:5] # Return top 5 recommendations
633 def validate_confidence_threshold(
634 self, confidence_score: float, threshold: float = 0.7, strict_mode: bool = False
635 ) -> Dict[str, Any]:
636 """
637 Validate confidence score against threshold.
639 Args:
640 confidence_score: Calculated confidence score
641 threshold: Minimum confidence threshold
642 strict_mode: Whether to use strict validation
644 Returns:
645 Validation result
646 """
647 if strict_mode:
648 # Strict mode: all scores must meet threshold
649 meets_threshold = confidence_score >= threshold
650 else:
651 # Normal mode: average score meets threshold
652 meets_threshold = confidence_score >= threshold
654 validation_result = {
655 "meets_threshold": meets_threshold,
656 "confidence_score": confidence_score,
657 "threshold": threshold,
658 "difference": confidence_score - threshold,
659 "recommendation": self._get_threshold_recommendation(
660 confidence_score, threshold
661 ),
662 }
664 return validation_result
666 def _get_threshold_recommendation(
667 self, confidence_score: float, threshold: float
668 ) -> str:
669 """Get recommendation based on confidence score."""
670 if confidence_score >= threshold:
671 if confidence_score >= 0.9:
672 return "Excellent confidence level - auto-spec generation recommended"
673 elif confidence_score >= 0.8:
674 return "Good confidence level - auto-spec generation recommended"
675 else:
676 return "Acceptable confidence level - auto-spec generation recommended"
677 else:
678 if confidence_score >= 0.6:
679 return "Marginal confidence level - manual review recommended"
680 elif confidence_score >= 0.4:
681 return "Low confidence level - significant improvements needed"
682 else:
683 return "Very low confidence level - complete redesign recommended"
685 def get_confidence_breakdown(self, confidence_score: float) -> Dict[str, Any]:
686 """Get detailed breakdown of confidence score components."""
687 return {
688 "overall_score": confidence_score,
689 "interpretation": self._interpret_confidence_score(confidence_score),
690 "risk_level": self._get_risk_level(confidence_score),
691 "action_required": self._get_action_required(confidence_score),
692 }
694 def _interpret_confidence_score(self, score: float) -> str:
695 """Interpret confidence score meaning."""
696 if score >= 0.9:
697 return "Excellent - Very high likelihood of generating a quality SPEC"
698 elif score >= 0.8:
699 return "Good - High likelihood of generating a quality SPEC"
700 elif score >= 0.7:
701 return "Acceptable - Moderate likelihood of generating a quality SPEC"
702 elif score >= 0.6:
703 return "Marginal - Low likelihood of generating a quality SPEC"
704 elif score >= 0.4:
705 return "Poor - Very low likelihood of generating a quality SPEC"
706 else:
707 return "Very Poor - Extremely low likelihood of generating a quality SPEC"
709 def _get_risk_level(self, score: float) -> str:
710 """Get risk level based on confidence score."""
711 if score >= 0.8:
712 return "Low"
713 elif score >= 0.6:
714 return "Medium"
715 elif score >= 0.4:
716 return "High"
717 else:
718 return "Critical"
720 def _get_action_required(self, score: float) -> str:
721 """Get required action based on confidence score."""
722 if score >= 0.7:
723 return "Auto-generate SPEC"
724 elif score >= 0.5:
725 return "Generate SPEC with manual review"
726 else:
727 return "Do not auto-generate - require manual creation"
730# Utility function for backwards compatibility
731def calculate_completion_confidence(analysis: Dict[str, Any]) -> float:
732 """
733 Backwards compatibility function.
735 Args:
736 analysis: Code analysis result
738 Returns:
739 Confidence score
740 """
741 scorer = ConfidenceScoringSystem()
743 # Extract file path from analysis or use default
744 file_path = analysis.get("file_path", "dummy_file.py")
746 # Calculate confidence score
747 confidence, detailed_analysis = scorer.calculate_confidence_score(file_path)
749 return confidence