Coverage for src / moai_adk / core / language_validator.py: 0.00%
209 statements
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
« prev ^ index » next coverage.py v7.12.0, created at 2025-11-20 20:52 +0900
1"""
2Language Validator
4Provides comprehensive language validation capabilities for programming languages.
6"""
8from pathlib import Path
9from typing import Any, Dict, List, Optional, Set, Tuple
12# Language detector functionality removed due to missing dependency
13# Using simplified language detection for now
14def get_all_supported_languages():
15 """Get all supported programming languages."""
16 return {"python", "javascript", "typescript", "java", "go", "rust", "cpp", "c"}
18def get_language_by_file_extension(extension: str) -> Optional[str]:
19 """Get programming language by file extension."""
21 # Handle Path objects and strings
22 if hasattr(extension, 'suffix'):
23 # Path object
24 ext = extension.suffix.lower()
25 else:
26 # String - extract extension
27 ext = str(extension).lower()
28 if not ext.startswith('.'):
29 # Extract extension from filename
30 if '.' in ext:
31 ext = '.' + ext.split('.')[-1]
32 else:
33 ext = ''
35 EXTENSION_MAP = {
36 ".py": "python",
37 ".js": "javascript",
38 ".ts": "typescript",
39 ".java": "java",
40 ".go": "go",
41 ".rs": "rust",
42 ".cpp": "cpp",
43 ".c": "c",
44 ".pyw": "python",
45 ".pyx": "python",
46 }
47 return EXTENSION_MAP.get(ext)
49def is_code_directory(path: str) -> bool:
50 """Check if directory is a code directory."""
51 code_dirs = {"src", "lib", "app", "components", "modules", "packages"}
52 return any(dir_name in path for dir_name in code_dirs)
54LANGUAGE_DIRECTORY_MAP = {
55 "python": ["src", "tests", "examples"],
56 "javascript": ["src", "lib", "packages"],
57 "typescript": ["src", "lib", "packages"],
58}
60def get_exclude_patterns():
61 """Get patterns to exclude from language detection."""
62 return ["*.pyc", "*.pyo", "__pycache__", ".git", "node_modules", ".venv"]
65class LanguageValidator:
66 """
67 A comprehensive language validator for programming languages.
69 This class provides language detection, validation, and project structure
70 analysis capabilities based on existing language detection infrastructure.
71 """
73 # Extended file extension mapping for better language detection
74 EXTENSION_MAP = {
75 "python": [".py", ".pyw", ".pyx", ".pxd"],
76 "javascript": [".js", ".jsx", ".mjs"],
77 "typescript": [".ts", ".tsx", ".cts", ".mts"],
78 "go": [".go"],
79 "rust": [".rs"],
80 "kotlin": [".kt", ".kts"],
81 "ruby": [".rb"],
82 "php": [".php", ".php3", ".php4", ".php5", ".phtml"],
83 "java": [".java"],
84 "csharp": [".cs"],
85 "cpp": [".cpp", ".cxx", ".cc", ".c++", ".h", ".hpp"],
86 "c": [".c", ".h"],
87 "swift": [".swift"],
88 "dart": [".dart"],
89 "scala": [".scala"],
90 "clojure": [".clj", ".cljs", ".cljc"],
91 "haskell": [".hs", ".lhs"],
92 "lua": [".lua"],
93 "ocaml": [".ml", ".mli", ".mll", ".mly"],
94 "elixir": [".ex", ".exs"],
95 "bash": [".sh", ".bash"],
96 "powershell": [".ps1", ".psm1", ".psd1"],
97 "sql": [".sql"],
98 "html": [".html", ".htm"],
99 "css": [".css", ".scss", ".sass"],
100 "json": [".json", ".json5"],
101 "yaml": [".yaml", ".yml"],
102 "toml": [".toml"],
103 "xml": [".xml", ".xsl", ".xslt"],
104 "markdown": [".md", ".markdown"],
105 "dockerfile": ["dockerfile", "dockerfile.", "dockerfile.*"],
106 }
108 def __init__(
109 self,
110 supported_languages: Optional[List[str]] = None,
111 auto_validate: bool = True,
112 ):
113 """
114 Initialize the language validator.
116 Args:
117 supported_languages: List of supported language codes.
118 If None, uses all available languages.
119 auto_validate: Whether to automatically validate inputs and perform cleanup
120 """
121 self.auto_validate = auto_validate
123 if supported_languages is None:
124 # Use all languages from the existing language detection system
125 self.supported_languages = set(get_all_supported_languages())
126 else:
127 self.supported_languages = set(lang.lower() for lang in supported_languages)
129 # Compile regex patterns for efficient matching
130 self._directory_patterns: Dict[str, Any] = {}
131 self._exclude_patterns_cache: Dict[str, Any] = {}
133 # Initialize analysis cache for statistics tracking
134 self._analysis_cache = {
135 "last_analysis_files": 0,
136 "detected_extensions": [],
137 "supported_languages_found": 0,
138 }
140 def _validate_and_normalize_input(
141 self, value: Any, input_type: str
142 ) -> Optional[Any]:
143 """
144 Validate and normalize input values.
146 Args:
147 value: Input value to validate
148 input_type: Type of input ('language', 'file_path', 'list', etc.)
150 Returns:
151 Normalized value or None if validation fails
152 """
153 if not value and input_type != "language": # Empty language is valid sometimes
154 return None
156 if input_type == "language":
157 if not isinstance(value, str):
158 return None
159 return value.strip().lower() if value else None
161 elif input_type == "file_path":
162 if isinstance(value, str):
163 return Path(value).resolve()
164 elif isinstance(value, Path):
165 return value.resolve()
166 else:
167 return None
169 elif input_type == "list":
170 if not isinstance(value, list):
171 return None
172 return value
174 return None
176 def validate_language(self, language: str) -> bool:
177 """
178 Validate if a language is supported.
180 Args:
181 language: Language code to validate.
183 Returns:
184 True if language is supported, False otherwise.
185 """
186 if self.auto_validate:
187 normalized_lang = self._validate_and_normalize_input(language, "language")
188 if normalized_lang is None:
189 return False
190 else:
191 normalized_lang = self.normalize_language_code(language)
193 return normalized_lang in self.supported_languages
195 def detect_language_from_extension(self, file_path: Any) -> Optional[str]:
196 """
197 Detect language from file extension using enhanced mapping.
199 Args:
200 file_path: File path as string or Path object.
202 Returns:
203 Detected language code or None if not recognized.
204 """
205 if self.auto_validate:
206 path_obj = self._validate_and_normalize_input(file_path, "file_path")
207 if path_obj is None:
208 return None
209 else:
210 if isinstance(file_path, str):
211 path_obj = Path(file_path)
212 elif isinstance(file_path, Path):
213 path_obj = file_path
214 else:
215 return None
217 # First try the enhanced mapping
218 extension = path_obj.suffix.lower()
219 for lang, extensions in self.EXTENSION_MAP.items():
220 if extension in extensions:
221 return lang
223 # Fall back to existing system for backwards compatibility
224 return get_language_by_file_extension(path_obj)
226 def get_expected_directories(self, language: str) -> List[str]:
227 """
228 Get expected directory patterns for a language.
230 Args:
231 language: Language code.
233 Returns:
234 List of expected directory patterns.
235 """
236 if self.auto_validate:
237 normalized_lang = self._validate_and_normalize_input(language, "language")
238 if normalized_lang is None:
239 return []
240 else:
241 normalized_lang = self.normalize_language_code(language)
243 if normalized_lang in LANGUAGE_DIRECTORY_MAP:
244 dirs = LANGUAGE_DIRECTORY_MAP[normalized_lang].copy()
245 # Add trailing slash for consistency with test expectations
246 return [f"{dir}/" if not dir.endswith('/') else dir for dir in dirs]
248 # Return default Python directories as fallback with trailing slashes
249 default_dirs = LANGUAGE_DIRECTORY_MAP.get("python", [])
250 return [f"{dir}/" if not dir.endswith('/') else dir for dir in default_dirs]
252 def get_file_extensions(self, language: str) -> List[str]:
253 """
254 Get file extensions for a language.
256 Args:
257 language: Language code.
259 Returns:
260 List of file extensions (including dot).
261 """
262 if self.auto_validate:
263 normalized_lang = self._validate_and_normalize_input(language, "language")
264 if normalized_lang is None:
265 return []
266 else:
267 normalized_lang = self.normalize_language_code(language)
269 return self.EXTENSION_MAP.get(normalized_lang, [])
271 def get_all_supported_extensions(self) -> Set[str]:
272 """
273 Get all supported file extensions.
275 Returns:
276 Set of all supported file extensions.
277 """
278 all_extensions = set()
279 for extensions in self.EXTENSION_MAP.values():
280 all_extensions.update(extensions)
281 return all_extensions
283 def detect_language_from_filename(self, file_name: str) -> Optional[str]:
284 """
285 Detect language from filename (including special cases like Dockerfile).
287 Args:
288 file_name: Filename or full path.
290 Returns:
291 Detected language code or None if not recognized.
292 """
293 if self.auto_validate:
294 normalized_name = self._validate_and_normalize_input(file_name, "file_path")
295 if normalized_name is None:
296 return None
297 else:
298 if not file_name or not isinstance(file_name, str):
299 return None
300 normalized_name = Path(file_name)
302 # Extract filename from path if needed
303 filename = normalized_name.name.lower()
305 # Check for special filenames
306 if filename in ["dockerfile", "dockerfile.dev", "dockerfile.prod"]:
307 return "dockerfile"
309 # Check for common build/config files
310 config_patterns = {
311 "makefile": "bash",
312 "cmakelists.txt": "cpp",
313 "pom.xml": "java",
314 "build.gradle": "kotlin",
315 "package.json": "javascript",
316 "pyproject.toml": "python",
317 "cargo.toml": "rust",
318 "go.mod": "go",
319 "requirements.txt": "python",
320 "gemfile": "ruby",
321 }
323 if filename in config_patterns:
324 return config_patterns[filename]
326 # Extract extension and try normal detection
327 Path(filename).suffix.lower()
328 return self.detect_language_from_extension(filename)
330 def validate_file_extension(self, file_path: Any, language: str) -> bool:
331 """
332 Validate if a file has the correct extension for a language.
334 Args:
335 file_path: File path to validate.
336 language: Expected language code.
338 Returns:
339 True if file extension matches language, False otherwise.
340 """
341 if language is None:
342 # Any file is valid when no specific language is required
343 return True
345 if self.auto_validate:
346 normalized_lang = self._validate_and_normalize_input(language, "language")
347 if normalized_lang is None:
348 return False
349 else:
350 normalized_lang = self.normalize_language_code(language)
352 detected_lang = self.detect_language_from_extension(file_path)
353 return detected_lang == normalized_lang
355 def get_supported_languages(self) -> List[str]:
356 """
357 Get list of supported languages.
359 Returns:
360 Sorted list of supported language codes.
361 """
362 return sorted(self.supported_languages)
364 def normalize_language_code(self, language: str) -> str:
365 """
366 Normalize language code to lowercase with stripped whitespace.
368 Args:
369 language: Raw language code.
371 Returns:
372 Normalized language code.
373 """
374 if not language or not isinstance(language, str):
375 return ""
377 return language.strip().lower()
379 def validate_project_configuration(
380 self, config: Dict[str, Any]
381 ) -> Tuple[bool, List[str]]:
382 """
383 Validate project configuration for language support.
385 Args:
386 config: Project configuration dictionary.
388 Returns:
389 Tuple of (is_valid, issues) where is_valid is boolean and issues is list of strings.
390 """
391 if self.auto_validate:
392 validated_config = self._validate_and_normalize_input(config, "dict")
393 if validated_config is None:
394 return False, ["Invalid configuration format"]
396 issues = []
398 # Check if project section exists
399 if "project" not in config:
400 issues.append("Missing 'project' section in configuration")
401 return False, issues
403 project_config = config["project"]
405 # Check if language is specified
406 if "language" not in project_config:
407 issues.append("Missing 'language' field in project configuration")
408 return False, issues
410 project_language = project_config["language"]
412 # Validate the language
413 if not self.validate_language(project_language):
414 issues.append(f"Unsupported language: {project_language}")
415 return False, issues
417 # Check if name is specified
418 if "name" not in project_config:
419 issues.append("Missing 'name' field in project configuration")
420 return False, issues
422 # Check if name is valid (not empty)
423 if not project_config["name"] or not isinstance(project_config["name"], str):
424 issues.append("Project name must be a non-empty string")
425 return False, issues
427 # Additional validation for empty strings and whitespace-only names
428 if (
429 isinstance(project_config["name"], str)
430 and not project_config["name"].strip()
431 ):
432 issues.append("Project name cannot be empty or contain only whitespace")
433 return False, issues
435 return True, issues
437 def validate_project_structure(
438 self, project_files: Dict[str, bool], language: str
439 ) -> Tuple[bool, List[str]]:
440 """
441 Validate project structure for a specific language.
443 Args:
444 project_files: Dictionary mapping file paths to boolean (is_source_file).
445 language: Project language to validate against.
447 Returns:
448 Tuple of (is_valid, issues) where is_valid is boolean and issues is list of strings.
449 """
450 if self.auto_validate:
451 validated_project_files = self._validate_and_normalize_input(
452 project_files, "dict"
453 )
454 validated_language = self._validate_and_normalize_input(
455 language, "language"
456 )
457 if validated_project_files is None or validated_language is None:
458 return False, ["Invalid input format for project structure validation"]
460 issues = []
461 expected_dirs = self.get_expected_directories(language)
463 # Group files by directory
464 files_by_dir: Dict[str, List[str]] = {}
465 for file_path, is_source in project_files.items():
466 if is_source: # Only validate source files
467 dir_path = str(Path(file_path).parent) + "/"
468 if dir_path not in files_by_dir:
469 files_by_dir[dir_path] = []
470 files_by_dir[dir_path].append(file_path)
472 # Check if expected directories exist and have files
473 for expected_dir in expected_dirs:
474 found_files_in_dir = False
475 for actual_dir in files_by_dir:
476 if actual_dir.startswith(expected_dir):
477 found_files_in_dir = True
478 break
480 if not found_files_in_dir and expected_dir != "{package_name}/":
481 issues.append(
482 f"No source files found in expected directory: {expected_dir}"
483 )
485 # Check for files in unexpected directories
486 # Note: Using simplified check since is_code_directory signature changed
487 for file_path, is_source in project_files.items():
488 if is_source:
489 path_str = str(file_path)
490 if not is_code_directory(path_str):
491 issues.append(f"Source file in unexpected location: {file_path}")
493 return len(issues) == 0, issues
495 def get_language_statistics(self, files: List[Any]) -> Dict[str, int]:
496 """
497 Get language statistics from a list of files.
499 Args:
500 files: List of file paths.
502 Returns:
503 Dictionary mapping language codes to file counts.
504 """
505 if self.auto_validate:
506 validated_files = self._validate_and_normalize_input(files, "list")
507 if validated_files is None:
508 return {}
509 else:
510 validated_files = files
512 stats: Dict[str, int] = {}
513 total_files = 0
514 detected_extensions = set()
516 for file_path in validated_files:
517 if file_path: # Ensure file path is not None
518 detected_lang = self.detect_language_from_extension(file_path)
519 if detected_lang:
520 stats[detected_lang] = stats.get(detected_lang, 0) + 1
521 total_files += 1
523 # Track detected extensions for analysis
524 if hasattr(file_path, "suffix"):
525 detected_extensions.add(file_path.suffix.lower())
526 elif isinstance(file_path, str):
527 detected_extensions.add(Path(file_path).suffix.lower())
529 # Add analysis information
530 if hasattr(self, "_analysis_cache"):
531 self._analysis_cache["last_analysis_files"] = total_files
532 self._analysis_cache["detected_extensions"] = list(detected_extensions)
533 self._analysis_cache["supported_languages_found"] = len(stats)
535 return stats
537 def get_analysis_cache(self) -> Dict[str, Any]:
538 """
539 Get the analysis cache with language detection statistics.
541 Returns:
542 Dictionary containing analysis statistics.
543 """
544 return self._analysis_cache.copy()
546 def clear_analysis_cache(self) -> None:
547 """
548 Clear the analysis cache.
549 """
550 self._analysis_cache = {
551 "last_analysis_files": 0,
552 "detected_extensions": [],
553 "supported_languages_found": 0,
554 }