Coverage for src / moai_adk / core / language_validator.py: 0.00%

209 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2025-11-20 20:52 +0900

1""" 

2Language Validator 

3 

4Provides comprehensive language validation capabilities for programming languages. 

5 

6""" 

7 

8from pathlib import Path 

9from typing import Any, Dict, List, Optional, Set, Tuple 

10 

11 

12# Language detector functionality removed due to missing dependency 

13# Using simplified language detection for now 

14def get_all_supported_languages(): 

15 """Get all supported programming languages.""" 

16 return {"python", "javascript", "typescript", "java", "go", "rust", "cpp", "c"} 

17 

18def get_language_by_file_extension(extension: str) -> Optional[str]: 

19 """Get programming language by file extension.""" 

20 

21 # Handle Path objects and strings 

22 if hasattr(extension, 'suffix'): 

23 # Path object 

24 ext = extension.suffix.lower() 

25 else: 

26 # String - extract extension 

27 ext = str(extension).lower() 

28 if not ext.startswith('.'): 

29 # Extract extension from filename 

30 if '.' in ext: 

31 ext = '.' + ext.split('.')[-1] 

32 else: 

33 ext = '' 

34 

35 EXTENSION_MAP = { 

36 ".py": "python", 

37 ".js": "javascript", 

38 ".ts": "typescript", 

39 ".java": "java", 

40 ".go": "go", 

41 ".rs": "rust", 

42 ".cpp": "cpp", 

43 ".c": "c", 

44 ".pyw": "python", 

45 ".pyx": "python", 

46 } 

47 return EXTENSION_MAP.get(ext) 

48 

49def is_code_directory(path: str) -> bool: 

50 """Check if directory is a code directory.""" 

51 code_dirs = {"src", "lib", "app", "components", "modules", "packages"} 

52 return any(dir_name in path for dir_name in code_dirs) 

53 

54LANGUAGE_DIRECTORY_MAP = { 

55 "python": ["src", "tests", "examples"], 

56 "javascript": ["src", "lib", "packages"], 

57 "typescript": ["src", "lib", "packages"], 

58} 

59 

60def get_exclude_patterns(): 

61 """Get patterns to exclude from language detection.""" 

62 return ["*.pyc", "*.pyo", "__pycache__", ".git", "node_modules", ".venv"] 

63 

64 

65class LanguageValidator: 

66 """ 

67 A comprehensive language validator for programming languages. 

68 

69 This class provides language detection, validation, and project structure 

70 analysis capabilities based on existing language detection infrastructure. 

71 """ 

72 

73 # Extended file extension mapping for better language detection 

74 EXTENSION_MAP = { 

75 "python": [".py", ".pyw", ".pyx", ".pxd"], 

76 "javascript": [".js", ".jsx", ".mjs"], 

77 "typescript": [".ts", ".tsx", ".cts", ".mts"], 

78 "go": [".go"], 

79 "rust": [".rs"], 

80 "kotlin": [".kt", ".kts"], 

81 "ruby": [".rb"], 

82 "php": [".php", ".php3", ".php4", ".php5", ".phtml"], 

83 "java": [".java"], 

84 "csharp": [".cs"], 

85 "cpp": [".cpp", ".cxx", ".cc", ".c++", ".h", ".hpp"], 

86 "c": [".c", ".h"], 

87 "swift": [".swift"], 

88 "dart": [".dart"], 

89 "scala": [".scala"], 

90 "clojure": [".clj", ".cljs", ".cljc"], 

91 "haskell": [".hs", ".lhs"], 

92 "lua": [".lua"], 

93 "ocaml": [".ml", ".mli", ".mll", ".mly"], 

94 "elixir": [".ex", ".exs"], 

95 "bash": [".sh", ".bash"], 

96 "powershell": [".ps1", ".psm1", ".psd1"], 

97 "sql": [".sql"], 

98 "html": [".html", ".htm"], 

99 "css": [".css", ".scss", ".sass"], 

100 "json": [".json", ".json5"], 

101 "yaml": [".yaml", ".yml"], 

102 "toml": [".toml"], 

103 "xml": [".xml", ".xsl", ".xslt"], 

104 "markdown": [".md", ".markdown"], 

105 "dockerfile": ["dockerfile", "dockerfile.", "dockerfile.*"], 

106 } 

107 

108 def __init__( 

109 self, 

110 supported_languages: Optional[List[str]] = None, 

111 auto_validate: bool = True, 

112 ): 

113 """ 

114 Initialize the language validator. 

115 

116 Args: 

117 supported_languages: List of supported language codes. 

118 If None, uses all available languages. 

119 auto_validate: Whether to automatically validate inputs and perform cleanup 

120 """ 

121 self.auto_validate = auto_validate 

122 

123 if supported_languages is None: 

124 # Use all languages from the existing language detection system 

125 self.supported_languages = set(get_all_supported_languages()) 

126 else: 

127 self.supported_languages = set(lang.lower() for lang in supported_languages) 

128 

129 # Compile regex patterns for efficient matching 

130 self._directory_patterns: Dict[str, Any] = {} 

131 self._exclude_patterns_cache: Dict[str, Any] = {} 

132 

133 # Initialize analysis cache for statistics tracking 

134 self._analysis_cache = { 

135 "last_analysis_files": 0, 

136 "detected_extensions": [], 

137 "supported_languages_found": 0, 

138 } 

139 

140 def _validate_and_normalize_input( 

141 self, value: Any, input_type: str 

142 ) -> Optional[Any]: 

143 """ 

144 Validate and normalize input values. 

145 

146 Args: 

147 value: Input value to validate 

148 input_type: Type of input ('language', 'file_path', 'list', etc.) 

149 

150 Returns: 

151 Normalized value or None if validation fails 

152 """ 

153 if not value and input_type != "language": # Empty language is valid sometimes 

154 return None 

155 

156 if input_type == "language": 

157 if not isinstance(value, str): 

158 return None 

159 return value.strip().lower() if value else None 

160 

161 elif input_type == "file_path": 

162 if isinstance(value, str): 

163 return Path(value).resolve() 

164 elif isinstance(value, Path): 

165 return value.resolve() 

166 else: 

167 return None 

168 

169 elif input_type == "list": 

170 if not isinstance(value, list): 

171 return None 

172 return value 

173 

174 return None 

175 

176 def validate_language(self, language: str) -> bool: 

177 """ 

178 Validate if a language is supported. 

179 

180 Args: 

181 language: Language code to validate. 

182 

183 Returns: 

184 True if language is supported, False otherwise. 

185 """ 

186 if self.auto_validate: 

187 normalized_lang = self._validate_and_normalize_input(language, "language") 

188 if normalized_lang is None: 

189 return False 

190 else: 

191 normalized_lang = self.normalize_language_code(language) 

192 

193 return normalized_lang in self.supported_languages 

194 

195 def detect_language_from_extension(self, file_path: Any) -> Optional[str]: 

196 """ 

197 Detect language from file extension using enhanced mapping. 

198 

199 Args: 

200 file_path: File path as string or Path object. 

201 

202 Returns: 

203 Detected language code or None if not recognized. 

204 """ 

205 if self.auto_validate: 

206 path_obj = self._validate_and_normalize_input(file_path, "file_path") 

207 if path_obj is None: 

208 return None 

209 else: 

210 if isinstance(file_path, str): 

211 path_obj = Path(file_path) 

212 elif isinstance(file_path, Path): 

213 path_obj = file_path 

214 else: 

215 return None 

216 

217 # First try the enhanced mapping 

218 extension = path_obj.suffix.lower() 

219 for lang, extensions in self.EXTENSION_MAP.items(): 

220 if extension in extensions: 

221 return lang 

222 

223 # Fall back to existing system for backwards compatibility 

224 return get_language_by_file_extension(path_obj) 

225 

226 def get_expected_directories(self, language: str) -> List[str]: 

227 """ 

228 Get expected directory patterns for a language. 

229 

230 Args: 

231 language: Language code. 

232 

233 Returns: 

234 List of expected directory patterns. 

235 """ 

236 if self.auto_validate: 

237 normalized_lang = self._validate_and_normalize_input(language, "language") 

238 if normalized_lang is None: 

239 return [] 

240 else: 

241 normalized_lang = self.normalize_language_code(language) 

242 

243 if normalized_lang in LANGUAGE_DIRECTORY_MAP: 

244 dirs = LANGUAGE_DIRECTORY_MAP[normalized_lang].copy() 

245 # Add trailing slash for consistency with test expectations 

246 return [f"{dir}/" if not dir.endswith('/') else dir for dir in dirs] 

247 

248 # Return default Python directories as fallback with trailing slashes 

249 default_dirs = LANGUAGE_DIRECTORY_MAP.get("python", []) 

250 return [f"{dir}/" if not dir.endswith('/') else dir for dir in default_dirs] 

251 

252 def get_file_extensions(self, language: str) -> List[str]: 

253 """ 

254 Get file extensions for a language. 

255 

256 Args: 

257 language: Language code. 

258 

259 Returns: 

260 List of file extensions (including dot). 

261 """ 

262 if self.auto_validate: 

263 normalized_lang = self._validate_and_normalize_input(language, "language") 

264 if normalized_lang is None: 

265 return [] 

266 else: 

267 normalized_lang = self.normalize_language_code(language) 

268 

269 return self.EXTENSION_MAP.get(normalized_lang, []) 

270 

271 def get_all_supported_extensions(self) -> Set[str]: 

272 """ 

273 Get all supported file extensions. 

274 

275 Returns: 

276 Set of all supported file extensions. 

277 """ 

278 all_extensions = set() 

279 for extensions in self.EXTENSION_MAP.values(): 

280 all_extensions.update(extensions) 

281 return all_extensions 

282 

283 def detect_language_from_filename(self, file_name: str) -> Optional[str]: 

284 """ 

285 Detect language from filename (including special cases like Dockerfile). 

286 

287 Args: 

288 file_name: Filename or full path. 

289 

290 Returns: 

291 Detected language code or None if not recognized. 

292 """ 

293 if self.auto_validate: 

294 normalized_name = self._validate_and_normalize_input(file_name, "file_path") 

295 if normalized_name is None: 

296 return None 

297 else: 

298 if not file_name or not isinstance(file_name, str): 

299 return None 

300 normalized_name = Path(file_name) 

301 

302 # Extract filename from path if needed 

303 filename = normalized_name.name.lower() 

304 

305 # Check for special filenames 

306 if filename in ["dockerfile", "dockerfile.dev", "dockerfile.prod"]: 

307 return "dockerfile" 

308 

309 # Check for common build/config files 

310 config_patterns = { 

311 "makefile": "bash", 

312 "cmakelists.txt": "cpp", 

313 "pom.xml": "java", 

314 "build.gradle": "kotlin", 

315 "package.json": "javascript", 

316 "pyproject.toml": "python", 

317 "cargo.toml": "rust", 

318 "go.mod": "go", 

319 "requirements.txt": "python", 

320 "gemfile": "ruby", 

321 } 

322 

323 if filename in config_patterns: 

324 return config_patterns[filename] 

325 

326 # Extract extension and try normal detection 

327 Path(filename).suffix.lower() 

328 return self.detect_language_from_extension(filename) 

329 

330 def validate_file_extension(self, file_path: Any, language: str) -> bool: 

331 """ 

332 Validate if a file has the correct extension for a language. 

333 

334 Args: 

335 file_path: File path to validate. 

336 language: Expected language code. 

337 

338 Returns: 

339 True if file extension matches language, False otherwise. 

340 """ 

341 if language is None: 

342 # Any file is valid when no specific language is required 

343 return True 

344 

345 if self.auto_validate: 

346 normalized_lang = self._validate_and_normalize_input(language, "language") 

347 if normalized_lang is None: 

348 return False 

349 else: 

350 normalized_lang = self.normalize_language_code(language) 

351 

352 detected_lang = self.detect_language_from_extension(file_path) 

353 return detected_lang == normalized_lang 

354 

355 def get_supported_languages(self) -> List[str]: 

356 """ 

357 Get list of supported languages. 

358 

359 Returns: 

360 Sorted list of supported language codes. 

361 """ 

362 return sorted(self.supported_languages) 

363 

364 def normalize_language_code(self, language: str) -> str: 

365 """ 

366 Normalize language code to lowercase with stripped whitespace. 

367 

368 Args: 

369 language: Raw language code. 

370 

371 Returns: 

372 Normalized language code. 

373 """ 

374 if not language or not isinstance(language, str): 

375 return "" 

376 

377 return language.strip().lower() 

378 

379 def validate_project_configuration( 

380 self, config: Dict[str, Any] 

381 ) -> Tuple[bool, List[str]]: 

382 """ 

383 Validate project configuration for language support. 

384 

385 Args: 

386 config: Project configuration dictionary. 

387 

388 Returns: 

389 Tuple of (is_valid, issues) where is_valid is boolean and issues is list of strings. 

390 """ 

391 if self.auto_validate: 

392 validated_config = self._validate_and_normalize_input(config, "dict") 

393 if validated_config is None: 

394 return False, ["Invalid configuration format"] 

395 

396 issues = [] 

397 

398 # Check if project section exists 

399 if "project" not in config: 

400 issues.append("Missing 'project' section in configuration") 

401 return False, issues 

402 

403 project_config = config["project"] 

404 

405 # Check if language is specified 

406 if "language" not in project_config: 

407 issues.append("Missing 'language' field in project configuration") 

408 return False, issues 

409 

410 project_language = project_config["language"] 

411 

412 # Validate the language 

413 if not self.validate_language(project_language): 

414 issues.append(f"Unsupported language: {project_language}") 

415 return False, issues 

416 

417 # Check if name is specified 

418 if "name" not in project_config: 

419 issues.append("Missing 'name' field in project configuration") 

420 return False, issues 

421 

422 # Check if name is valid (not empty) 

423 if not project_config["name"] or not isinstance(project_config["name"], str): 

424 issues.append("Project name must be a non-empty string") 

425 return False, issues 

426 

427 # Additional validation for empty strings and whitespace-only names 

428 if ( 

429 isinstance(project_config["name"], str) 

430 and not project_config["name"].strip() 

431 ): 

432 issues.append("Project name cannot be empty or contain only whitespace") 

433 return False, issues 

434 

435 return True, issues 

436 

437 def validate_project_structure( 

438 self, project_files: Dict[str, bool], language: str 

439 ) -> Tuple[bool, List[str]]: 

440 """ 

441 Validate project structure for a specific language. 

442 

443 Args: 

444 project_files: Dictionary mapping file paths to boolean (is_source_file). 

445 language: Project language to validate against. 

446 

447 Returns: 

448 Tuple of (is_valid, issues) where is_valid is boolean and issues is list of strings. 

449 """ 

450 if self.auto_validate: 

451 validated_project_files = self._validate_and_normalize_input( 

452 project_files, "dict" 

453 ) 

454 validated_language = self._validate_and_normalize_input( 

455 language, "language" 

456 ) 

457 if validated_project_files is None or validated_language is None: 

458 return False, ["Invalid input format for project structure validation"] 

459 

460 issues = [] 

461 expected_dirs = self.get_expected_directories(language) 

462 

463 # Group files by directory 

464 files_by_dir: Dict[str, List[str]] = {} 

465 for file_path, is_source in project_files.items(): 

466 if is_source: # Only validate source files 

467 dir_path = str(Path(file_path).parent) + "/" 

468 if dir_path not in files_by_dir: 

469 files_by_dir[dir_path] = [] 

470 files_by_dir[dir_path].append(file_path) 

471 

472 # Check if expected directories exist and have files 

473 for expected_dir in expected_dirs: 

474 found_files_in_dir = False 

475 for actual_dir in files_by_dir: 

476 if actual_dir.startswith(expected_dir): 

477 found_files_in_dir = True 

478 break 

479 

480 if not found_files_in_dir and expected_dir != "{package_name}/": 

481 issues.append( 

482 f"No source files found in expected directory: {expected_dir}" 

483 ) 

484 

485 # Check for files in unexpected directories 

486 # Note: Using simplified check since is_code_directory signature changed 

487 for file_path, is_source in project_files.items(): 

488 if is_source: 

489 path_str = str(file_path) 

490 if not is_code_directory(path_str): 

491 issues.append(f"Source file in unexpected location: {file_path}") 

492 

493 return len(issues) == 0, issues 

494 

495 def get_language_statistics(self, files: List[Any]) -> Dict[str, int]: 

496 """ 

497 Get language statistics from a list of files. 

498 

499 Args: 

500 files: List of file paths. 

501 

502 Returns: 

503 Dictionary mapping language codes to file counts. 

504 """ 

505 if self.auto_validate: 

506 validated_files = self._validate_and_normalize_input(files, "list") 

507 if validated_files is None: 

508 return {} 

509 else: 

510 validated_files = files 

511 

512 stats: Dict[str, int] = {} 

513 total_files = 0 

514 detected_extensions = set() 

515 

516 for file_path in validated_files: 

517 if file_path: # Ensure file path is not None 

518 detected_lang = self.detect_language_from_extension(file_path) 

519 if detected_lang: 

520 stats[detected_lang] = stats.get(detected_lang, 0) + 1 

521 total_files += 1 

522 

523 # Track detected extensions for analysis 

524 if hasattr(file_path, "suffix"): 

525 detected_extensions.add(file_path.suffix.lower()) 

526 elif isinstance(file_path, str): 

527 detected_extensions.add(Path(file_path).suffix.lower()) 

528 

529 # Add analysis information 

530 if hasattr(self, "_analysis_cache"): 

531 self._analysis_cache["last_analysis_files"] = total_files 

532 self._analysis_cache["detected_extensions"] = list(detected_extensions) 

533 self._analysis_cache["supported_languages_found"] = len(stats) 

534 

535 return stats 

536 

537 def get_analysis_cache(self) -> Dict[str, Any]: 

538 """ 

539 Get the analysis cache with language detection statistics. 

540 

541 Returns: 

542 Dictionary containing analysis statistics. 

543 """ 

544 return self._analysis_cache.copy() 

545 

546 def clear_analysis_cache(self) -> None: 

547 """ 

548 Clear the analysis cache. 

549 """ 

550 self._analysis_cache = { 

551 "last_analysis_files": 0, 

552 "detected_extensions": [], 

553 "supported_languages_found": 0, 

554 }