Coverage for src/pullapprove/matches.py: 65%

191 statements  

« prev     ^ index     » next       coverage.py v7.8.2, created at 2025-06-11 12:55 -0500

1import hashlib 

2import json 

3from collections.abc import Iterator 

4from pathlib import Path 

5 

6import click 

7from pydantic import BaseModel, ConfigDict, Field, model_validator 

8 

9from .config import ( 

10 ConfigModel, 

11 ConfigModels, 

12 LargeScaleChangeModel, 

13 OwnershipChoices, 

14 ScopeModel, 

15) 

16from .diff import DiffCode, DiffFile, iterate_diff_parts 

17from .exceptions import LargeScaleChangeException 

18 

19 

20def match_path(*, path: Path, config: ConfigModel): 

21 path_match = ScopePathMatch(path=str(path), scopes=[]) 

22 

23 scopes_matching_paths = [ 

24 scope for scope in config.scopes if scope.matches_path(path) 

25 ] 

26 code_scopes = [scope for scope in scopes_matching_paths if scope.code] 

27 path_scopes = [scope for scope in scopes_matching_paths if not scope.code] 

28 

29 # Set the scopes on the path itself 

30 for scope in path_scopes: 

31 path_match.add_scope(scope) 

32 

33 return path_match, code_scopes 

34 

35 

36def match_code(*, path: str, code: str, scopes: list[ScopeModel], line_offset: int = 0): 

37 code_matches = {} 

38 

39 for scope in scopes: 

40 for match in scope.matches_code(code): 

41 code_match = ScopeCodeMatch( 

42 path=path, 

43 start_line=line_offset + match["start_line"], 

44 end_line=line_offset + match["end_line"], 

45 start_column=match["start_col"], 

46 end_column=match["end_col"], 

47 scopes=[scope.name], 

48 location_id="", 

49 ) 

50 code_match._scopes = [scope] 

51 

52 if code_match.location_id in code_matches: 

53 # Just add the scopes to it 

54 code_matches[code_match.location_id].add_scope(scope) 

55 else: 

56 code_matches[code_match.location_id] = code_match 

57 

58 yield from code_matches.values() 

59 

60 

61def match_files(configs: ConfigModels, files: Iterator): 

62 def _iterate(): 

63 for f in files: 

64 file_path = Path(f) 

65 

66 config = configs.compile_closest_config(file_path) 

67 

68 path_match, code_scopes = match_path( 

69 path=file_path, 

70 config=config, 

71 ) 

72 

73 # Yield the paths first 

74 yield path_match 

75 

76 # Then go line by line to find scopes that match lines 

77 if code_scopes: 

78 code = file_path.read_text() 

79 

80 yield from match_code( 

81 path=str(file_path), 

82 code=code, 

83 scopes=code_scopes, 

84 ) 

85 

86 return ChangeMatches.from_config_matches(configs, _iterate()) 

87 

88 

89def iterate_diff(configs: ConfigModels, diff: Iterator | str): 

90 # We can still iterate a diff without configs, just by yield the diff objs 

91 if not configs: 

92 for diff_obj in iterate_diff_parts(diff): 

93 yield diff_obj, [] 

94 

95 return 

96 

97 # Keep track of these as we go and jump between file header 

98 # and raw code during iteration 

99 check_code_scopes = False 

100 current_code_path = None 

101 

102 current_code_diffs = [] 

103 

104 # TODO get root config here, check diff size as we go and raise exception? 

105 # or we need to keep track per LSC? should be a compiled value... 

106 

107 def yield_code_diffs(): 

108 # We're passing the entire diff chunk to see if there's a match inside, 

109 # but if there is, it probably won't match EVERY line in the chunk 

110 current_code_chunk = "\n".join([code.raw() for code in current_code_diffs]) 

111 current_code_line_number = current_code_diffs[0].line_number - 1 

112 

113 code_matches = match_code( 

114 path=current_code_path, 

115 code=current_code_chunk, 

116 scopes=check_code_scopes, 

117 line_offset=current_code_line_number, 

118 ) 

119 code_matches = list(code_matches) 

120 

121 for diff_line_index, diff_code in enumerate(current_code_diffs): 

122 subcode_matches = [ 

123 code_match 

124 for code_match in code_matches 

125 if code_match.start_line 

126 <= (current_code_line_number + diff_line_index + 1) 

127 <= code_match.end_line 

128 ] 

129 yield diff_code, subcode_matches 

130 

131 for diff_obj in iterate_diff_parts(diff): 

132 if isinstance(diff_obj, DiffFile): 

133 # Yield a code chunk if we finished one 

134 if current_code_diffs: 

135 yield from yield_code_diffs() 

136 

137 current_code_path = None 

138 current_code_diffs = [] 

139 

140 diff_file = diff_obj 

141 file_path = Path(diff_file.new_path) 

142 config = configs.compile_closest_config(file_path) 

143 

144 path_match, code_scopes = match_path( 

145 path=file_path, 

146 config=config, 

147 ) 

148 

149 current_code_path = str(file_path) 

150 check_code_scopes = code_scopes 

151 

152 yield diff_obj, [path_match] 

153 elif isinstance(diff_obj, DiffCode): 

154 if check_code_scopes: 

155 # It will be yielded later 

156 current_code_diffs.append(diff_obj) 

157 else: 

158 # Skip all code lines if we don't care about code 

159 yield diff_obj, [] 

160 

161 # Yield the last code chunk we saw 

162 if current_code_diffs: 

163 yield from yield_code_diffs() 

164 

165 

166def match_diff( 

167 configs: ConfigModels, diff: Iterator | str 

168) -> tuple["ChangeMatches", list[str]]: 

169 config_paths_modified = set() 

170 

171 def iterate(): 

172 for diff_obj, matches in iterate_diff(configs, diff): 

173 if isinstance(diff_obj, DiffFile) and diff_obj.new_path in configs: 

174 config_paths_modified.add(diff_obj.new_path) 

175 if isinstance(diff_obj, DiffFile) and diff_obj.old_path in configs: 

176 config_paths_modified.add(diff_obj.old_path) 

177 

178 yield from matches 

179 

180 try: 

181 return ChangeMatches.from_config_matches(configs, iterate()), list( 

182 config_paths_modified 

183 ) 

184 except LargeScaleChangeException as e: 

185 if e.large_scale_change: 

186 lsc = e.large_scale_change 

187 else: 

188 # Get the root large scale change config 

189 lsc = configs.get_default_large_scale_change() 

190 

191 return ChangeMatches.from_large_scale_change( 

192 configs=configs, 

193 large_scale_change=lsc, 

194 ), list(config_paths_modified) 

195 

196 

197class ChangeMatches(BaseModel): 

198 """ 

199 The matches for a given diff or set of files. 

200 

201 This knows nothing about a pull request (branches, commits, etc.) 

202 """ 

203 

204 model_config = ConfigDict(extra="forbid") 

205 

206 # Instead we could do 

207 # - scopes 

208 # - config 

209 # - paths 

210 # - code 

211 # could add points, reviewers, etc to this 

212 # but then we're mixing concerns... looking at raw files will just have empty values? 

213 

214 # Three modes are: 

215 # - raw files 

216 # - raw diff 

217 # - pull request (has reviews) 

218 

219 configs: dict[str, ConfigModel] = {} 

220 

221 # The matching LSC, if there is one. 

222 large_scale_change: LargeScaleChangeModel | None = None 

223 

224 # All scopes found in the results 

225 scopes: dict[str, ScopeModel] = {} 

226 

227 # All evaluated paths 

228 paths: dict[str, "ScopePathMatch"] = {} 

229 

230 # All code matches 

231 code: dict[str, "ScopeCodeMatch"] = {} 

232 

233 def as_dict(self): 

234 return self.model_dump() 

235 

236 def __bool__(self): 

237 return bool(self.scopes) 

238 

239 @classmethod 

240 def from_config_matches(cls, configs: ConfigModels, matches): 

241 scopes = {} 

242 paths = {} 

243 code = {} 

244 

245 for match in matches: 

246 # Store seen scopes as we go from all matches 

247 for scope in match._scopes: 

248 scopes[scope.name] = scope 

249 

250 if isinstance(match, ScopePathMatch): 

251 if not match._scopes: 

252 # Right now we don't care about storing anything that doesn't have scopes. 

253 # This prevents an unnecessarily huge dump on big repos or PRs. 

254 continue 

255 

256 paths[match.path] = match 

257 

258 elif isinstance(match, ScopeCodeMatch): 

259 code_location_id = match.location_id 

260 

261 # Store it in the code results 

262 code[code_location_id] = match 

263 

264 # Associate it with any path results 

265 # if code_location_id not in paths[match.path].code: 

266 # paths[match.path].code.append(code_location_id) 

267 

268 else: 

269 raise ValueError(f"Unknown match type: {match}") 

270 

271 return cls( 

272 large_scale_change=None, 

273 scopes=scopes, 

274 paths=paths, 

275 code=code, 

276 # Should this be compiled configs? At this point they may be modified (branches, author, etc.) 

277 configs=configs.get_config_models(), 

278 ) 

279 

280 @classmethod 

281 def from_large_scale_change( 

282 cls, 

283 configs: ConfigModels, 

284 large_scale_change: LargeScaleChangeModel, 

285 ): 

286 return cls( 

287 configs=configs.get_config_models(), 

288 large_scale_change=large_scale_change, 

289 scopes={}, 

290 paths={}, 

291 code={}, 

292 ) 

293 

294 def get_scope_display(self, scope_name): 

295 scope = self.scopes[scope_name] 

296 

297 def _color_for_name(name): 

298 colors = [ 

299 "bright_blue", 

300 "bright_green", 

301 "bright_yellow", 

302 "bright_magenta", 

303 "bright_cyan", 

304 ] 

305 index = sum([ord(c) for c in name]) % len(colors) 

306 return colors[index] 

307 

308 dim = scope.ownership == OwnershipChoices.GLOBAL 

309 

310 return click.style( 

311 scope.printed_name(), fg=_color_for_name(scope.name), dim=dim 

312 ) 

313 

314 def print(self, *, by="scope"): 

315 def _print_path_match(path, path_match): 

316 click.echo(path, nl=False) 

317 

318 for scope in path_match.scopes: 

319 click.echo(" ", nl=False) 

320 click.echo(self.get_scope_display(scope), nl=False) 

321 

322 click.echo() 

323 

324 # Find code matches for this path 

325 for code_match in self.code.values(): 

326 if code_match.path == path: 

327 click.echo(" ", nl=False) 

328 click.echo(code_match.printed_location(), nl=False) 

329 

330 for scope in code_match.scopes: 

331 click.echo(" ", nl=False) 

332 click.echo(self.get_scope_display(scope), nl=False) 

333 click.echo() 

334 

335 if by == "scope": 

336 for scope in self.scopes.keys(): 

337 click.secho(scope, bold=True) 

338 for path, path_match in self.paths.items(): 

339 if scope in path_match.scopes: 

340 _print_path_match(path, path_match) 

341 print() 

342 elif by == "path": 

343 for path, path_match in self.paths.items(): 

344 if path_match.scopes: # Only print those with scopes...? 

345 _print_path_match(path, path_match) 

346 else: 

347 raise ValueError(f"Unknown by: {by}") 

348 

349 

350class ScopePathMatch(BaseModel): 

351 model_config = ConfigDict(extra="forbid") 

352 

353 path: str = Field(min_length=1) 

354 scopes: list[str] # Field(min_length=1) 

355 # code: list[str] = [] 

356 

357 # Store this internally during processing (full reference of scope models) 

358 _scopes: list[ScopeModel] = [] 

359 

360 def add_scope(self, scope: ScopeModel): 

361 if not scope.ownership: 

362 # Remove any other scopes that don't have special ownership rules 

363 # (i.e. we only want one primary scope in the end) 

364 self._scopes = [s for s in self._scopes if s.ownership] 

365 

366 self._scopes.append(scope) 

367 

368 self.scopes = [s.name for s in self._scopes] 

369 

370 

371class ScopeCodeMatch(BaseModel): 

372 model_config = ConfigDict(extra="forbid") 

373 

374 # In a diff match, we could see both sides of the diff, i.e. repeated lines if the before and after both match... 

375 path: str = Field(min_length=1) 

376 start_line: int 

377 end_line: int 

378 start_column: int 

379 end_column: int 

380 scopes: list[str] # Field(min_length=1) 

381 location_id: str 

382 

383 # Store this internally during processing (full reference of scope models) 

384 _scopes: list[ScopeModel] = [] 

385 

386 def printed_location(self): 

387 if self.start_line == self.end_line: 

388 return f"Ln {self.start_line}, Col {self.start_column}-{self.end_column}" 

389 else: 

390 return f"Ln {self.start_line}-{self.end_line}" 

391 

392 def add_scope(self, scope: ScopeModel): 

393 if not scope.ownership: 

394 # Remove any other scopes that don't have special ownership rules 

395 # (i.e. we only want one primary scope in the end) 

396 self._scopes = [s for s in self._scopes if s.ownership] 

397 

398 self._scopes.append(scope) 

399 

400 self.scopes = [s.name for s in self._scopes] 

401 

402 @model_validator(mode="after") 

403 def compute_location_id(self): 

404 # only compute if the caller didn’t provide one 

405 if not self.location_id: 

406 loc = { 

407 "path": self.path, 

408 "start_line": self.start_line, 

409 "end_line": self.end_line, 

410 "start_column": self.start_column, 

411 "end_column": self.end_column, 

412 } 

413 raw = json.dumps(loc, sort_keys=True, separators=(",", ":")).encode() 

414 self.location_id = hashlib.md5(raw).hexdigest() 

415 return self 

416 

417 

418# how to store what was reviewed? ideally we could be fine-grained, at some point 

419# so we need to know who, which scopes, which paths, which codes (location hash) then we can cross reference everything?