Coverage for src/pullapprove/matches.py: 65%
191 statements
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-11 12:55 -0500
« prev ^ index » next coverage.py v7.8.2, created at 2025-06-11 12:55 -0500
1import hashlib
2import json
3from collections.abc import Iterator
4from pathlib import Path
6import click
7from pydantic import BaseModel, ConfigDict, Field, model_validator
9from .config import (
10 ConfigModel,
11 ConfigModels,
12 LargeScaleChangeModel,
13 OwnershipChoices,
14 ScopeModel,
15)
16from .diff import DiffCode, DiffFile, iterate_diff_parts
17from .exceptions import LargeScaleChangeException
20def match_path(*, path: Path, config: ConfigModel):
21 path_match = ScopePathMatch(path=str(path), scopes=[])
23 scopes_matching_paths = [
24 scope for scope in config.scopes if scope.matches_path(path)
25 ]
26 code_scopes = [scope for scope in scopes_matching_paths if scope.code]
27 path_scopes = [scope for scope in scopes_matching_paths if not scope.code]
29 # Set the scopes on the path itself
30 for scope in path_scopes:
31 path_match.add_scope(scope)
33 return path_match, code_scopes
36def match_code(*, path: str, code: str, scopes: list[ScopeModel], line_offset: int = 0):
37 code_matches = {}
39 for scope in scopes:
40 for match in scope.matches_code(code):
41 code_match = ScopeCodeMatch(
42 path=path,
43 start_line=line_offset + match["start_line"],
44 end_line=line_offset + match["end_line"],
45 start_column=match["start_col"],
46 end_column=match["end_col"],
47 scopes=[scope.name],
48 location_id="",
49 )
50 code_match._scopes = [scope]
52 if code_match.location_id in code_matches:
53 # Just add the scopes to it
54 code_matches[code_match.location_id].add_scope(scope)
55 else:
56 code_matches[code_match.location_id] = code_match
58 yield from code_matches.values()
61def match_files(configs: ConfigModels, files: Iterator):
62 def _iterate():
63 for f in files:
64 file_path = Path(f)
66 config = configs.compile_closest_config(file_path)
68 path_match, code_scopes = match_path(
69 path=file_path,
70 config=config,
71 )
73 # Yield the paths first
74 yield path_match
76 # Then go line by line to find scopes that match lines
77 if code_scopes:
78 code = file_path.read_text()
80 yield from match_code(
81 path=str(file_path),
82 code=code,
83 scopes=code_scopes,
84 )
86 return ChangeMatches.from_config_matches(configs, _iterate())
89def iterate_diff(configs: ConfigModels, diff: Iterator | str):
90 # We can still iterate a diff without configs, just by yield the diff objs
91 if not configs:
92 for diff_obj in iterate_diff_parts(diff):
93 yield diff_obj, []
95 return
97 # Keep track of these as we go and jump between file header
98 # and raw code during iteration
99 check_code_scopes = False
100 current_code_path = None
102 current_code_diffs = []
104 # TODO get root config here, check diff size as we go and raise exception?
105 # or we need to keep track per LSC? should be a compiled value...
107 def yield_code_diffs():
108 # We're passing the entire diff chunk to see if there's a match inside,
109 # but if there is, it probably won't match EVERY line in the chunk
110 current_code_chunk = "\n".join([code.raw() for code in current_code_diffs])
111 current_code_line_number = current_code_diffs[0].line_number - 1
113 code_matches = match_code(
114 path=current_code_path,
115 code=current_code_chunk,
116 scopes=check_code_scopes,
117 line_offset=current_code_line_number,
118 )
119 code_matches = list(code_matches)
121 for diff_line_index, diff_code in enumerate(current_code_diffs):
122 subcode_matches = [
123 code_match
124 for code_match in code_matches
125 if code_match.start_line
126 <= (current_code_line_number + diff_line_index + 1)
127 <= code_match.end_line
128 ]
129 yield diff_code, subcode_matches
131 for diff_obj in iterate_diff_parts(diff):
132 if isinstance(diff_obj, DiffFile):
133 # Yield a code chunk if we finished one
134 if current_code_diffs:
135 yield from yield_code_diffs()
137 current_code_path = None
138 current_code_diffs = []
140 diff_file = diff_obj
141 file_path = Path(diff_file.new_path)
142 config = configs.compile_closest_config(file_path)
144 path_match, code_scopes = match_path(
145 path=file_path,
146 config=config,
147 )
149 current_code_path = str(file_path)
150 check_code_scopes = code_scopes
152 yield diff_obj, [path_match]
153 elif isinstance(diff_obj, DiffCode):
154 if check_code_scopes:
155 # It will be yielded later
156 current_code_diffs.append(diff_obj)
157 else:
158 # Skip all code lines if we don't care about code
159 yield diff_obj, []
161 # Yield the last code chunk we saw
162 if current_code_diffs:
163 yield from yield_code_diffs()
166def match_diff(
167 configs: ConfigModels, diff: Iterator | str
168) -> tuple["ChangeMatches", list[str]]:
169 config_paths_modified = set()
171 def iterate():
172 for diff_obj, matches in iterate_diff(configs, diff):
173 if isinstance(diff_obj, DiffFile) and diff_obj.new_path in configs:
174 config_paths_modified.add(diff_obj.new_path)
175 if isinstance(diff_obj, DiffFile) and diff_obj.old_path in configs:
176 config_paths_modified.add(diff_obj.old_path)
178 yield from matches
180 try:
181 return ChangeMatches.from_config_matches(configs, iterate()), list(
182 config_paths_modified
183 )
184 except LargeScaleChangeException as e:
185 if e.large_scale_change:
186 lsc = e.large_scale_change
187 else:
188 # Get the root large scale change config
189 lsc = configs.get_default_large_scale_change()
191 return ChangeMatches.from_large_scale_change(
192 configs=configs,
193 large_scale_change=lsc,
194 ), list(config_paths_modified)
197class ChangeMatches(BaseModel):
198 """
199 The matches for a given diff or set of files.
201 This knows nothing about a pull request (branches, commits, etc.)
202 """
204 model_config = ConfigDict(extra="forbid")
206 # Instead we could do
207 # - scopes
208 # - config
209 # - paths
210 # - code
211 # could add points, reviewers, etc to this
212 # but then we're mixing concerns... looking at raw files will just have empty values?
214 # Three modes are:
215 # - raw files
216 # - raw diff
217 # - pull request (has reviews)
219 configs: dict[str, ConfigModel] = {}
221 # The matching LSC, if there is one.
222 large_scale_change: LargeScaleChangeModel | None = None
224 # All scopes found in the results
225 scopes: dict[str, ScopeModel] = {}
227 # All evaluated paths
228 paths: dict[str, "ScopePathMatch"] = {}
230 # All code matches
231 code: dict[str, "ScopeCodeMatch"] = {}
233 def as_dict(self):
234 return self.model_dump()
236 def __bool__(self):
237 return bool(self.scopes)
239 @classmethod
240 def from_config_matches(cls, configs: ConfigModels, matches):
241 scopes = {}
242 paths = {}
243 code = {}
245 for match in matches:
246 # Store seen scopes as we go from all matches
247 for scope in match._scopes:
248 scopes[scope.name] = scope
250 if isinstance(match, ScopePathMatch):
251 if not match._scopes:
252 # Right now we don't care about storing anything that doesn't have scopes.
253 # This prevents an unnecessarily huge dump on big repos or PRs.
254 continue
256 paths[match.path] = match
258 elif isinstance(match, ScopeCodeMatch):
259 code_location_id = match.location_id
261 # Store it in the code results
262 code[code_location_id] = match
264 # Associate it with any path results
265 # if code_location_id not in paths[match.path].code:
266 # paths[match.path].code.append(code_location_id)
268 else:
269 raise ValueError(f"Unknown match type: {match}")
271 return cls(
272 large_scale_change=None,
273 scopes=scopes,
274 paths=paths,
275 code=code,
276 # Should this be compiled configs? At this point they may be modified (branches, author, etc.)
277 configs=configs.get_config_models(),
278 )
280 @classmethod
281 def from_large_scale_change(
282 cls,
283 configs: ConfigModels,
284 large_scale_change: LargeScaleChangeModel,
285 ):
286 return cls(
287 configs=configs.get_config_models(),
288 large_scale_change=large_scale_change,
289 scopes={},
290 paths={},
291 code={},
292 )
294 def get_scope_display(self, scope_name):
295 scope = self.scopes[scope_name]
297 def _color_for_name(name):
298 colors = [
299 "bright_blue",
300 "bright_green",
301 "bright_yellow",
302 "bright_magenta",
303 "bright_cyan",
304 ]
305 index = sum([ord(c) for c in name]) % len(colors)
306 return colors[index]
308 dim = scope.ownership == OwnershipChoices.GLOBAL
310 return click.style(
311 scope.printed_name(), fg=_color_for_name(scope.name), dim=dim
312 )
314 def print(self, *, by="scope"):
315 def _print_path_match(path, path_match):
316 click.echo(path, nl=False)
318 for scope in path_match.scopes:
319 click.echo(" ", nl=False)
320 click.echo(self.get_scope_display(scope), nl=False)
322 click.echo()
324 # Find code matches for this path
325 for code_match in self.code.values():
326 if code_match.path == path:
327 click.echo(" ", nl=False)
328 click.echo(code_match.printed_location(), nl=False)
330 for scope in code_match.scopes:
331 click.echo(" ", nl=False)
332 click.echo(self.get_scope_display(scope), nl=False)
333 click.echo()
335 if by == "scope":
336 for scope in self.scopes.keys():
337 click.secho(scope, bold=True)
338 for path, path_match in self.paths.items():
339 if scope in path_match.scopes:
340 _print_path_match(path, path_match)
341 print()
342 elif by == "path":
343 for path, path_match in self.paths.items():
344 if path_match.scopes: # Only print those with scopes...?
345 _print_path_match(path, path_match)
346 else:
347 raise ValueError(f"Unknown by: {by}")
350class ScopePathMatch(BaseModel):
351 model_config = ConfigDict(extra="forbid")
353 path: str = Field(min_length=1)
354 scopes: list[str] # Field(min_length=1)
355 # code: list[str] = []
357 # Store this internally during processing (full reference of scope models)
358 _scopes: list[ScopeModel] = []
360 def add_scope(self, scope: ScopeModel):
361 if not scope.ownership:
362 # Remove any other scopes that don't have special ownership rules
363 # (i.e. we only want one primary scope in the end)
364 self._scopes = [s for s in self._scopes if s.ownership]
366 self._scopes.append(scope)
368 self.scopes = [s.name for s in self._scopes]
371class ScopeCodeMatch(BaseModel):
372 model_config = ConfigDict(extra="forbid")
374 # In a diff match, we could see both sides of the diff, i.e. repeated lines if the before and after both match...
375 path: str = Field(min_length=1)
376 start_line: int
377 end_line: int
378 start_column: int
379 end_column: int
380 scopes: list[str] # Field(min_length=1)
381 location_id: str
383 # Store this internally during processing (full reference of scope models)
384 _scopes: list[ScopeModel] = []
386 def printed_location(self):
387 if self.start_line == self.end_line:
388 return f"Ln {self.start_line}, Col {self.start_column}-{self.end_column}"
389 else:
390 return f"Ln {self.start_line}-{self.end_line}"
392 def add_scope(self, scope: ScopeModel):
393 if not scope.ownership:
394 # Remove any other scopes that don't have special ownership rules
395 # (i.e. we only want one primary scope in the end)
396 self._scopes = [s for s in self._scopes if s.ownership]
398 self._scopes.append(scope)
400 self.scopes = [s.name for s in self._scopes]
402 @model_validator(mode="after")
403 def compute_location_id(self):
404 # only compute if the caller didn’t provide one
405 if not self.location_id:
406 loc = {
407 "path": self.path,
408 "start_line": self.start_line,
409 "end_line": self.end_line,
410 "start_column": self.start_column,
411 "end_column": self.end_column,
412 }
413 raw = json.dumps(loc, sort_keys=True, separators=(",", ":")).encode()
414 self.location_id = hashlib.md5(raw).hexdigest()
415 return self
418# how to store what was reviewed? ideally we could be fine-grained, at some point
419# so we need to know who, which scopes, which paths, which codes (location hash) then we can cross reference everything?