Coverage for src / dataknobs_bots / knowledge / retrieval / merger.py: 30%
84 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-16 10:13 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-16 10:13 -0700
1"""Chunk merging utilities for RAG retrieval optimization.
3This module provides functionality to merge adjacent chunks that share
4the same heading path, improving context coherence for LLM consumption.
5"""
7from __future__ import annotations
9from collections import defaultdict
10from dataclasses import dataclass
11from typing import Any
14@dataclass
15class MergerConfig:
16 """Configuration for chunk merging.
18 Attributes:
19 max_merged_size: Maximum size of merged chunk content in characters
20 preserve_order: Whether to preserve positional ordering within groups
21 """
23 max_merged_size: int = 2000
24 preserve_order: bool = True
27@dataclass
28class MergedChunk:
29 """A merged chunk combining multiple related chunks.
31 Attributes:
32 text: Combined text content
33 source: Source file path
34 heading_path: Shared heading path
35 heading_display: Formatted heading display string
36 chunks: Original chunks that were merged
37 avg_similarity: Average similarity score of merged chunks
38 content_length: Total content length
39 """
41 text: str
42 source: str
43 heading_path: list[str]
44 heading_display: str
45 chunks: list[dict[str, Any]]
46 avg_similarity: float
47 content_length: int
50class ChunkMerger:
51 """Merges adjacent chunks sharing the same heading path.
53 This merger groups search results by their heading path and source,
54 then combines them into coherent context units while respecting
55 size limits.
57 Example:
58 ```python
59 merger = ChunkMerger(MergerConfig(max_merged_size=2000))
60 results = await kb.query("How do I configure auth?", k=10)
61 merged = merger.merge(results)
63 for chunk in merged:
64 print(f"[{chunk.avg_similarity:.2f}] {chunk.heading_display}")
65 print(chunk.text)
66 ```
67 """
69 def __init__(self, config: MergerConfig | None = None):
70 """Initialize the chunk merger.
72 Args:
73 config: Merger configuration, uses defaults if not provided
74 """
75 self.config = config or MergerConfig()
77 def merge(self, results: list[dict[str, Any]]) -> list[MergedChunk]:
78 """Merge search results by shared heading path.
80 Groups chunks by (source, heading_path) and merges those that
81 share identical heading paths. Chunks are ordered by their
82 position within the document.
84 Args:
85 results: Search results from RAGKnowledgeBase.query()
86 Each result should have:
87 - text: Chunk content
88 - source: Source file
89 - heading_path: Heading hierarchy string or list
90 - similarity: Similarity score
91 - metadata: Full chunk metadata
93 Returns:
94 List of MergedChunk objects, sorted by average similarity
95 """
96 if not results:
97 return []
99 # Group chunks by (source, heading_path)
100 groups: dict[tuple[str, tuple[str, ...]], list[dict[str, Any]]] = defaultdict(list)
102 for result in results:
103 source = result.get("source", "")
104 heading_path = self._normalize_heading_path(result)
105 key = (source, tuple(heading_path))
106 groups[key].append(result)
108 # Merge each group
109 merged_chunks = []
110 for (source, heading_path_tuple), chunks in groups.items():
111 heading_path = list(heading_path_tuple)
113 # Sort by position if available
114 if self.config.preserve_order:
115 chunks = self._sort_by_position(chunks)
117 # Merge chunks respecting size limit
118 merged = self._merge_chunk_group(chunks, source, heading_path)
119 merged_chunks.extend(merged)
121 # Sort by average similarity (descending)
122 merged_chunks.sort(key=lambda c: c.avg_similarity, reverse=True)
124 return merged_chunks
126 def _normalize_heading_path(self, result: dict[str, Any]) -> list[str]:
127 """Extract and normalize heading path from result.
129 Args:
130 result: Search result dictionary
132 Returns:
133 List of heading strings
134 """
135 # Try to get from metadata first (may have list format)
136 metadata = result.get("metadata", {})
137 headings = metadata.get("headings", [])
138 if headings:
139 return headings
141 # Fall back to heading_path string
142 heading_path = result.get("heading_path", "")
143 if isinstance(heading_path, list):
144 return heading_path
145 elif heading_path:
146 return heading_path.split(" > ")
148 return []
150 def _sort_by_position(self, chunks: list[dict[str, Any]]) -> list[dict[str, Any]]:
151 """Sort chunks by their position in the document.
153 Args:
154 chunks: List of chunk results
156 Returns:
157 Sorted list
158 """
159 def get_position(chunk: dict[str, Any]) -> int:
160 metadata = chunk.get("metadata", {})
161 # Try chunk_index first, then line_number
162 return metadata.get("chunk_index", metadata.get("line_number", 0))
164 return sorted(chunks, key=get_position)
166 def _merge_chunk_group(
167 self,
168 chunks: list[dict[str, Any]],
169 source: str,
170 heading_path: list[str],
171 ) -> list[MergedChunk]:
172 """Merge a group of chunks with the same heading path.
174 Combines chunks until max_merged_size is reached, then starts
175 a new merged chunk. Overflow chunks are returned as separate
176 merged chunks.
178 Args:
179 chunks: Chunks to merge
180 source: Source file path
181 heading_path: Shared heading path
183 Returns:
184 List of merged chunks
185 """
186 if not chunks:
187 return []
189 merged_results = []
190 current_chunks: list[dict[str, Any]] = []
191 current_size = 0
193 for chunk in chunks:
194 chunk_text = chunk.get("text", "")
195 chunk_size = len(chunk_text)
197 # Check if adding this chunk would exceed the limit
198 if current_size + chunk_size > self.config.max_merged_size and current_chunks:
199 # Save current merge and start new one
200 merged_results.append(
201 self._create_merged_chunk(current_chunks, source, heading_path)
202 )
203 current_chunks = []
204 current_size = 0
206 current_chunks.append(chunk)
207 current_size += chunk_size
209 # Don't forget the last group
210 if current_chunks:
211 merged_results.append(
212 self._create_merged_chunk(current_chunks, source, heading_path)
213 )
215 return merged_results
217 def _create_merged_chunk(
218 self,
219 chunks: list[dict[str, Any]],
220 source: str,
221 heading_path: list[str],
222 ) -> MergedChunk:
223 """Create a MergedChunk from a list of chunks.
225 Args:
226 chunks: Chunks to combine
227 source: Source file path
228 heading_path: Shared heading path
230 Returns:
231 MergedChunk object
232 """
233 # Combine text with double newline separator
234 texts = [chunk.get("text", "") for chunk in chunks]
235 combined_text = "\n\n".join(text.strip() for text in texts if text.strip())
237 # Calculate average similarity
238 similarities = [chunk.get("similarity", 0.0) for chunk in chunks]
239 avg_similarity = sum(similarities) / len(similarities) if similarities else 0.0
241 # Build heading display
242 heading_display = " > ".join(heading_path) if heading_path else ""
244 return MergedChunk(
245 text=combined_text,
246 source=source,
247 heading_path=heading_path,
248 heading_display=heading_display,
249 chunks=chunks,
250 avg_similarity=avg_similarity,
251 content_length=len(combined_text),
252 )
254 def to_result_list(self, merged_chunks: list[MergedChunk]) -> list[dict[str, Any]]:
255 """Convert merged chunks back to result list format.
257 Useful for compatibility with existing code that expects
258 the standard result format.
260 Args:
261 merged_chunks: List of merged chunks
263 Returns:
264 List of result dictionaries
265 """
266 results = []
267 for merged in merged_chunks:
268 results.append({
269 "text": merged.text,
270 "source": merged.source,
271 "heading_path": merged.heading_display,
272 "similarity": merged.avg_similarity,
273 "metadata": {
274 "headings": merged.heading_path,
275 "content_length": merged.content_length,
276 "merged_count": len(merged.chunks),
277 },
278 })
279 return results