Coverage for src / dataknobs_bots / knowledge / retrieval / merger.py: 30%

84 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-16 10:13 -0700

1"""Chunk merging utilities for RAG retrieval optimization. 

2 

3This module provides functionality to merge adjacent chunks that share 

4the same heading path, improving context coherence for LLM consumption. 

5""" 

6 

7from __future__ import annotations 

8 

9from collections import defaultdict 

10from dataclasses import dataclass 

11from typing import Any 

12 

13 

14@dataclass 

15class MergerConfig: 

16 """Configuration for chunk merging. 

17 

18 Attributes: 

19 max_merged_size: Maximum size of merged chunk content in characters 

20 preserve_order: Whether to preserve positional ordering within groups 

21 """ 

22 

23 max_merged_size: int = 2000 

24 preserve_order: bool = True 

25 

26 

27@dataclass 

28class MergedChunk: 

29 """A merged chunk combining multiple related chunks. 

30 

31 Attributes: 

32 text: Combined text content 

33 source: Source file path 

34 heading_path: Shared heading path 

35 heading_display: Formatted heading display string 

36 chunks: Original chunks that were merged 

37 avg_similarity: Average similarity score of merged chunks 

38 content_length: Total content length 

39 """ 

40 

41 text: str 

42 source: str 

43 heading_path: list[str] 

44 heading_display: str 

45 chunks: list[dict[str, Any]] 

46 avg_similarity: float 

47 content_length: int 

48 

49 

50class ChunkMerger: 

51 """Merges adjacent chunks sharing the same heading path. 

52 

53 This merger groups search results by their heading path and source, 

54 then combines them into coherent context units while respecting 

55 size limits. 

56 

57 Example: 

58 ```python 

59 merger = ChunkMerger(MergerConfig(max_merged_size=2000)) 

60 results = await kb.query("How do I configure auth?", k=10) 

61 merged = merger.merge(results) 

62 

63 for chunk in merged: 

64 print(f"[{chunk.avg_similarity:.2f}] {chunk.heading_display}") 

65 print(chunk.text) 

66 ``` 

67 """ 

68 

69 def __init__(self, config: MergerConfig | None = None): 

70 """Initialize the chunk merger. 

71 

72 Args: 

73 config: Merger configuration, uses defaults if not provided 

74 """ 

75 self.config = config or MergerConfig() 

76 

77 def merge(self, results: list[dict[str, Any]]) -> list[MergedChunk]: 

78 """Merge search results by shared heading path. 

79 

80 Groups chunks by (source, heading_path) and merges those that 

81 share identical heading paths. Chunks are ordered by their 

82 position within the document. 

83 

84 Args: 

85 results: Search results from RAGKnowledgeBase.query() 

86 Each result should have: 

87 - text: Chunk content 

88 - source: Source file 

89 - heading_path: Heading hierarchy string or list 

90 - similarity: Similarity score 

91 - metadata: Full chunk metadata 

92 

93 Returns: 

94 List of MergedChunk objects, sorted by average similarity 

95 """ 

96 if not results: 

97 return [] 

98 

99 # Group chunks by (source, heading_path) 

100 groups: dict[tuple[str, tuple[str, ...]], list[dict[str, Any]]] = defaultdict(list) 

101 

102 for result in results: 

103 source = result.get("source", "") 

104 heading_path = self._normalize_heading_path(result) 

105 key = (source, tuple(heading_path)) 

106 groups[key].append(result) 

107 

108 # Merge each group 

109 merged_chunks = [] 

110 for (source, heading_path_tuple), chunks in groups.items(): 

111 heading_path = list(heading_path_tuple) 

112 

113 # Sort by position if available 

114 if self.config.preserve_order: 

115 chunks = self._sort_by_position(chunks) 

116 

117 # Merge chunks respecting size limit 

118 merged = self._merge_chunk_group(chunks, source, heading_path) 

119 merged_chunks.extend(merged) 

120 

121 # Sort by average similarity (descending) 

122 merged_chunks.sort(key=lambda c: c.avg_similarity, reverse=True) 

123 

124 return merged_chunks 

125 

126 def _normalize_heading_path(self, result: dict[str, Any]) -> list[str]: 

127 """Extract and normalize heading path from result. 

128 

129 Args: 

130 result: Search result dictionary 

131 

132 Returns: 

133 List of heading strings 

134 """ 

135 # Try to get from metadata first (may have list format) 

136 metadata = result.get("metadata", {}) 

137 headings = metadata.get("headings", []) 

138 if headings: 

139 return headings 

140 

141 # Fall back to heading_path string 

142 heading_path = result.get("heading_path", "") 

143 if isinstance(heading_path, list): 

144 return heading_path 

145 elif heading_path: 

146 return heading_path.split(" > ") 

147 

148 return [] 

149 

150 def _sort_by_position(self, chunks: list[dict[str, Any]]) -> list[dict[str, Any]]: 

151 """Sort chunks by their position in the document. 

152 

153 Args: 

154 chunks: List of chunk results 

155 

156 Returns: 

157 Sorted list 

158 """ 

159 def get_position(chunk: dict[str, Any]) -> int: 

160 metadata = chunk.get("metadata", {}) 

161 # Try chunk_index first, then line_number 

162 return metadata.get("chunk_index", metadata.get("line_number", 0)) 

163 

164 return sorted(chunks, key=get_position) 

165 

166 def _merge_chunk_group( 

167 self, 

168 chunks: list[dict[str, Any]], 

169 source: str, 

170 heading_path: list[str], 

171 ) -> list[MergedChunk]: 

172 """Merge a group of chunks with the same heading path. 

173 

174 Combines chunks until max_merged_size is reached, then starts 

175 a new merged chunk. Overflow chunks are returned as separate 

176 merged chunks. 

177 

178 Args: 

179 chunks: Chunks to merge 

180 source: Source file path 

181 heading_path: Shared heading path 

182 

183 Returns: 

184 List of merged chunks 

185 """ 

186 if not chunks: 

187 return [] 

188 

189 merged_results = [] 

190 current_chunks: list[dict[str, Any]] = [] 

191 current_size = 0 

192 

193 for chunk in chunks: 

194 chunk_text = chunk.get("text", "") 

195 chunk_size = len(chunk_text) 

196 

197 # Check if adding this chunk would exceed the limit 

198 if current_size + chunk_size > self.config.max_merged_size and current_chunks: 

199 # Save current merge and start new one 

200 merged_results.append( 

201 self._create_merged_chunk(current_chunks, source, heading_path) 

202 ) 

203 current_chunks = [] 

204 current_size = 0 

205 

206 current_chunks.append(chunk) 

207 current_size += chunk_size 

208 

209 # Don't forget the last group 

210 if current_chunks: 

211 merged_results.append( 

212 self._create_merged_chunk(current_chunks, source, heading_path) 

213 ) 

214 

215 return merged_results 

216 

217 def _create_merged_chunk( 

218 self, 

219 chunks: list[dict[str, Any]], 

220 source: str, 

221 heading_path: list[str], 

222 ) -> MergedChunk: 

223 """Create a MergedChunk from a list of chunks. 

224 

225 Args: 

226 chunks: Chunks to combine 

227 source: Source file path 

228 heading_path: Shared heading path 

229 

230 Returns: 

231 MergedChunk object 

232 """ 

233 # Combine text with double newline separator 

234 texts = [chunk.get("text", "") for chunk in chunks] 

235 combined_text = "\n\n".join(text.strip() for text in texts if text.strip()) 

236 

237 # Calculate average similarity 

238 similarities = [chunk.get("similarity", 0.0) for chunk in chunks] 

239 avg_similarity = sum(similarities) / len(similarities) if similarities else 0.0 

240 

241 # Build heading display 

242 heading_display = " > ".join(heading_path) if heading_path else "" 

243 

244 return MergedChunk( 

245 text=combined_text, 

246 source=source, 

247 heading_path=heading_path, 

248 heading_display=heading_display, 

249 chunks=chunks, 

250 avg_similarity=avg_similarity, 

251 content_length=len(combined_text), 

252 ) 

253 

254 def to_result_list(self, merged_chunks: list[MergedChunk]) -> list[dict[str, Any]]: 

255 """Convert merged chunks back to result list format. 

256 

257 Useful for compatibility with existing code that expects 

258 the standard result format. 

259 

260 Args: 

261 merged_chunks: List of merged chunks 

262 

263 Returns: 

264 List of result dictionaries 

265 """ 

266 results = [] 

267 for merged in merged_chunks: 

268 results.append({ 

269 "text": merged.text, 

270 "source": merged.source, 

271 "heading_path": merged.heading_display, 

272 "similarity": merged.avg_similarity, 

273 "metadata": { 

274 "headings": merged.heading_path, 

275 "content_length": merged.content_length, 

276 "merged_count": len(merged.chunks), 

277 }, 

278 }) 

279 return results