Coverage for src / dataknobs_bots / knowledge / rag.py: 13%

166 statements  

« prev     ^ index     » next       coverage.py v7.13.0, created at 2025-12-16 10:13 -0700

1"""RAG (Retrieval-Augmented Generation) knowledge base implementation.""" 

2 

3import types 

4from pathlib import Path 

5from typing import Any 

6 

7from dataknobs_xization import ( 

8 ChunkQualityConfig, 

9 ContentTransformer, 

10 HeadingInclusion, 

11 chunk_markdown_tree, 

12 parse_markdown, 

13) 

14from dataknobs_bots.knowledge.retrieval import ( 

15 ChunkMerger, 

16 ContextFormatter, 

17 FormatterConfig, 

18 MergerConfig, 

19) 

20 

21 

22class RAGKnowledgeBase: 

23 """RAG knowledge base using dataknobs-xization for chunking and vector search. 

24 

25 This implementation: 

26 - Parses markdown documents using dataknobs-xization 

27 - Chunks documents intelligently based on structure 

28 - Stores chunks with embeddings in vector store 

29 - Provides semantic search for relevant context 

30 

31 Attributes: 

32 vector_store: Vector store backend from dataknobs_data 

33 embedding_provider: LLM provider for generating embeddings 

34 chunking_config: Configuration for document chunking 

35 """ 

36 

37 def __init__( 

38 self, 

39 vector_store: Any, 

40 embedding_provider: Any, 

41 chunking_config: dict[str, Any] | None = None, 

42 merger_config: MergerConfig | None = None, 

43 formatter_config: FormatterConfig | None = None, 

44 ): 

45 """Initialize RAG knowledge base. 

46 

47 Args: 

48 vector_store: Vector store backend instance 

49 embedding_provider: LLM provider with embed() method 

50 chunking_config: Configuration for chunking: 

51 - max_chunk_size: Maximum chunk size in characters 

52 - chunk_overlap: Overlap between chunks 

53 - combine_under_heading: Combine text under same heading 

54 - quality_filter: ChunkQualityConfig for filtering 

55 - generate_embeddings: Whether to generate enriched embedding text 

56 merger_config: Configuration for chunk merging (optional) 

57 formatter_config: Configuration for context formatting (optional) 

58 """ 

59 self.vector_store = vector_store 

60 self.embedding_provider = embedding_provider 

61 self.chunking_config = chunking_config or { 

62 "max_chunk_size": 500, 

63 "chunk_overlap": 50, 

64 "combine_under_heading": True, 

65 } 

66 

67 # Initialize merger and formatter 

68 self.merger = ChunkMerger(merger_config) if merger_config else ChunkMerger() 

69 self.formatter = ContextFormatter(formatter_config) if formatter_config else ContextFormatter() 

70 

71 @classmethod 

72 async def from_config(cls, config: dict[str, Any]) -> "RAGKnowledgeBase": 

73 """Create RAG knowledge base from configuration. 

74 

75 Args: 

76 config: Configuration dictionary with: 

77 - vector_store: Vector store configuration 

78 - embedding_provider: LLM provider name 

79 - embedding_model: Model for embeddings 

80 - chunking: Optional chunking configuration 

81 - documents_path: Optional path to load documents from 

82 - document_pattern: Optional glob pattern for documents 

83 

84 Returns: 

85 Configured RAGKnowledgeBase instance 

86 

87 Example: 

88 ```python 

89 config = { 

90 "vector_store": { 

91 "backend": "faiss", 

92 "dimensions": 1536, 

93 "collection": "docs" 

94 }, 

95 "embedding_provider": "openai", 

96 "embedding_model": "text-embedding-3-small", 

97 "chunking": { 

98 "max_chunk_size": 500, 

99 "chunk_overlap": 50 

100 }, 

101 "documents_path": "./docs" 

102 } 

103 kb = await RAGKnowledgeBase.from_config(config) 

104 ``` 

105 """ 

106 from dataknobs_data.vector.stores import VectorStoreFactory 

107 from dataknobs_llm.llm import LLMProviderFactory 

108 

109 # Create vector store 

110 vs_config = config["vector_store"] 

111 factory = VectorStoreFactory() 

112 vector_store = factory.create(**vs_config) 

113 await vector_store.initialize() 

114 

115 # Create embedding provider 

116 llm_factory = LLMProviderFactory(is_async=True) 

117 embedding_provider = llm_factory.create( 

118 { 

119 "provider": config.get("embedding_provider", "openai"), 

120 "model": config.get("embedding_model", "text-embedding-ada-002"), 

121 } 

122 ) 

123 await embedding_provider.initialize() 

124 

125 # Create merger config if specified 

126 merger_config = None 

127 if "merger" in config: 

128 merger_config = MergerConfig(**config["merger"]) 

129 

130 # Create formatter config if specified 

131 formatter_config = None 

132 if "formatter" in config: 

133 formatter_config = FormatterConfig(**config["formatter"]) 

134 

135 # Create instance 

136 kb = cls( 

137 vector_store=vector_store, 

138 embedding_provider=embedding_provider, 

139 chunking_config=config.get("chunking", {}), 

140 merger_config=merger_config, 

141 formatter_config=formatter_config, 

142 ) 

143 

144 # Load documents if path provided 

145 if "documents_path" in config: 

146 await kb.load_documents_from_directory( 

147 config["documents_path"], config.get("document_pattern", "**/*.md") 

148 ) 

149 

150 return kb 

151 

152 async def load_markdown_document( 

153 self, filepath: str | Path, metadata: dict[str, Any] | None = None 

154 ) -> int: 

155 """Load and chunk a markdown document. 

156 

157 Args: 

158 filepath: Path to markdown file 

159 metadata: Optional metadata to attach to all chunks 

160 

161 Returns: 

162 Number of chunks created 

163 

164 Example: 

165 ```python 

166 num_chunks = await kb.load_markdown_document( 

167 "docs/api.md", 

168 metadata={"category": "api", "version": "1.0"} 

169 ) 

170 ``` 

171 """ 

172 import numpy as np 

173 

174 # Read document 

175 filepath = Path(filepath) 

176 with open(filepath, encoding="utf-8") as f: 

177 markdown_text = f.read() 

178 

179 # Parse markdown 

180 tree = parse_markdown(markdown_text) 

181 

182 # Build quality filter config if specified 

183 quality_filter = None 

184 if "quality_filter" in self.chunking_config: 

185 qf_config = self.chunking_config["quality_filter"] 

186 if isinstance(qf_config, ChunkQualityConfig): 

187 quality_filter = qf_config 

188 elif isinstance(qf_config, dict): 

189 quality_filter = ChunkQualityConfig(**qf_config) 

190 

191 # Chunk the document with enhanced options 

192 chunks = chunk_markdown_tree( 

193 tree, 

194 max_chunk_size=self.chunking_config.get("max_chunk_size", 500), 

195 chunk_overlap=self.chunking_config.get("chunk_overlap", 50), 

196 heading_inclusion=HeadingInclusion.IN_METADATA, # Keep headings in metadata only 

197 combine_under_heading=self.chunking_config.get("combine_under_heading", True), 

198 quality_filter=quality_filter, 

199 generate_embeddings=self.chunking_config.get("generate_embeddings", True), 

200 ) 

201 

202 # Process and store chunks 

203 vectors = [] 

204 ids = [] 

205 metadatas = [] 

206 

207 for i, chunk in enumerate(chunks): 

208 # Use embedding_text if available, otherwise use chunk text 

209 text_for_embedding = chunk.metadata.embedding_text or chunk.text 

210 

211 # Generate embedding 

212 embedding = await self.embedding_provider.embed(text_for_embedding) 

213 

214 # Convert to numpy if needed 

215 if not isinstance(embedding, np.ndarray): 

216 embedding = np.array(embedding, dtype=np.float32) 

217 

218 # Prepare metadata with new fields 

219 chunk_id = f"{filepath.stem}_{i}" 

220 chunk_metadata = { 

221 "text": chunk.text, 

222 "source": str(filepath), 

223 "chunk_index": i, 

224 "heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(), 

225 "headings": chunk.metadata.headings, 

226 "heading_levels": chunk.metadata.heading_levels, 

227 "line_number": chunk.metadata.line_number, 

228 "chunk_size": chunk.metadata.chunk_size, 

229 "content_length": chunk.metadata.content_length, 

230 } 

231 

232 # Merge with user metadata 

233 if metadata: 

234 chunk_metadata.update(metadata) 

235 

236 vectors.append(embedding) 

237 ids.append(chunk_id) 

238 metadatas.append(chunk_metadata) 

239 

240 # Batch insert into vector store 

241 if vectors: 

242 await self.vector_store.add_vectors( 

243 vectors=vectors, ids=ids, metadata=metadatas 

244 ) 

245 

246 return len(chunks) 

247 

248 async def load_documents_from_directory( 

249 self, directory: str | Path, pattern: str = "**/*.md" 

250 ) -> dict[str, Any]: 

251 """Load all markdown documents from a directory. 

252 

253 Args: 

254 directory: Directory path containing documents 

255 pattern: Glob pattern for files to load (default: **/*.md) 

256 

257 Returns: 

258 Dictionary with loading statistics: 

259 - total_files: Number of files processed 

260 - total_chunks: Total chunks created 

261 - errors: List of errors encountered 

262 

263 Example: 

264 ```python 

265 results = await kb.load_documents_from_directory( 

266 "docs/", 

267 pattern="**/*.md" 

268 ) 

269 print(f"Loaded {results['total_chunks']} chunks from {results['total_files']} files") 

270 ``` 

271 """ 

272 directory = Path(directory) 

273 results = {"total_files": 0, "total_chunks": 0, "errors": []} 

274 

275 for filepath in directory.glob(pattern): 

276 if not filepath.is_file(): 

277 continue 

278 

279 try: 

280 num_chunks = await self.load_markdown_document( 

281 filepath, metadata={"filename": filepath.name} 

282 ) 

283 results["total_files"] += 1 

284 results["total_chunks"] += num_chunks 

285 except Exception as e: 

286 results["errors"].append({"file": str(filepath), "error": str(e)}) 

287 

288 return results 

289 

290 async def load_json_document( 

291 self, 

292 filepath: str | Path, 

293 metadata: dict[str, Any] | None = None, 

294 schema: str | None = None, 

295 transformer: ContentTransformer | None = None, 

296 title: str | None = None, 

297 ) -> int: 

298 """Load and chunk a JSON document by converting it to markdown. 

299 

300 This method converts JSON data to markdown format using ContentTransformer, 

301 then processes it like any other markdown document. 

302 

303 Args: 

304 filepath: Path to JSON file 

305 metadata: Optional metadata to attach to all chunks 

306 schema: Optional schema name (requires transformer with registered schema) 

307 transformer: Optional ContentTransformer instance with custom configuration 

308 title: Optional document title for the markdown 

309 

310 Returns: 

311 Number of chunks created 

312 

313 Example: 

314 ```python 

315 # Generic conversion 

316 num_chunks = await kb.load_json_document( 

317 "data/patterns.json", 

318 metadata={"content_type": "patterns"} 

319 ) 

320 

321 # With custom schema 

322 transformer = ContentTransformer() 

323 transformer.register_schema("pattern", { 

324 "title_field": "name", 

325 "sections": [ 

326 {"field": "description", "heading": "Description"}, 

327 {"field": "example", "heading": "Example", "format": "code"} 

328 ] 

329 }) 

330 num_chunks = await kb.load_json_document( 

331 "data/patterns.json", 

332 transformer=transformer, 

333 schema="pattern" 

334 ) 

335 ``` 

336 """ 

337 import json 

338 

339 filepath = Path(filepath) 

340 

341 # Read JSON 

342 with open(filepath, encoding="utf-8") as f: 

343 data = json.load(f) 

344 

345 # Convert to markdown 

346 if transformer is None: 

347 transformer = ContentTransformer() 

348 

349 markdown_text = transformer.transform_json( 

350 data, 

351 schema=schema, 

352 title=title or filepath.stem.replace("_", " ").title(), 

353 ) 

354 

355 return await self._load_markdown_text( 

356 markdown_text, 

357 source=str(filepath), 

358 metadata=metadata, 

359 ) 

360 

361 async def load_yaml_document( 

362 self, 

363 filepath: str | Path, 

364 metadata: dict[str, Any] | None = None, 

365 schema: str | None = None, 

366 transformer: ContentTransformer | None = None, 

367 title: str | None = None, 

368 ) -> int: 

369 """Load and chunk a YAML document by converting it to markdown. 

370 

371 Args: 

372 filepath: Path to YAML file 

373 metadata: Optional metadata to attach to all chunks 

374 schema: Optional schema name (requires transformer with registered schema) 

375 transformer: Optional ContentTransformer instance with custom configuration 

376 title: Optional document title for the markdown 

377 

378 Returns: 

379 Number of chunks created 

380 

381 Example: 

382 ```python 

383 num_chunks = await kb.load_yaml_document( 

384 "data/config.yaml", 

385 metadata={"content_type": "configuration"} 

386 ) 

387 ``` 

388 """ 

389 filepath = Path(filepath) 

390 

391 # Convert to markdown 

392 if transformer is None: 

393 transformer = ContentTransformer() 

394 

395 markdown_text = transformer.transform_yaml( 

396 filepath, 

397 schema=schema, 

398 title=title or filepath.stem.replace("_", " ").title(), 

399 ) 

400 

401 return await self._load_markdown_text( 

402 markdown_text, 

403 source=str(filepath), 

404 metadata=metadata, 

405 ) 

406 

407 async def load_csv_document( 

408 self, 

409 filepath: str | Path, 

410 metadata: dict[str, Any] | None = None, 

411 title: str | None = None, 

412 title_field: str | None = None, 

413 transformer: ContentTransformer | None = None, 

414 ) -> int: 

415 """Load and chunk a CSV document by converting it to markdown. 

416 

417 Each row becomes a section with the first column (or title_field) as heading. 

418 

419 Args: 

420 filepath: Path to CSV file 

421 metadata: Optional metadata to attach to all chunks 

422 title: Optional document title for the markdown 

423 title_field: Column to use as section title (default: first column) 

424 transformer: Optional ContentTransformer instance with custom configuration 

425 

426 Returns: 

427 Number of chunks created 

428 

429 Example: 

430 ```python 

431 num_chunks = await kb.load_csv_document( 

432 "data/faq.csv", 

433 title="Frequently Asked Questions", 

434 title_field="question" 

435 ) 

436 ``` 

437 """ 

438 filepath = Path(filepath) 

439 

440 # Convert to markdown 

441 if transformer is None: 

442 transformer = ContentTransformer() 

443 

444 markdown_text = transformer.transform_csv( 

445 filepath, 

446 title=title or filepath.stem.replace("_", " ").title(), 

447 title_field=title_field, 

448 ) 

449 

450 return await self._load_markdown_text( 

451 markdown_text, 

452 source=str(filepath), 

453 metadata=metadata, 

454 ) 

455 

456 async def _load_markdown_text( 

457 self, 

458 markdown_text: str, 

459 source: str, 

460 metadata: dict[str, Any] | None = None, 

461 ) -> int: 

462 """Internal method to load markdown text directly. 

463 

464 Used by load_json_document, load_yaml_document, and load_csv_document. 

465 

466 Args: 

467 markdown_text: Markdown content to load 

468 source: Source identifier for metadata 

469 metadata: Optional metadata to attach to all chunks 

470 

471 Returns: 

472 Number of chunks created 

473 """ 

474 import numpy as np 

475 

476 # Parse markdown 

477 tree = parse_markdown(markdown_text) 

478 

479 # Build quality filter config if specified 

480 quality_filter = None 

481 if "quality_filter" in self.chunking_config: 

482 qf_config = self.chunking_config["quality_filter"] 

483 if isinstance(qf_config, ChunkQualityConfig): 

484 quality_filter = qf_config 

485 elif isinstance(qf_config, dict): 

486 quality_filter = ChunkQualityConfig(**qf_config) 

487 

488 # Chunk the document with enhanced options 

489 chunks = chunk_markdown_tree( 

490 tree, 

491 max_chunk_size=self.chunking_config.get("max_chunk_size", 500), 

492 chunk_overlap=self.chunking_config.get("chunk_overlap", 50), 

493 heading_inclusion=HeadingInclusion.IN_METADATA, 

494 combine_under_heading=self.chunking_config.get("combine_under_heading", True), 

495 quality_filter=quality_filter, 

496 generate_embeddings=self.chunking_config.get("generate_embeddings", True), 

497 ) 

498 

499 # Process and store chunks 

500 vectors = [] 

501 ids = [] 

502 metadatas = [] 

503 

504 # Generate a base ID from source 

505 source_stem = Path(source).stem if source else "doc" 

506 

507 for i, chunk in enumerate(chunks): 

508 # Use embedding_text if available, otherwise use chunk text 

509 text_for_embedding = chunk.metadata.embedding_text or chunk.text 

510 

511 # Generate embedding 

512 embedding = await self.embedding_provider.embed(text_for_embedding) 

513 

514 # Convert to numpy if needed 

515 if not isinstance(embedding, np.ndarray): 

516 embedding = np.array(embedding, dtype=np.float32) 

517 

518 # Prepare metadata with new fields 

519 chunk_id = f"{source_stem}_{i}" 

520 chunk_metadata = { 

521 "text": chunk.text, 

522 "source": source, 

523 "chunk_index": i, 

524 "heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(), 

525 "headings": chunk.metadata.headings, 

526 "heading_levels": chunk.metadata.heading_levels, 

527 "line_number": chunk.metadata.line_number, 

528 "chunk_size": chunk.metadata.chunk_size, 

529 "content_length": chunk.metadata.content_length, 

530 } 

531 

532 # Merge with user metadata 

533 if metadata: 

534 chunk_metadata.update(metadata) 

535 

536 vectors.append(embedding) 

537 ids.append(chunk_id) 

538 metadatas.append(chunk_metadata) 

539 

540 # Batch insert into vector store 

541 if vectors: 

542 await self.vector_store.add_vectors( 

543 vectors=vectors, ids=ids, metadata=metadatas 

544 ) 

545 

546 return len(chunks) 

547 

548 async def query( 

549 self, 

550 query: str, 

551 k: int = 5, 

552 filter_metadata: dict[str, Any] | None = None, 

553 min_similarity: float = 0.0, 

554 merge_adjacent: bool = False, 

555 max_chunk_size: int | None = None, 

556 ) -> list[dict[str, Any]]: 

557 """Query knowledge base for relevant chunks. 

558 

559 Args: 

560 query: Query text to search for 

561 k: Number of results to return 

562 filter_metadata: Optional metadata filters 

563 min_similarity: Minimum similarity score (0-1) 

564 merge_adjacent: Whether to merge adjacent chunks with same heading 

565 max_chunk_size: Maximum size for merged chunks (uses merger config default if not specified) 

566 

567 Returns: 

568 List of result dictionaries with: 

569 - text: Chunk text 

570 - source: Source file 

571 - heading_path: Heading hierarchy 

572 - similarity: Similarity score 

573 - metadata: Full chunk metadata 

574 

575 Example: 

576 ```python 

577 results = await kb.query( 

578 "How do I configure the database?", 

579 k=3, 

580 merge_adjacent=True 

581 ) 

582 for result in results: 

583 print(f"[{result['similarity']:.2f}] {result['heading_path']}") 

584 print(result['text']) 

585 ``` 

586 """ 

587 import numpy as np 

588 

589 # Generate query embedding 

590 query_embedding = await self.embedding_provider.embed(query) 

591 

592 # Convert to numpy if needed 

593 if not isinstance(query_embedding, np.ndarray): 

594 query_embedding = np.array(query_embedding, dtype=np.float32) 

595 

596 # Search vector store 

597 search_results = await self.vector_store.search( 

598 query_vector=query_embedding, 

599 k=k, 

600 filter=filter_metadata, 

601 include_metadata=True, 

602 ) 

603 

604 # Format results 

605 results = [] 

606 for _chunk_id, similarity, chunk_metadata in search_results: 

607 if chunk_metadata and similarity >= min_similarity: 

608 results.append( 

609 { 

610 "text": chunk_metadata.get("text", ""), 

611 "source": chunk_metadata.get("source", ""), 

612 "heading_path": chunk_metadata.get("heading_path", ""), 

613 "similarity": similarity, 

614 "metadata": chunk_metadata, 

615 } 

616 ) 

617 

618 # Apply chunk merging if requested 

619 if merge_adjacent and results: 

620 # Update merger config if max_chunk_size specified 

621 if max_chunk_size is not None: 

622 merger = ChunkMerger(MergerConfig(max_merged_size=max_chunk_size)) 

623 else: 

624 merger = self.merger 

625 

626 merged_chunks = merger.merge(results) 

627 results = merger.to_result_list(merged_chunks) 

628 

629 return results 

630 

631 def format_context( 

632 self, 

633 results: list[dict[str, Any]], 

634 wrap_in_tags: bool = True, 

635 ) -> str: 

636 """Format search results for LLM context. 

637 

638 Convenience method to format results using the configured formatter. 

639 

640 Args: 

641 results: Search results from query() 

642 wrap_in_tags: Whether to wrap in <knowledge_base> tags 

643 

644 Returns: 

645 Formatted context string 

646 """ 

647 context = self.formatter.format(results) 

648 if wrap_in_tags: 

649 context = self.formatter.wrap_for_prompt(context) 

650 return context 

651 

652 async def clear(self) -> None: 

653 """Clear all documents from the knowledge base. 

654 

655 Warning: This removes all stored chunks and embeddings. 

656 """ 

657 if hasattr(self.vector_store, "clear"): 

658 await self.vector_store.clear() 

659 else: 

660 raise NotImplementedError( 

661 "Vector store does not support clearing. " 

662 "Consider creating a new knowledge base with a fresh collection." 

663 ) 

664 

665 async def save(self) -> None: 

666 """Save the knowledge base to persistent storage. 

667 

668 This persists the vector store index and metadata to disk. 

669 Only applicable for vector stores that support persistence (e.g., FAISS). 

670 

671 Example: 

672 ```python 

673 await kb.load_markdown_document("docs/api.md") 

674 await kb.save() # Persist to disk 

675 ``` 

676 """ 

677 if hasattr(self.vector_store, "save"): 

678 await self.vector_store.save() 

679 

680 async def close(self) -> None: 

681 """Close the knowledge base and release resources. 

682 

683 This method: 

684 - Saves the vector store to disk (if persistence is configured) 

685 - Closes the vector store connection 

686 - Closes the embedding provider (releases HTTP sessions) 

687 

688 Should be called when done using the knowledge base to prevent 

689 resource leaks (e.g., unclosed aiohttp sessions). 

690 

691 Example: 

692 ```python 

693 kb = await RAGKnowledgeBase.from_config(config) 

694 try: 

695 await kb.load_markdown_document("docs/api.md") 

696 results = await kb.query("How do I configure?") 

697 finally: 

698 await kb.close() 

699 ``` 

700 """ 

701 # Close vector store (will save if persist_path is set) 

702 if hasattr(self.vector_store, "close"): 

703 await self.vector_store.close() 

704 

705 # Close embedding provider (releases HTTP client sessions) 

706 if hasattr(self.embedding_provider, "close"): 

707 await self.embedding_provider.close() 

708 

709 async def __aenter__(self) -> "RAGKnowledgeBase": 

710 """Async context manager entry. 

711 

712 Returns: 

713 Self for use in async with statement 

714 

715 Example: 

716 ```python 

717 async with await RAGKnowledgeBase.from_config(config) as kb: 

718 await kb.load_markdown_document("docs/api.md") 

719 results = await kb.query("How do I configure?") 

720 # Automatically saved and closed 

721 ``` 

722 """ 

723 return self 

724 

725 async def __aexit__( 

726 self, 

727 exc_type: type[BaseException] | None, 

728 exc_val: BaseException | None, 

729 exc_tb: types.TracebackType | None, 

730 ) -> None: 

731 """Async context manager exit - ensures cleanup. 

732 

733 Args: 

734 exc_type: Exception type if an exception occurred 

735 exc_val: Exception value if an exception occurred 

736 exc_tb: Exception traceback if an exception occurred 

737 """ 

738 await self.close()