Coverage for src / dataknobs_bots / knowledge / rag.py: 13%
166 statements
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-16 10:13 -0700
« prev ^ index » next coverage.py v7.13.0, created at 2025-12-16 10:13 -0700
1"""RAG (Retrieval-Augmented Generation) knowledge base implementation."""
3import types
4from pathlib import Path
5from typing import Any
7from dataknobs_xization import (
8 ChunkQualityConfig,
9 ContentTransformer,
10 HeadingInclusion,
11 chunk_markdown_tree,
12 parse_markdown,
13)
14from dataknobs_bots.knowledge.retrieval import (
15 ChunkMerger,
16 ContextFormatter,
17 FormatterConfig,
18 MergerConfig,
19)
22class RAGKnowledgeBase:
23 """RAG knowledge base using dataknobs-xization for chunking and vector search.
25 This implementation:
26 - Parses markdown documents using dataknobs-xization
27 - Chunks documents intelligently based on structure
28 - Stores chunks with embeddings in vector store
29 - Provides semantic search for relevant context
31 Attributes:
32 vector_store: Vector store backend from dataknobs_data
33 embedding_provider: LLM provider for generating embeddings
34 chunking_config: Configuration for document chunking
35 """
37 def __init__(
38 self,
39 vector_store: Any,
40 embedding_provider: Any,
41 chunking_config: dict[str, Any] | None = None,
42 merger_config: MergerConfig | None = None,
43 formatter_config: FormatterConfig | None = None,
44 ):
45 """Initialize RAG knowledge base.
47 Args:
48 vector_store: Vector store backend instance
49 embedding_provider: LLM provider with embed() method
50 chunking_config: Configuration for chunking:
51 - max_chunk_size: Maximum chunk size in characters
52 - chunk_overlap: Overlap between chunks
53 - combine_under_heading: Combine text under same heading
54 - quality_filter: ChunkQualityConfig for filtering
55 - generate_embeddings: Whether to generate enriched embedding text
56 merger_config: Configuration for chunk merging (optional)
57 formatter_config: Configuration for context formatting (optional)
58 """
59 self.vector_store = vector_store
60 self.embedding_provider = embedding_provider
61 self.chunking_config = chunking_config or {
62 "max_chunk_size": 500,
63 "chunk_overlap": 50,
64 "combine_under_heading": True,
65 }
67 # Initialize merger and formatter
68 self.merger = ChunkMerger(merger_config) if merger_config else ChunkMerger()
69 self.formatter = ContextFormatter(formatter_config) if formatter_config else ContextFormatter()
71 @classmethod
72 async def from_config(cls, config: dict[str, Any]) -> "RAGKnowledgeBase":
73 """Create RAG knowledge base from configuration.
75 Args:
76 config: Configuration dictionary with:
77 - vector_store: Vector store configuration
78 - embedding_provider: LLM provider name
79 - embedding_model: Model for embeddings
80 - chunking: Optional chunking configuration
81 - documents_path: Optional path to load documents from
82 - document_pattern: Optional glob pattern for documents
84 Returns:
85 Configured RAGKnowledgeBase instance
87 Example:
88 ```python
89 config = {
90 "vector_store": {
91 "backend": "faiss",
92 "dimensions": 1536,
93 "collection": "docs"
94 },
95 "embedding_provider": "openai",
96 "embedding_model": "text-embedding-3-small",
97 "chunking": {
98 "max_chunk_size": 500,
99 "chunk_overlap": 50
100 },
101 "documents_path": "./docs"
102 }
103 kb = await RAGKnowledgeBase.from_config(config)
104 ```
105 """
106 from dataknobs_data.vector.stores import VectorStoreFactory
107 from dataknobs_llm.llm import LLMProviderFactory
109 # Create vector store
110 vs_config = config["vector_store"]
111 factory = VectorStoreFactory()
112 vector_store = factory.create(**vs_config)
113 await vector_store.initialize()
115 # Create embedding provider
116 llm_factory = LLMProviderFactory(is_async=True)
117 embedding_provider = llm_factory.create(
118 {
119 "provider": config.get("embedding_provider", "openai"),
120 "model": config.get("embedding_model", "text-embedding-ada-002"),
121 }
122 )
123 await embedding_provider.initialize()
125 # Create merger config if specified
126 merger_config = None
127 if "merger" in config:
128 merger_config = MergerConfig(**config["merger"])
130 # Create formatter config if specified
131 formatter_config = None
132 if "formatter" in config:
133 formatter_config = FormatterConfig(**config["formatter"])
135 # Create instance
136 kb = cls(
137 vector_store=vector_store,
138 embedding_provider=embedding_provider,
139 chunking_config=config.get("chunking", {}),
140 merger_config=merger_config,
141 formatter_config=formatter_config,
142 )
144 # Load documents if path provided
145 if "documents_path" in config:
146 await kb.load_documents_from_directory(
147 config["documents_path"], config.get("document_pattern", "**/*.md")
148 )
150 return kb
152 async def load_markdown_document(
153 self, filepath: str | Path, metadata: dict[str, Any] | None = None
154 ) -> int:
155 """Load and chunk a markdown document.
157 Args:
158 filepath: Path to markdown file
159 metadata: Optional metadata to attach to all chunks
161 Returns:
162 Number of chunks created
164 Example:
165 ```python
166 num_chunks = await kb.load_markdown_document(
167 "docs/api.md",
168 metadata={"category": "api", "version": "1.0"}
169 )
170 ```
171 """
172 import numpy as np
174 # Read document
175 filepath = Path(filepath)
176 with open(filepath, encoding="utf-8") as f:
177 markdown_text = f.read()
179 # Parse markdown
180 tree = parse_markdown(markdown_text)
182 # Build quality filter config if specified
183 quality_filter = None
184 if "quality_filter" in self.chunking_config:
185 qf_config = self.chunking_config["quality_filter"]
186 if isinstance(qf_config, ChunkQualityConfig):
187 quality_filter = qf_config
188 elif isinstance(qf_config, dict):
189 quality_filter = ChunkQualityConfig(**qf_config)
191 # Chunk the document with enhanced options
192 chunks = chunk_markdown_tree(
193 tree,
194 max_chunk_size=self.chunking_config.get("max_chunk_size", 500),
195 chunk_overlap=self.chunking_config.get("chunk_overlap", 50),
196 heading_inclusion=HeadingInclusion.IN_METADATA, # Keep headings in metadata only
197 combine_under_heading=self.chunking_config.get("combine_under_heading", True),
198 quality_filter=quality_filter,
199 generate_embeddings=self.chunking_config.get("generate_embeddings", True),
200 )
202 # Process and store chunks
203 vectors = []
204 ids = []
205 metadatas = []
207 for i, chunk in enumerate(chunks):
208 # Use embedding_text if available, otherwise use chunk text
209 text_for_embedding = chunk.metadata.embedding_text or chunk.text
211 # Generate embedding
212 embedding = await self.embedding_provider.embed(text_for_embedding)
214 # Convert to numpy if needed
215 if not isinstance(embedding, np.ndarray):
216 embedding = np.array(embedding, dtype=np.float32)
218 # Prepare metadata with new fields
219 chunk_id = f"{filepath.stem}_{i}"
220 chunk_metadata = {
221 "text": chunk.text,
222 "source": str(filepath),
223 "chunk_index": i,
224 "heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
225 "headings": chunk.metadata.headings,
226 "heading_levels": chunk.metadata.heading_levels,
227 "line_number": chunk.metadata.line_number,
228 "chunk_size": chunk.metadata.chunk_size,
229 "content_length": chunk.metadata.content_length,
230 }
232 # Merge with user metadata
233 if metadata:
234 chunk_metadata.update(metadata)
236 vectors.append(embedding)
237 ids.append(chunk_id)
238 metadatas.append(chunk_metadata)
240 # Batch insert into vector store
241 if vectors:
242 await self.vector_store.add_vectors(
243 vectors=vectors, ids=ids, metadata=metadatas
244 )
246 return len(chunks)
248 async def load_documents_from_directory(
249 self, directory: str | Path, pattern: str = "**/*.md"
250 ) -> dict[str, Any]:
251 """Load all markdown documents from a directory.
253 Args:
254 directory: Directory path containing documents
255 pattern: Glob pattern for files to load (default: **/*.md)
257 Returns:
258 Dictionary with loading statistics:
259 - total_files: Number of files processed
260 - total_chunks: Total chunks created
261 - errors: List of errors encountered
263 Example:
264 ```python
265 results = await kb.load_documents_from_directory(
266 "docs/",
267 pattern="**/*.md"
268 )
269 print(f"Loaded {results['total_chunks']} chunks from {results['total_files']} files")
270 ```
271 """
272 directory = Path(directory)
273 results = {"total_files": 0, "total_chunks": 0, "errors": []}
275 for filepath in directory.glob(pattern):
276 if not filepath.is_file():
277 continue
279 try:
280 num_chunks = await self.load_markdown_document(
281 filepath, metadata={"filename": filepath.name}
282 )
283 results["total_files"] += 1
284 results["total_chunks"] += num_chunks
285 except Exception as e:
286 results["errors"].append({"file": str(filepath), "error": str(e)})
288 return results
290 async def load_json_document(
291 self,
292 filepath: str | Path,
293 metadata: dict[str, Any] | None = None,
294 schema: str | None = None,
295 transformer: ContentTransformer | None = None,
296 title: str | None = None,
297 ) -> int:
298 """Load and chunk a JSON document by converting it to markdown.
300 This method converts JSON data to markdown format using ContentTransformer,
301 then processes it like any other markdown document.
303 Args:
304 filepath: Path to JSON file
305 metadata: Optional metadata to attach to all chunks
306 schema: Optional schema name (requires transformer with registered schema)
307 transformer: Optional ContentTransformer instance with custom configuration
308 title: Optional document title for the markdown
310 Returns:
311 Number of chunks created
313 Example:
314 ```python
315 # Generic conversion
316 num_chunks = await kb.load_json_document(
317 "data/patterns.json",
318 metadata={"content_type": "patterns"}
319 )
321 # With custom schema
322 transformer = ContentTransformer()
323 transformer.register_schema("pattern", {
324 "title_field": "name",
325 "sections": [
326 {"field": "description", "heading": "Description"},
327 {"field": "example", "heading": "Example", "format": "code"}
328 ]
329 })
330 num_chunks = await kb.load_json_document(
331 "data/patterns.json",
332 transformer=transformer,
333 schema="pattern"
334 )
335 ```
336 """
337 import json
339 filepath = Path(filepath)
341 # Read JSON
342 with open(filepath, encoding="utf-8") as f:
343 data = json.load(f)
345 # Convert to markdown
346 if transformer is None:
347 transformer = ContentTransformer()
349 markdown_text = transformer.transform_json(
350 data,
351 schema=schema,
352 title=title or filepath.stem.replace("_", " ").title(),
353 )
355 return await self._load_markdown_text(
356 markdown_text,
357 source=str(filepath),
358 metadata=metadata,
359 )
361 async def load_yaml_document(
362 self,
363 filepath: str | Path,
364 metadata: dict[str, Any] | None = None,
365 schema: str | None = None,
366 transformer: ContentTransformer | None = None,
367 title: str | None = None,
368 ) -> int:
369 """Load and chunk a YAML document by converting it to markdown.
371 Args:
372 filepath: Path to YAML file
373 metadata: Optional metadata to attach to all chunks
374 schema: Optional schema name (requires transformer with registered schema)
375 transformer: Optional ContentTransformer instance with custom configuration
376 title: Optional document title for the markdown
378 Returns:
379 Number of chunks created
381 Example:
382 ```python
383 num_chunks = await kb.load_yaml_document(
384 "data/config.yaml",
385 metadata={"content_type": "configuration"}
386 )
387 ```
388 """
389 filepath = Path(filepath)
391 # Convert to markdown
392 if transformer is None:
393 transformer = ContentTransformer()
395 markdown_text = transformer.transform_yaml(
396 filepath,
397 schema=schema,
398 title=title or filepath.stem.replace("_", " ").title(),
399 )
401 return await self._load_markdown_text(
402 markdown_text,
403 source=str(filepath),
404 metadata=metadata,
405 )
407 async def load_csv_document(
408 self,
409 filepath: str | Path,
410 metadata: dict[str, Any] | None = None,
411 title: str | None = None,
412 title_field: str | None = None,
413 transformer: ContentTransformer | None = None,
414 ) -> int:
415 """Load and chunk a CSV document by converting it to markdown.
417 Each row becomes a section with the first column (or title_field) as heading.
419 Args:
420 filepath: Path to CSV file
421 metadata: Optional metadata to attach to all chunks
422 title: Optional document title for the markdown
423 title_field: Column to use as section title (default: first column)
424 transformer: Optional ContentTransformer instance with custom configuration
426 Returns:
427 Number of chunks created
429 Example:
430 ```python
431 num_chunks = await kb.load_csv_document(
432 "data/faq.csv",
433 title="Frequently Asked Questions",
434 title_field="question"
435 )
436 ```
437 """
438 filepath = Path(filepath)
440 # Convert to markdown
441 if transformer is None:
442 transformer = ContentTransformer()
444 markdown_text = transformer.transform_csv(
445 filepath,
446 title=title or filepath.stem.replace("_", " ").title(),
447 title_field=title_field,
448 )
450 return await self._load_markdown_text(
451 markdown_text,
452 source=str(filepath),
453 metadata=metadata,
454 )
456 async def _load_markdown_text(
457 self,
458 markdown_text: str,
459 source: str,
460 metadata: dict[str, Any] | None = None,
461 ) -> int:
462 """Internal method to load markdown text directly.
464 Used by load_json_document, load_yaml_document, and load_csv_document.
466 Args:
467 markdown_text: Markdown content to load
468 source: Source identifier for metadata
469 metadata: Optional metadata to attach to all chunks
471 Returns:
472 Number of chunks created
473 """
474 import numpy as np
476 # Parse markdown
477 tree = parse_markdown(markdown_text)
479 # Build quality filter config if specified
480 quality_filter = None
481 if "quality_filter" in self.chunking_config:
482 qf_config = self.chunking_config["quality_filter"]
483 if isinstance(qf_config, ChunkQualityConfig):
484 quality_filter = qf_config
485 elif isinstance(qf_config, dict):
486 quality_filter = ChunkQualityConfig(**qf_config)
488 # Chunk the document with enhanced options
489 chunks = chunk_markdown_tree(
490 tree,
491 max_chunk_size=self.chunking_config.get("max_chunk_size", 500),
492 chunk_overlap=self.chunking_config.get("chunk_overlap", 50),
493 heading_inclusion=HeadingInclusion.IN_METADATA,
494 combine_under_heading=self.chunking_config.get("combine_under_heading", True),
495 quality_filter=quality_filter,
496 generate_embeddings=self.chunking_config.get("generate_embeddings", True),
497 )
499 # Process and store chunks
500 vectors = []
501 ids = []
502 metadatas = []
504 # Generate a base ID from source
505 source_stem = Path(source).stem if source else "doc"
507 for i, chunk in enumerate(chunks):
508 # Use embedding_text if available, otherwise use chunk text
509 text_for_embedding = chunk.metadata.embedding_text or chunk.text
511 # Generate embedding
512 embedding = await self.embedding_provider.embed(text_for_embedding)
514 # Convert to numpy if needed
515 if not isinstance(embedding, np.ndarray):
516 embedding = np.array(embedding, dtype=np.float32)
518 # Prepare metadata with new fields
519 chunk_id = f"{source_stem}_{i}"
520 chunk_metadata = {
521 "text": chunk.text,
522 "source": source,
523 "chunk_index": i,
524 "heading_path": chunk.metadata.heading_display or chunk.metadata.get_heading_path(),
525 "headings": chunk.metadata.headings,
526 "heading_levels": chunk.metadata.heading_levels,
527 "line_number": chunk.metadata.line_number,
528 "chunk_size": chunk.metadata.chunk_size,
529 "content_length": chunk.metadata.content_length,
530 }
532 # Merge with user metadata
533 if metadata:
534 chunk_metadata.update(metadata)
536 vectors.append(embedding)
537 ids.append(chunk_id)
538 metadatas.append(chunk_metadata)
540 # Batch insert into vector store
541 if vectors:
542 await self.vector_store.add_vectors(
543 vectors=vectors, ids=ids, metadata=metadatas
544 )
546 return len(chunks)
548 async def query(
549 self,
550 query: str,
551 k: int = 5,
552 filter_metadata: dict[str, Any] | None = None,
553 min_similarity: float = 0.0,
554 merge_adjacent: bool = False,
555 max_chunk_size: int | None = None,
556 ) -> list[dict[str, Any]]:
557 """Query knowledge base for relevant chunks.
559 Args:
560 query: Query text to search for
561 k: Number of results to return
562 filter_metadata: Optional metadata filters
563 min_similarity: Minimum similarity score (0-1)
564 merge_adjacent: Whether to merge adjacent chunks with same heading
565 max_chunk_size: Maximum size for merged chunks (uses merger config default if not specified)
567 Returns:
568 List of result dictionaries with:
569 - text: Chunk text
570 - source: Source file
571 - heading_path: Heading hierarchy
572 - similarity: Similarity score
573 - metadata: Full chunk metadata
575 Example:
576 ```python
577 results = await kb.query(
578 "How do I configure the database?",
579 k=3,
580 merge_adjacent=True
581 )
582 for result in results:
583 print(f"[{result['similarity']:.2f}] {result['heading_path']}")
584 print(result['text'])
585 ```
586 """
587 import numpy as np
589 # Generate query embedding
590 query_embedding = await self.embedding_provider.embed(query)
592 # Convert to numpy if needed
593 if not isinstance(query_embedding, np.ndarray):
594 query_embedding = np.array(query_embedding, dtype=np.float32)
596 # Search vector store
597 search_results = await self.vector_store.search(
598 query_vector=query_embedding,
599 k=k,
600 filter=filter_metadata,
601 include_metadata=True,
602 )
604 # Format results
605 results = []
606 for _chunk_id, similarity, chunk_metadata in search_results:
607 if chunk_metadata and similarity >= min_similarity:
608 results.append(
609 {
610 "text": chunk_metadata.get("text", ""),
611 "source": chunk_metadata.get("source", ""),
612 "heading_path": chunk_metadata.get("heading_path", ""),
613 "similarity": similarity,
614 "metadata": chunk_metadata,
615 }
616 )
618 # Apply chunk merging if requested
619 if merge_adjacent and results:
620 # Update merger config if max_chunk_size specified
621 if max_chunk_size is not None:
622 merger = ChunkMerger(MergerConfig(max_merged_size=max_chunk_size))
623 else:
624 merger = self.merger
626 merged_chunks = merger.merge(results)
627 results = merger.to_result_list(merged_chunks)
629 return results
631 def format_context(
632 self,
633 results: list[dict[str, Any]],
634 wrap_in_tags: bool = True,
635 ) -> str:
636 """Format search results for LLM context.
638 Convenience method to format results using the configured formatter.
640 Args:
641 results: Search results from query()
642 wrap_in_tags: Whether to wrap in <knowledge_base> tags
644 Returns:
645 Formatted context string
646 """
647 context = self.formatter.format(results)
648 if wrap_in_tags:
649 context = self.formatter.wrap_for_prompt(context)
650 return context
652 async def clear(self) -> None:
653 """Clear all documents from the knowledge base.
655 Warning: This removes all stored chunks and embeddings.
656 """
657 if hasattr(self.vector_store, "clear"):
658 await self.vector_store.clear()
659 else:
660 raise NotImplementedError(
661 "Vector store does not support clearing. "
662 "Consider creating a new knowledge base with a fresh collection."
663 )
665 async def save(self) -> None:
666 """Save the knowledge base to persistent storage.
668 This persists the vector store index and metadata to disk.
669 Only applicable for vector stores that support persistence (e.g., FAISS).
671 Example:
672 ```python
673 await kb.load_markdown_document("docs/api.md")
674 await kb.save() # Persist to disk
675 ```
676 """
677 if hasattr(self.vector_store, "save"):
678 await self.vector_store.save()
680 async def close(self) -> None:
681 """Close the knowledge base and release resources.
683 This method:
684 - Saves the vector store to disk (if persistence is configured)
685 - Closes the vector store connection
686 - Closes the embedding provider (releases HTTP sessions)
688 Should be called when done using the knowledge base to prevent
689 resource leaks (e.g., unclosed aiohttp sessions).
691 Example:
692 ```python
693 kb = await RAGKnowledgeBase.from_config(config)
694 try:
695 await kb.load_markdown_document("docs/api.md")
696 results = await kb.query("How do I configure?")
697 finally:
698 await kb.close()
699 ```
700 """
701 # Close vector store (will save if persist_path is set)
702 if hasattr(self.vector_store, "close"):
703 await self.vector_store.close()
705 # Close embedding provider (releases HTTP client sessions)
706 if hasattr(self.embedding_provider, "close"):
707 await self.embedding_provider.close()
709 async def __aenter__(self) -> "RAGKnowledgeBase":
710 """Async context manager entry.
712 Returns:
713 Self for use in async with statement
715 Example:
716 ```python
717 async with await RAGKnowledgeBase.from_config(config) as kb:
718 await kb.load_markdown_document("docs/api.md")
719 results = await kb.query("How do I configure?")
720 # Automatically saved and closed
721 ```
722 """
723 return self
725 async def __aexit__(
726 self,
727 exc_type: type[BaseException] | None,
728 exc_val: BaseException | None,
729 exc_tb: types.TracebackType | None,
730 ) -> None:
731 """Async context manager exit - ensures cleanup.
733 Args:
734 exc_type: Exception type if an exception occurred
735 exc_val: Exception value if an exception occurred
736 exc_tb: Exception traceback if an exception occurred
737 """
738 await self.close()