"""
MDRetriever - Python implementation of MDRetriever.java

Processes Markdown documents and extracts sections with their metadata.
Each section is identified by a level 1 header (# Title [id]) and can have
keywords (lines starting with >) and content.
"""

import re
import json
import sqlite3
from typing import Optional, List, Dict, Any
from pathlib import Path


class MDRetriever:
    """Processes Markdown documents and stores sections in a database."""
    
    MD_DOCS_COLLECTION = "@@@@@MD_DOCS@@@@@"
    
    def __init__(self, db_path: str = "md_docs.db"):
        """
        Initialize MDRetriever with a database connection.
        
        Args:
            db_path: Path to SQLite database file
        """
        self.db_path = db_path
        self._init_database()
    
    def _init_database(self):
        """Initialize the database schema."""
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Create table for storing document sections
        cursor.execute("""
            CREATE TABLE IF NOT EXISTS md_docs (
                id TEXT PRIMARY KEY,
                title TEXT NOT NULL,
                keywords TEXT,
                content TEXT NOT NULL,
                source TEXT NOT NULL,
                index_name TEXT NOT NULL,
                position INTEGER NOT NULL,
                body TEXT  -- JSONB equivalent: stores full document as JSON
            )
        """)
        
        # Create indexes for faster queries
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_index_name ON md_docs(index_name)
        """)
        cursor.execute("""
            CREATE INDEX IF NOT EXISTS idx_position ON md_docs(index_name, position)
        """)
        
        conn.commit()
        conn.close()
    
    def process_document(self, index: str, resource: str):
        """
        Process a Markdown document and extract sections.
        
        Args:
            index: Index identifier (e.g., "AGENT_ID/resource.md")
            resource: Path to the Markdown file
        """
        try:
            # Drop existing index before processing
            self.drop_index(index)
            
            with open(resource, 'r', encoding='utf-8') as f:
                lines = f.readlines()
            
            title = None
            section_id = None
            keywords_builder = []
            content_builder = []
            collecting_keywords = True
            saw_any_keyword_line = False
            in_section = False
            in_code_block = False
            position = 0
            
            for line in lines:
                trimmed = line.strip()
                
                # Detect fenced code block start/end (``` or ~~~)
                if trimmed.startswith("```") or trimmed.startswith("~~~"):
                    in_code_block = not in_code_block
                    # Code fence should be included as content in the current section
                    if in_section:
                        # If we were still collecting keywords, stop and treat the fence as content
                        if collecting_keywords:
                            collecting_keywords = False
                        content_builder.append(line)
                    continue
                
                # Treat headers only when not inside a fenced code block
                if not in_code_block and trimmed.startswith("# "):
                    # New section starts
                    if in_section:
                        # Flush previous section
                        keywords = ' '.join(keywords_builder).strip() if keywords_builder else None
                        content = ''.join(content_builder).strip()
                        self._flush_section(
                            index, section_id, title, keywords, content, resource, position
                        )
                        position += 1
                        
                        # Reset for next section
                        keywords_builder = []
                        content_builder = []
                        collecting_keywords = True
                        saw_any_keyword_line = False
                    
                    in_section = True
                    title_line = trimmed[2:].strip()
                    
                    # Extract ID from [id] pattern (allows word chars, hyphens, and underscores)
                    id_match = re.search(r'\[([\w-]+)\]', title_line)
                    if id_match:
                        section_id = id_match.group(1)
                    else:
                        section_id = f"section-{position}"
                    
                    # Remove [id] from title
                    title = re.sub(r'\[([\w-]+)\]\s*', '', title_line).strip()
                    
                elif in_section:
                    if collecting_keywords:
                        if not trimmed:
                            # Blank line: if we've already seen keyword lines, end keywords collection
                            if saw_any_keyword_line:
                                collecting_keywords = False
                            # Otherwise skip leading blank lines
                        elif trimmed.startswith(">"):
                            keyword = trimmed[1:].strip()
                            keywords_builder.append(keyword)
                            saw_any_keyword_line = True
                        else:
                            # No keywords; start content
                            collecting_keywords = False
                            content_builder.append(line)
                    else:
                        content_builder.append(line)
            
            # Flush last section
            if in_section:
                keywords = ' '.join(keywords_builder).strip() if keywords_builder else None
                content = ''.join(content_builder).strip()
                self._flush_section(
                    index, section_id, title, keywords, content, resource, position
                )
        
        except Exception as e:
            print(f"Error processing document {resource}: {e}")
            raise
    
    def _flush_section(
        self, 
        index: str, 
        section_id: str, 
        title: str, 
        keywords: Optional[str], 
        content: str, 
        resource: str, 
        position: int
    ):
        """
        Save a section to the database.
        
        Args:
            index: Index identifier
            section_id: Section ID
            title: Section title
            keywords: Section keywords (or None)
            content: Section content
            resource: Source file path
            position: Position in document
        """
        if title and section_id and content and content.strip():
            doc_id = f"{index}#{section_id}"
            
            # Use title as keywords if keywords not provided
            final_keywords = keywords if keywords and keywords.strip() else title
            
            # Create full document object (JSONB equivalent)
            doc_obj = {
                "$id": doc_id,
                "title": title,
                "keywords": final_keywords,
                "content": content,
                "source": resource,
                "index": index,
                "position": position
            }
            
            body_json = json.dumps(doc_obj)
            
            conn = sqlite3.connect(self.db_path)
            cursor = conn.cursor()
            
            # Upsert (insert or update)
            cursor.execute("""
                INSERT OR REPLACE INTO md_docs 
                (id, title, keywords, content, source, index_name, position, body)
                VALUES (?, ?, ?, ?, ?, ?, ?, ?)
            """, (
                doc_id,
                title,
                final_keywords,
                content,
                resource,
                index,
                position,
                body_json
            ))
            
            conn.commit()
            conn.close()
    
    def get_document_by_id(self, doc_id: str) -> Optional[Dict[str, Any]]:
        """
        Retrieve a document by its ID.
        
        Args:
            doc_id: Document ID (format: "index#section-id")
            
        Returns:
            Document dictionary or None if not found
        """
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute("""
            SELECT body FROM md_docs WHERE id = ?
        """, (doc_id,))
        
        row = cursor.fetchone()
        conn.close()
        
        if row:
            return json.loads(row[0])
        return None
    
    def drop_index(self, index: str) -> int:
        """
        Delete all documents with the given index.
        
        Args:
            index: Index identifier
            
        Returns:
            Number of deleted documents
        """
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        cursor.execute("""
            DELETE FROM md_docs WHERE index_name = ?
        """, (index,))
        
        deleted_count = cursor.rowcount
        conn.commit()
        conn.close()
        
        return deleted_count
    
    def get_index_info(self, agent_id: str, md_resources: List[str]) -> str:
        """
        Get index information for an agent's markdown resources.
        
        Args:
            agent_id: Agent ID
            md_resources: List of markdown resource file names
            
        Returns:
            Formatted string with index information
        """
        # Build list of index names
        indexes = [f"{agent_id}/{resource}" for resource in md_resources]
        
        # Query documents
        conn = sqlite3.connect(self.db_path)
        cursor = conn.cursor()
        
        # Build query with IN clause
        placeholders = ','.join('?' * len(indexes))
        cursor.execute(f"""
            SELECT body FROM md_docs 
            WHERE index_name IN ({placeholders})
            ORDER BY index_name, position ASC
        """, indexes)
        
        rows = cursor.fetchall()
        conn.close()
        
        # Build result array
        result = []
        for row in rows:
            doc = json.loads(row[0])
            entry = {
                "id": doc.get("$id"),
                "title": doc.get("title"),
                "keywords": doc.get("keywords")
            }
            result.append(entry)
        
        # Format as JSON
        result_json = json.dumps(result, indent=2, ensure_ascii=False)
        
        return (
            "# IMPORTANTE\n"
            "Estas son las secciones de documentación para fundamentar tus respuestas y obtener instrucciones específicas. "
            "Sólo utiliza esta información cuando sea necesario para proporcionar respuestas precisas y relevantes. "
            "Utiliza los títulos y las keywords para identificar rápidamente los temas cubiertos en cada sección. "
            "Utiliza el id para recuperar el contenido completo de la sección usando la herramienta 'get_bpm_guide_content'."
            "\n```\n" + result_json + "\n```"
        )

