# spiderforce4ai/__init__.py

import asyncio
import aiohttp
import json
import logging
from typing import List, Dict, Union, Optional, Tuple
from dataclasses import dataclass, asdict
from urllib.parse import urljoin, urlparse
from pathlib import Path
import time
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import re
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn, TaskProgressColumn
from rich.console import Console
import aiofiles
import httpx
import requests
from multiprocessing import Pool

console = Console()

def slugify(url: str) -> str:
    """Convert URL to a valid filename."""
    parsed = urlparse(url)
    # Combine domain and path, remove scheme and special characters
    slug = f"{parsed.netloc}{parsed.path}"
    slug = re.sub(r'[^\w\-]', '_', slug)
    slug = re.sub(r'_+', '_', slug)  # Replace multiple underscores with single
    return slug.strip('_')

@dataclass
class CrawlResult:
    """Store results of a crawl operation."""
    url: str
    status: str  # 'success' or 'failed'
    markdown: Optional[str] = None
    error: Optional[str] = None
    timestamp: str = None
    config: Dict = None
    
    def __post_init__(self):
        if not self.timestamp:
            self.timestamp = datetime.now().isoformat()

@dataclass
class CrawlConfig:
    """Configuration for crawling settings."""
    target_selector: Optional[str] = None  # Optional - specific element to target
    remove_selectors: Optional[List[str]] = None  # Optional - elements to remove
    remove_selectors_regex: Optional[List[str]] = None  # Optional - regex patterns for removal
    max_concurrent_requests: int = 1  # Default to single thread
    request_delay: float = 0.5  # Delay between requests
    timeout: int = 30  # Request timeout
    output_dir: Path = Path("spiderforce_reports")  # Default to spiderforce_reports in current directory
    webhook_url: Optional[str] = None  # Optional webhook endpoint
    webhook_timeout: int = 10  # Webhook timeout
    report_file: Optional[Path] = None  # Optional report file location

    def __post_init__(self):
        # Initialize empty lists for selectors if None
        self.remove_selectors = self.remove_selectors or []
        self.remove_selectors_regex = self.remove_selectors_regex or []
        
        # Ensure output_dir is a Path and exists
        self.output_dir = Path(self.output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
        # If report_file is not specified, create it in output_dir
        if self.report_file is None:
            self.report_file = self.output_dir / "crawl_report.json"
        else:
            self.report_file = Path(self.report_file)

    def to_dict(self) -> Dict:
        """Convert config to dictionary for API requests."""
        payload = {}
        # Only include selectors if they are set
        if self.target_selector:
            payload["target_selector"] = self.target_selector
        if self.remove_selectors:
            payload["remove_selectors"] = self.remove_selectors
        if self.remove_selectors_regex:
            payload["remove_selectors_regex"] = self.remove_selectors_regex
        return payload

# Module level function for multiprocessing
def _process_url_parallel(args: Tuple[str, str, CrawlConfig]) -> CrawlResult:
    """Process a single URL for parallel processing."""
    url, base_url, config = args
    try:
        endpoint = f"{base_url}/convert"
        payload = {
            "url": url,
            **config.to_dict()
        }
        
        response = requests.post(endpoint, json=payload, timeout=config.timeout)
        if response.status_code != 200:
            return CrawlResult(
                url=url,
                status="failed",
                error=f"HTTP {response.status_code}: {response.text}",
                config=config.to_dict()
            )
        
        markdown = response.text
        
        # Save markdown if output directory is configured
        if config.output_dir:
            filepath = config.output_dir / f"{slugify(url)}.md"
            with open(filepath, 'w', encoding='utf-8') as f:
                f.write(markdown)
        
        # Add delay if configured
        if config.request_delay:
            time.sleep(config.request_delay)
        
        return CrawlResult(
            url=url,
            status="success",
            markdown=markdown,
            config=config.to_dict()
        )
            
    except Exception as e:
        return CrawlResult(
            url=url,
            status="failed",
            error=str(e),
            config=config.to_dict()
        )

class SpiderForce4AI:
    """Main class for interacting with SpiderForce4AI service."""

    def __init__(self, base_url: str):
        self.base_url = base_url.rstrip('/')
        self.session = None
        self._executor = ThreadPoolExecutor()
        self.crawl_results: List[CrawlResult] = []

    async def _ensure_session(self):
        """Ensure aiohttp session exists."""
        if self.session is None or self.session.closed:
            self.session = aiohttp.ClientSession()

    async def _close_session(self):
        """Close aiohttp session."""
        if self.session and not self.session.closed:
            await self.session.close()

    async def _save_markdown(self, url: str, markdown: str, output_dir: Path):
        """Save markdown content to file."""
        filename = f"{slugify(url)}.md"
        filepath = output_dir / filename
        async with aiofiles.open(filepath, 'w', encoding='utf-8') as f:
            await f.write(markdown)
        return filepath

    async def _send_webhook(self, result: CrawlResult, config: CrawlConfig):
        """Send webhook with crawl results."""
        if not config.webhook_url:
            return

        payload = {
            "url": result.url,
            "status": result.status,
            "markdown": result.markdown if result.status == "success" else None,
            "error": result.error if result.status == "failed" else None,
            "timestamp": result.timestamp,
            "config": config.to_dict()
        }

        try:
            async with httpx.AsyncClient() as client:
                response = await client.post(
                    config.webhook_url,
                    json=payload,
                    timeout=config.webhook_timeout
                )
                response.raise_for_status()
        except Exception as e:
            console.print(f"[yellow]Warning: Failed to send webhook for {result.url}: {str(e)}[/yellow]")

    def _save_report_sync(self, results: List[CrawlResult], config: CrawlConfig) -> None:
        """Save crawl report synchronously."""
        report = {
            "timestamp": datetime.now().isoformat(),
            "config": config.to_dict(),
            "results": {
                "successful": [asdict(r) for r in results if r.status == "success"],
                "failed": [asdict(r) for r in results if r.status == "failed"]
            },
            "summary": {
                "total": len(results),
                "successful": len([r for r in results if r.status == "success"]),
                "failed": len([r for r in results if r.status == "failed"])
            }
        }

        with open(config.report_file, 'w', encoding='utf-8') as f:
            json.dump(report, f, indent=2)

    async def _save_report(self, config: CrawlConfig):
        """Save crawl report to JSON file."""
        if not config.report_file:
            return

        report = {
            "timestamp": datetime.now().isoformat(),
            "config": config.to_dict(),
            "results": {
                "successful": [asdict(r) for r in self.crawl_results if r.status == "success"],
                "failed": [asdict(r) for r in self.crawl_results if r.status == "failed"]
            },
            "summary": {
                "total": len(self.crawl_results),
                "successful": len([r for r in self.crawl_results if r.status == "success"]),
                "failed": len([r for r in self.crawl_results if r.status == "failed"])
            }
        }

        async with aiofiles.open(config.report_file, 'w', encoding='utf-8') as f:
            await f.write(json.dumps(report, indent=2))

    async def crawl_url_async(self, url: str, config: CrawlConfig) -> CrawlResult:
        """Crawl a single URL asynchronously."""
        await self._ensure_session()
        
        try:
            endpoint = f"{self.base_url}/convert"
            payload = {
                "url": url,
                **config.to_dict()
            }
            
            async with self.session.post(endpoint, json=payload, timeout=config.timeout) as response:
                if response.status != 200:
                    error_text = await response.text()
                    result = CrawlResult(
                        url=url,
                        status="failed",
                        error=f"HTTP {response.status}: {error_text}",
                        config=config.to_dict()
                    )
                else:
                    markdown = await response.text()
                    result = CrawlResult(
                        url=url,
                        status="success",
                        markdown=markdown,
                        config=config.to_dict()
                    )

                    if config.output_dir:
                        await self._save_markdown(url, markdown, config.output_dir)
                    
                    await self._send_webhook(result, config)
                
                self.crawl_results.append(result)
                return result
                
        except Exception as e:
            result = CrawlResult(
                url=url,
                status="failed",
                error=str(e),
                config=config.to_dict()
            )
            self.crawl_results.append(result)
            return result

    def crawl_url(self, url: str, config: CrawlConfig) -> CrawlResult:
        """Synchronous version of crawl_url_async."""
        return asyncio.run(self.crawl_url_async(url, config))

    async def crawl_urls_async(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
        """Crawl multiple URLs asynchronously with progress bar."""
        await self._ensure_session()
        
        with Progress(
            SpinnerColumn(),
            TextColumn("[progress.description]{task.description}"),
            BarColumn(),
            TaskProgressColumn(),
            console=console
        ) as progress:
            task = progress.add_task("[cyan]Crawling URLs...", total=len(urls))
            
            async def crawl_with_progress(url):
                result = await self.crawl_url_async(url, config)
                progress.update(task, advance=1, description=f"[cyan]Crawled: {url}")
                return result

            semaphore = asyncio.Semaphore(config.max_concurrent_requests)
            async def crawl_with_semaphore(url):
                async with semaphore:
                    result = await crawl_with_progress(url)
                    await asyncio.sleep(config.request_delay)
                    return result

            results = await asyncio.gather(*[crawl_with_semaphore(url) for url in urls])
            
            # Save final report
            await self._save_report(config)
            
            # Print summary
            successful = len([r for r in results if r.status == "success"])
            failed = len([r for r in results if r.status == "failed"])
            console.print(f"\n[green]Crawling completed:[/green]")
            console.print(f"✓ Successful: {successful}")
            console.print(f"✗ Failed: {failed}")
            
            if config.report_file:
                console.print(f"📊 Report saved to: {config.report_file}")
            
            return results

    def crawl_urls(self, urls: List[str], config: CrawlConfig) -> List[CrawlResult]:
        """Synchronous version of crawl_urls_async."""
        return asyncio.run(self.crawl_urls_async(urls, config))

    async def crawl_sitemap_async(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
        """Crawl URLs from a sitemap asynchronously."""
        await self._ensure_session()
        
        try:
            console.print(f"[cyan]Fetching sitemap from {sitemap_url}...[/cyan]")
            async with self.session.get(sitemap_url, timeout=config.timeout) as response:
                sitemap_text = await response.text()
        except Exception as e:
            console.print(f"[red]Error fetching sitemap: {str(e)}[/red]")
            raise

        try:
            root = ET.fromstring(sitemap_text)
            namespace = {'ns': root.tag.split('}')[0].strip('{')}
            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
            console.print(f"[green]Found {len(urls)} URLs in sitemap[/green]")
        except Exception as e:
            console.print(f"[red]Error parsing sitemap: {str(e)}[/red]")
            raise

        return await self.crawl_urls_async(urls, config)

    def crawl_sitemap(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
        """Synchronous version of crawl_sitemap_async."""
        return asyncio.run(self.crawl_sitemap_async(sitemap_url, config))

    def crawl_sitemap_parallel(self, sitemap_url: str, config: CrawlConfig) -> List[CrawlResult]:
        """Crawl sitemap URLs in parallel using multiprocessing (no asyncio required)."""
        print(f"Fetching sitemap from {sitemap_url}...")
        
        # Fetch sitemap
        try:
            response = requests.get(sitemap_url, timeout=config.timeout)
            response.raise_for_status()
            sitemap_text = response.text
        except Exception as e:
            print(f"Error fetching sitemap: {str(e)}")
            raise

        # Parse sitemap
        try:
            root = ET.fromstring(sitemap_text)
            namespace = {'ns': root.tag.split('}')[0].strip('{')}
            urls = [loc.text for loc in root.findall('.//ns:loc', namespace)]
            print(f"Found {len(urls)} URLs in sitemap")
        except Exception as e:
            print(f"Error parsing sitemap: {str(e)}")
            raise

        # Prepare arguments for parallel processing
        process_args = [(url, self.base_url, config) for url in urls]

        # Create process pool and execute crawls
        results = []

        with Pool(processes=config.max_concurrent_requests) as pool:
            with Progress(
                SpinnerColumn(),
                TextColumn("[progress.description]{task.description}"),
                BarColumn(),
                TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
                TextColumn("({task.completed}/{task.total})"),
            ) as progress:
                task = progress.add_task("Crawling URLs...", total=len(urls))
                
                for result in pool.imap_unordered(_process_url_parallel, process_args):
                    results.append(result)
                    progress.update(task, advance=1)
                    status = "✓" if result.status == "success" else "✗"
                    progress.description = f"Last: {status} {result.url}"

        # Save final report
        if config.report_file:
            self._save_report_sync(results, config)
            print(f"\nReport saved to: {config.report_file}")

        # Print summary
        successful = len([r for r in results if r.status == "success"])
        failed = len([r for r in results if r.status == "failed"])
        print(f"\nCrawling completed:")
        print(f"✓ Successful: {successful}")
        print(f"✗ Failed: {failed}")

        return results

    async def __aenter__(self):
        """Async context manager entry."""
        await self._ensure_session()
        return self

    async def __aexit__(self, exc_type, exc_val, exc_tb):
        """Async context manager exit."""
        await self._close_session()

    def __enter__(self):
        """Sync context manager entry."""
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        """Sync context manager exit."""
        self._executor.shutdown(wait=True)