"""
LangChain tools for Olostep web scraping API.

Comprehensive tools matching all Olostep API capabilities:
- Scrapes: Extract content from single URLs
- Batches: Scrape multiple URLs in parallel
- Crawls: Autonomously discover and scrape entire websites
- Maps: Extract all URLs from a website
- Answers: AI-powered web search and question answering
"""

import os
import time
from typing import Literal, List, Dict, Any, Optional, Union
from langchain_core.tools import tool
from langchain_core.exceptions import LangChainException
import requests
import json


class OlostepClient:
    """Client for Olostep API with full feature support."""
    
    def __init__(self, api_key: Optional[str] = None):
        self.api_key = api_key or os.getenv("OLOSTEP_API_KEY")
        if not self.api_key:
            raise ValueError("OLOSTEP_API_KEY environment variable is required")
        self.base_url = "https://api.olostep.com/v1"
        self.headers = {
            "Authorization": f"Bearer {self.api_key}",
            "Content-Type": "application/json",
            "Accept": "application/json"
        }
    
    def scrape(
        self, 
        url: str,
        formats: Optional[List[str]] = None,
        country: Optional[str] = None,
        wait_before_scraping: int = 0,
        parser_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Scrape a single URL using /v1/scrapes endpoint."""
        payload = {
            "url_to_scrape": url,
            "formats": formats or ["markdown"]
        }
        
        if country:
            payload["country"] = country
        if wait_before_scraping:
            payload["wait_before_scraping"] = wait_before_scraping
        if parser_id:
            payload["parser"] = {"id": parser_id}
        
        try:
            response = requests.post(
                f"{self.base_url}/scrapes",
                headers=self.headers,
                json=payload,
                timeout=60
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Olostep scrape failed: {str(e)}")
    
    def batch_scrape(
        self,
        urls: List[Dict[str, str]],
        formats: Optional[List[str]] = None,
        country: Optional[str] = None,
        wait_before_scraping: int = 0,
        parser_id: Optional[str] = None
    ) -> Dict[str, Any]:
        """Scrape multiple URLs using /v1/batches endpoint."""
        payload = {
            "items": urls
        }
        
        if formats:
            payload["formats"] = formats
        if country:
            payload["country"] = country
        if wait_before_scraping:
            payload["wait_before_scraping"] = wait_before_scraping
        if parser_id:
            payload["parser"] = {"id": parser_id}
        
        try:
            response = requests.post(
                f"{self.base_url}/batches",
                headers=self.headers,
                json=payload,
                timeout=120
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Olostep batch scrape failed: {str(e)}")
    
    def answer(
        self,
        task: str,
        json_schema: Optional[Union[Dict, str]] = None
    ) -> Dict[str, Any]:
        """
        Search the web and get AI-powered answers using /v1/answers endpoint.
        
        Args:
            task: Question or task to search for
            json_schema: Optional JSON schema or string describing desired output format
            
        Returns:
            Answer with sources
        """
        payload = {
            "task": task
        }
        
        if json_schema:
            payload["json"] = json_schema
        
        try:
            response = requests.post(
                f"{self.base_url}/answers",
                headers=self.headers,
                json=payload,
                timeout=120
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Olostep answer failed: {str(e)}")
    
    def create_map(
        self,
        url: str,
        search_query: Optional[str] = None,
        top_n: Optional[int] = None,
        include_urls: Optional[List[str]] = None,
        exclude_urls: Optional[List[str]] = None
    ) -> Dict[str, Any]:
        """
        Get all URLs from a website using /v1/maps endpoint.
        
        Args:
            url: Website URL to extract URLs from
            search_query: Optional search query to filter URLs
            top_n: Limit number of URLs returned
            include_urls: Glob patterns to include (e.g., ["/blog/**"])
            exclude_urls: Glob patterns to exclude (e.g., ["/admin/**"])
            
        Returns:
            List of URLs found on the website
        """
        payload = {
            "url": url
        }
        
        if search_query:
            payload["search_query"] = search_query
        if top_n:
            payload["top_n"] = top_n
        if include_urls:
            payload["include_urls"] = include_urls
        if exclude_urls:
            payload["exclude_urls"] = exclude_urls
        
        try:
            response = requests.post(
                f"{self.base_url}/maps",
                headers=self.headers,
                json=payload,
                timeout=120
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Olostep map creation failed: {str(e)}")
    
    def start_crawl(
        self,
        start_url: str,
        max_pages: int = 100,
        include_urls: Optional[List[str]] = None,
        exclude_urls: Optional[List[str]] = None,
        max_depth: Optional[int] = None,
        include_external: bool = False
    ) -> Dict[str, Any]:
        """
        Start crawling a website using /v1/crawls endpoint.
        
        Args:
            start_url: Starting URL for the crawl
            max_pages: Maximum number of pages to crawl
            include_urls: Glob patterns to include
            exclude_urls: Glob patterns to exclude
            max_depth: Maximum depth to crawl
            include_external: Include external URLs
            
        Returns:
            Crawl job information
        """
        payload = {
            "start_url": start_url,
            "max_pages": max_pages
        }
        
        if include_urls:
            payload["include_urls"] = include_urls
        if exclude_urls:
            payload["exclude_urls"] = exclude_urls
        if max_depth is not None:
            payload["max_depth"] = max_depth
        if include_external:
            payload["include_external"] = include_external
        
        try:
            response = requests.post(
                f"{self.base_url}/crawls",
                headers=self.headers,
                json=payload,
                timeout=120
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Olostep crawl failed: {str(e)}")
    
    def get_crawl_status(self, crawl_id: str) -> Dict[str, Any]:
        """Get status of a crawl job."""
        try:
            response = requests.get(
                f"{self.base_url}/crawls/{crawl_id}",
                headers=self.headers,
                timeout=30
            )
            response.raise_for_status()
            return response.json()
        except requests.exceptions.RequestException as e:
            raise LangChainException(f"Failed to get crawl status: {str(e)}")


# LangChain Tools

@tool
def scrape_website(
    url: str,
    format: Literal["markdown", "html", "json", "text"] = "markdown",
    country: Optional[str] = None,
    wait_before_scraping: int = 0,
    parser: Optional[str] = None,
    api_key: Optional[str] = None
) -> str:
    """
    Scrape content from any website using Olostep's /v1/scrapes endpoint.
    
    Extract content in HTML, Markdown, JSON, or text format. Handles JavaScript
    rendering, anti-scraping measures, and supports specialized parsers for
    specific websites (Amazon, LinkedIn, Google Search, etc.).
    
    Args:
        url: Website URL to scrape (must include http:// or https://)
        format: Output format - markdown, html, json, or text
        country: Country code for location-specific content (e.g., 'US', 'GB', 'CA')
        wait_before_scraping: Wait time in milliseconds for JavaScript rendering (0-10000)
        parser: Optional parser ID for specialized extraction (e.g., '@olostep/amazon-product')
        api_key: Olostep API key (uses OLOSTEP_API_KEY env var if not provided)
    
    Returns:
        Scraped content in requested format with metadata
    
    Examples:
        >>> # Basic scraping
        >>> content = scrape_website("https://example.com")
        
        >>> # With JavaScript wait time
        >>> content = scrape_website("https://example.com", wait_before_scraping=2000)
        
        >>> # With specialized parser
        >>> content = scrape_website(
        ...     "https://amazon.com/product/123",
        ...     parser="@olostep/amazon-product"
        ... )
    """
    try:
        client = OlostepClient(api_key)
        result = client.scrape(
            url=url,
            formats=[format],
            country=country,
            wait_before_scraping=wait_before_scraping,
            parser_id=parser
        )
        
        # Extract content from result
        content_result = result.get("result", {})
        
        # Get the requested format content
        format_key = f"{format}_content"
        content = content_result.get(format_key, "")
        
        # If content is a dict (for json format), convert to string
        if isinstance(content, dict):
            content = json.dumps(content, indent=2)
        
        # Build response with metadata
        response = {
            "content": content,
            "url": url,
            "format": format,
            "scrape_id": result.get("retrieve_id", ""),
            "metadata": content_result.get("page_metadata", {})
        }
        
        # Add hosted URLs if available
        if f"{format}_hosted_url" in content_result:
            response[f"{format}_url"] = content_result[f"{format}_hosted_url"]
        
        return json.dumps(response, indent=2)
        
    except Exception as e:
        raise LangChainException(f"Failed to scrape {url}: {str(e)}")


@tool
def scrape_batch(
    urls: List[str],
    format: Literal["markdown", "html", "json", "text"] = "markdown",
    country: Optional[str] = None,
    wait_before_scraping: int = 0,
    parser: Optional[str] = None,
    api_key: Optional[str] = None
) -> str:
    """
    Scrape multiple websites in parallel using Olostep's /v1/batches endpoint.
    
    Process up to 10,000 URLs at once. Batch jobs typically complete in 5-8 minutes
    regardless of batch size. Perfect for large-scale data extraction, competitor
    analysis, or building comprehensive datasets.
    
    Args:
        urls: List of website URLs to scrape
        format: Output format for all URLs - markdown, html, json, or text
        country: Country code for location-specific content
        wait_before_scraping: Wait time in milliseconds for JavaScript rendering
        parser: Optional parser ID for specialized extraction
        api_key: Olostep API key (uses OLOSTEP_API_KEY env var if not provided)
    
    Returns:
        Batch job information with status and batch ID
    
    Examples:
        >>> # Scrape multiple websites
        >>> urls = [
        ...     "https://example1.com",
        ...     "https://example2.com",
        ...     "https://example3.com"
        ... ]
        >>> result = scrape_batch(urls)
        
        >>> # With custom options
        >>> result = scrape_batch(
        ...     urls,
        ...     format="json",
        ...     country="US",
        ...     wait_before_scraping=2000
        ... )
    """
    try:
        client = OlostepClient(api_key)
        
        # Convert URLs to batch format
        batch_items = [
            {"url": url, "custom_id": f"url_{i}"}
            for i, url in enumerate(urls)
        ]
        
        result = client.batch_scrape(
            urls=batch_items,
            formats=[format],
            country=country,
            wait_before_scraping=wait_before_scraping,
            parser_id=parser
        )
        
        response = {
            "batch_id": result.get("batch_id", result.get("id", "")),
            "status": result.get("status", "in_progress"),
            "total_urls": len(urls),
            "format": format,
            "urls": urls
        }
        
        if country:
            response["country"] = country
        if parser:
            response["parser"] = parser
        
        return json.dumps(response, indent=2)
        
    except Exception as e:
        raise LangChainException(f"Failed to scrape batch: {str(e)}")


@tool
def answer_question(
    task: str,
    json_schema: Optional[Union[Dict, str]] = None,
    api_key: Optional[str] = None
) -> str:
    """
    Search the web and get AI-powered answers using Olostep's /v1/answers endpoint.
    
    Ground your AI products on real-world data and facts. Ask a question or provide
    a data point to enrich, and get back structured answers with sources. Perfect
    for data enrichment, research, and building AI applications that need current
    web information.
    
    Args:
        task: Question or task to search for (e.g., "What is the latest book by J.K. Rowling?")
        json_schema: Optional JSON schema dict or string describing desired output format
                     Example: {"book_title": "", "author": "", "release_date": ""}
        api_key: Olostep API key (uses OLOSTEP_API_KEY env var if not provided)
    
    Returns:
        AI-generated answer in requested JSON format with sources
    
    Examples:
        >>> # Simple question
        >>> result = answer_question("What is the capital of France?")
        
        >>> # With JSON schema
        >>> result = answer_question(
        ...     "What is the latest book by J.K. Rowling?",
        ...     json_schema={"book_title": "", "author": "", "release_date": ""}
        ... )
        
        >>> # Data enrichment
        >>> result = answer_question(
        ...     "Find the CEO and headquarters of Stripe",
        ...     json_schema={"ceo_name": "", "headquarters": "", "founded_year": ""}
        ... )
        
        >>> # Returns NOT_FOUND for uncertain data
        >>> result = answer_question(
        ...     "How much did Olostep raise?",
        ...     json_schema={"amount": "", "investors": ""}
        ... )
    """
    try:
        client = OlostepClient(api_key)
        result = client.answer(
            task=task,
            json_schema=json_schema
        )
        
        # Extract JSON content from result
        result_data = result.get("result", {})
        json_content = result_data.get("json_content", "")
        
        # Parse JSON content if it's a string
        if isinstance(json_content, str) and json_content:
            try:
                json_content = json.loads(json_content)
            except json.JSONDecodeError:
                pass
        
        response = {
            "answer": json_content,
            "task": task,
            "sources": result_data.get("sources", []),
            "answer_id": result.get("id", "")
        }
        
        return json.dumps(response, indent=2)
        
    except Exception as e:
        raise LangChainException(f"Failed to get answer for task '{task}': {str(e)}")


@tool
def extract_urls(
    url: str,
    search_query: Optional[str] = None,
    top_n: Optional[int] = None,
    include_urls: Optional[List[str]] = None,
    exclude_urls: Optional[List[str]] = None,
    api_key: Optional[str] = None
) -> str:
    """
    Extract all URLs from a website using Olostep's /v1/maps endpoint.
    
    Discover all pages on a website for content discovery and site structure analysis.
    Perfect for preparing URLs for batch processing, SEO audits, or understanding
    website architecture. Can extract up to ~100,000 URLs in a single call.
    
    Args:
        url: Website URL to extract URLs from
        search_query: Optional search query to filter URLs (e.g., "blog")
        top_n: Limit the number of URLs returned
        include_urls: Glob patterns to include (e.g., ["/blog/**", "/product/**"])
        exclude_urls: Glob patterns to exclude (e.g., ["/admin/**", "/private/**"])
        api_key: Olostep API key (uses OLOSTEP_API_KEY env var if not provided)
    
    Returns:
        List of URLs found on the website
    
    Examples:
        >>> # Get all URLs from a website
        >>> result = extract_urls("https://example.com")
        
        >>> # Get only blog URLs
        >>> result = extract_urls(
        ...     "https://example.com",
        ...     include_urls=["/blog/**"]
        ... )
        
        >>> # Get product URLs, exclude admin
        >>> result = extract_urls(
        ...     "https://store.com",
        ...     include_urls=["/product/**"],
        ...     exclude_urls=["/admin/**"],
        ...     top_n=100
        ... )
    """
    try:
        client = OlostepClient(api_key)
        result = client.create_map(
            url=url,
            search_query=search_query,
            top_n=top_n,
            include_urls=include_urls,
            exclude_urls=exclude_urls
        )
        
        urls = result.get("urls", [])
        
        response = {
            "map_id": result.get("id", result.get("map_id", "")),
            "url": url,
            "total_urls": len(urls),
            "urls": urls
        }
        
        if search_query:
            response["search_query"] = search_query
        if top_n:
            response["top_n"] = top_n
        
        return json.dumps(response, indent=2)
        
    except Exception as e:
        raise LangChainException(f"Failed to extract URLs from {url}: {str(e)}")


@tool
def crawl_website(
    start_url: str,
    max_pages: int = 100,
    include_urls: Optional[List[str]] = None,
    exclude_urls: Optional[List[str]] = None,
    max_depth: Optional[int] = None,
    include_external: bool = False,
    api_key: Optional[str] = None
) -> str:
    """
    Autonomously crawl and scrape entire websites using Olostep's /v1/crawls endpoint.
    
    Perfect for scraping documentation sites, blogs, or any website where you want
    to discover and extract content from all pages automatically. The crawler
    follows links and respects your filters.
    
    Args:
        start_url: Starting URL for the crawl
        max_pages: Maximum number of pages to crawl (default: 100)
        include_urls: Glob patterns to include (e.g., ["/**"] for all)
        exclude_urls: Glob patterns to exclude (e.g., ["/admin/**"])
        max_depth: Maximum depth to crawl from start_url
        include_external: Include external URLs (default: False)
        api_key: Olostep API key (uses OLOSTEP_API_KEY env var if not provided)
    
    Returns:
        Crawl job information with status and crawl ID
    
    Examples:
        >>> # Crawl entire website
        >>> result = crawl_website("https://docs.example.com")
        
        >>> # Crawl with filters
        >>> result = crawl_website(
        ...     "https://example.com",
        ...     max_pages=200,
        ...     include_urls=["/**"],
        ...     exclude_urls=["/admin/**", "/private/**"]
        ... )
        
        >>> # Limited depth crawl
        >>> result = crawl_website(
        ...     "https://blog.example.com",
        ...     max_pages=50,
        ...     max_depth=3
        ... )
    """
    try:
        client = OlostepClient(api_key)
        result = client.start_crawl(
            start_url=start_url,
            max_pages=max_pages,
            include_urls=include_urls,
            exclude_urls=exclude_urls,
            max_depth=max_depth,
            include_external=include_external
        )
        
        response = {
            "crawl_id": result.get("id", ""),
            "status": result.get("status", "in_progress"),
            "start_url": start_url,
            "max_pages": max_pages,
            "pages_crawled": result.get("pages_crawled", 0)
        }
        
        if max_depth is not None:
            response["max_depth"] = max_depth
        if include_external:
            response["include_external"] = include_external
        
        return json.dumps(response, indent=2)
        
    except Exception as e:
        raise LangChainException(f"Failed to start crawl for {start_url}: {str(e)}")
