"""
Agent details scraper module.

Optimized scraper for fetching agent details (phone, address, email, website)
from Idealista with caching and credit optimization.

Credit optimization:
- Previous implementation: 4-7 calls x 25 credits = 100-175 credits per agent
- Optimized implementation: 0-2 calls x 1 credit = 0-2 credits per agent
"""

import json
import re
import asyncio
from typing import Dict, Optional, List, TYPE_CHECKING

from scrapfly import ScrapflyClient, ScrapeConfig
from bs4 import BeautifulSoup

from ..cache.agent_cache import AgentDetailsCache
from ..client.tiers import (
    get_config_for_tier,
    TIER_CACHED,
    BASE_HEADERS,
)
from ..utils.countries import CountryConfig, get_country

if TYPE_CHECKING:
    from ..utils.countries import CountryConfig


class AgentDetailsScraper:
    """
    Optimized scraper for agent details from Idealista.

    Credit usage per agent:
    - With cache hit: 0 credits
    - Without cache: 1-2 credits (1 for page, 1 for phone API if needed)

    Previous implementation used 4-7 calls x 25 credits = 100-175 credits per agent.
    Optimized implementation uses 1-2 calls x 1 credit = 1-2 credits per agent.

    Example:
        >>> from scrapfly import ScrapflyClient
        >>> client = ScrapflyClient(key="your_api_key")
        >>> scraper = AgentDetailsScraper(client)
        >>> details = await scraper.get_agent_details("12345", "https://idealista.com/pro/agent/")
    """

    def __init__(
        self,
        scrapfly_client: ScrapflyClient,
        cache_ttl_hours: int = 24,
        country: str | CountryConfig | None = None
    ):
        """
        Initialize the agent details scraper.

        Args:
            scrapfly_client: Configured Scrapfly client
            cache_ttl_hours: Cache TTL in hours (default 24)
            country: Country name or CountryConfig. Defaults to IDEALISTA_DEFAULT_COUNTRY
                    env var or 'spain'.
        """
        self.scrapfly = scrapfly_client
        self.cache = AgentDetailsCache(ttl_hours=cache_ttl_hours)

        # Set up country configuration
        if isinstance(country, CountryConfig):
            self.country = country
        else:
            self.country = get_country(country)

        # Headers for AJAX phone requests
        self.ajax_headers = {
            **BASE_HEADERS,
            "accept": "application/json, text/javascript, */*; q=0.01",
            "content-type": "application/json",
            "x-requested-with": "XMLHttpRequest",
            "sec-fetch-dest": "empty",
            "sec-fetch-mode": "cors",
            "sec-fetch-site": "same-origin",
        }

    async def _fetch_phone_numbers(self, agent_id: str, referer_url: str) -> List[str]:
        """
        Fetch phone numbers for an agent from AJAX endpoint.

        Stops after first successful response (previously tried all 3 endpoints).
        Uses CACHED tier (1 credit) instead of ASP (25 credits).

        Args:
            agent_id: Agent ID
            referer_url: URL to use as referer header

        Returns:
            List of phone numbers
        """
        # Priority-ordered endpoints - stop on first success
        # Use country domain for the AJAX endpoints
        endpoints = [
            f"{self.country.domain}/ajax/offices/{agent_id}/contact-phones",
            f"{self.country.domain}/{self.country.language}/ajax/offices/{agent_id}/contact-phones",
        ]

        headers = self.ajax_headers.copy()
        headers["referer"] = referer_url

        # Get tier config (CACHED = 1 credit with server-side caching)
        tier_config = get_config_for_tier(TIER_CACHED)

        for endpoint in endpoints:
            try:
                result = await self.scrapfly.async_scrape(ScrapeConfig(
                    url=endpoint,
                    headers=headers,
                    **tier_config
                ))

                if result.content:
                    try:
                        phone_data = json.loads(result.content)
                        phones = phone_data.get('phones', [])
                        if phones:
                            return phones  # Return immediately on success
                    except json.JSONDecodeError:
                        continue
            except Exception as e:
                # Log but continue to next endpoint
                print(f"Phone endpoint {endpoint} failed: {e}")
                continue

        return []

    async def _fetch_page_details(self, agent_url: str) -> Dict[str, str]:
        """
        Extract address, website, and email from a single page fetch.

        Combines what was previously 2-3 separate API calls into 1.
        Uses CACHED tier (1 credit) instead of ASP (25 credits).
        Does NOT follow external websites (removes additional API call).

        Args:
            agent_url: URL of the agent's page

        Returns:
            Dictionary with address, website, and email
        """
        address = ""
        website = ""
        email = ""

        try:
            tier_config = get_config_for_tier(TIER_CACHED)
            headers = BASE_HEADERS.copy()
            headers["referer"] = f"{self.country.domain}/"

            result = await self.scrapfly.async_scrape(ScrapeConfig(
                url=agent_url,
                headers=headers,
                **tier_config
            ))

            soup = BeautifulSoup(result.content, 'html.parser')

            # Extract address
            address = self._extract_address(soup)

            # Extract website and email from page (no external fetch)
            website, email = self._extract_website_and_email(soup, result.content)

        except Exception as e:
            print(f"Error fetching page details from {agent_url}: {e}")

        return {
            "address": address,
            "website": website,
            "email": email,
        }

    def _extract_address(self, soup: BeautifulSoup) -> str:
        """
        Extract address from parsed page.

        Args:
            soup: BeautifulSoup parsed page

        Returns:
            Address string or empty string if not found
        """
        # Try location button first (most common pattern)
        location_button = soup.find('button', {'class': ['location', 'showMap', 'icon-location']})
        if location_button:
            location_texts = location_button.find_all('span', {'class': 'location-text'})
            if location_texts:
                return ' '.join(text.get_text(strip=True) for text in location_texts)

        # Fallback selectors
        for selector in ['office-address', 'address', 'location-address']:
            address_div = soup.find('div', {'class': selector})
            if address_div:
                return address_div.get_text(strip=True)

        return ""

    def _extract_website_and_email(
        self, soup: BeautifulSoup, content: str
    ) -> tuple[str, str]:
        """
        Extract website URL and email from page content.

        Does NOT follow external URLs (saves 1 API call with ASP = 25 credits).
        Instead, extracts what's available on the Idealista page itself.

        Args:
            soup: BeautifulSoup parsed page
            content: Raw page content

        Returns:
            Tuple of (website_url, email)
        """
        website = ""
        email = ""

        # Find website link
        website_link = (
            soup.find('a', {'class': 'icon-new-tab'}) or
            soup.find('a', {'rel': 'nofollow noopener'}) or
            soup.find('a', string=re.compile(r'Go to.*website', re.I)) or
            soup.find('a', {'class': re.compile(r'website|external', re.I)})
        )

        if website_link:
            href = website_link.get('href', '')
            if href and not href.startswith('#'):
                website = href

        # Try to find email on the page (common patterns)
        email_patterns = [
            r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b',
        ]

        for pattern in email_patterns:
            emails = re.findall(pattern, content)
            # Filter out common false positives
            valid_emails = [
                e for e in emails
                if not e.endswith(('.png', '.jpg', '.gif', '.svg'))
                and 'idealista' not in e.lower()
                and 'example' not in e.lower()
            ]
            if valid_emails:
                email = valid_emails[0]
                break

        return website, email

    async def get_agent_details(self, agent_id: str, agent_url: str) -> Dict[str, str]:
        """
        Get all details for an agent with caching.

        API calls:
        - Cache hit: 0 calls
        - Cache miss: 2 calls maximum (1 page + 1 phone API)

        Args:
            agent_id: Agent ID
            agent_url: Agent page URL

        Returns:
            Dictionary with phone, address, agentWebsite, and email
        """
        # Check cache first
        cached = self.cache.get(agent_id)
        if cached:
            print(f"Cache hit for agent {agent_id}")
            return cached

        # Fetch page details and phone numbers in parallel
        page_task = self._fetch_page_details(agent_url)
        phone_task = self._fetch_phone_numbers(agent_id, agent_url)

        page_details, phones = await asyncio.gather(page_task, phone_task)

        # Combine results
        details = {
            "phone": phones[0] if phones else "",
            "address": page_details["address"],
            "agentWebsite": page_details["website"],
            "email": page_details["email"],
        }

        # Cache for future requests
        self.cache.set(agent_id, details)

        return details

    async def get_phone_numbers(self, agent_id: str, referer_url: str = "") -> List[str]:
        """
        Public method for fetching only phone numbers.

        Maintains backwards compatibility.

        Args:
            agent_id: Agent ID
            referer_url: Referer URL (optional)

        Returns:
            List of phone numbers
        """
        if not referer_url:
            referer_url = f"{self.country.domain}/"
        return await self._fetch_phone_numbers(agent_id, referer_url)

    async def get_address(self, agent_url: str) -> str:
        """
        Public method for fetching only address.

        Maintained for backwards compatibility but prefer get_agent_details.

        Args:
            agent_url: Agent page URL

        Returns:
            Address string
        """
        details = await self._fetch_page_details(agent_url)
        return details["address"]

    async def get_website_and_email(self, agent_url: str) -> tuple[str, str]:
        """
        Public method for fetching website and email.

        Maintained for backwards compatibility but prefer get_agent_details.

        Args:
            agent_url: Agent page URL

        Returns:
            Tuple of (website, email)
        """
        details = await self._fetch_page_details(agent_url)
        return details["website"], details["email"]
