"""
Portugal-specific agent scraper for Idealista.

Portugal uses HTML pages for agent listings instead of Spain's API approach.
URL pattern: /agencias-imobiliarias/{region}/imobiliarias
"""
from __future__ import annotations

import asyncio
import codecs
import json
import re
from dataclasses import dataclass, field
from typing import TYPE_CHECKING, Dict, List, Optional

from bs4 import BeautifulSoup
from bson import ObjectId
from loguru import logger
from scrapfly import ScrapflyClient, ScrapeConfig

if TYPE_CHECKING:
    from ..utils.countries import CountryConfig

from ..utils.countries import get_country
from ..utils.config import settings


@dataclass
class PortugalAgency:
    """Represents an agency from Portugal Idealista."""

    agency_id: str
    name: str
    profile_url: str
    location_id: str = ""
    operation: str = ""  # SALE, RENT
    logo: str = ""
    region_slug: str = ""
    region_name: str = ""

    def to_dict(self) -> Dict:
        """Convert to dictionary for JSON output."""
        shared_id = ObjectId()
        return {
            "firstname": self.name,
            "url": self.profile_url,
            "agencyReference": self.profile_url.split("/pro/")[-1].rstrip("/") if "/pro/" in self.profile_url else "",
            "logo": self.logo,
            "agentID": self.agency_id,
            "locationId": self.location_id,
            "operation": self.operation,
            "region": self.region_name,
            "regionSlug": self.region_slug,
            "owner": {"$oid": str(shared_id)},
            "superAgent": {"$oid": str(shared_id)},
        }


class PortugalAgentScraper:
    """
    Scrapes agencies from Portugal's HTML pages.

    Portugal uses a different structure than Spain:
    - No API endpoint for agents
    - Uses HTML pages at /agencias-imobiliarias/{region}/imobiliarias
    - Agency cards are <article class="zone-experts-agency-card">
    """

    def __init__(
        self,
        scrapfly: ScrapflyClient,
        country: CountryConfig | str | None = None
    ):
        """
        Initialize Portugal agent scraper.

        Args:
            scrapfly: Configured Scrapfly client
            country: Country configuration (should be Portugal)
        """
        self.scrapfly = scrapfly

        if isinstance(country, str) or country is None:
            self.country = get_country(country or "portugal")
        else:
            self.country = country

        if self.country.code != "pt":
            logger.warning(f"PortugalAgentScraper initialized with non-Portugal country: {self.country.code}")

    def _get_scrape_config(self, url: str) -> ScrapeConfig:
        """Create scrape config for Portugal pages."""
        return ScrapeConfig(
            url=url,
            asp=True,
            render_js=True,
            wait_for_selector="body",
            rendering_wait=3000,
            cache=False,
            country="PT",
            proxy_pool="public_residential_pool",
            headers={
                "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "accept-language": "pt-PT,pt;q=0.9,en;q=0.8",
            }
        )

    def _parse_agencies_from_html(
        self,
        html: str,
        region_slug: str,
        region_name: str
    ) -> List[PortugalAgency]:
        """
        Parse agencies from HTML page.

        Agency cards have this structure:
        <article class="zone-experts-agency-card"
                 data-microsite-url="https://www.idealista.pt/pro/{slug}/"
                 data-agency-id="440156312"
                 data-location-id="0-EU-PT-11"
                 data-operation-id="SALE">
        """
        agencies = []
        soup = BeautifulSoup(html, 'lxml')

        # Find all agency cards
        cards = soup.find_all('article', class_='zone-experts-agency-card')
        logger.debug(f"Found {len(cards)} agency cards in HTML")

        for card in cards:
            try:
                agency_id = card.get('data-agency-id', '')
                profile_url = card.get('data-microsite-url', '')
                location_id = card.get('data-location-id', '')
                operation = card.get('data-operation-id', '')

                # Get agency name - try multiple selectors
                name = ""
                name_elem = card.find(['h2', 'h3', 'a'], class_=lambda x: x and 'name' in str(x).lower())
                if name_elem:
                    name = name_elem.get_text(strip=True)
                else:
                    # Try to get from link text
                    link = card.find('a', href=lambda x: x and '/pro/' in str(x))
                    if link:
                        name = link.get_text(strip=True)

                # Get logo if available
                logo = ""
                img = card.find('img')
                if img:
                    logo = img.get('src', '') or img.get('data-src', '')

                if agency_id and profile_url:
                    agencies.append(PortugalAgency(
                        agency_id=agency_id,
                        name=name,
                        profile_url=profile_url,
                        location_id=location_id,
                        operation=operation,
                        logo=logo,
                        region_slug=region_slug,
                        region_name=region_name,
                    ))
            except Exception as e:
                logger.warning(f"Failed to parse agency card: {e}")
                continue

        return agencies

    def _get_total_pages(self, html: str) -> int:
        """Extract total number of pages from pagination."""
        soup = BeautifulSoup(html, 'lxml')

        # Look for pagination elements
        pagination = soup.find(class_=lambda x: x and 'pagination' in str(x).lower())
        if not pagination:
            return 1

        # Find all page number links
        page_links = pagination.find_all('a', href=True)
        max_page = 1

        for link in page_links:
            # Extract page number from URL or text
            href = link.get('href', '')
            text = link.get_text(strip=True)

            # Try to get page number from URL
            page_match = re.search(r'pagina-(\d+)', href)
            if page_match:
                page_num = int(page_match.group(1))
                max_page = max(max_page, page_num)
            elif text.isdigit():
                max_page = max(max_page, int(text))

        return max_page

    async def scrape_agencies_for_region(
        self,
        region_slug: str,
        region_name: str = "",
        max_pages: int = 0
    ) -> List[PortugalAgency]:
        """
        Fetch all agencies for a region from HTML pages.

        Args:
            region_slug: Region slug (e.g., "lisboa-distrito")
            region_name: Human-readable region name
            max_pages: Maximum pages to scrape (0 = all)

        Returns:
            List of PortugalAgency objects
        """
        agencies = []
        base_url = self.country.build_agents_url(region_slug)

        if not base_url:
            logger.error(f"Could not build agents URL for region: {region_slug}")
            return agencies

        region_name = region_name or region_slug.replace("-", " ").title()
        logger.info(f"Scraping agencies for {region_name}: {base_url}")

        try:
            # Fetch first page to get pagination info
            result = await self.scrapfly.async_scrape(self._get_scrape_config(base_url))

            if not result.content:
                logger.warning(f"No content received for {region_slug}")
                return agencies

            # Parse first page
            page_agencies = self._parse_agencies_from_html(
                result.content, region_slug, region_name
            )
            agencies.extend(page_agencies)
            logger.info(f"Found {len(page_agencies)} agencies on page 1 for {region_name}")

            # Get total pages
            total_pages = self._get_total_pages(result.content)
            if max_pages > 0:
                total_pages = min(total_pages, max_pages)

            logger.info(f"Total pages for {region_name}: {total_pages}")

            # Fetch remaining pages
            for page in range(2, total_pages + 1):
                page_url = f"{base_url}pagina-{page}.htm"
                try:
                    result = await self.scrapfly.async_scrape(self._get_scrape_config(page_url))
                    if result.content:
                        page_agencies = self._parse_agencies_from_html(
                            result.content, region_slug, region_name
                        )
                        agencies.extend(page_agencies)
                        logger.info(f"Found {len(page_agencies)} agencies on page {page} for {region_name}")

                    # Small delay between pages
                    await asyncio.sleep(1)
                except Exception as e:
                    logger.warning(f"Failed to fetch page {page} for {region_slug}: {e}")
                    continue

        except Exception as e:
            logger.error(f"Failed to scrape agencies for {region_slug}: {e}")

        return agencies

    async def scrape_all_regions(
        self,
        region_slugs: List[str] | None = None,
        max_pages_per_region: int = 0
    ) -> List[PortugalAgency]:
        """
        Fetch agencies from all Portugal regions.

        Args:
            region_slugs: Optional list of specific region slugs to scrape.
                         If None, scrapes all regions from config.
            max_pages_per_region: Maximum pages per region (0 = all)

        Returns:
            List of PortugalAgency objects from all regions
        """
        all_agencies = []

        # Get regions from config
        if region_slugs:
            regions = [
                r for r in self.country.get_regions()
                if r.slug in region_slugs
            ]
        else:
            regions = self.country.get_regions()

        if not regions:
            logger.error("No regions found for Portugal")
            return all_agencies

        logger.info(f"Scraping agencies from {len(regions)} Portugal regions")

        for region in regions:
            region_agencies = await self.scrape_agencies_for_region(
                region.slug,
                region.name,
                max_pages_per_region
            )
            all_agencies.extend(region_agencies)
            logger.info(f"Total: {len(region_agencies)} agencies from {region.name}")

            # Delay between regions
            await asyncio.sleep(2)

        logger.info(f"Completed: {len(all_agencies)} agencies from {len(regions)} regions")
        return all_agencies

    async def run(
        self,
        output_file: str = "portugal_agents.jsonl",
        region_slugs: List[str] | None = None,
        max_pages_per_region: int = 0
    ) -> None:
        """
        Run the full Portugal agent scraping process.

        Args:
            output_file: Path to output JSONL file
            region_slugs: Optional list of specific regions to scrape
            max_pages_per_region: Maximum pages per region (0 = all)
        """
        agencies = await self.scrape_all_regions(region_slugs, max_pages_per_region)

        # Write results to file
        with codecs.open(output_file, 'w', encoding='utf-8') as f:
            for agency in agencies:
                f.write(json.dumps(agency.to_dict(), ensure_ascii=False) + '\n')

        logger.info(f"Saved {len(agencies)} agencies to {output_file}")
