"""
Property listing scraper module.

Provides functions for scraping property listings from Idealista
agent pages, including pagination handling and URL extraction.
"""
from __future__ import annotations

import math
import re
import uuid
import asyncio
from typing import List, Tuple, Dict, TYPE_CHECKING
from urllib.parse import urljoin

from scrapfly import ScrapflyClient, ScrapeConfig
from bs4 import BeautifulSoup
from loguru import logger

if TYPE_CHECKING:
    from ..utils.countries import CountryConfig

from ..utils.countries import get_country


def extract_property_urls(
    html_content: str,
    base_pattern: str,
    country: CountryConfig | None = None
) -> List[str]:
    """
    Extract property URLs from HTML content.

    Args:
        html_content: HTML content of the listing page
        base_pattern: Base pattern for matching property URLs (e.g., "/en/pro/agent-name")
        country: Country configuration. Defaults to Spain.

    Returns:
        List of unique property URLs
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    if country is None:
        country = get_country()
    base_url = country.domain

    property_urls = []
    for link in soup.find_all('a', href=re.compile(f"{base_pattern}/inmueble/\\d+")):
        full_url = urljoin(base_url, link['href'])
        if full_url not in property_urls:
            property_urls.append(full_url)

    return property_urls


def extract_property_count(html_content: str) -> int:
    """
    Extract total number of properties from page title.

    Args:
        html_content: HTML content of the page

    Returns:
        Total property count, or 0 if not found
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    title = soup.find('title')
    if title:
        match = re.search(r'(\d+)\s+houses?.*(?:for sale|for rent)', title.text)
        if match:
            count = int(match.group(1))
            logger.info(f"Found {count} properties in title: {title.text.strip()}")
            return count
    logger.warning("No property count found in title")
    return 0


def generate_pagination_urls(base_url: str, total_properties: int) -> List[str]:
    """
    Generate pagination URLs based on total properties.

    Args:
        base_url: Base URL for the listings page
        total_properties: Total number of properties

    Returns:
        List of paginated URLs
    """
    if total_properties <= 30:
        return [base_url]

    total_pages = math.ceil(total_properties / 30)
    urls = [base_url]  # First page
    for page in range(2, total_pages + 1):
        urls.append(f"{base_url}pagina-{page}.htm")
    return urls


async def scrape_property_type(
    scrapfly: ScrapflyClient,
    base_url: str,
    property_type: str,
    base_config: Dict,
    available_concurrency: int
) -> Tuple[List[str], int]:
    """
    Scrape properties for a specific type (rental or resale).

    Args:
        scrapfly: Scrapfly client instance
        base_url: Base URL for the agent
        property_type: Type of properties ('alquiler' for rental, 'venta' for resale)
        base_config: Base configuration for Scrapfly requests
        available_concurrency: Number of concurrent requests allowed

    Returns:
        Tuple of (property_urls, total_count)
    """
    url = f"{base_url}/{property_type}-viviendas/"
    logger.info(f"Starting {property_type} property scraping from: {url}")

    # Initial request to get property count
    initial_config = base_config.copy()
    initial_config["session"] = str(uuid.uuid4())
    result = scrapfly.scrape(ScrapeConfig(
        url=url,
        **initial_config
    ))

    total_properties = extract_property_count(result.content)
    if total_properties == 0:
        logger.warning(f"No {property_type} properties found")
        return [], 0

    # Generate pagination URLs
    pagination_urls = generate_pagination_urls(url, total_properties)
    total_pages = len(pagination_urls)
    logger.info(f"Found {total_properties} {property_type} properties across {total_pages} pages")

    # Calculate concurrency based on available slots
    batch_size = min(total_pages, available_concurrency)
    logger.info(f"Using batch size of {batch_size} for {property_type} properties")

    all_property_urls = []

    # Process URLs in batches
    for i in range(0, len(pagination_urls), batch_size):
        batch_urls = pagination_urls[i:i + batch_size]
        logger.info(f"Scraping {property_type} batch: pages {i+1} to {i+len(batch_urls)}")

        scrape_configs = [
            ScrapeConfig(
                url=batch_url,
                **{**base_config, "session": str(uuid.uuid4())}
            )
            for batch_url in batch_urls
        ]

        results = await asyncio.gather(*[
            scrapfly.async_scrape(config)
            for config in scrape_configs
        ])

        for result in results:
            agent_pattern = "/en/pro/([^/]+)/"
            agent_match = re.search(agent_pattern, url)
            base_pattern = f"/en/pro/{agent_match.group(1)}" if agent_match else "/en/pro"
            property_urls = extract_property_urls(result.content, base_pattern)
            logger.info(f"Found {len(property_urls)} {property_type} properties on page")
            all_property_urls.extend(property_urls)

    logger.info(f"Completed {property_type} scraping. Total properties found: {len(all_property_urls)}")
    return all_property_urls, total_properties


async def scrape_agent_listings(
    scrapfly: ScrapflyClient,
    agent_url: str,
    base_config: Dict,
    concurrency: int = 25
) -> Dict:
    """
    Scrape both rental and resale listings for an agent.

    Args:
        scrapfly: Scrapfly client instance
        agent_url: Agent's base URL
        base_config: Base configuration for Scrapfly requests
        concurrency: Number of concurrent requests per type

    Returns:
        Dictionary with 'rental' and 'resale' listings data
    """
    # Create both tasks immediately
    rental_task = scrape_property_type(
        scrapfly,
        agent_url,
        "alquiler",
        base_config,
        concurrency
    )
    resale_task = scrape_property_type(
        scrapfly,
        agent_url,
        "venta",
        base_config,
        concurrency
    )

    # Start both tasks simultaneously
    results = await asyncio.gather(rental_task, resale_task)

    rental_result, resale_result = results
    return {
        "rental": {
            "listings": rental_result[0],
            "total_properties": rental_result[1]
        },
        "resale": {
            "listings": resale_result[0],
            "total_properties": resale_result[1]
        }
    }
