"""
Geocoding and location utilities.

This module provides location parsing and normalization utilities
for processing Idealista property listings.
"""

from typing import Dict, Optional, Tuple
import re
import unicodedata
from loguru import logger
from bs4 import BeautifulSoup


def parse_location_html(html_content: str) -> Dict[str, str]:
    """
    Parse location information from Idealista HTML content.

    Extracts location components from the #headerMap section of
    Idealista property listings.

    Args:
        html_content: HTML string containing location information

    Returns:
        Dictionary containing parsed location components:
        - country: Country name (defaults to "Spain")
        - province: Province name
        - region: Region name
        - municipality: Municipality/city name
        - district: District name
        - urbanisation: Urbanisation/subdistrict name
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    location_items = soup.select('#headerMap .header-map-list')

    location_data = {}
    if location_items:
        # Parse from bottom up as the HTML structure is hierarchical
        items = [item.get_text(strip=True) for item in location_items]

        # Parse the region/province string
        if len(items) >= 1:
            region_parts = items[-1].split(',')
            if len(region_parts) == 2:
                location_data['province'] = region_parts[1].strip()
                location_data['region'] = region_parts[0].strip()

        # Municipality
        if len(items) >= 2:
            location_data['municipality'] = items[-2]

        # District
        if len(items) >= 3:
            location_data['district'] = items[-3].replace('District ', '')

        # Urbanisation/Subdistrict
        if len(items) >= 4:
            location_data['urbanisation'] = items[-4].replace('Subdistrict ', '')

        # Default country for this case
        location_data['country'] = 'Spain'

    return location_data


def get_location_details(
    location_data: Dict[str, str]
) -> Tuple[Optional[str], Optional[str], Optional[str], Optional[str], Optional[str]]:
    """
    Process location data and return standardized components.

    Takes parsed location data and returns a tuple of standardized
    location components suitable for database storage.

    Args:
        location_data: Dictionary containing parsed location components

    Returns:
        Tuple containing (country, province, municipality, urbanisation, address)
    """
    country = location_data.get('country')
    province = location_data.get('province')
    municipality = location_data.get('municipality')
    urbanisation = location_data.get('urbanisation')

    # Construct full address
    address_parts = []
    if urbanisation:
        address_parts.append(urbanisation)
    if location_data.get('district'):
        address_parts.append(location_data['district'])
    if municipality:
        address_parts.append(municipality)
    if province:
        address_parts.append(province)
    if country:
        address_parts.append(country)

    address = ', '.join(address_parts) if address_parts else None

    return country, province, municipality, urbanisation, address


def parse_location(html_content: str) -> Dict[str, str]:
    """
    Parse location information from HTML content.

    Extracts country, province, municipality, urbanisation from
    Idealista property listings using main-info title minor class.

    Args:
        html_content: Raw HTML content string

    Returns:
        Dictionary with location components
    """
    location = {
        "country": "",
        "province": "",
        "municipality": "",
        "urbanisation": "",
        "full_address": "",
    }

    try:
        # Look for location breadcrumbs or structured data
        # Pattern: "Madrid, Comunidad de Madrid, Spain"
        location_pattern = r'<span[^>]*class="[^"]*main-info__title-minor[^"]*"[^>]*>([^<]+)</span>'
        match = re.search(location_pattern, html_content)
        if match:
            location_text = match.group(1).strip()
            parts = [p.strip() for p in location_text.split(",")]

            if len(parts) >= 1:
                location["municipality"] = parts[0]
            if len(parts) >= 2:
                location["province"] = parts[1]
            if len(parts) >= 3:
                location["country"] = parts[2]

            location["full_address"] = location_text

    except Exception as e:
        logger.warning(f"Failed to parse location: {e}")

    return location


def parse_address(address: str) -> Dict[str, str]:
    """
    Parse a full address string into components.

    Args:
        address: Full address string (e.g., "Calle Mayor 5, Madrid, Spain")

    Returns:
        Dictionary with street, city, province, country, postal_code
    """
    result = {
        "street": "",
        "city": "",
        "province": "",
        "country": "",
        "postal_code": "",
    }

    if not address:
        return result

    parts = [p.strip() for p in address.split(",")]

    # Try to identify components based on position and patterns
    if len(parts) >= 1:
        # First part is usually street
        result["street"] = parts[0]

    if len(parts) >= 2:
        # Second part might be city or district
        result["city"] = parts[1]

    if len(parts) >= 3:
        # Could be province
        result["province"] = parts[2]

    if len(parts) >= 4:
        result["country"] = parts[3]

    # Look for postal code pattern
    postal_pattern = r"\b(\d{5})\b"
    postal_match = re.search(postal_pattern, address)
    if postal_match:
        result["postal_code"] = postal_match.group(1)

    return result


def normalize_location(location: str) -> str:
    """
    Normalize location string for comparison.

    - Lowercase
    - Remove accents
    - Remove extra whitespace

    Args:
        location: Raw location string

    Returns:
        Normalized location string
    """
    # Normalize unicode and remove accents
    normalized = unicodedata.normalize("NFD", location)
    normalized = "".join(c for c in normalized if unicodedata.category(c) != "Mn")

    # Lowercase and clean whitespace
    normalized = normalized.lower()
    normalized = " ".join(normalized.split())

    return normalized


def locations_match(loc1: str, loc2: str) -> bool:
    """
    Check if two location strings refer to the same place.

    Args:
        loc1: First location string
        loc2: Second location string

    Returns:
        True if locations match, False otherwise
    """
    return normalize_location(loc1) == normalize_location(loc2)
