"""
HTML Parser module for cleaning and extracting content from Idealista property pages.

This module provides functionality to parse HTML content from Idealista listings,
extracting property details, images, and metadata using BeautifulSoup and regex patterns.

Performance optimizations:
- Uses lxml parser (10-100x faster than html.parser)
- Pre-compiled regex patterns cached at module level
- Single-pass image extraction
"""

import re
import json
import json5
from bs4 import BeautifulSoup
import logging
from typing import Dict, List, Optional, TYPE_CHECKING
import asyncio

# Import centralized cache layer for module-level config caching
from ..cache.config_loader import get_regex_patterns
from ..utils.countries import CountryConfig, get_country

if TYPE_CHECKING:
    from ..utils.countries import CountryConfig

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

# Pre-load and cache regex patterns at module level using centralized cache
# This ensures configs are loaded ONCE and shared across the entire application
_PATTERNS: dict = get_regex_patterns()

# Pre-compile frequently used regex patterns for better performance
_COMPILED_PATTERNS: Dict[str, Optional[re.Pattern]] = {}


def _get_compiled_pattern(pattern_name: str) -> Optional[re.Pattern]:
    """Get a pre-compiled regex pattern by name."""
    if pattern_name not in _COMPILED_PATTERNS:
        pattern_str = _PATTERNS.get(pattern_name)
        if pattern_str:
            try:
                _COMPILED_PATTERNS[pattern_name] = re.compile(pattern_str)
            except re.error as e:
                logger.error(f"Invalid regex pattern '{pattern_name}': {e}")
                _COMPILED_PATTERNS[pattern_name] = None
        else:
            _COMPILED_PATTERNS[pattern_name] = None
    return _COMPILED_PATTERNS[pattern_name]


# Pre-compile patterns at module load time
for _pattern_name in ['area_pattern', 'orientation_pattern', 'year_pattern',
                      'floor_pattern', 'property_id_pattern', 'multimedia_pattern',
                      'utag_data_pattern']:
    _get_compiled_pattern(_pattern_name)

# Pre-compiled pattern for image URL modification (used in single-pass extraction)
_IMAGE_URL_PATTERN = re.compile(r'/WEB_DETAIL.*?/0/')


def clean_html_content(
    html_content: str,
    country: CountryConfig | None = None
) -> Optional[Dict]:
    """
    Clean and extract relevant content from HTML string.

    Args:
        html_content: Raw HTML content string
        country: Country configuration. Defaults to IDEALISTA_DEFAULT_COUNTRY env var or 'spain'.

    Returns:
        Dictionary with extracted and cleaned content, or None on error

    Performance optimizations:
        - Uses lxml parser (10-100x faster than html.parser)
        - Pre-compiled regex patterns cached at module level
        - Single-pass image extraction
    """
    if country is None:
        country = get_country()
    try:
        # Parse HTML content with lxml (significantly faster than html.parser)
        soup = BeautifulSoup(html_content, 'lxml')
        logger.info("Successfully parsed HTML content with lxml")

        content = {
            'titles': [],
            'paragraphs': [],
            'list_items': [],
            'images': [],
            'image_tags': [],
            'basic_features': [],
            'description': '',
            'propertyUrl': '',
            'propertyId': None,
            'location': '',
            'advertiser_name': None,
            'agency_ref': None,
            'development_name': None,
            'html_content': html_content,  # Store raw content
            'agencyReference': None,
            'utag_data': None,
            'development_website': None
        }

        # Extract development name from the <title> tag
        title_tag = soup.find('title')
        if title_tag:
            content['development_name'] = title_tag.text.strip()
        else:
            content['development_name'] = 'Unknown Development'

        # Extract advertiser name from <div class="advertiser-info">
        advertiser_elem = soup.find('div', class_='advertiser-info')
        if advertiser_elem:
            name_elem = advertiser_elem.find('p', class_='advertiser-name')
            if name_elem:
                content['advertiser_name'] = name_elem.text.strip()
            else:
                content['advertiser_name'] = 'Unknown Advertiser'
        else:
            content['advertiser_name'] = 'Unknown Advertiser'

        # Extract description from <div class="comment">
        description_div = soup.find('div', class_='comment')
        if description_div:
            p_tag = description_div.find('p')
            if p_tag:
                content['description'] = p_tag.get_text(separator=' ', strip=True)
            else:
                content['description'] = ''
        else:
            content['description'] = ''

        # Extract basic features and amenities from <div class="details-property">
        details_property = soup.select_one('div[class="details-property"]')
        if details_property:
            logger.info("Found details-property section")
            # Initialize new fields
            content['utilArea'] = 0
            content['floor'] = None
            content['orientation'] = []
            content['completedYear'] = None
            content['climateControl'] = []
            content['basic_features'] = []

            # Process all sections within details-property
            for section in details_property.find_all('div', class_='details-property_features'):
                # Get the section heading
                section_heading = section.find_previous('h2', class_='details-property-h2')
                section_name = section_heading.text.strip() if section_heading else ""
                logger.info(f"Processing section: {section_name}")

                # Process list items in this section
                for li in section.find_all('li'):
                    feature_text = li.text.strip()
                    content['basic_features'].append(feature_text)

                    # Extract specific information based on section
                    if section_name == "Basic features":
                        # Extract built and floor area using pre-compiled pattern
                        area_pattern = _get_compiled_pattern('area_pattern')
                        if area_pattern:
                            area_match = area_pattern.search(feature_text)
                            if area_match:
                                content['utilArea'] = float(area_match.group(2))
                                logger.info(f"Found utilArea: {content['utilArea']} (matched from: {feature_text})")

                        # Extract orientation using pre-compiled pattern
                        orientation_pattern = _get_compiled_pattern('orientation_pattern')
                        if orientation_pattern:
                            orientation_match = orientation_pattern.search(feature_text)
                            if orientation_match:
                                content['orientation'] = [
                                    direction.strip()
                                    for direction in orientation_match.group(1).split(',')
                                ]
                                logger.info(f"Found orientation: {content['orientation']} (matched from: {feature_text})")

                        # Extract completed year using pre-compiled pattern
                        year_pattern = _get_compiled_pattern('year_pattern')
                        if year_pattern:
                            year_match = year_pattern.search(feature_text)
                            if year_match:
                                content['completedYear'] = int(year_match.group(1))
                                logger.info(f"Found completedYear: {content['completedYear']} (matched from: {feature_text})")

                    elif section_name == "Building":
                        # Extract floor information using pre-compiled pattern
                        floor_pattern = _get_compiled_pattern('floor_pattern')
                        if floor_pattern:
                            floor_match = floor_pattern.search(feature_text)
                            if floor_match:
                                content['floor'] = floor_match.group(1)
                                logger.info(f"Found floor: {content['floor']} (matched from: {feature_text})")
                        elif 'Ground floor' in feature_text:
                            content['floor'] = "Ground"
                            logger.info(f"Found floor: Ground (matched from: {feature_text})")

                    elif section_name == "Amenities":
                        # Extract climate control features
                        if 'Air conditioning' in feature_text:
                            content['climateControl'].append('Air Conditioning')
                            logger.info("Added Air Conditioning to climateControl")
                        elif 'Heat' in feature_text:
                            content['climateControl'].append('Heat')
                            logger.info("Added Heat to climateControl")

            logger.info("Completed processing details section")
            logger.info(f"Found: {len(content['basic_features'])} features")
        else:
            logger.warning("Could not find details-property section")

        # Extract property ID and URL from the canonical link using pre-compiled pattern
        canonical_link = soup.find('link', {'rel': 'canonical'})
        if canonical_link:
            property_id_pattern = _get_compiled_pattern('property_id_pattern')
            if property_id_pattern:
                property_id_match = property_id_pattern.search(canonical_link.get('href', ''))
                if property_id_match:
                    content['propertyId'] = property_id_match.group(1)
                    content['propertyUrl'] = f"/inmueble/{content['propertyId']}/"

        # Extract images and image tags from adMultimediasInfo JavaScript variable
        # Optimized: Single-pass extraction with pre-compiled patterns
        scripts = soup.find_all('script')
        content['image_urls'] = []  # New field to store full URLs

        # Get pre-compiled multimedia pattern
        multimedia_pattern = _get_compiled_pattern('multimedia_pattern')

        for script in scripts:
            if not script.string or 'var adMultimediasInfo =' not in script.string:
                continue

            try:
                script_content = script.string
                if multimedia_pattern:
                    match = multimedia_pattern.search(script_content)
                else:
                    # Fallback if pattern not available
                    match = re.search(_PATTERNS.get('multimedia_pattern', ''), script_content, re.DOTALL)

                if not match:
                    logger.error("adMultimediasInfo not found in script content.")
                    continue

                js_object_str = match.group(1)
                image_info = json5.loads(js_object_str)

                if 'fullScreenGalleryPics' not in image_info:
                    logger.error("fullScreenGalleryPics not found in adMultimediasInfo.")
                    continue

                # Single-pass extraction: process all images in one loop
                for img in image_info['fullScreenGalleryPics']:
                    url = img.get('imageDataService')
                    if url:
                        # Use pre-compiled pattern for URL modification (single regex operation)
                        modified_url = _IMAGE_URL_PATTERN.sub('/WEB_DETAIL_TOP-XL-L/0/', url)
                        # Extract filename directly (no additional regex needed)
                        content['images'].append(modified_url.rsplit('/', 1)[-1])
                        content['image_urls'].append(modified_url)

                    tag = img.get('tag')
                    if tag:
                        content['image_tags'].append(tag)

                # Found and processed adMultimediasInfo, no need to continue
                break

            except json5.JSONDecodeError as e:
                logger.error(f"Error parsing image JSON: {str(e)}")
            except Exception as e:
                logger.error(f"Error processing images: {str(e)}")

        # Extract agency reference and property URL from about-advertiser-name link
        agency_link = soup.select_one('a.about-advertiser-name')
        if agency_link:
            href = agency_link.get('href', '')
            # Extract agency reference (last part of href)
            content['agencyReference'] = href.rstrip('/').split('/')[-1]
            # Construct full property URL using country domain
            content['propertyUrl'] = f"{country.domain}{href}"

        # Extract utag_data from script using pre-compiled pattern
        utag_data_pattern = _get_compiled_pattern('utag_data_pattern')
        for script in scripts:
            if script.string and 'var utag_data =' in script.string:
                try:
                    # Extract utag_data JSON with pre-compiled pattern
                    if utag_data_pattern:
                        match = utag_data_pattern.search(script.string)
                    else:
                        match = re.search(_PATTERNS.get('utag_data_pattern', ''), script.string, re.DOTALL)
                    if match:
                        content['utag_data'] = json5.loads(match.group(1))
                        break  # Exit loop once found
                except Exception as e:
                    logger.error(f"Error parsing utag_data: {str(e)}")
                    content['utag_data'] = {}

        # Extract development website
        website_link = soup.find('a', class_='link-withered icon-new-tab')
        if website_link:
            content['development_website'] = website_link.get('href')

        logger.info("Completed content extraction")
        return content

    except Exception as e:
        logger.error(f"Error cleaning HTML content: {str(e)}")
        logger.debug(f"Error details: {str(e)}", exc_info=True)
        return None


class HTMLParser:
    """Async HTML parser for concurrent processing."""

    def __init__(self, country: str | CountryConfig | None = None):
        """Initialize parser without concurrency limits.

        Args:
            country: Country name or CountryConfig. Defaults to IDEALISTA_DEFAULT_COUNTRY
                    env var or 'spain'.
        """
        if isinstance(country, CountryConfig):
            self.country = country
        else:
            self.country = get_country(country)
        logger.info(f"Initialized HTML parser (unlimited concurrency, country={self.country.code})")

    async def clean_html_content(self, html_content: str) -> Optional[Dict]:
        """Process HTML content asynchronously."""
        return clean_html_content(html_content, country=self.country)

    async def process_batch(self, contents: List[str]) -> List[Optional[Dict]]:
        """Process multiple HTML contents concurrently."""
        tasks = [self.clean_html_content(content) for content in contents]
        return await asyncio.gather(*tasks)
