"""
Property details scraper module.

Handles scraping and processing of individual property details
from Idealista, including content fetching, HTML parsing, and data transformation.
"""
from __future__ import annotations

import uuid
import json
import asyncio
import aiofiles
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Optional, Tuple

from scrapfly import ScrapflyClient, ScrapeConfig
from loguru import logger
from backoff import on_exception, expo

from ..parsing.html_parser import HTMLParser
from ..transform.properties import transform_to_mongodb_format
from ..session.manager import SessionManager
from ..cache.property_cache import PropertyProgressCache, PropertyContentCache
from ..utils.config import settings
from ..utils.paths import get_output_dir, get_cache_dir


class PropertyDetailsScraper:
    """
    Handles scraping and processing of individual property details.

    This scraper:
    - Fetches property pages from Idealista
    - Parses HTML to extract structured data
    - Transforms data to MongoDB format
    - Saves results to JSONL files
    - Tracks progress for resumable operations

    Example:
        >>> from scrapfly import ScrapflyClient
        >>> from src.session import SessionManager
        >>> client = ScrapflyClient(key="your_key")
        >>> session_mgr = SessionManager()
        >>> scraper = PropertyDetailsScraper(client, session_mgr, base_config={})
        >>> await scraper.process_property_batch(urls, "rental", agent_info)
    """

    def __init__(
        self,
        scrapfly_client: ScrapflyClient,
        session_manager: SessionManager,
        base_config: Dict
    ):
        """
        Initialize the scraper with a Scrapfly client and session.

        Args:
            scrapfly_client: Configured Scrapfly client
            session_manager: Session manager for tracking sessions
            base_config: Base configuration for Scrapfly requests
        """
        self.client = scrapfly_client
        self.session_manager = session_manager
        self.base_config = base_config.copy()
        self.client_key = self.client.key[-6:]

        # Create necessary directories using centralized paths
        self.output_dir = get_output_dir()
        self.cache_dir = get_cache_dir()
        self.temp_dir = self.output_dir / "temp"

        for directory in [self.output_dir, self.cache_dir, self.temp_dir]:
            directory.mkdir(parents=True, exist_ok=True)

        # Initialize caches
        self.progress_cache = PropertyProgressCache(
            cache_dir=self.cache_dir,
            client_key=self.client_key
        )
        self.content_cache = PropertyContentCache(cache_dir=self.cache_dir)

        # Add results lock for concurrent writes
        self.results_lock = asyncio.Lock()

        # Concurrency control for Scrapfly requests - configurable via settings
        self.request_semaphore = asyncio.Semaphore(settings.max_concurrency)

        logger.info(
            f"Initialized PropertyDetailsScraper with client {self.client_key} "
            f"and concurrency limit {settings.max_concurrency}"
        )

        # Initialize HTML parser
        self.html_parser = HTMLParser()
        logger.info("Initialized HTML parser with no concurrency limit")

        # Image URLs tracking
        self.image_urls_file = self.output_dir / "image_urls.jsonl"
        self.image_urls_lock = asyncio.Lock()

    async def process_property_batch(
        self,
        property_urls: List[str],
        property_type: str,
        agent_info: Optional[Dict] = None
    ) -> None:
        """
        Process a batch of property URLs concurrently.

        Args:
            property_urls: List of property URLs to process
            property_type: Type of property ('rental' or 'resale')
            agent_info: Optional agent information dictionary
        """
        logger.info(f"Starting batch processing of {len(property_urls)} {property_type} properties")

        # Get a session for this property type
        session_id = await self.session_manager.get_session(
            worker_key=self.client_key,
            client_key=self.client.key,
            property_type=property_type
        )
        logger.info(f"Got session {session_id} for {property_type} properties")

        try:
            # Filter out already processed URLs
            new_urls = [
                url for url in property_urls
                if not self.progress_cache.is_processed(url)
            ]

            logger.info(f"Found {len(new_urls)} new URLs to process out of {len(property_urls)} total")

            if not new_urls:
                logger.info("All URLs in batch already processed")
                return

            # Fetch all contents first
            logger.info(f"Starting content fetch for {len(new_urls)} URLs")
            fetch_tasks = []
            cached_count = 0

            for url in new_urls:
                if self.progress_cache.is_fetched(url):
                    content = await self.content_cache.get(url)
                    if content:
                        fetch_tasks.append((url, content))
                        cached_count += 1
                        continue

                fetch_tasks.append(self.fetch_property_content(url))

            logger.info(f"Using {cached_count} cached contents, fetching {len(new_urls) - cached_count} new")

            # Process contents concurrently
            contents = await asyncio.gather(*fetch_tasks)
            logger.info(f"Received {len(contents)} contents")

            # Clean and transform concurrently
            process_tasks = []
            for url, content in contents:
                if content:
                    process_tasks.append(
                        self.process_single_property(url, content, property_type, agent_info)
                    )
                else:
                    logger.warning(f"No content received for {url}")

            logger.info(f"Processing {len(process_tasks)} properties")
            await asyncio.gather(*process_tasks)
            await self.progress_cache.save()
            logger.info(f"Completed batch processing for {property_type}")

        except Exception as e:
            logger.error(f"Error in batch processing: {str(e)}")
            logger.exception("Full traceback:")
        finally:
            # Release the session when done
            logger.info(f"Releasing session {session_id}")
            await self.session_manager.release_session(session_id)

    async def process_single_property(
        self,
        url: str,
        content: str,
        property_type: str,
        agent_info: Optional[Dict] = None
    ) -> None:
        """
        Process a single property content.

        Args:
            url: Property URL
            content: HTML content
            property_type: Type of property
            agent_info: Optional agent information
        """
        try:
            if self.progress_cache.is_processed(url):
                logger.info(f"Already processed: {url}")
                return

            logger.info(f"Starting processing for {url}")

            # Clean HTML content
            cleaned_data = await self.html_parser.clean_html_content(content)
            if not cleaned_data:
                raise ValueError("Failed to clean content")

            # Transform data
            transformed_data = transform_to_mongodb_format(
                cleaned_data,
                agent_info,
                cleaned_data.get('advertiser_name'),
                cleaned_data.get('development_name')
            )

            if transformed_data:
                # Save property data
                await self.save_property_data(transformed_data, property_type)

                # Save image URLs separately
                if 'image_urls' in cleaned_data:
                    transformed_data['image_urls'] = cleaned_data['image_urls']
                    await self.save_image_urls(transformed_data, property_type)

                # Mark as processed
                self.progress_cache.mark_processed(url)
                logger.info(f"Successfully processed {property_type} property: {url}")
            else:
                raise ValueError("Failed to transform data")

        except Exception as e:
            logger.error(f"Error processing {url}: {e}")
            self.progress_cache.mark_failed(url)
            await self.progress_cache.save()

    @on_exception(
        expo,
        exception=(Exception,),
        max_tries=5,
        giveup=lambda e: not (
            isinstance(e, Exception) and
            getattr(e, 'status_code', None) == 429
        )
    )
    async def fetch_property_content(self, url: str) -> Tuple[str, Optional[str]]:
        """
        Fetch property content using Scrapfly with backoff retry.

        Args:
            url: URL to fetch

        Returns:
            Tuple of (url, content) where content may be None on failure
        """
        async with self.request_semaphore:
            logger.info(f"Fetching content for {url}")
            try:
                config = self.base_config.copy()
                config["session"] = str(uuid.uuid4())

                scrape_config = ScrapeConfig(url=url, **config)
                result = await self.client.async_scrape(scrape_config)

                if result.scrape_success:
                    self.progress_cache.mark_fetched(url)
                    await self.content_cache.store(url, result.content)
                    return url, result.content
                else:
                    if result.status_code == 429:
                        logger.warning(f"Rate limit hit for {url}, backing off...")
                        raise Exception("Rate limit", status_code=429)
                    logger.error(f"Failed to fetch {url}: {result.status_code}")
                    return url, None

            except Exception as e:
                logger.error(f"Error fetching {url}: {str(e)}")
                return url, None

    async def save_property_data(self, transformed_data: Dict, property_type: str) -> bool:
        """
        Save property data atomically with validation.

        Args:
            transformed_data: Transformed property data
            property_type: Type of property

        Returns:
            True if saved successfully, False otherwise
        """
        try:
            # Validate data
            if not transformed_data or not isinstance(transformed_data, dict):
                logger.error("Invalid property data - not a complete dictionary")
                return False

            # Ensure required fields are present
            required_fields = ['_id', 'systemReference', 'owner']
            missing_fields = [f for f in required_fields if f not in transformed_data]
            if missing_fields:
                logger.error(f"Missing required fields: {missing_fields}")
                return False

            output_file = f"output/propli_{property_type}_properties.jsonl"

            # Atomically append to main file
            async with aiofiles.open(output_file, "a", encoding="utf-8") as f:
                async with self.results_lock:
                    json_line = json.dumps(transformed_data, ensure_ascii=False)
                    await f.write(json_line + "\n")
                    await f.flush()

            logger.info(f"Successfully saved property data to {output_file}")
            return True

        except Exception as e:
            logger.error(f"Error in save_property_data: {e}")
            return False

    async def save_image_urls(self, property_data: Dict, property_type: str) -> None:
        """
        Save image URLs with property information.

        Args:
            property_data: Property data including image URLs
            property_type: Type of property
        """
        try:
            if 'image_urls' not in property_data:
                return

            image_data = {
                'property_id': property_data.get('systemReference'),
                'property_type': property_type,
                'agent_id': property_data.get('owner', {}).get('$oid'),
                'image_urls': property_data.get('image_urls', []),
                'timestamp': datetime.now().isoformat()
            }

            async with self.image_urls_lock:
                async with aiofiles.open(self.image_urls_file, 'a', encoding='utf-8') as f:
                    await f.write(json.dumps(image_data, ensure_ascii=False) + '\n')
                    logger.info(
                        f"Saved {len(image_data['image_urls'])} image URLs "
                        f"for property {image_data['property_id']}"
                    )

        except Exception as e:
            logger.error(f"Error saving image URLs: {e}")
