"""
Agent listing manager module.

Orchestrates the scraping of property listings from multiple real estate
agents using a pool of Scrapfly workers for concurrent processing.
"""
from __future__ import annotations

import json
import pickle
import asyncio
import aiofiles
from pathlib import Path
from typing import Dict, List, Set, Optional, Tuple, TYPE_CHECKING

from loguru import logger

if TYPE_CHECKING:
    from idealista_scraper.cli.config import ScraperConfig

from .worker import ScrapflyWorker, AgentProgress, create_workers_from_env
from ..cache.listing_cache import AgentListingsCache
from ..scraping.listing_scraper import scrape_agent_listings
from ..scraping.property_scraper import PropertyDetailsScraper
from ..session.manager import SessionManager
from ..cache.config_loader import get_scrapfly_configs
from ..utils.paths import get_output_dir, get_cache_dir


class AgentListingManager:
    """
    Manages multiple Scrapfly clients and distributes agent scraping tasks.

    This manager:
    - Coordinates multiple Scrapfly workers
    - Tracks agent processing progress
    - Manages caching of listings data
    - Filters already processed properties
    - Handles concurrent property scraping

    Example:
        >>> manager = AgentListingManager()
        >>> await manager.load_agent_urls("agents.jsonl")
        >>> await manager.run()
    """

    def __init__(self, config: Optional['ScraperConfig'] = None):
        """Initialize the manager with multiple Scrapfly clients.

        Args:
            config: Optional ScraperConfig for configuring concurrency and behavior.
                   If None, uses defaults from environment variables.
        """
        self.config = config
        self.workers: List[ScrapflyWorker] = create_workers_from_env()
        self.task_queue: asyncio.Queue = asyncio.Queue()
        self.completed_agents: Set[str] = set()
        self.in_progress: Set[str] = set()
        self.lock = asyncio.Lock()
        self.results_lock = asyncio.Lock()

        # Load configurations
        configs = get_scrapfly_configs()
        self.base_config = configs.get("BASE_CONFIG", {})
        self.base_config["headers"] = configs.get("BASE_HEADERS", {})

        # Cache and progress tracking
        self.cache_dir = get_cache_dir() / "agents"
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.progress_file = self.cache_dir / "agent_progress.pkl"
        self.progress: Dict[str, AgentProgress] = self._load_progress()

        # Initialize session manager
        self.session_manager = SessionManager()

        # Initialize cache
        self.cache = AgentListingsCache()

        # Track processed property URLs
        self.processed_rental_urls: Set[str] = set()
        self.processed_resale_urls: Set[str] = set()
        self._load_processed_properties()

        # In-memory cache
        self.agent_cache: Dict = {}
        self._load_agent_cache()

        logger.info(f"Initialized AgentListingManager with {len(self.workers)} workers")

    def _load_progress(self) -> Dict[str, AgentProgress]:
        """Load progress from pickle file."""
        try:
            if self.progress_file.exists():
                with open(self.progress_file, 'rb') as f:
                    return pickle.load(f)
        except Exception as e:
            logger.error(f"Error loading progress: {e}")
        return {}

    async def _save_progress(self) -> None:
        """Save progress to pickle file."""
        async with self.lock:
            try:
                with open(self.progress_file, 'wb') as f:
                    pickle.dump(self.progress, f)
            except Exception as e:
                logger.error(f"Error saving progress: {e}")

    def _load_processed_properties(self) -> None:
        """Load already processed properties from output files."""
        output_dir = get_output_dir()
        try:
            # Load rental properties
            rental_file = output_dir / "propli_rental_properties.jsonl"
            if rental_file.exists():
                with open(rental_file, 'r', encoding='utf-8') as f:
                    for line in f:
                        if line.strip():
                            try:
                                data = json.loads(line)
                                if 'propertyUrl' in data:
                                    self.processed_rental_urls.add(data['propertyUrl'])
                            except json.JSONDecodeError:
                                continue
                logger.info(f"Loaded {len(self.processed_rental_urls)} processed rental URLs")

            # Load resale properties
            resale_file = output_dir / "propli_resale_properties.jsonl"
            if resale_file.exists():
                with open(resale_file, 'r', encoding='utf-8') as f:
                    for line in f:
                        if line.strip():
                            try:
                                data = json.loads(line)
                                if 'propertyUrl' in data:
                                    self.processed_resale_urls.add(data['propertyUrl'])
                            except json.JSONDecodeError:
                                continue
                logger.info(f"Loaded {len(self.processed_resale_urls)} processed resale URLs")

        except Exception as e:
            logger.error(f"Error loading processed properties: {e}")

    def _load_agent_cache(self) -> None:
        """Load all agent cache data into memory at startup."""
        try:
            cache_dir = get_cache_dir() / "listings"
            if not cache_dir.exists():
                return

            for cache_file in cache_dir.glob("*_*_listings.json"):
                agent_oid = cache_file.stem.split('_')[0]
                property_type = cache_file.stem.split('_')[1]

                with open(cache_file, 'r') as f:
                    data = json.load(f)
                    if agent_oid not in self.agent_cache:
                        self.agent_cache[agent_oid] = {}
                    self.agent_cache[agent_oid][property_type] = data

            logger.info(f"Loaded cache for {len(self.agent_cache)} agents")
        except Exception as e:
            logger.error(f"Error loading agent cache: {e}")

    async def filter_unprocessed_urls(
        self,
        urls: List[str],
        property_type: str
    ) -> List[str]:
        """
        Filter out already processed URLs.

        Args:
            urls: List of property URLs
            property_type: 'rental' or 'resale'

        Returns:
            List of unprocessed URLs
        """
        if property_type == "rental":
            unprocessed = [url for url in urls if url not in self.processed_rental_urls]
            logger.info(f"Filtered rental URLs: {len(urls)} total, {len(unprocessed)} unprocessed")
            return unprocessed
        else:
            unprocessed = [url for url in urls if url not in self.processed_resale_urls]
            logger.info(f"Filtered resale URLs: {len(urls)} total, {len(unprocessed)} unprocessed")
            return unprocessed

    async def load_agent_urls(self, file_path: str = "property_agents.jsonl") -> None:
        """
        Load agent URLs into the task queue.

        Args:
            file_path: Path to the JSONL file with agent data
        """
        try:
            async with aiofiles.open(file_path, 'r') as f:
                async for line in f:
                    agent_data = json.loads(line)
                    agent_url = agent_data["url"]

                    # Check if agent needs processing
                    if agent_url not in self.progress or not self._is_agent_complete(agent_url):
                        await self.task_queue.put((agent_url, agent_data))
                        if agent_url not in self.progress:
                            self.progress[agent_url] = AgentProgress(url=agent_url)

            logger.info(f"Loaded {self.task_queue.qsize()} agents that need processing")
        except Exception as e:
            logger.error(f"Error loading agent URLs: {e}")
            raise

    def _is_agent_complete(self, agent_url: str) -> bool:
        """Check if an agent's processing is complete."""
        if agent_url not in self.progress:
            return False
        return self.progress[agent_url].is_complete()

    async def process_agent(
        self,
        worker: ScrapflyWorker,
        agent_url: str,
        agent_data: Dict
    ) -> None:
        """
        Process single agent with cache awareness.

        Args:
            worker: Worker to use for scraping
            agent_url: Agent's URL
            agent_data: Agent's metadata
        """
        try:
            agent_oid = agent_data.get('owner', {}).get('$oid')
            if not agent_oid:
                logger.warning(f"Missing agent OID for {agent_url}")
                return

            worker.start_task(agent_url)

            # Check in-memory cache
            cached_data = self.agent_cache.get(agent_oid, {})
            needs_scraping = True
            listings_data = None

            if cached_data:
                # Check if all property URLs are processed
                rental_urls = cached_data.get('rental', {}).get('data', {}).get('listings', [])
                resale_urls = cached_data.get('resale', {}).get('data', {}).get('listings', [])

                unprocessed_rentals = await self.filter_unprocessed_urls(rental_urls, "rental")
                unprocessed_resales = await self.filter_unprocessed_urls(resale_urls, "resale")

                if not unprocessed_rentals and not unprocessed_resales:
                    logger.info(f"All properties already processed for agent {agent_oid}")
                    return

                listings_data = {
                    "rental": {"listings": unprocessed_rentals},
                    "resale": {"listings": unprocessed_resales}
                }
                needs_scraping = False

            if needs_scraping:
                # Scrape new data
                await worker.initialize_session(agent_url)
                listings_data = await scrape_agent_listings(
                    worker.client,
                    agent_url,
                    self.base_config
                )

                # Update cache
                self.agent_cache[agent_oid] = listings_data
                await self.cache.cache_agent_listings(agent_oid, listings_data)

            # Process properties
            if listings_data:
                await self._process_properties(worker, listings_data, agent_data)

            async with self.lock:
                self.completed_agents.add(agent_url)
                if agent_url in self.in_progress:
                    self.in_progress.remove(agent_url)

            logger.info(f"Worker {worker.worker_id} completed agent: {agent_url}")

        except Exception as e:
            logger.error(f"Error processing agent {agent_url}: {e}")
        finally:
            worker.complete_task()

    async def _process_properties(
        self,
        worker: ScrapflyWorker,
        listings_data: Dict,
        agent_data: Dict
    ) -> None:
        """
        Process both rental and resale properties.

        Args:
            worker: Worker to use
            listings_data: Listings data with rental and resale URLs
            agent_data: Agent metadata
        """
        # Initialize property details scraper
        base_config = self.base_config.copy()
        base_config["session_sticky_proxy"] = True

        property_scraper = PropertyDetailsScraper(
            worker.client,
            self.session_manager,
            base_config=base_config
        )

        # Process rental properties
        rental_listings = listings_data.get("rental", {}).get("listings", [])
        if rental_listings:
            unprocessed_rentals = await self.filter_unprocessed_urls(rental_listings, "rental")
            if unprocessed_rentals:
                logger.info(f"Processing {len(unprocessed_rentals)} new rental properties")
                await property_scraper.process_property_batch(
                    unprocessed_rentals,
                    "rental",
                    agent_info=agent_data
                )
            else:
                logger.info("All rental properties already processed")

        # Process resale properties
        resale_listings = listings_data.get("resale", {}).get("listings", [])
        if resale_listings:
            unprocessed_resales = await self.filter_unprocessed_urls(resale_listings, "resale")
            if unprocessed_resales:
                logger.info(f"Processing {len(unprocessed_resales)} new resale properties")
                await property_scraper.process_property_batch(
                    unprocessed_resales,
                    "resale",
                    agent_info=agent_data
                )
            else:
                logger.info("All resale properties already processed")

    async def _process_worker(self, worker: ScrapflyWorker) -> None:
        """
        Continuously process tasks for a worker.

        Args:
            worker: Worker to process tasks with
        """
        while True:
            try:
                # Get next task with timeout
                task = await asyncio.wait_for(
                    self.task_queue.get(),
                    timeout=1.0
                )

                agent_url, agent_data = task

                # Check if already completed
                if agent_url in self.completed_agents:
                    self.task_queue.task_done()
                    continue

                # Process agent
                try:
                    await self.process_agent(worker, agent_url, agent_data)
                except Exception as e:
                    logger.error(f"Error processing agent {agent_url}: {e}")
                finally:
                    self.task_queue.task_done()

            except asyncio.TimeoutError:
                # No tasks available within timeout
                if self.task_queue.empty() and not self.in_progress:
                    logger.info(f"Worker {worker.worker_id} finishing - no more tasks")
                    break
                continue
            except Exception as e:
                logger.error(f"Error in worker {worker.worker_id}: {e}")
                continue

    async def run(self) -> None:
        """Run workers continuously until all tasks are complete."""
        try:
            # Start all workers
            worker_tasks = []
            for worker in self.workers:
                task = asyncio.create_task(
                    self._process_worker(worker),
                    name=f"Worker_{worker.worker_id}"
                )
                worker_tasks.append(task)

            # Wait for all workers to complete
            await asyncio.gather(*worker_tasks)
            logger.info("All tasks completed")

        except Exception as e:
            logger.error(f"Error in main run loop: {e}")
            raise


async def main(config: Optional['ScraperConfig'] = None):
    """Entry point for running the agent listing manager.

    Args:
        config: Optional ScraperConfig for configuring the scraper.
               If None, uses default configuration from environment variables.
    """
    logger.info("Starting agent listings scraper with multiple workers")

    manager = AgentListingManager(config=config)
    await manager.load_agent_urls()
    await manager.run()

    logger.info("All agents processed successfully")


if __name__ == "__main__":
    asyncio.run(main())
