"""
Property scraping progress cache module.

Provides progress tracking and content caching for property scraping
operations, with pickle-based persistence.
"""

import glob
import json
import uuid
import pickle
import asyncio
import aiofiles
from pathlib import Path
from datetime import datetime
from typing import Dict, Optional, Set
from loguru import logger

from ..utils.paths import get_cache_dir


class PropertyProgressCache:
    """
    Tracks property scraping progress with disk persistence.

    Maintains sets of fetched, processed, and failed URLs to enable
    resumable scraping operations.

    Example:
        >>> cache = PropertyProgressCache(cache_dir="output/cache", client_key="abc123")
        >>> if url not in cache.processed_urls:
        ...     # Process the URL
        ...     cache.mark_processed(url)
        ...     await cache.save()
    """

    def __init__(self, cache_dir: Path | str | None = None, client_key: str = "default"):
        """
        Initialize the progress cache.

        Args:
            cache_dir: Directory for cache files
            client_key: Unique identifier for this client/worker
        """
        self.cache_dir = Path(cache_dir) if cache_dir else get_cache_dir()
        self.cache_dir.mkdir(parents=True, exist_ok=True)
        self.client_key = client_key
        self.progress_file = self.cache_dir / f"progress_{client_key}.pkl"
        self.lock = asyncio.Lock()

        # Load existing progress
        self._progress = self._load_progress()

    @property
    def fetched_urls(self) -> Set[str]:
        """Get set of fetched URLs."""
        return self._progress.get('fetched_urls', set())

    @property
    def processed_urls(self) -> Set[str]:
        """Get set of successfully processed URLs."""
        return self._progress.get('processed_urls', set())

    @property
    def failed_urls(self) -> Set[str]:
        """Get set of failed URLs."""
        return self._progress.get('failed_urls', set())

    def _load_progress(self) -> Dict:
        """Load progress from pickle file."""
        try:
            if self.progress_file.exists():
                with open(self.progress_file, 'rb') as f:
                    progress = pickle.load(f)
                    logger.info(f"Loaded progress: {len(progress.get('processed_urls', set()))} processed URLs")
                    return progress
        except Exception as e:
            logger.error(f"Error loading progress: {e}")
        return {
            'fetched_urls': set(),
            'processed_urls': set(),
            'failed_urls': set()
        }

    async def save(self) -> None:
        """Save progress to pickle file."""
        async with self.lock:
            try:
                with open(self.progress_file, 'wb') as f:
                    pickle.dump(self._progress, f)
            except Exception as e:
                logger.error(f"Error saving progress: {e}")

    def mark_fetched(self, url: str) -> None:
        """Mark a URL as fetched."""
        self._progress['fetched_urls'].add(url)

    def mark_processed(self, url: str) -> None:
        """Mark a URL as successfully processed."""
        self._progress['processed_urls'].add(url)

    def mark_failed(self, url: str) -> None:
        """Mark a URL as failed."""
        self._progress['failed_urls'].add(url)

    def is_processed(self, url: str) -> bool:
        """Check if a URL has been processed."""
        return url in self._progress['processed_urls']

    def is_fetched(self, url: str) -> bool:
        """Check if a URL has been fetched."""
        return url in self._progress['fetched_urls']

    def get_stats(self) -> Dict:
        """Get progress statistics."""
        return {
            'fetched': len(self._progress['fetched_urls']),
            'processed': len(self._progress['processed_urls']),
            'failed': len(self._progress['failed_urls']),
        }


class PropertyContentCache:
    """
    Caches fetched HTML content for properties.

    Stores content in JSON files to avoid re-fetching during retries.

    Example:
        >>> cache = PropertyContentCache(cache_dir="output/cache")
        >>> await cache.store(url, html_content)
        >>> content = await cache.get(url)
    """

    def __init__(self, cache_dir: Path | str | None = None):
        """
        Initialize the content cache.

        Args:
            cache_dir: Directory for cache files
        """
        self.cache_dir = Path(cache_dir) if cache_dir else get_cache_dir()
        self.cache_dir.mkdir(parents=True, exist_ok=True)

    async def store(self, url: str, content: str) -> str:
        """
        Cache fetched content for a URL.

        Args:
            url: The URL that was fetched
            content: The HTML content to cache

        Returns:
            Path to the cache file
        """
        cache_file = self.cache_dir / f"{uuid.uuid4()}.html"
        async with aiofiles.open(cache_file, 'w', encoding='utf-8') as f:
            await f.write(json.dumps({
                'url': url,
                'content': content,
                'timestamp': datetime.now().isoformat()
            }))
        return str(cache_file)

    async def get(self, url: str) -> Optional[str]:
        """
        Retrieve cached content for a URL.

        Args:
            url: The URL to look up

        Returns:
            Cached content or None if not found
        """
        cache_pattern = str(self.cache_dir / "*.html")
        for cache_file in glob.glob(cache_pattern):
            try:
                async with aiofiles.open(cache_file, 'r', encoding='utf-8') as f:
                    data = json.loads(await f.read())
                    if data['url'] == url:
                        return data['content']
            except Exception:
                continue
        return None

    def clear(self) -> int:
        """
        Clear all cached content.

        Returns:
            Number of files removed
        """
        count = 0
        for cache_file in self.cache_dir.glob("*.html"):
            cache_file.unlink()
            count += 1
        return count
