"""
URL deduplication module for tracking already-scraped URLs.

Features:
- URL normalization for consistent comparison
- Persistent tracking to disk
- Bloom filter option for memory-efficient large-scale deduplication
- Thread-safe operations
"""

import hashlib
import json
import logging
import os
import threading
import time
from pathlib import Path
from typing import Optional, Set
from urllib.parse import parse_qs, urlencode, urlparse, urlunparse

logger = logging.getLogger(__name__)


class URLNormalizer:
    """
    URL normalization utilities for consistent URL comparison.

    Handles:
    - Scheme normalization (http -> https)
    - Host lowercasing
    - Path normalization (trailing slashes, double slashes)
    - Query parameter sorting and filtering
    - Fragment removal
    """

    # Query parameters to exclude (typically session/tracking params)
    EXCLUDE_PARAMS = frozenset([
        'utm_source', 'utm_medium', 'utm_campaign', 'utm_term', 'utm_content',
        'fbclid', 'gclid', 'ref', 'source', 'sessionid', 'sid', '_ga', '_gid',
        'mc_cid', 'mc_eid', 'yclid', 'rb_clickid'
    ])

    @classmethod
    def normalize(
        cls,
        url: str,
        remove_fragments: bool = True,
        remove_tracking_params: bool = True,
        lowercase_path: bool = False
    ) -> str:
        """
        Normalize a URL for consistent comparison.

        Args:
            url: The URL to normalize
            remove_fragments: Remove URL fragments (#...)
            remove_tracking_params: Remove common tracking parameters
            lowercase_path: Convert path to lowercase

        Returns:
            Normalized URL string
        """
        try:
            # Parse URL
            parsed = urlparse(url.strip())

            # Normalize scheme (prefer https)
            scheme = parsed.scheme.lower() or 'https'

            # Normalize host (lowercase, remove www)
            netloc = parsed.netloc.lower()
            if netloc.startswith('www.'):
                netloc = netloc[4:]

            # Normalize path
            path = parsed.path
            # Remove double slashes
            while '//' in path:
                path = path.replace('//', '/')
            # Ensure leading slash
            if path and not path.startswith('/'):
                path = '/' + path
            # Handle trailing slash consistently (keep it for directories)
            if not path:
                path = '/'
            if lowercase_path:
                path = path.lower()

            # Normalize query parameters
            query_params = parse_qs(parsed.query, keep_blank_values=True)
            if remove_tracking_params:
                query_params = {
                    k: v for k, v in query_params.items()
                    if k.lower() not in cls.EXCLUDE_PARAMS
                }
            # Sort and rebuild query string
            sorted_params = sorted(query_params.items())
            query = urlencode(
                [(k, v[0] if len(v) == 1 else v) for k, v in sorted_params],
                doseq=True
            )

            # Handle fragment
            fragment = '' if remove_fragments else parsed.fragment

            # Rebuild URL
            normalized = urlunparse((
                scheme,
                netloc,
                path,
                parsed.params,
                query,
                fragment
            ))

            return normalized
        except Exception as e:
            logger.warning(f"Failed to normalize URL '{url}': {e}")
            return url

    @classmethod
    def get_hash(cls, url: str) -> str:
        """
        Get a hash of the normalized URL.

        Args:
            url: The URL to hash

        Returns:
            SHA256 hash of the normalized URL
        """
        normalized = cls.normalize(url)
        return hashlib.sha256(normalized.encode('utf-8')).hexdigest()


class URLDeduplicator:
    """
    Track already-scraped URLs with persistence support.

    Uses a set for O(1) lookups and supports disk persistence
    for recovery between sessions.

    Args:
        persistence_file: Path to persistence file (default: .cache/scraped_urls.json)
        auto_save_interval: Save to disk every N additions (default: 100)

    Example:
        dedup = URLDeduplicator()

        if not dedup.is_scraped(url):
            # scrape the URL
            dedup.mark_scraped(url)
    """

    def __init__(
        self,
        persistence_file: Optional[str] = None,
        auto_save_interval: int = 100
    ):
        self._lock = threading.RLock()
        self._scraped_urls: Set[str] = set()
        self._url_hashes: Set[str] = set()
        self._changes_since_save = 0
        self._auto_save_interval = auto_save_interval

        # Set up persistence file
        if persistence_file is None:
            base_dir = Path(__file__).parent.parent.parent
            self._persistence_file = base_dir / '.cache' / 'scraped_urls.json'
        else:
            self._persistence_file = Path(persistence_file)

        # Create directory if needed
        self._persistence_file.parent.mkdir(parents=True, exist_ok=True)

        # Load existing data
        self._load()

        logger.info(
            f"URLDeduplicator initialized with {len(self._scraped_urls)} URLs"
        )

    def _load(self) -> None:
        """Load scraped URLs from disk."""
        try:
            if self._persistence_file.exists():
                with open(self._persistence_file, 'r', encoding='utf-8') as f:
                    data = json.load(f)

                self._scraped_urls = set(data.get('urls', []))
                self._url_hashes = set(data.get('hashes', []))

                # Rebuild hashes if missing
                if not self._url_hashes and self._scraped_urls:
                    self._url_hashes = {
                        URLNormalizer.get_hash(url)
                        for url in self._scraped_urls
                    }

                logger.info(
                    f"Loaded {len(self._scraped_urls)} scraped URLs from disk"
                )
        except Exception as e:
            logger.warning(f"Failed to load scraped URLs: {e}")
            self._scraped_urls = set()
            self._url_hashes = set()

    def save(self) -> bool:
        """
        Save scraped URLs to disk.

        Returns:
            True if save was successful
        """
        try:
            with self._lock:
                data = {
                    'urls': list(self._scraped_urls),
                    'hashes': list(self._url_hashes),
                    'count': len(self._scraped_urls),
                    'saved_at': time.time()
                }

                # Write atomically with temp file
                temp_file = self._persistence_file.with_suffix('.tmp')
                with open(temp_file, 'w', encoding='utf-8') as f:
                    json.dump(data, f, indent=2)

                temp_file.replace(self._persistence_file)
                self._changes_since_save = 0

                logger.debug(f"Saved {len(self._scraped_urls)} URLs to disk")
                return True
        except Exception as e:
            logger.error(f"Failed to save scraped URLs: {e}")
            return False

    def _maybe_auto_save(self) -> None:
        """Auto-save if enough changes have accumulated."""
        if self._changes_since_save >= self._auto_save_interval:
            self.save()

    def is_scraped(self, url: str) -> bool:
        """
        Check if a URL has already been scraped.

        Uses hash-based comparison for O(1) lookup.

        Args:
            url: The URL to check

        Returns:
            True if URL has been scraped
        """
        url_hash = URLNormalizer.get_hash(url)
        with self._lock:
            return url_hash in self._url_hashes

    def mark_scraped(self, url: str, save: bool = False) -> bool:
        """
        Mark a URL as scraped.

        Args:
            url: The URL that was scraped
            save: Force immediate save to disk

        Returns:
            True if URL was newly added, False if already existed
        """
        normalized_url = URLNormalizer.normalize(url)
        url_hash = URLNormalizer.get_hash(url)

        with self._lock:
            if url_hash in self._url_hashes:
                return False

            self._scraped_urls.add(normalized_url)
            self._url_hashes.add(url_hash)
            self._changes_since_save += 1

            if save:
                self.save()
            else:
                self._maybe_auto_save()

            return True

    def mark_scraped_batch(self, urls: list, save: bool = True) -> int:
        """
        Mark multiple URLs as scraped.

        Args:
            urls: List of URLs that were scraped
            save: Save to disk after batch (default: True)

        Returns:
            Number of newly added URLs
        """
        added = 0
        with self._lock:
            for url in urls:
                normalized_url = URLNormalizer.normalize(url)
                url_hash = URLNormalizer.get_hash(url)

                if url_hash not in self._url_hashes:
                    self._scraped_urls.add(normalized_url)
                    self._url_hashes.add(url_hash)
                    added += 1

            self._changes_since_save += added

            if save and added > 0:
                self.save()

        return added

    def remove(self, url: str, save: bool = False) -> bool:
        """
        Remove a URL from the scraped set.

        Args:
            url: The URL to remove
            save: Force immediate save to disk

        Returns:
            True if URL was removed
        """
        normalized_url = URLNormalizer.normalize(url)
        url_hash = URLNormalizer.get_hash(url)

        with self._lock:
            if url_hash not in self._url_hashes:
                return False

            self._scraped_urls.discard(normalized_url)
            self._url_hashes.discard(url_hash)
            self._changes_since_save += 1

            if save:
                self.save()

            return True

    def clear(self, save: bool = True) -> None:
        """
        Clear all scraped URLs.

        Args:
            save: Save empty state to disk (default: True)
        """
        with self._lock:
            self._scraped_urls.clear()
            self._url_hashes.clear()
            self._changes_since_save = 0

            if save:
                self.save()

        logger.info("Cleared all scraped URLs")

    def get_stats(self) -> dict:
        """Get deduplication statistics."""
        with self._lock:
            return {
                'total_urls': len(self._scraped_urls),
                'changes_since_save': self._changes_since_save,
                'persistence_file': str(self._persistence_file)
            }

    def __len__(self) -> int:
        """Return the number of scraped URLs."""
        return len(self._scraped_urls)

    def __contains__(self, url: str) -> bool:
        """Check if URL has been scraped."""
        return self.is_scraped(url)


# Module-level singleton instance
_default_deduplicator: Optional[URLDeduplicator] = None
_default_dedup_lock = threading.Lock()


def get_default_deduplicator(
    persistence_file: Optional[str] = None
) -> URLDeduplicator:
    """
    Get or create the default URL deduplicator singleton.

    Args:
        persistence_file: Path to persistence file (only used on first call)

    Returns:
        The default URLDeduplicator instance
    """
    global _default_deduplicator

    if _default_deduplicator is None:
        with _default_dedup_lock:
            if _default_deduplicator is None:
                _default_deduplicator = URLDeduplicator(
                    persistence_file=persistence_file
                )

    return _default_deduplicator
