"""
MongoDB data cleaner module.

This module provides functionality to clean and transform data for MongoDB insertion,
including UUID to ObjectID conversion, UTF-8/BSON compatibility fixes, and data
structure validation.
"""

import json
import re
import uuid
from pathlib import Path
from typing import Any, Callable, Dict, List, Optional, Union

from bson import ObjectId


def is_uuid(value: str) -> bool:
    """
    Check if a string is a valid UUID.

    Args:
        value: String to check

    Returns:
        True if string is in UUID format, False otherwise
    """
    try:
        uuid.UUID(value)
        return True
    except (ValueError, AttributeError, TypeError):
        return False


def convert_uuid_to_mongo_id(value: str) -> str:
    """
    Convert UUID to MongoDB compatible ID format.

    Takes a UUID string and converts it to a 24-character hex string
    compatible with MongoDB ObjectID format.

    Args:
        value: UUID string to convert

    Returns:
        MongoDB compatible 24-character hex string, or original value if not a UUID
    """
    if is_uuid(value):
        # Convert UUID to a 24-character hex string
        return uuid.UUID(value).hex[:24]
    return value


def clean_string(s: Any) -> Any:
    """
    Clean string to ensure valid UTF-8 and BSON compatibility.

    Removes invalid UTF-8 characters, null bytes, and other problematic
    control characters that can cause issues with BSON encoding.

    Args:
        s: Input value to clean (only processes strings)

    Returns:
        Cleaned string if input is string, otherwise returns original value
    """
    if not isinstance(s, str):
        return s

    # Remove or replace invalid UTF-8 characters
    s = s.encode('utf-8', errors='ignore').decode('utf-8')

    # Replace null bytes which are invalid in BSON
    s = s.replace('\x00', '')

    # Remove or replace other problematic control characters
    # Preserves \t (0x09), \n (0x0A), \r (0x0D)
    s = re.sub(r'[\x00-\x08\x0B\x0C\x0E-\x1F\x7F]', '', s)

    return s


def clean_dict(d: Any) -> Any:
    """
    Recursively clean all string values in a dictionary.

    Traverses nested dictionaries and lists, applying string cleaning
    to all string values found.

    Args:
        d: Dictionary to clean (or any value for recursive processing)

    Returns:
        Cleaned dictionary with all string values sanitized
    """
    if not isinstance(d, dict):
        return d

    cleaned = {}
    for k, v in d.items():
        if isinstance(v, dict):
            cleaned[k] = clean_dict(v)
        elif isinstance(v, list):
            cleaned[k] = [
                clean_dict(item) if isinstance(item, dict) else clean_string(item)
                for item in v
            ]
        elif isinstance(v, str):
            cleaned[k] = clean_string(v)
        else:
            cleaned[k] = v
    return cleaned


def ensure_proper_id_format(value: Any) -> Dict[str, str]:
    """
    Ensure ID is in proper MongoDB ObjectID format.

    Converts various ID formats (UUID, string, existing ObjectID dict)
    to the standard MongoDB ObjectID dictionary format: {"$oid": "..."}

    Args:
        value: ID value to convert (string, dict, or any type)

    Returns:
        Dictionary in format {"$oid": "hex_string"}
    """
    if isinstance(value, str):
        # Convert UUID format to MongoDB compatible format if needed
        converted_value = convert_uuid_to_mongo_id(value)
        return {"$oid": converted_value}
    elif isinstance(value, dict) and "$oid" in value:
        # Already in ObjectID format, but check if it's a UUID
        if is_uuid(value["$oid"]):
            value["$oid"] = convert_uuid_to_mongo_id(value["$oid"])
        return value
    # Generate new ObjectID if value is invalid
    return {"$oid": str(ObjectId())}


def ensure_pricing_structure(unit: Dict[str, Any]) -> Dict[str, Any]:
    """
    Ensure unit pricing structure has all required fields.

    Adds missing pricing fields with appropriate default values to ensure
    data consistency across all units.

    Args:
        unit: Unit data dictionary

    Returns:
        Unit dictionary with complete pricing structure
    """
    if 'pricing' not in unit:
        unit['pricing'] = {}

    if 'sales' not in unit['pricing']:
        unit['pricing']['sales'] = {}

    sales = unit['pricing']['sales']

    # Add missing fields with default values if they don't exist
    default_fields = {
        '_id': ensure_proper_id_format(sales.get('_id', str(ObjectId()))),
        'salesTotalCommissionPercentage': sales.get('salesTotalCommissionPercentage', 0),
        'salesTotalCommission': sales.get('salesTotalCommission', 0),
        'salesListingAgentCommissionPercentage': sales.get('salesListingAgentCommissionPercentage', 0),
        'salesListingAgentCommission': sales.get('salesListingAgentCommission', 0),
        'salesSellingAgentCommissionPercentage': sales.get('salesSellingAgentCommissionPercentage', 0),
        'salesSellingAgentCommission': sales.get('salesSellingAgentCommission', 4),
        'communityFees': sales.get('communityFees', None),
        'IBIFees': sales.get('IBIFees', None),
        'price': sales.get('price', 0)
    }

    unit['pricing']['sales'].update(default_fields)
    return unit


class MongoCleaner:
    """
    MongoDB data cleaner with progress tracking and batch processing support.

    Provides a class-based interface for cleaning JSONL files, converting UUIDs
    to ObjectIDs, fixing BSON compatibility issues, and ensuring proper data structures.

    Example:
        >>> cleaner = MongoCleaner()
        >>> cleaner.convert_file(
        ...     "input.jsonl",
        ...     "output.jsonl",
        ...     progress_callback=lambda p: print(f"Progress: {p}%")
        ... )
    """

    def __init__(self):
        """Initialize the MongoCleaner."""
        self.success_count = 0
        self.error_count = 0
        self.total_lines = 0

    def _count_lines(self, file_path: Union[str, Path]) -> int:
        """
        Count total lines in a file for progress tracking.

        Args:
            file_path: Path to the file

        Returns:
            Number of non-empty lines in the file
        """
        count = 0
        with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
            for line in f:
                if line.strip():
                    count += 1
        return count

    def clean_record(self, data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Clean a single record.

        Applies all cleaning transformations to a single data record:
        - String cleaning for UTF-8/BSON compatibility
        - UUID to ObjectID conversion
        - Pricing structure validation

        Args:
            data: Data record to clean

        Returns:
            Cleaned data record
        """
        # Clean all string values
        data = clean_dict(data)

        # Convert main _id if it's not in proper format
        if '_id' in data and not isinstance(data['_id'], dict):
            data['_id'] = ensure_proper_id_format(data['_id'])

        # Convert ownerDetails._id if present
        if 'ownerDetails' in data and '_id' in data['ownerDetails']:
            if not isinstance(data['ownerDetails']['_id'], dict):
                data['ownerDetails']['_id'] = ensure_proper_id_format(
                    data['ownerDetails']['_id']
                )

        # Process units pricing structure
        if 'units' in data:
            data['units'] = [ensure_pricing_structure(unit) for unit in data['units']]

        return data

    def convert_file(
        self,
        input_path: Union[str, Path],
        output_path: Union[str, Path],
        progress_callback: Optional[Callable[[float], None]] = None,
        error_callback: Optional[Callable[[int, str, Exception], None]] = None,
    ) -> Dict[str, int]:
        """
        Convert UUID-based IDs to MongoDB ObjectID format and clean BSON issues.

        Processes a JSONL file line by line, applying all cleaning transformations
        and writing to an output file. Supports progress tracking and error handling.

        Args:
            input_path: Path to input JSONL file
            output_path: Path to output JSONL file
            progress_callback: Optional callback function that receives progress percentage (0-100)
            error_callback: Optional callback function that receives (line_number, line_content, exception)

        Returns:
            Dictionary with processing statistics:
            - success_count: Number of successfully processed records
            - error_count: Number of failed records
            - total_processed: Total records attempted

        Example:
            >>> def on_progress(percent):
            ...     print(f"Progress: {percent:.1f}%")
            >>>
            >>> def on_error(line_num, content, error):
            ...     print(f"Error at line {line_num}: {error}")
            >>>
            >>> stats = cleaner.convert_file(
            ...     "input.jsonl",
            ...     "output.jsonl",
            ...     progress_callback=on_progress,
            ...     error_callback=on_error
            ... )
        """
        # Reset counters
        self.success_count = 0
        self.error_count = 0

        # Convert to Path objects
        input_path = Path(input_path)
        output_path = Path(output_path)

        # Ensure output directory exists
        output_path.parent.mkdir(parents=True, exist_ok=True)

        # Count total lines for progress tracking
        if progress_callback:
            self.total_lines = self._count_lines(input_path)

        current_line = 0

        with open(input_path, 'r', encoding='utf-8', errors='ignore') as infile, \
             open(output_path, 'w', encoding='utf-8') as outfile:

            for line_number, line in enumerate(infile, 1):
                if not line.strip():
                    continue

                current_line += 1

                try:
                    data = json.loads(line.strip())

                    # Clean the record
                    cleaned_data = self.clean_record(data)

                    # Write the cleaned line
                    outfile.write(json.dumps(cleaned_data) + '\n')
                    self.success_count += 1

                except (json.JSONDecodeError, UnicodeError) as e:
                    self.error_count += 1

                    if error_callback:
                        error_callback(line_number, line, e)
                    else:
                        print(f"Error processing line {line_number}: {str(e)}")

                    continue

                # Update progress
                if progress_callback and self.total_lines > 0:
                    progress = (current_line / self.total_lines) * 100
                    progress_callback(progress)

        return {
            'success_count': self.success_count,
            'error_count': self.error_count,
            'total_processed': self.success_count + self.error_count,
        }

    def get_stats(self) -> Dict[str, int]:
        """
        Get current processing statistics.

        Returns:
            Dictionary with processing statistics
        """
        return {
            'success_count': self.success_count,
            'error_count': self.error_count,
            'total_processed': self.success_count + self.error_count,
        }

    def print_stats(self) -> None:
        """Print current processing statistics to console."""
        stats = self.get_stats()
        print(f"\nConversion Statistics:")
        print(f"Successfully processed records: {stats['success_count']}")
        print(f"Failed records: {stats['error_count']}")
        print(f"Total records processed: {stats['total_processed']}")

        if stats['total_processed'] > 0:
            success_rate = (stats['success_count'] / stats['total_processed']) * 100
            print(f"Success rate: {success_rate:.2f}%")


# Backward-compatible function for CLI usage
def convert_file(input_path: str, output_path: str) -> Dict[str, int]:
    """
    Convert UUID-based IDs to MongoDB ObjectID format and clean BSON issues.

    This is a convenience function that wraps MongoCleaner for backward
    compatibility with existing code.

    Args:
        input_path: Path to input JSONL file
        output_path: Path to output JSONL file

    Returns:
        Dictionary with processing statistics
    """
    cleaner = MongoCleaner()
    stats = cleaner.convert_file(input_path, output_path)
    cleaner.print_stats()
    return stats
