"""
S3 image uploader with streaming support and progress tracking.

This module provides an async S3 uploader that downloads images from URLs
and streams them directly to S3 without saving to disk. Features include:
- Streaming uploads (no disk storage needed)
- Resume capability with progress tracking
- Batch processing with configurable concurrency
- Progress callbacks for CLI integration
- Comprehensive error handling and logging
- All configurable values loaded from src.utils.config.settings
"""

import asyncio
import aiohttp
import aioboto3
import json
import os
from urllib.parse import urlparse
from typing import List, Dict, Optional, Callable, Any
from pathlib import Path
from aiohttp.client import ClientTimeout
from boto3.s3.transfer import TransferConfig
import botocore.exceptions

from ..utils.config import settings
from ..utils.paths import get_output_dir


class S3ImageUploader:
    """
    Handles downloading images and uploading them to S3 bucket.

    This class provides async methods to download images from URLs and stream
    them directly to S3 storage without writing to disk. It supports:
    - Concurrent uploads with configurable limits
    - Progress tracking and resume capability
    - Batch processing of properties
    - Custom progress callbacks for integration

    Attributes:
        bucket_name: Name of the S3 bucket to upload to
        max_concurrency: Maximum number of concurrent upload operations
        session: aioboto3 session instance for AWS operations
        transfer_config: S3 transfer configuration for optimal upload performance
        progress_file: Path to the progress tracking JSON file
        semaphore: Asyncio semaphore for controlling concurrency
        progress_callback: Optional callback function for progress updates

    Environment Variables Required:
        AWS_ACCESS_KEY_ID: AWS access key for authentication
        AWS_SECRET_ACCESS_KEY: AWS secret key for authentication
        AWS_REGION: AWS region for S3 bucket
        S3_BUCKET_NAME: Name of the S3 bucket to upload to
    """

    def __init__(
        self,
        bucket_name: str,
        max_concurrency: Optional[int] = None,
        progress_callback: Optional[Callable[[str, Any], None]] = None
    ):
        """
        Initialize the S3ImageUploader.

        Args:
            bucket_name: Name of the S3 bucket to upload images to
            max_concurrency: Maximum number of concurrent upload operations.
                           If None, uses settings.max_concurrency from environment.
            progress_callback: Optional callback function called with progress updates
                             Signature: callback(event_type: str, data: Any) -> None
                             Event types: 'upload_start', 'upload_success', 'upload_error',
                                        'property_complete', 'batch_complete'

        Raises:
            ValueError: If required environment variables are not set
        """
        # Validate required environment variables
        aws_access_key = os.getenv('AWS_ACCESS_KEY_ID')
        aws_secret_key = os.getenv('AWS_SECRET_ACCESS_KEY')
        aws_region = os.getenv('AWS_REGION')

        if not all([aws_access_key, aws_secret_key, aws_region]):
            raise ValueError(
                "Required AWS credentials not found. Please set: "
                "AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY, AWS_REGION"
            )

        self.session = aioboto3.Session(
            aws_access_key_id=aws_access_key,
            aws_secret_access_key=aws_secret_key,
            region_name=aws_region
        )

        # Use provided value or fall back to settings
        concurrency = max_concurrency if max_concurrency is not None else settings.max_concurrency

        # Configure S3 transfer settings for optimal performance
        self.transfer_config = TransferConfig(
            multipart_threshold=1024 * 1024 * 5,  # 5MB threshold for multipart
            max_concurrency=concurrency,
            multipart_chunksize=1024 * 1024 * 5,  # 5MB chunks
            use_threads=True
        )

        self.bucket_name = bucket_name
        self.max_concurrency = concurrency
        self.progress_file = Path("upload_progress.json")
        self.semaphore = asyncio.Semaphore(concurrency)
        self.progress_callback = progress_callback

    def _notify_progress(self, event_type: str, data: Any) -> None:
        """
        Send progress notification to callback if configured.

        Args:
            event_type: Type of progress event
            data: Event data to send to callback
        """
        if self.progress_callback:
            try:
                self.progress_callback(event_type, data)
            except Exception:
                # Silently ignore callback errors to not break upload flow
                pass

    def load_progress(self) -> int:
        """
        Load the progress from the tracking file.

        Returns:
            The last processed property index, or 0 if no progress file exists
        """
        try:
            progress_data = json.loads(self.progress_file.read_text())
            return progress_data.get('last_index', 0)
        except FileNotFoundError:
            return 0
        except (json.JSONDecodeError, ValueError):
            # Corrupted progress file, start from beginning
            return 0

    def save_progress(self, index: int) -> None:
        """
        Save the current progress to the tracking file.

        Args:
            index: Current processing index to save
        """
        self.progress_file.write_text(json.dumps({'last_index': index}))

    async def upload_image(
        self,
        session: aiohttp.ClientSession,
        s3_client,
        property_id: str,
        image_url: str,
        logger
    ) -> Optional[str]:
        """
        Download and upload a single image to S3 using streaming.

        This method downloads an image from a URL and streams it directly to S3
        without writing to disk. The image is stored under a property-specific path.

        Args:
            session: aiohttp client session for HTTP requests
            s3_client: aioboto3 S3 client for uploads
            property_id: Unique identifier for the property
            image_url: URL of the image to download and upload
            logger: Logger instance for recording operations

        Returns:
            S3 key (path) of the uploaded image if successful, None otherwise
        """
        filename = Path(urlparse(image_url).path).name
        s3_key = f"{property_id}/{filename}"

        self._notify_progress('upload_start', {
            'property_id': property_id,
            'image_url': image_url,
            's3_key': s3_key
        })

        try:
            async with session.get(image_url) as response:
                if response.status != 200:
                    error_msg = f"Failed to download {image_url}: HTTP {response.status}"
                    logger.error(error_msg)
                    self._notify_progress('upload_error', {
                        'property_id': property_id,
                        'image_url': image_url,
                        's3_key': s3_key,
                        'error': error_msg
                    })
                    return None

                # Stream data directly to S3 without saving to disk
                await s3_client.upload_fileobj(
                    Fileobj=response.content,
                    Bucket=self.bucket_name,
                    Key=s3_key,
                    Config=self.transfer_config,
                    ExtraArgs={'ContentType': 'image/jpeg'}
                )

            logger.info(f"Successfully uploaded {s3_key}")
            self._notify_progress('upload_success', {
                'property_id': property_id,
                'image_url': image_url,
                's3_key': s3_key
            })
            return s3_key

        except aiohttp.ClientError as e:
            error_msg = f"HTTP error for {image_url}: {str(e)}"
            logger.error(error_msg)
            self._notify_progress('upload_error', {
                'property_id': property_id,
                'image_url': image_url,
                's3_key': s3_key,
                'error': error_msg
            })
            return None
        except botocore.exceptions.BotoCoreError as e:
            error_msg = f"S3 error for {s3_key}: {str(e)}"
            logger.error(error_msg)
            self._notify_progress('upload_error', {
                'property_id': property_id,
                'image_url': image_url,
                's3_key': s3_key,
                'error': error_msg
            })
            return None
        except Exception as e:
            error_msg = f"Unexpected error for {image_url}: {str(e)}"
            logger.error(error_msg)
            self._notify_progress('upload_error', {
                'property_id': property_id,
                'image_url': image_url,
                's3_key': s3_key,
                'error': error_msg
            })
            return None

    async def process_property(
        self,
        session: aiohttp.ClientSession,
        s3_client,
        property_data: Dict,
        logger
    ) -> List[str]:
        """
        Process all images for a single property.

        Downloads and uploads all images associated with a property concurrently,
        respecting the configured concurrency limits.

        Args:
            session: aiohttp client session for HTTP requests
            s3_client: aioboto3 S3 client for uploads
            property_data: Dictionary containing property information with keys:
                          - property_id: Unique property identifier
                          - image_urls: List of image URLs to process
            logger: Logger instance for recording operations

        Returns:
            List of successfully uploaded S3 keys (paths)
        """
        property_id = property_data['property_id']
        image_urls = property_data['image_urls']

        logger.info(f"Processing property {property_id} with {len(image_urls)} images")

        tasks = []
        for url in image_urls:
            task = asyncio.create_task(
                self.upload_image(session, s3_client, property_id, url, logger)
            )
            tasks.append(task)

            # Control the concurrency by waiting when we hit the limit
            if len(tasks) >= self.max_concurrency:
                await asyncio.wait(tasks)
                tasks = []

        # Wait for any remaining tasks
        if tasks:
            await asyncio.wait(tasks)

        # Collect successful upload results
        results = [task.result() for task in tasks if task.result() is not None]

        self._notify_progress('property_complete', {
            'property_id': property_id,
            'total_images': len(image_urls),
            'successful_uploads': len(results),
            'failed_uploads': len(image_urls) - len(results)
        })

        return results

    async def process_all_properties(
        self,
        jsonl_file: str,
        batch_size: Optional[int] = None,
        logger = None
    ) -> None:
        """
        Process all properties from a JSONL file in batches.

        Reads properties from a JSONL file and processes their images in batches.
        Supports resuming from the last saved progress point. Each line in the
        JSONL file should be a JSON object with 'property_id' and 'image_urls' keys.

        Args:
            jsonl_file: Path to the JSONL file containing property data
            batch_size: Number of properties to process in each batch.
                       If None, uses settings.s3_batch_size from environment.
            logger: Optional logger instance (creates default if not provided)

        Raises:
            FileNotFoundError: If the JSONL file doesn't exist
            ValueError: If the file format is invalid
        """
        # Use a default logger if none provided
        if logger is None:
            import logging
            logging.basicConfig(
                level=logging.INFO,
                format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
            )
            logger = logging.getLogger(__name__)

        # Use provided value or fall back to settings
        batch = batch_size if batch_size is not None else settings.s3_batch_size

        start_index = self.load_progress()
        logger.info(f"Starting from index {start_index}")

        # Configure HTTP client with timeouts from settings
        timeout = ClientTimeout(
            total=settings.client_total_timeout,
            connect=settings.client_connect_timeout,
            sock_connect=settings.client_connect_timeout,
            sock_read=settings.client_connect_timeout
        )
        connector = aiohttp.TCPConnector(
            limit=settings.s3_connection_limit,
            ttl_dns_cache=300,
            use_dns_cache=True,
            limit_per_host=settings.s3_connection_limit
        )

        async with aiohttp.ClientSession(
            connector=connector,
            timeout=timeout,
            raise_for_status=True
        ) as session:
            async with self.session.client('s3') as s3_client:
                with open(jsonl_file, 'r', encoding='utf-8') as f:
                    batch_items = []
                    for i, line in enumerate(f):
                        # Skip already processed properties
                        if i < start_index:
                            continue

                        try:
                            property_data = json.loads(line)
                            batch_items.append(property_data)

                            # Process when batch is full
                            if len(batch_items) >= batch:
                                tasks = []
                                for prop_data in batch_items:
                                    async with self.semaphore:
                                        task = asyncio.create_task(
                                            self.process_property(
                                                session, s3_client, prop_data, logger
                                            )
                                        )
                                        tasks.append(task)

                                await asyncio.gather(*tasks)
                                self.save_progress(i + 1)

                                logger.info(f"Completed batch up to property {i + 1}")
                                self._notify_progress('batch_complete', {
                                    'batch_end_index': i + 1,
                                    'batch_size': len(batch_items)
                                })

                                batch_items = []

                        except json.JSONDecodeError:
                            logger.error(f"Error parsing JSON at line {i + 1}")
                            continue
                        except Exception as e:
                            logger.error(f"Error processing property at line {i + 1}: {str(e)}")
                            continue

                    # Process any remaining properties in the final batch
                    if batch_items:
                        tasks = []
                        for prop_data in batch_items:
                            async with self.semaphore:
                                task = asyncio.create_task(
                                    self.process_property(session, s3_client, prop_data, logger)
                                )
                                tasks.append(task)

                        await asyncio.gather(*tasks)
                        self.save_progress(i + 1)

                        logger.info(f"Completed final batch")
                        self._notify_progress('batch_complete', {
                            'batch_end_index': i + 1,
                            'batch_size': len(batch_items),
                            'final': True
                        })


async def upload_images_to_s3(
    jsonl_file: str,
    bucket_name: Optional[str] = None,
    max_concurrency: Optional[int] = None,
    batch_size: Optional[int] = None,
    progress_callback: Optional[Callable[[str, Any], None]] = None,
    logger = None
) -> None:
    """
    Convenience function to upload images from a JSONL file to S3.

    This is a high-level function that handles the complete upload workflow:
    1. Validates environment variables and configuration
    2. Creates an S3ImageUploader instance
    3. Processes all properties from the JSONL file
    4. Handles errors and cleanup

    Args:
        jsonl_file: Path to the JSONL file containing property data
        bucket_name: S3 bucket name (uses S3_BUCKET_NAME env var if not provided)
        max_concurrency: Maximum number of concurrent uploads.
                        If None, uses settings.max_concurrency from environment.
        batch_size: Number of properties to process per batch.
                   If None, uses settings.s3_batch_size from environment.
        progress_callback: Optional callback for progress updates
        logger: Optional logger instance (creates default if not provided)

    Raises:
        ValueError: If bucket_name is not provided and S3_BUCKET_NAME env var not set
        FileNotFoundError: If the JSONL file doesn't exist

    Example:
        >>> import asyncio
        >>> from src.upload.s3 import upload_images_to_s3
        >>>
        >>> def progress_handler(event_type, data):
        ...     print(f"{event_type}: {data}")
        >>>
        >>> asyncio.run(upload_images_to_s3(
        ...     'output/image_urls.jsonl',
        ...     progress_callback=progress_handler
        ... ))
    """
    # Use environment variable if bucket name not provided
    if bucket_name is None:
        bucket_name = os.getenv('S3_BUCKET_NAME')
        if not bucket_name:
            raise ValueError(
                "S3_BUCKET_NAME must be provided or set as environment variable"
            )

    # Create default logger if not provided
    if logger is None:
        import logging
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
        )
        logger = logging.getLogger(__name__)

    # Create uploader and process all properties
    uploader = S3ImageUploader(
        bucket_name=bucket_name,
        max_concurrency=max_concurrency,
        progress_callback=progress_callback
    )

    try:
        await uploader.process_all_properties(
            jsonl_file=jsonl_file,
            batch_size=batch_size,
            logger=logger
        )
        logger.info("Upload process completed successfully")
    except KeyboardInterrupt:
        logger.info("Process interrupted by user")
        raise
    except Exception as e:
        logger.error(f"Upload process failed: {str(e)}")
        raise


# Example usage for CLI integration
if __name__ == "__main__":
    import sys

    def simple_progress_callback(event_type: str, data: Any) -> None:
        """Simple progress callback that prints to console."""
        if event_type == 'upload_success':
            print(f"✓ Uploaded: {data['s3_key']}")
        elif event_type == 'upload_error':
            print(f"✗ Failed: {data['s3_key']} - {data.get('error', 'Unknown error')}")
        elif event_type == 'property_complete':
            print(f"Property {data['property_id']}: "
                  f"{data['successful_uploads']}/{data['total_images']} uploaded")
        elif event_type == 'batch_complete':
            print(f"Batch completed at index {data['batch_end_index']}")

    # Get JSONL file from command line or use default
    jsonl_file = sys.argv[1] if len(sys.argv) > 1 else 'output/image_urls.jsonl'

    # Run the upload process
    asyncio.run(upload_images_to_s3(
        jsonl_file=jsonl_file,
        progress_callback=simple_progress_callback
    ))
