import asyncio
import re
from typing import Any, Dict, List, Optional, Sequence, Tuple

import boto3
from botocore.client import Config
from botocore.exceptions import ClientError, EndpointConnectionError, NoCredentialsError

from mineru_flow.internal.common.logging import get_logger
from mineru_flow.internal.storage.base import (
    StorageAuthenticationError,
    StorageError,
    StorageFile,
    StorageNotFoundError,
    StorageOperator,
    StoragePermissionError,
)


class S3StorageOperator(StorageOperator):
    """Storage operator that relies on boto3 for S3-compatible services."""

    def __init__(
        self,
        *,
        endpoint: str,
        aws_access_key_id: str,
        aws_secret_access_key: str,
        region_name: Optional[str] = "us-east-1",
        **kwargs: Any,
    ):
        super().__init__(**kwargs)
        self.logger = get_logger("storage.s3").bind(endpoint=endpoint)
        self._client = boto3.client(
            "s3",
            aws_access_key_id=aws_access_key_id,
            aws_secret_access_key=aws_secret_access_key,
            endpoint_url=endpoint,
            region_name=region_name,
            config=Config(
                s3={"addressing_style": "path"},
                signature_version="s3v4",
                retries={"max_attempts": 8, "mode": "standard"},
            ),
        )

    async def _read(self, path: str, *, max_bytes: Optional[int]) -> bytes:
        bucket, key = self._split_path(path)

        def _download() -> bytes:
            try:
                params: Dict[str, Any] = {"Bucket": bucket, "Key": key}
                if max_bytes is not None:
                    params["Range"] = f"bytes=0-{max_bytes - 1}"
                response = self._client.get_object(**params)
                try:
                    return response["Body"].read()
                finally:
                    response["Body"].close()
            except ClientError as exc:  # pragma: no cover - boto3 result
                raise exc

        try:
            data = await asyncio.to_thread(_download)
            self.logger.debug(
                "读取对象成功",
                bucket=bucket,
                key=key,
                max_bytes=max_bytes,
                bytes=len(data),
            )
            return data
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, key)
            self.logger.error(
                "读取对象失败",
                bucket=bucket,
                key=key,
                error=str(storage_error),
            )
            raise storage_error
        except NoCredentialsError as exc:
            raise StorageAuthenticationError(
                "S3 credentials are not configured"
            ) from exc
        except EndpointConnectionError as exc:
            raise StorageError(f"Failed to reach S3 endpoint: {exc}") from exc

    async def _stream(self, path: str, *, chunk_size: int):
        bucket, key = self._split_path(path)

        def _open() -> Any:
            return self._client.get_object(Bucket=bucket, Key=key)["Body"]

        try:
            body = await asyncio.to_thread(_open)
            self.logger.debug("开始流式读取对象", bucket=bucket, key=key, chunk_size=chunk_size)
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, key)
            self.logger.error(
                "流式读取对象失败", bucket=bucket, key=key, error=str(storage_error)
            )
            raise storage_error from exc
        except NoCredentialsError as exc:
            raise StorageAuthenticationError(
                "S3 credentials are not configured"
            ) from exc
        except EndpointConnectionError as exc:
            raise StorageError(f"Failed to reach S3 endpoint: {exc}") from exc

        try:
            while True:
                chunk = await asyncio.to_thread(body.read, chunk_size)
                if not chunk:
                    break
                yield chunk
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, key)
            self.logger.error(
                "流式读取对象时失败", bucket=bucket, key=key, error=str(storage_error)
            )
            raise storage_error from exc
        finally:
            await asyncio.to_thread(body.close)

    async def _list(self, location: str, *, recursive: bool) -> Sequence[StorageFile]:
        bucket, key = self._split_path(location)

        if key and not key.endswith("/"):
            try:
                return [await self._head_object(bucket, key)]
            except StorageNotFoundError:
                key = f"{key}/"

        prefix = key
        params: Dict[str, Any] = {"Bucket": bucket, "Prefix": prefix}
        if not recursive:
            params["Delimiter"] = "/"

        def _collect() -> List[StorageFile]:
            paginator = self._client.get_paginator("list_objects_v2")
            files: List[StorageFile] = []
            for page in paginator.paginate(**params):
                contents = page.get("Contents", [])
                for obj in contents:
                    obj_key = obj["Key"]
                    if obj_key.endswith("/"):
                        continue
                    files.append(
                        StorageFile(
                            path=f"s3://{bucket}/{obj_key}",
                            name=obj_key.rsplit("/", 1)[-1],
                            size=obj.get("Size"),
                            metadata={"etag": obj.get("ETag")},
                        )
                    )
            return files

        try:
            files = await asyncio.to_thread(_collect)
            self.logger.debug(
                "列出对象成功",
                bucket=bucket,
                prefix=prefix,
                recursive=recursive,
                count=len(files),
            )
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, prefix)
            self.logger.error(
                "列出对象失败",
                bucket=bucket,
                prefix=prefix,
                error=str(storage_error),
            )
            raise storage_error from exc

        if not files:
            self.logger.warning(
                "未找到匹配对象", bucket=bucket, prefix=prefix, recursive=recursive
            )
            raise StorageNotFoundError(f"No files found for prefix '{location}'")
        return files

    async def _get_mime(self, path: str) -> Optional[str]:
        bucket, key = self._split_path(path)
        file_info = await self._head_object(bucket, key)
        return file_info.mime_type

    async def _head_object(self, bucket: str, key: str) -> StorageFile:
        def _head() -> Dict[str, Any]:
            return self._client.head_object(Bucket=bucket, Key=key)

        try:
            response = await asyncio.to_thread(_head)
            self.logger.debug("获取对象元数据", bucket=bucket, key=key)
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, key)
            self.logger.error(
                "获取对象元数据失败",
                bucket=bucket,
                key=key,
                error=str(storage_error),
            )
            raise storage_error from exc
        except NoCredentialsError as exc:
            raise StorageAuthenticationError(
                "S3 credentials are not configured"
            ) from exc

        size = response.get("ContentLength")
        mime = response.get("ContentType")
        return StorageFile(
            path=f"s3://{bucket}/{key}",
            name=key.rsplit("/", 1)[-1],
            size=size,
            mime_type=mime,
            metadata={
                "etag": response.get("ETag"),
                "metadata": response.get("Metadata", {}),
            },
        )

    async def presigned(self, path: str):
        bucket, key = self._split_path(path)
        params = {"Bucket": bucket, "Key": key}

        def _presigned():
            return self._client.generate_presigned_url("get_object", Params=params)

        try:
            url = await asyncio.to_thread(_presigned)
            self.logger.debug("生成预签名 URL", bucket=bucket, key=key)
            return url
        except ClientError as exc:
            storage_error = self._convert_client_error(exc, bucket, key)
            self.logger.error(
                "生成预签名 URL 失败",
                bucket=bucket,
                key=key,
                error=str(storage_error),
            )
            raise storage_error from exc

    def _split_path(self, path: str) -> Tuple[str, str]:
        _re_s3_path = re.compile("^s3a?://([^/]+)(?:/(.*))?$")
        m = _re_s3_path.match(path)
        if m is None:
            return "", ""
        return m.group(1), (m.group(2) or "")

    def _convert_client_error(
        self, exc: ClientError, bucket: str, key: str
    ) -> StorageError:
        error = exc.response.get("Error", {})
        code = error.get("Code")
        if code in {"404", "NoSuchKey", "NotFound"}:
            return StorageNotFoundError(f"S3 object not found: s3://{bucket}/{key}")
        if code in {"403", "AccessDenied"}:
            return StoragePermissionError(f"Permission denied for s3://{bucket}/{key}")
        self.logger.debug(
            "S3 客户端异常",
            bucket=bucket,
            key=key,
            code=code,
            message=error.get("Message"),
        )
        return StorageError(error.get("Message") or str(exc))
