import asyncio
from typing import Any, Dict, List, Tuple

from mineru_flow.internal.common.exceptions import PhaseExecutionError
from mineru_flow.internal.processor.base import BasePhaseProcessor, TaskContext
from mineru_flow.internal.schema.state import Phase
from mineru_flow.internal.storage.base import StorageFile, StorageOperator

from .mineru_client import MinerUSaasClient, MinerUSelfhostClient, MineruClientError

valid_mineru_type = ["saas", "selfhost"]
supported_file_types = {
    "selfhost": [".pdf", ".doc", ".docx", ".ppt", ".pptx", ".png", ".jpg", ".jpeg"],
    "saas": [".pdf", ".png", ".jpg", ".jpeg"],
}


def create_storage_operator(config: Dict[str, Any]) -> StorageOperator:
    """根据存储类型创建存储操作对象"""
    storage_type = config.get("type")

    if storage_type == "local":
        from mineru_flow.internal.storage.local import LocalFileStorageOperator

        return LocalFileStorageOperator(base_path=config.get("storagePath"))
    elif storage_type == "s3":
        from mineru_flow.internal.storage.s3 import S3StorageOperator

        return S3StorageOperator(
            endpoint=config.get("endpoint"),
            aws_access_key_id=config.get("accessKeyId"),
            aws_secret_access_key=config.get("secretKeyId"),
            region_name=config.get("region"),
        )
    elif storage_type == "url":
        from mineru_flow.internal.storage.http import HTTPStorageOperator

        return HTTPStorageOperator()
    else:
        raise ValueError(f"Unsupported storage type: {storage_type}")


class MinerUParserProcessor(BasePhaseProcessor):
    """MinerU 文档解析处理器（支持 SaaS 和自托管）"""

    def __init__(self):
        self._storage: StorageOperator = None
        self._media_dir: str = None

        super().__init__(Phase.PARSE)

    async def validate_input(self, context: TaskContext) -> bool:
        """验证输入参数"""
        config = context.config

        # 检查必要的配置项
        if "mineru" not in config:
            self.logger.error("缺少 MinerU 配置", job_id=context.job_id)
            return False

        mineru_config = config["mineru"]
        mineru_type = mineru_config.get("type")

        if mineru_type not in valid_mineru_type:
            self.logger.error(
                "MinerU 类型配置错误", job_id=context.job_id, type=mineru_type
            )
            return False

        if mineru_type == "saas":
            required_fields = ["baseUrl", "apiKey"]
            for field in required_fields:
                if not mineru_config.get(field):
                    self.logger.error(
                        f"缺少必要的 SaaS 配置: {field}", job_id=context.job_id
                    )
                    return False

        elif mineru_type == "selfhost":
            if not mineru_config.get("baseUrl"):
                self.logger.error("缺少自托管服务器 baseUrl", job_id=context.job_id)
                return False

        # 检查源配置
        if "source" not in config:
            self.logger.error("缺少源配置", job_id=context.job_id)
            return False

        return True

    async def process(self, context: TaskContext) -> Dict[str, Any]:
        """解析文档"""
        self.logger.info(
            "开始文档解析", job_id=context.job_id, phase=context.current_phase
        )

        # 验证输入
        if not await self.validate_input(context):
            raise PhaseExecutionError(
                "Input validation failed", context.current_phase, context.job_id
            )

        source_config = context.config["source"]
        self._storage = create_storage_operator(source_config)

        job_media_dir = context.artifacts_dir / context.current_phase
        job_media_dir.mkdir(parents=True, exist_ok=True)
        self._media_dir = job_media_dir

        try:
            mineru_config = context.config["mineru"]
            mineru_type = mineru_config["type"]

            if mineru_type == "saas":
                files, results = await self._parse_with_saas(context, mineru_config)
            else:
                files, results = await self._parse_with_selfhost(context, mineru_config)

            # 构建解析结果
            paths = []
            for result in results:
                basename = result.filename.rsplit(".", 1)[0]
                target_dir = self.prepare_media_dir(context, basename)
                file_path = f"{target_dir}/result.json"
                paths.append(file_path)

                await self.save_artifacts(
                    context=context,
                    result={
                        "filename": result.filename,
                        "content": result.content,
                        "content_list": result.content_list,
                        "images": result.images,
                        "html": result.html,
                        "latex": result.latex,
                    },
                    target_path=file_path,
                )

            self.logger.info(
                "文档解析完成", job_id=context.job_id, files_count=len(paths)
            )

            return {
                "input": [
                    {
                        "filename": f.name,
                        "path": f.path,
                        "size": f.size,
                        "mime_type": f.mime_type,
                    }
                    for f in files
                ],
                "timestamp": asyncio.get_event_loop().time(),
                "output": paths,
            }

        except PhaseExecutionError:
            raise
        except MineruClientError as e:
            self.logger.error("Document parsing failed", job_id=context.job_id, error=str(e))
            raise PhaseExecutionError(
                f"Document parsing failed: {e}", context.current_phase, context.job_id
            ) from e
        except Exception as e:
            self.logger.exception(
                "Unexpected error during document parsing",
                exception=e,
                job_id=context.job_id,
            )
            raise PhaseExecutionError(
                "Document parsing failed due to an unexpected error",
                context.current_phase,
                context.job_id,
            ) from e
        finally:
            await self.cleanup(context)

    async def _parse_with_saas(
        self, context: TaskContext, config: Dict[str, Any]
    ) -> Tuple[List[StorageFile], List[Any]]:
        """使用 SaaS 模式解析"""
        self.logger.info("使用 SaaS 模式解析文档", job_id=context.job_id)
        source_config = context.config["source"]
        files = await self._storage.list_files(
            location=source_config.get("storagePath"),
            with_mime=True,
            suffixes=[
                ".pdf",
                ".png",
                ".jpg",
                ".jpeg",
                ".doc",
                ".docx",
                ".ppt",
                ".pptx",
            ],
        )

        if not files or len(files) == 0:
            raise PhaseExecutionError(
                "No parseable files found", context.current_phase, context.job_id
            )

        self.logger.info(f"找到 {len(files)} 个文件待解析", job_id=context.job_id)

        async with MinerUSaasClient(
            config["baseUrl"],
            config["apiKey"],
            self._storage,
            media_output_dir=self._media_dir,
        ) as client:
            return files, await client.parse_documents(
                files=files,
                enable_formula=config.get("enableFormula", True),
                enable_table=config.get("enableTable", True),
                language=config.get("language", "ch"),
                model_version=config.get("modelVersion", "pipeline"),
                extra_formats=config.get("extraFormats"),
                enable_ocr=config.get("enableOcr", False),
            )

    async def _parse_with_selfhost(
        self, context: TaskContext, config: Dict[str, Any]
    ) -> Tuple[List[StorageFile], List[Any]]:
        """使用自托管模式解析"""
        self.logger.info("使用自托管模式解析文档", job_id=context.job_id)
        files = self._storage.list_files(
            location=config.get("storagePath"),
            suffixes=[
                ".pdf",
                ".png",
                ".jpg",
                ".jpeg",
            ],
        )

        if not files or len(files) == 0:
            raise PhaseExecutionError(
                "No parseable files found", context.current_phase, context.job_id
            )

        self.logger.info(f"找到 {len(files)} 个文件待解析", job_id=context.job_id)

        async with MinerUSelfhostClient(
            config["baseUrl"], self._storage, media_output_dir=self._media_dir
        ) as client:
            return files, await client.parse_documents(
                files=files,
                server_url=config.get("serverUrl"),
                backend_type=config.get("backendType", "pipeline"),
                parse_method=config.get("parseMethod", "auto"),
                return_images=config.get("returnImages", True),
                language=config.get("language", "ch"),
                enable_formula=config.get("enableFormula", True),
                enable_table=config.get("enableTable", True),
                return_content_list=config.get("returnContentList", True),
                return_md=config.get("returnMd", True),
                start_page=config.get("startPage"),
                end_page=config.get("endPage"),
            )
