from __future__ import annotations

import json
import logging
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable, Optional

from .chunking import build_chunks
from .file_dispatcher import FileDispatcher
from .heuristics import apply_rule_based_filter
from .llm_client import LLMClient, LLMResponse
from .pdf_parser import PDFParser


@dataclass
class PipelineStats:
    processed: int = 0
    parsed: int = 0
    candidates: int = 0
    classified: int = 0
    moved: int = 0


class Pipeline:
    def __init__(
        self,
        parser: PDFParser,
        llm_client: LLMClient,
        dispatcher: FileDispatcher,
        output_root: Path,
        activity_enabled: bool,
        min_total_pages: int = 20,
        min_keywords_hit: int = 3,
    ) -> None:
        self.parser = parser
        self.llm_client = llm_client
        self.dispatcher = dispatcher
        self.output_root = output_root
        self.activity_enabled = activity_enabled
        self.min_total_pages = min_total_pages
        self.min_keywords_hit = min_keywords_hit
        self.stats = PipelineStats()

    def run(self, documents: Iterable[Path]) -> PipelineStats:
        for doc_path in documents:
            self.stats.processed += 1
            logging.info("Processing %s", doc_path)
            parsed = self.parser.parse(doc_path)
            if not parsed:
                continue
            self.stats.parsed += 1
            heuristics = apply_rule_based_filter(parsed, self.min_total_pages, self.min_keywords_hit)
            logging.debug(
                "Heuristics for %s: %s",
                doc_path,
                heuristics,
            )
            llm_response: Optional[LLMResponse] = None
            activity = self.activity_enabled
            chunks = build_chunks(parsed)
            if heuristics.candidate:
                self.stats.candidates += 1
                llm_response = self.llm_client.classify(chunks)
                if llm_response:
                    self.stats.classified += 1
                    logging.info(
                        "LLM (%s) classified %s: %s",
                        llm_response.config_name,
                        doc_path,
                        llm_response.raw_content,
                    )
                    activity = next(
                        (
                            cfg.activity
                            for cfg in self.llm_client.configs
                            if cfg.name == llm_response.config_name
                        ),
                        self.activity_enabled,
                    )
            else:
                logging.info(
                    "Rule-based filter rejected %s (markers=%s, pages=%s)",
                    doc_path,
                    heuristics.markers_found,
                    heuristics.page_count,
                )
                # For rejected files, use low-certainty synthetic response
                llm_response = LLMResponse(
                    is_sustainability_report=False,
                    certainty=1.0,
                    company=None,
                    year=None,
                    raw_content=json.dumps(
                        {
                            "is_sustainability_report": False,
                            "certainty": 1.0,
                            "company": None,
                            "year": None,
                            "reason": "Rule-based filter rejected the document",
                        }
                    ),
                    config_name="rule-based",
                    model="rule-based",
                    api_base="heuristic",
                    messages=[],
                    raw_api_response={
                        "source": "rule-based",
                        "detail": "Rule-based filter rejected the document",
                    },
                )
                activity = self.activity_enabled

            moved = self.dispatcher.dispatch(parsed, heuristics, llm_response, activity, chunks)
            if moved:
                self.stats.moved += 1
        return self.stats
