from __future__ import annotations

import logging
from dataclasses import dataclass
from pathlib import Path
from typing import List, Optional

try:  # optional dependency, required at runtime for PDF parsing
    from pypdf import PdfReader  # type: ignore
except ImportError as exc:  # pragma: no cover - defensive guard
    PdfReader = None  # type: ignore
    _PDF_IMPORT_ERROR = exc
else:
    _PDF_IMPORT_ERROR = None


@dataclass
class ParsedDocument:
    path: Path
    total_pages: int
    start_pages: List[str]
    end_pages: List[str]

    def get_start_pages(self) -> List[str]:
        return self.start_pages

    def get_start_and_end_pages(self) -> List[str]:
        first_three = self.start_pages[:3]
        last_three = self.end_pages[:3]
        pages: List[str] = []
        pages.extend(first_three)
        for page_text in last_three:
            if page_text not in pages:
                pages.append(page_text)
        return pages


class PDFParser:
    def __init__(self, start_pages: int = 6, end_pages: int = 3) -> None:
        if PdfReader is None:  # pragma: no cover - runtime guard
            raise RuntimeError(
                "pypdf is required to parse PDFs. Install it with 'pip install pypdf'"
            ) from _PDF_IMPORT_ERROR
        self.start_pages = start_pages
        self.end_pages = end_pages

    def parse(self, pdf_path: Path) -> Optional[ParsedDocument]:
        try:
            reader = PdfReader(str(pdf_path))
            total_pages = len(reader.pages)
        except Exception as exc:  # pragma: no cover - parser guard
            logging.error("Failed to open PDF %s: %s", pdf_path, exc)
            return None

        start_pages: List[str] = []
        end_pages: List[str] = []

        def extract_page_text(idx: int) -> str:
            try:
                text = reader.pages[idx].extract_text() or ""
            except Exception as extract_exc:  # pragma: no cover - extraction guard
                logging.debug(
                    "Failed to extract text from %s page %s: %s",
                    pdf_path,
                    idx,
                    extract_exc,
                )
                return ""
            cleaned = text.strip()
            return cleaned

        start_limit = min(self.start_pages, total_pages)
        i = 0
        while i < start_limit:
            text = extract_page_text(i)
            if i > total_pages:
                break
            if text:
                start_pages.append(text)
            else:
                start_limit+=1
            i += 1
        # for i in range(start_limit):
        #     text = extract_page_text(i)
        #     if text:
        #         start_pages.append(text)

        end_limit = min(self.end_pages, total_pages)
        i = 0
        while i < end_limit:
            page_index = total_pages - i
            if  page_index < 0:
                break
            text = extract_page_text(page_index)
            if text:
                end_pages.append(text)
            else:
                end_limit+=1
            i+=1
        # for i in range(end_limit):
        #     page_index = total_pages - 1 - i
        #     if page_index < 0:
        #         break
        #     text = extract_page_text(page_index)
        #     if text and text not in end_pages:
        #         end_pages.append(text)

        return ParsedDocument(path=pdf_path, total_pages=total_pages, start_pages=start_pages, end_pages=end_pages)
