from __future__ import annotations

import re
from typing import List

from .pdf_parser import ParsedDocument

CHUNK_SIZE = 1800
CHUNK_OVERLAP = 200
MAX_CHUNKS = 3


def chunk_text(text: str, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP, max_chunks: int = MAX_CHUNKS) -> List[str]:
    if not text:
        return []
    cleaned = re.sub(r"\s+", " ", text).strip()
    chunks: List[str] = []
    start = 0
    while start < len(cleaned) and len(chunks) < max_chunks:
        end = min(start + chunk_size, len(cleaned))
        chunks.append(cleaned[start:end])
        if end >= len(cleaned):
            break
        start = max(0, end - overlap)
    return chunks


def build_chunks(parsed: ParsedDocument) -> List[str]:
    first_pages = parsed.get_start_and_end_pages()
    combined = "\n\n".join(first_pages)
    return chunk_text(combined)
