from __future__ import annotations

from dataclasses import dataclass
from typing import List, Sequence, Tuple

from .pdf_parser import ParsedDocument

SUSTAINABILITY_KEYWORDS = {
    "sustain",
    "sosten",
    "esg",
    "non-financial",
    "nonfinancial",
    "non financial",
    "extra-financial",
    "extra financial",
    "csr",
    "responsibility",
    "responsabil",
    "non-finanziario",
    "non finanziario",
    "extrafinanziario",
    "extra-finanziario",
    "csrd",
    "nfrd",
    "gri standard",
    "gri standards",
    "standard gri"
}

ENGLISH_MARKERS = {
    "sustainability"
    "sustainability report",
    "corporate social responsibility",
    "csr report",
    "non-financial disclosure",
    "gri index",
    "un sustainable development goals",
    "letter to stakeholders",

}

ITALIAN_MARKERS = {
    "relazione sulla responsabilità sociale",
    "sostenibilità",
    "gestione responsabile",
    "indice gri",
    "lettere agli stakeholder",
}

KEYWORD_MARKERS = ENGLISH_MARKERS.union(ITALIAN_MARKERS).union(SUSTAINABILITY_KEYWORDS)

ENGLISH_LANGUAGE_TOKENS = [" the ", " and ", " of ", " report", " sustainability"]
ITALIAN_LANGUAGE_TOKENS = [" il ", " la ", " di ", " che ", " rapporto", " sostenibilità"]


@dataclass
class HeuristicResult:
    language: str
    markers_found: List[str]
    sustainability_hits: int
    page_count: int
    candidate: bool


def detect_language(pages: Sequence[str]) -> str:
    text = " ".join(pages).lower()
    eng_score = sum(text.count(token) for token in ENGLISH_LANGUAGE_TOKENS)
    ita_score = sum(text.count(token) for token in ITALIAN_LANGUAGE_TOKENS)
    if eng_score == ita_score == 0:
        return "unknown"
    return "english" if eng_score >= ita_score else "italian"


def find_markers(pages: Sequence[str]) -> Tuple[List[str], int]:
    markers_found: set[str] = set()
    sustainability_hits = 0
    for page_text in pages:
        lower_page = page_text.lower()
        for marker in KEYWORD_MARKERS:
            if marker in lower_page:
                markers_found.add(marker)
        for keyword in KEYWORD_MARKERS:
            if keyword in lower_page:
                markers_found.add(keyword)
                sustainability_hits += 1
    return sorted(markers_found), sustainability_hits


def apply_rule_based_filter(parsed: ParsedDocument, min_total_pages: int = 20, min_keywords_hit: int = 3) -> HeuristicResult:
    """
    Apply rule-based filter to determine if a document is a sustainability report.
    
    Args:
        parsed: The parsed document to evaluate
        min_total_pages: Minimum number of pages required to be considered a candidate (default: 20)
        min_keywords_hit: Minimum number of sustainability keywords/markers to be considered a candidate (default: 3)
    
    Returns:
        HeuristicResult containing the evaluation results
    """
    sample_pages = parsed.get_start_and_end_pages()
    language = detect_language(sample_pages)
    markers_found, sustainability_hits = find_markers(sample_pages)
    candidate = parsed.total_pages >= min_total_pages and sustainability_hits >= min_keywords_hit
    return HeuristicResult(
        language=language,
        markers_found=markers_found,
        sustainability_hits=sustainability_hits,
        page_count=parsed.total_pages,
        candidate=candidate,
    )
