from ragang.core.bases.datas.performance import Performance
from ragang.core.bases.abstracts.base_metric import BaseMetric
from ragang.adapters.llm_adapter import BaseLLMAdapter
from ragang.adapters.embedding_adapter import BaseEmbeddingAdapter
import re
import json
import numpy as np
import logging

logger = logging.getLogger(__name__)


class BaseBuiltinMetric(BaseMetric):
    def __init__(self, param_src: list[str], llm_adapter: BaseLLMAdapter = None, embedding_adapter: BaseEmbeddingAdapter = None):
        super().__init__(param_src)
        self.llm_adapter = llm_adapter
        self.embedding_adapter = embedding_adapter


class A2RYNFaithfulnessMetric(BaseBuiltinMetric):
    """
    Generator faithfulness metric via Yes/No judgement on claims.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :ivar llm_adapter: Stores the LLM model.
    :vartype llm_adapter: BaseLLMAdapter
    """

    def evaluate(self, ret_docs: list[str], gen: str) -> Performance:
        """
        Evaluates the faithfulness of a generated answer by breaking it into claims and verifying each against the provided documents.

        :param ret_docs: A list of retrieved documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :returns: A Performance object with the faithfulness score calculatd by # of faithful claims / # of total claims.
        :rtype: Performance
        """
        prompt_claim_generation = (
            # todo : prompt needs to be refined. especially the rubric has to be specified more.
            """
            The following text is an answer generated by an LLM in response to a user question.

            Split the answer into **evaluation units**, where each unit represents a self-contained claim, explanation, or fact that can be **independently evaluated for factual accuracy or relevance**.

            Avoid splitting into single sentences unless necessary. If multiple sentences support the same point or explain a single concept, keep them together in one unit.

            Each unit should be meaningful and complete enough to be judged on its own.

            Do not include any text or explanation outside of the evaluation units.


            <Output format>
            1. Unit 1
            2. Unit 2
            ...
            n. Unit n

            <Example>
            Text:
            Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources. This training enables them to generate fluent and coherent responses. However, LLMs do not possess true understanding or consciousness. Instead, they rely on statistical patterns in language to predict likely next words. As a result, they can sometimes produce hallucinated or inaccurate content, especially on topics not well-covered in their training data.

            Output:
            1. Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources.  
            2. This training enables LLMs to generate fluent and coherent responses.  
            3. LLMs do not possess true understanding or consciousness; they rely on statistical patterns in language to predict likely next words.  
            4. Because of this, LLMs can sometimes produce hallucinated or inaccurate content, especially when the topic is not well covered in their training data.
            ---

            """
        )
        user_query = f"Text:\n{gen}\n\nOutput:\n"
        response = self.llm_adapter.request(prompt_claim_generation, user_query)
        claim_list = [
            line.strip()
            for line in response["text"].split('\n')
            if re.match(r"^\d+\.\s+", line.strip())
        ]
        prompt_claim_judgement = (  # todo : prompt needs to be refined. especially the rubric has to be specified more.
            """
            You are given a single claim generated by an LLM, and a set of retrieved documents that were provided as context during generation.

            Your task is to determine whether the claim is **grounded** in the retrieved documents — that is, whether it can be supported by the content of any of the documents.

            Instructions:
            - If the claim is **explicitly stated or clearly supported** by one or more of the retrieved documents, mark it as **"Grounded"**.
            - If the claim is **not supported**, **contradicted**, or **cannot be verified** by any of the documents, mark it as **"Not Grounded"**.
            - Briefly justify your decision by citing matching or missing content from the documents.

            <Output format>
            Claim: [insert claim]

            Justification:
            [Explain whether the claim is supported by the content. If grounded, mention the relevant document(s) and excerpt(s). If not, explain what's missing or unsupported.]

            Groundedness: [Grounded / Not Grounded]

            <Example>

            Claim:
            "GPT-4 was released in March 2023."

            Retrieved Documents:
            1. "OpenAI released GPT-4 in March 2023, with improved reasoning performance."
            2. "Unlike earlier models, GPT-4 can process both text and images."

            Justification:
            The first document supports the release date of GPT-4.

            Groundedness: Grounded

            ---

            <Now your input>

            """
        )
        score = 0
        for claim in claim_list:
            print(f"Claim: {claim}")
            user_query = f"Claim:\n{claim}\n\nRetrieved Documents:\n{ret_docs}\n\nJustification:\n"
            response = self.llm_adapter.request(prompt_claim_judgement, user_query)
            try:
                for line in reversed(response["text"].strip().splitlines()):
                    line = line.strip()
                    if line.startswith("Groundedness:"):
                        value = line.split(":", 1)[1].strip()
                        if value == "Grounded":
                            print("Grounded")
                            score += 1
                        else:
                            print("Not Grounded")
            except ValueError:
                score = 0.0  # Default to 0 if the response is not a valid form
        return Performance(score=score / len(claim_list), unit="", metric="Yes/No Relevancy")


class A2RSimpleScoringFaithfulnessMetric(BaseBuiltinMetric):
    """
    Generator faithfulness metric via scoring on claims.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :ivar llm_adapter: Stores the LLM model.
    :vartype llm_adapter: BaseLLMAdapter
    """

    def evaluate(self, ret_docs: list[str], gen: str) -> Performance:
        """
        Evaluates the faithfulness of a generated answer by scoring claims against the provided documents.

        :param ret_docs: A list of retrieved documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :returns: A Performance object with the faithfulness score calculated by total score / possible highest score.
        :rtype: Performance
        """
        prompt_claim_generation = (
            """
            The following text is an answer generated by an LLM in response to a user question.

            Split the answer into **evaluation units**, where each unit represents a self-contained claim, explanation, or fact that can be **independently evaluated for factual accuracy or relevance**.

            Avoid splitting into single sentences unless necessary. If multiple sentences support the same point or explain a single concept, keep them together in one unit.

            Each unit should be meaningful and complete enough to be judged on its own.

            Do not include any text or explanation outside of the evaluation units.

            <Output format>
            1. Unit 1
            2. Unit 2
            ...
            n. Unit n

            <Example>
            Text:
            Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources. This training enables them to generate fluent and coherent responses. However, LLMs do not possess true understanding or consciousness. Instead, they rely on statistical patterns in language to predict likely next words. As a result, they can sometimes produce hallucinated or inaccurate content, especially on topics not well-covered in their training data.

            Output:
            1. Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources.  
            2. This training enables LLMs to generate fluent and coherent responses.  
            3. LLMs do not possess true understanding or consciousness; they rely on statistical patterns in language to predict likely next words.  
            4. Because of this, LLMs can sometimes produce hallucinated or inaccurate content, especially when the topic is not well covered in their training data.
            ---
            """
        )
        user_query = f"Text:\n{gen}\n\nOutput:\n"
        response = self.llm_adapter.request(prompt_claim_generation, user_query)
        claim_list = [
            line.strip()
            for line in response["text"].split('\n')
            if re.match(r"^\d+\.\s+", line.strip())
        ]

        prompt_claim_scoring = (
            """
            You are given a single claim generated by an LLM and a set of retrieved documents that were provided as context. Your task is to score the faithfulness of the claim based on the provided documents.

            **Scoring Rubric:**
            - **4 (Exactly Aligns):** The claim is fully supported by the information in the retrieved documents.
            - **2 (Partially Aligns):** The claim is partially supported by the documents but may contain some minor inaccuracies or unverified details.
            - **0 (Does Not Align):** The claim is inaccurate, contradicted by the documents, or completely unrelated to the provided context.

            **Instructions:**
            1.  Carefully read the claim and the retrieved documents.
            2.  Compare the information in the claim against the documents.
            3.  Provide a brief justification for your score.
            4.  Assign a score of 0, 2, or 4 based on the rubric.

            **Output Format:**
            Claim: [The claim being evaluated]

            Justification:
            [Your brief explanation for the score]

            Score: [0, 2, or 4]

            **Example:**
            Claim:
            "GPT-4 was released by OpenAI in March 2023 and can process both text and images."

            Retrieved Documents:
            1. "OpenAI released GPT-4 in March 2023, with improved reasoning performance."
            2. "Unlike earlier models, GPT-4 can process both text and images."

            Justification:
            Both parts of the claim—the release date and the multimodal capability—are explicitly supported by the provided documents.

            Score: 4
            ---
            """
        )

        total_score = 0
        for i, claim in enumerate(claim_list):
            user_query = f"Claim:\n{claim}\n\nRetrieved Documents:\n{ret_docs}\n\nJustification:\n"
            response = self.llm_adapter.request(prompt_claim_scoring, user_query)
            score = 0
            try:
                for line in reversed(response["text"].strip().splitlines()):
                    line = line.strip()
                    if line.startswith("Score:"):
                        value = line.split(":", 1)[1].strip()
                        score = int(value)
                        total_score += score
                        break
            except (ValueError, IndexError):
                # In case of parsing error, assume the worst score
                score = 0

            print(f"claim {i + 1}: {claim} score: {score}")

        final_score = total_score / (len(claim_list) * 4) if claim_list else 0
        return Performance(score=final_score, unit="score", metric="Simple Scoring Faithfulness")


class A2RHallucinationFaithfulnessMetric(BaseBuiltinMetric):
    """
    Generator hallucination metric via Yes/No judgement.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :ivar llm_adapter: Stores the LLM model.
    :vartype llm_adapter: BaseLLMAdapter
    """

    def evaluate(self, query: str, ret_docs: list[str], gen: str) -> Performance:
        """
        Evaluates whether a generated answer is a hallucination based on a relevant text.

        :param query: The user's query.
        :type query: str
        :param ret_docs: A list of relevant documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :returns: A Performance object with a score of 1 for "factual" and 0 for "hallucinated".
        :rtype: Performance
        """
        prompt = f"""In this task, you will be presented with a query, a reference text and an answer. 
            The answer is generated to the question based on the reference text. The answer may contain false information. 
            You must use the reference text to determine if the answer to the question contains false information, if the answer is a hallucination of facts. 
            Your objective is to determine whether the answer text contains factual information and is not a hallucination. 
            A 'hallucination' refers to an answer that is not based on the reference text or assumes information that is not available in the reference text. 
            Your response should be a single word: either "factual" or "hallucinated", and it should not include any other text or characters. 
            "hallucinated" indicates that the answer provides factually inaccurate information to the query based on the reference text. 
            "factual" indicates that the answer to the question is correct relative to the reference text, and does not contain made up information. 
            Please read the query and reference text carefully before determining your response.
            # Query: {query}
            # Reference text: {ret_docs}
            # Answer: {gen}
            Is the answer above factual or hallucinated based on the query and reference text?"""

        response = self.llm_adapter.request(prompt, "")
        result = response["text"].strip().lower()

        score = 1 if result == "factual" else 0

        return Performance(score=score, unit="", metric="Hallucination (Factual/Hallucinated)")


class A2RTruthfulFaithfulnessMetric(BaseBuiltinMetric):
    """
    Generator faithfulness metric via truthfulness judgement on claims.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :ivar llm_adapter: Stores the LLM model.
    :vartype llm_adapter: BaseLLMAdapter
    """

    def evaluate(self, ret_docs: list[str], gen: str) -> Performance:
        """
        Evaluates the truthfulness of a generated answer by checking for contradictions with the provided documents.

        :param ret_docs: A list of retrieved documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :returns: A Performance object with the truthfulness score calculated by # of truthful claims / # of total claims.
        :rtype: Performance
        """
        prompt_claim_generation = (
            # todo : prompt needs to be refined. especially the rubric has to be specified more.
            """
            The following text is an answer generated by an LLM in response to a user question.

            Split the answer into **evaluation units**, where each unit represents a self-contained claim, explanation, or fact that can be **independently evaluated for factual accuracy or relevance**.

            Avoid splitting into single sentences unless necessary. If multiple sentences support the same point or explain a single concept, keep them together in one unit.

            Each unit should be meaningful and complete enough to be judged on its own.

            Do not include any text or explanation outside of the evaluation units.

            <Output format>
            1. Unit 1
            2. Unit 2
            ...
            n. Unit n

            <Example>
            Text:
            Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources. This training enables them to generate fluent and coherent responses. However, LLMs do not possess true understanding or consciousness. Instead, they rely on statistical patterns in language to predict likely next words. As a result, they can sometimes produce hallucinated or inaccurate content, especially on topics not well-covered in their training data.

            Output:
            1. Large Language Models (LLMs) are trained on vast datasets composed of text from books, websites, and other sources.  
            2. This training enables LLMs to generate fluent and coherent responses.  
            3. LLMs do not possess true understanding or consciousness; they rely on statistical patterns in language to predict likely next words.  
            4. Because of this, LLMs can sometimes produce hallucinated or inaccurate content, especially when the topic is not well covered in their training data.
            ---
            """
        )
        user_query = f"Text:\n{gen}\n\nOutput:\n"
        response = self.llm_adapter.request(prompt_claim_generation, user_query)
        claim_list = [
            line.strip()
            for line in response["text"].split('\n')
            if re.match(r"^\d+\.\s+", line.strip())
        ]
        prompt_claim_judgement = (  # todo : prompt needs to be refined. especially the rubric has to be specified more.
            """
            You are given a single claim generated by an LLM and a set of retrieved documents that were available during generation.

            Your task is to assess whether the claim is **truthful**, based on whether it **contradicts any facts** presented in the retrieved documents.

            Instructions:
            - A claim is considered **Truthful** if it does **not contradict any facts** in the retrieved documents.
            - A claim is **Not Truthful** if it directly contradicts, misrepresents, or states something clearly incompatible with the documents.
            - You do **not** need to verify that the documents explicitly support the claim — only that they **do not contradict** it.
            - Provide a brief justification based on the presence or absence of contradiction.

            <Output format>
            Claim: [insert claim]

            Justification:
            [State whether the claim contradicts anything in the documents. If it does not, say so. If it does, explain what and where.]

            Faithfulness: [Truthful / Not Truthful]

            <Example>

            Claim:
            "GPT-4 was released in March 2023."

            Retrieved Documents:
            1. "OpenAI released GPT-4 in March 2023, with improved reasoning performance."
            2. "Unlike earlier models, GPT-4 can process both text and images."

            Justification:
            The claim is consistent with the information in the first document and is not contradicted anywhere.

            Faithfulness: Truthful

            ---

            <Now your input>

            """

        )
        score = 0
        for claim in claim_list:
            print(f"Claim: {claim}")
            user_query = f"Claim:\n{claim}\n\nRetrieved Documents:\n{ret_docs}\n\nJustification:\n"
            response = self.llm_adapter.request(prompt_claim_judgement, user_query)
            try:
                for line in reversed(response["text"].strip().splitlines()):
                    line = line.strip()
                    if line.startswith("Faithfulness:"):
                        value = line.split(":", 1)[1].strip()
                        if value == "Truthful":
                            print("Truthful")
                            score += 1
                        else:
                            print("Not Truthful")
            except ValueError:
                score = 0.0  # Default to 0 if the response is not a valid form
        return Performance(score=score / len(claim_list), unit="", metric="Yes/No Relevancy")


class A2RYNFaithfulnessMetricSingleCall(BaseBuiltinMetric):
    """
    (EXPERIMENTAL) Generator faithfulness metric via a single Yes/No judgement call on all claims.

    This approach is faster and more cost-effective than making an API call for each claim.
    However, for smaller models, a single large prompt might be challenging. The prompt and parsing
    have been optimized for robustness.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :ivar llm_adapter: Stores the LLM model.
    :vartype llm_adapter: BaseLLMAdapter
    """

    def evaluate(self, ret_docs: list[str], gen: str, max_docs: int = 5, max_gen_chars: int = 2000) -> Performance:
        """
        Evaluates the faithfulness of a generated answer by extracting and verifying all claims in a single LLM call.

        To manage context size, inputs are truncated:
        - The number of documents is limited by `max_docs`.
        - The generated answer is limited by `max_gen_chars`.

        :param ret_docs: A list of retrieved documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :param max_docs: The maximum number of documents to include in the prompt.
        :type max_docs: int
        :param max_gen_chars: The maximum number of characters from the generated answer to evaluate.
        :type max_gen_chars: int
        :returns: A Performance object with the faithfulness score, or np.nan on failure.
        :rtype: Performance
        """
        if not gen or not gen.strip():
            return Performance(score=0.0, unit="", metric="Faithfulness (Single Call)")

        # Truncate inputs to manage context window size
        truncated_gen = gen[:max_gen_chars]
        truncated_docs = ret_docs[:max_docs]

        prompt = (
            """
            You are an expert evaluator. Your task is to break down a generated answer into self-contained claims
            and evaluate if each claim is grounded in a set of retrieved documents.

            A claim is "Grounded" if supported by the documents. Otherwise, it is "Not Grounded".

            Your final output **must be a single, valid JSON array of objects**. Each object must have two keys:
            - "`claim`": The string containing the claim.
            - "`grounded`": A boolean value (`true` if grounded, `false` otherwise).

            **Do not include any other text or explanations outside of the JSON array.**

            <Example>
            <Generated Answer>
            The Eiffel Tower is 330 meters tall and was designed by Gustave Eiffel.

            <Retrieved Documents>
            1. "The Eiffel Tower stands at a height of 330 meters (1,083 ft)."
            2. "The famous tower in Paris was built for the 1889 World's Fair."

            <Output>
            [
                {"claim": "The Eiffel Tower is 330 meters tall.", "grounded": true},
                {"claim": "It was designed by Gustave Eiffel.", "grounded": false}
            ]
            """
        )

        formatted_docs = "\n".join([f"{i + 1}. {doc}" for i, doc in enumerate(truncated_docs)])
        user_query = f"<Generated Answer>\n{truncated_gen}\n\n<Retrieved Documents>\n{formatted_docs}\n\n<Output>\n"

        try:
            response = self.llm_adapter.request(prompt, user_query)
            response_text = response["text"].strip()

            match = re.search(r"```json\s*(\[.*?\])\s*```|(\[.*?\])", response_text, re.DOTALL)

            if not match:
                logger.error("Failed to find a valid JSON array in the LLM response.")
                logger.debug(f"Malformed response: {response_text}")
                return Performance(score=np.nan, unit="", metric="Faithfulness (Single Call)")

            json_part = match.group(1) or match.group(2)
            evaluations = json.loads(json_part)

            if not evaluations:
                return Performance(score=0.0, unit="", metric="Faithfulness (Single Call)")

            grounded_count = sum(1 for e in evaluations if e.get("grounded") is True)
            score = grounded_count / len(evaluations)

        except json.JSONDecodeError as e:
            logger.error(f"JSON decoding failed for single-call faithfulness: {e}")
            logger.debug(f"Invalid JSON content: {json_part}")
            score = np.nan
        except (KeyError, AttributeError, IndexError) as e:
            logger.error(f"Failed to parse LLM response structure for single-call faithfulness: {e}")
            logger.debug(f"Full response text: {response_text}")
            score = np.nan

        return Performance(score=score, unit="", metric="Faithfulness (Single Call)")


class A2RHybridFaithfulnessMetric(BaseBuiltinMetric):
    """
    A robust faithfulness metric that separates claim extraction and judgment into two steps.
    It extracts all claims first, then evaluates them in batches to manage context size,
    providing a balance between performance and reliability.

    :param llm_adapter: The LLM model to use.
    :type llm_adapter: BaseLLMAdapter
    :param claims_batch_size: The number of claims to evaluate in a single LLM call.
    :type claims_batch_size: int
    """

    def __init__(self, llm_adapter: BaseLLMAdapter = None, embedding_adapter: BaseEmbeddingAdapter = None,
                 claims_batch_size: int = 10):
        super().__init__(llm_adapter, embedding_adapter)
        self.claims_batch_size = claims_batch_size

    def _extract_claims(self, ret_docs: list[str] = None, gen: str = None) -> list[str]:
        """Extracts claims from the generated text using an LLM call."""
        prompt = (
            """
            You are a text analysis expert. Your task is to split the following answer into self-contained, 
            verifiable claims. Each claim should be a single, complete sentence or a meaningful unit that can be 
            judged for accuracy on its own.

            Provide the output as a single, valid JSON array of strings.
            **Do not include any other text, markdown formatting, or explanations outside of the JSON array.**

            <Example>
            <Answer>
            The Eiffel Tower, located in Paris, is 330 meters tall. It was designed by Gustave Eiffel.

            <Output>
            ```json
            [
                "The Eiffel Tower, located in Paris, is 330 meters tall.",
                "It was designed by Gustave Eiffel."
            ]
            ```
            ---

            """
        )
        user_query = f"<Answer>\n{gen}\n\n<Output>\n"

        try:
            response = self.llm_adapter.request(prompt, user_query)
            response_text = response["text"].strip()
            match = re.search(r"```json\s*(\[.*?\])\s*```|(\[.*?\])", response_text, re.DOTALL)

            if not match:
                logger.error("Failed to find a valid JSON array in the claim extraction response.")
                logger.debug(f"Malformed response: {response_text}")
                return []

            json_part = match.group(1) or match.group(2)
            claims = json.loads(json_part)
            return [str(c) for c in claims if isinstance(c, str)]
        except (json.JSONDecodeError, KeyError, AttributeError) as e:
            logger.error(f"Failed to extract claims: {e}")
            logger.debug(f"Full response text: {response.get('text', '')}")
            return []

    def _judge_claims_batch(self, claims: list[str], ret_docs: list[str]) -> list[dict]:
        """Judges a batch of claims for faithfulness against the provided documents."""
        # Note the double curly braces `{{` and `}}` to escape them for the .format() method.
        prompt = (
            """
            You are an expert evaluator. For each claim in the <Claims to Evaluate> list, you must determine if it is grounded in the <Retrieved Documents>.
            A claim is "Grounded" if it is explicitly stated or clearly supported by the documents.

            Your final output **must be a single, valid JSON array of objects**. Each object must have two keys:
            - "`claim`": The exact string of the claim you evaluated.
            - "`grounded`": A boolean value (`true` if the claim is grounded, `false` otherwise).

            **Do not include any other text, markdown formatting, or explanations outside of the JSON array.**

            <Example>
            <Retrieved Documents>
            1. The sky is blue due to Rayleigh scattering.
            2. Water is composed of hydrogen and oxygen.

            <Claims to Evaluate>
            1. The sky's color is caused by Rayleigh scattering.
            2. The ocean is salty.

            <Output>
            ```json
            [
                {{
                    "claim": "The sky's color is caused by Rayleigh scattering.",
                    "grounded": true
                }},
                {{
                    "claim": "The ocean is salty.",
                    "grounded": false
                }}
            ]
            ```
            ---
            <Retrieved Documents>
            {formatted_docs}

            <Claims to Evaluate>
            {formatted_claims}

            <Output>
            """
        )

        formatted_docs = "\n".join([f"{i + 1}. {doc}" for i, doc in enumerate(ret_docs)])
        formatted_claims = "\n".join([f"{i + 1}. {claim}" for i, claim in enumerate(claims)])

        final_prompt = prompt.format(formatted_docs=formatted_docs, formatted_claims=formatted_claims)

        try:
            response = self.llm_adapter.request(final_prompt, "")
            response_text = response["text"].strip()
            match = re.search(r"```json\s*(\[.*?\])\s*```|(\[.*?\])", response_text, re.DOTALL)

            if not match:
                logger.error("Failed to find a valid JSON array in the judgment response.")
                logger.debug(f"Malformed response: {response_text}")
                return []

            json_part = match.group(1) or match.group(2)
            evaluations = json.loads(json_part)

            if len(evaluations) != len(claims):
                logger.warning(f"Judgment mismatch: Expected {len(claims)} evaluations, but got {len(evaluations)}.")
                return []
            return evaluations
        except json.JSONDecodeError as e:
            logger.error(f"Failed to judge claims batch: {e}")
            logger.debug(f"Full response text for failed JSON parse: {response.get('text', '')}")
            return []
        except (KeyError, AttributeError) as e:
            logger.error(f"Failed to parse judgment response structure: {e}")
            logger.debug(f"Full response text: {response.get('text', '')}")
            return []

    def evaluate(self, ret_docs: list[str] = None, gen: str = None, max_docs: int = 5) -> Performance:
        """
        Evaluates faithfulness by extracting all claims and then judging them in batches.

        :param ret_docs: A list of retrieved documents.
        :type ret_docs: list[str]
        :param gen: The generated answer.
        :type gen: str
        :param max_docs: The maximum number of documents to include for judgment.
        :type max_docs: int
        :returns: A Performance object with the faithfulness score, or np.nan on failure.
        :rtype: Performance
        """
        if not gen or not gen.strip():
            return Performance(score=0.0, unit="", metric="Hybrid Faithfulness")

        claims = self._extract_claims(gen)
        if not claims:
            logger.warning("No claims were extracted from the generated answer.")
            return Performance(score=0.0, unit="", metric="Hybrid Faithfulness")

        all_evaluations = []
        truncated_docs = ret_docs[:max_docs]  # 상위 [max_docs]개만 포함

        for i in range(0, len(claims), self.claims_batch_size):
            batch = claims[i:i + self.claims_batch_size]
            evaluations = self._judge_claims_batch(batch, truncated_docs)
            if evaluations:
                all_evaluations.extend(evaluations)
            else:
                logger.warning(f"A batch of {len(batch)} claims failed to be judged.")

        if not all_evaluations:
            logger.error("All claim judgment batches failed.")
            return Performance(score=np.nan, unit="", metric="Hybrid Faithfulness")

        grounded_count = sum(1 for e in all_evaluations if isinstance(e, dict) and e.get("grounded") is True)

        total_claims = len(claims)
        score = grounded_count / total_claims

        return Performance(score=score, unit="", metric="Hybrid Faithfulness")
