from collections.abc import Sequence

from ice.formatter.multi import format_multi
from ice.formatter.multi import stop
from ice.formatter.transform.value import numbered_list
from ice.formatter.transform.value import ValueTransform
from ice.recipes.experiments_and_arms.prompts.utils import get_part
from ice.recipes.experiments_and_arms.prompts.utils import start_last_example
from ice.recipes.experiments_and_arms.types import MultipartReasoningPrompt

CAN_WE_COUNT_EXPERIMENTS_EXAMPLE_TEMPLATE = """Do the following excerpts explain the methodology of the paper, including how many experiments were conducted, and what the trial arms in each experiment were? If the passages merely discuss multiple interventions but do not specify that they belong to distinct experiments, answer "Unclear".

Excerpts:

{paragraphs}

Let's think it over:

{reasoning}

Which excerpts, if any, were helpful in understanding how many experiments were conducted in this study? {helpfulness}

Therefore, do the above excerpts explain the methodology of the paper, including how many experiments were conducted, and what the trial arms in each experiment were? If we cannot tell from the excerpts, answer "Unclear". Answer Yes, No, or Unclear.

Answer: {answer}""".strip()


CAN_WE_COUNT_EXPERIMENTS_EXAMPLES: list[
    dict[str, ValueTransform[Sequence[str]] | str]
] = [
    dict(
        paragraphs=numbered_list(
            [
                """We find that the effects of the workshop are significantly larger for the most disadvantaged. In particular, we show in Table 5 that the least educated, the least experienced, and those with the lowest expected earnings benefit the most from the interventions. For other dimensions, we are unable to find significant differences in response to treatment. 51  he size of the effects for the worst-off workers is substantial. For example, young people without tertiary education increase the earnings by almost 60 percent, while the low predicted earnings group experiences a 50 percent increase. This causes a large reduction in earning inequality: the earning gap between the low and the high earnings group drops from 142 percent to 54 percent and, strikingly, the gap between experienced and inexperienced workers is fully erased. Overall, these results illustrate the large equity gains that can be generated by helping young workers to access the labour market through improved signalling.""",
                """A.17""",
                """Note. In this table we report the intent-to-treat estimates of the direct and indirect effects of the transport intervention and the job application workshop on financial outcomes. These are obtained by OLS estimation of equation ( 1), weighting each observation by the inverse of the probability of being sampled. Below each coefficient estimate, we report the s.e. in parentheses and the q-value in brackets. We correct standard errors to allow for arbitrary correlation at the level of geographical clusters. q-values are obtained using the sharpened procedure of Benjamini et al. (2006). Changing number of observations due to missing values in the dependent variable. In the last three columns we report the mean outcome for the control group, the p-value from a F-test of the null hypothesis that transport subsidies and the job application workshop have the same effect, and the number of observations. ***p< 0.01, **p<0.05, *p<0.1. Note. In this table we report the intent-to-treat estimates of the direct and indirect effects of the transport intervention and the job application workshop on expectations, aspirations and reservation wages. These are obtained by OLS estimation of equation ( 1), weighting each observation by the inverse of the probability of being sampled. Below each coefficient estimate, we report the s.e. in parentheses and the q-value in brackets. We correct standard errors to allow for arbitrary correlation at the level of geographical clusters. q-values are obtained using the sharpened procedure of Benjamini et al. (2006). Changing number of observations due to missing values in the dependent variable. In the last three columns we report the mean outcome for the control group, the p-value from a F-test of the null hypothesis that transport subsidies and the job application workshop have the same effect, and the number of observations. ***p< 0.01, **p<0.05, *p<0.1. Note. In this table we report the intent-to-treat estimates of the direct and indirect effects of the transport intervention and the job application workshop on outcomes related to mobility. These are obtained by OLS estimation of equation ( 1), weighting each observation by the inverse of the probability of being sampled. Below each coefficient estimate, we report the s.e. in parentheses and the q-value in brackets. We correct standard errors to allow for arbitrary correlation at the level of geographical clusters. q-values are obtained using the sharpened procedure of Benjamini et al. (2006). Changing number of observations due to missing values in the dependent variable. In the last three columns we report the mean outcome for the control group, the p-value from a F-test of the null hypothesis that transport subsidies and the job application workshop have the same effect, and the number of observations. ***p< 0.01, **p<0.05, *p<0.1.""",
                """A.19 1), weighting each observation by the inverse of the probability of being sampled. Below each coefficient estimate, we report the s.e. in parentheses and the q-value in brackets. We correct standard errors to allow for arbitrary correlation at the level of geographical clusters. q-values are obtained using the sharpened procedure of Benjamini et al. (2006). Changing number of observations due to missing values in the dependent variable. In the last three columns we report the mean outcome for the control group, the p-value from a F-test of the null hypothesis that transport subsidies and the job application workshop have the same effect, and the number of observations. ***p< 0.01, **p<0.05, *p<0.1.""",
            ]
        ),
        reasoning="""Excerpt 1 discusses findings related to a workshop; this appears to be from a section of the paper reporting results, not a methodology section, and it's not clear whether this paper conducted multiple experiments or just one.

Excerpt 2 is just a reference and does not have any useful content.

Excerpt 3 is a note explaining some analytic methodology related to reporting outcomes from the job application workshop and the transport subsidy. However, it does not clarify whether these belong to separate experiments.

Excerpt 4 also explains some analytic methodology; because it seems to compare the effect sizes of the job application workshop and the transport subsidy together, it suggests that these might be two trial arms belonging to a single experiment, but it is ultimately unclear here.""",
        helpfulness="""Excerpts 3 and 4 were somewhat helpful. Excerpts 1 and 2 were not helpful.""",
        answer="Unclear",
    )
]


def make_can_we_count_experiments_prompt(num_shots: int) -> MultipartReasoningPrompt:
    def can_we_count_experiments_prompt(
        paragraphs: Sequence[str],
        helpfulness: str | None = None,
        reasoning: str | None = None,
    ) -> str:
        last_example = start_last_example(
            helpfulness=helpfulness,
            reasoning=reasoning,
            pre_final="Excerpt 1",
        )
        last_example = last_example | dict(
            paragraphs=numbered_list(paragraphs),
            reasoning=reasoning if reasoning else stop("Excerpt 1"),
        )

        prompt = "\n\n".join(
            format_multi(
                CAN_WE_COUNT_EXPERIMENTS_EXAMPLE_TEMPLATE,
                CAN_WE_COUNT_EXPERIMENTS_EXAMPLES[:num_shots] + [last_example],
            )
        )
        return prompt

    return can_we_count_experiments_prompt


def get_helpfulness(response: str):
    return get_part(
        response,
        "understanding how many experiments were conducted in this study? ",
        "\n\n",
    )


def get_reasoning(response: str):
    return "".join(
        ("Excerpt 1 ", get_part(response, "Let's think it over:", "\nWhich excerpt"))
    )


CAN_WE_COUNT_EXPERIMENTS_CHOICES = [" Yes", " No", " Unclear"]
CAN_WE_COUNT_EXPERIMENTS_BEST_CHOICE = " Yes"

CAN_WE_COUNT_EXPERIMENTS_REASONING_STOP = ("\n\nWhich excerpt",)
