Source code for langchain.smith.evaluation.string_run_evaluator

"""Run evaluator wrapper for string evaluators."""
from __future__ import annotations

from abc import abstractmethod
from typing import Any, Dict, List, Optional

from langsmith import EvaluationResult, RunEvaluator
from langsmith.schemas import DataType, Example, Run

from langchain.callbacks.manager import (
    AsyncCallbackManagerForChainRun,
    CallbackManagerForChainRun,
)
from langchain.chains.base import Chain
from langchain.evaluation.schema import StringEvaluator
from langchain.load.dump import dumps
from langchain.load.load import loads
from langchain.load.serializable import Serializable
from langchain.schema import RUN_KEY, messages_from_dict
from langchain.schema.messages import BaseMessage, get_buffer_string


def _get_messages_from_run_dict(messages: List[dict]) -> List[BaseMessage]:
    if not messages:
        return []
    first_message = messages[0]
    if "lc" in first_message:
        return [loads(dumps(message)) for message in messages]
    else:
        return messages_from_dict(messages)


[docs]class StringRunMapper(Serializable):
    """Extract items to evaluate from the run object."""

    @property
    def output_keys(self) -> List[str]:
        """The keys to extract from the run."""
        return ["prediction", "input"]

[docs]    @abstractmethod
    def map(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""

[docs]    def __call__(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""
        if not run.outputs:
            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
        return self.map(run)


[docs]class LLMStringRunMapper(StringRunMapper):
    """Extract items to evaluate from the run object."""

[docs]    def serialize_chat_messages(self, messages: List[Dict]) -> str:
        """Extract the input messages from the run."""
        if isinstance(messages, list) and messages:
            if isinstance(messages[0], dict):
                chat_messages = _get_messages_from_run_dict(messages)
            elif isinstance(messages[0], list):
                # Runs from Tracer have messages as a list of lists of dicts
                chat_messages = _get_messages_from_run_dict(messages[0])
            else:
                raise ValueError(f"Could not extract messages to evaluate {messages}")
            return get_buffer_string(chat_messages)
        raise ValueError(f"Could not extract messages to evaluate {messages}")

[docs]    def serialize_inputs(self, inputs: Dict) -> str:
        if "prompts" in inputs:  # Should we even accept this?
            input_ = "\n\n".join(inputs["prompts"])
        elif "prompt" in inputs:
            input_ = inputs["prompt"]
        elif "messages" in inputs:
            input_ = self.serialize_chat_messages(inputs["messages"])
        else:
            raise ValueError("LLM Run must have either messages or prompts as inputs.")
        return input_

[docs]    def serialize_outputs(self, outputs: Dict) -> str:
        if not outputs.get("generations"):
            raise ValueError("Cannot evaluate LLM Run without generations.")
        generations: List[Dict] = outputs["generations"]
        if not generations:
            raise ValueError("Cannot evaluate LLM run with empty generations.")
        first_generation: Dict = generations[0]
        if isinstance(first_generation, list):
            # Runs from Tracer have generations as a list of lists of dicts
            # Whereas Runs from the API have a list of dicts
            first_generation = first_generation[0]
        if "message" in first_generation:
            output_ = self.serialize_chat_messages([first_generation["message"]])
        else:
            output_ = first_generation["text"]
        return output_

[docs]    def map(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""
        if run.run_type != "llm":
            raise ValueError("LLM RunMapper only supports LLM runs.")
        elif not run.outputs:
            if run.error:
                raise ValueError(
                    f"Cannot evaluate errored LLM run {run.id}: {run.error}"
                )
            else:
                raise ValueError(
                    f"Run {run.id} has no outputs. Cannot evaluate this run."
                )
        else:
            try:
                inputs = self.serialize_inputs(run.inputs)
            except Exception as e:
                raise ValueError(
                    f"Could not parse LM input from run inputs {run.inputs}"
                ) from e
            try:
                output_ = self.serialize_outputs(run.outputs)
            except Exception as e:
                raise ValueError(
                    f"Could not parse LM prediction from run outputs {run.outputs}"
                ) from e
            return {"input": inputs, "prediction": output_}


[docs]class ChainStringRunMapper(StringRunMapper):
    """Extract items to evaluate from the run object from a chain."""

    input_key: Optional[str] = None
    """The key from the model Run's inputs to use as the eval input."""
    prediction_key: Optional[str] = None
    """The key from the model Run's outputs to use as the eval prediction."""

    def _get_key(self, source: Dict, key: Optional[str], which: str) -> str:
        if key is not None:
            return source[key]
        elif len(source) == 1:
            return next(iter(source.values()))
        else:
            raise ValueError(
                f"Could not map run {which} with multiple keys: "
                f"{source}\nPlease manually specify a {which}_key"
            )

[docs]    def map(self, run: Run) -> Dict[str, str]:
        """Maps the Run to a dictionary."""
        if not run.outputs:
            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
        if run.run_type != "chain":
            raise ValueError("Chain RunMapper only supports Chain runs.")
        if self.input_key not in run.inputs:
            raise ValueError(f"Run {run.id} does not have input key {self.input_key}.")
        elif self.prediction_key not in run.outputs:
            raise ValueError(
                f"Run {run.id} does not have prediction key {self.prediction_key}."
            )
        else:
            input_ = self._get_key(run.inputs, self.input_key, "input")
            prediction = self._get_key(run.outputs, self.prediction_key, "prediction")
            return {
                "input": input_,
                "prediction": prediction,
            }


[docs]class ToolStringRunMapper(StringRunMapper):
    """Map an input to the tool."""

[docs]    def map(self, run: Run) -> Dict[str, str]:
        if not run.outputs:
            raise ValueError(f"Run {run.id} has no outputs to evaluate.")
        return {"input": run.inputs["input"], "prediction": run.outputs["output"]}


[docs]class StringExampleMapper(Serializable):
    """Map an example, or row in the dataset, to the inputs of an evaluation."""

    reference_key: Optional[str] = None

    @property
    def output_keys(self) -> List[str]:
        """The keys to extract from the run."""
        return ["reference"]

[docs]    def serialize_chat_messages(self, messages: List[Dict]) -> str:
        """Extract the input messages from the run."""
        chat_messages = _get_messages_from_run_dict(messages)
        return get_buffer_string(chat_messages)

[docs]    def map(self, example: Example) -> Dict[str, str]:
        """Maps the Example, or dataset row to a dictionary."""
        if not example.outputs:
            raise ValueError(
                f"Example {example.id} has no outputs to use as a reference."
            )
        if self.reference_key is None:
            if len(example.outputs) > 1:
                raise ValueError(
                    f"Example {example.id} has multiple outputs, so you must"
                    " specify a reference_key."
                )
            else:
                output = list(example.outputs.values())[0]
        elif self.reference_key not in example.outputs:
            raise ValueError(
                f"Example {example.id} does not have reference key"
                f" {self.reference_key}."
            )
        else:
            output = example.outputs[self.reference_key]
        return {
            "reference": self.serialize_chat_messages([output])
            if isinstance(output, dict) and output.get("type") and output.get("data")
            else str(output)
        }

[docs]    def __call__(self, example: Example) -> Dict[str, str]:
        """Maps the Run and Example to a dictionary."""
        if not example.outputs:
            raise ValueError(
                f"Example {example.id} has no outputs to use as areference label."
            )
        return self.map(example)


[docs]class StringRunEvaluatorChain(Chain, RunEvaluator):
    """Evaluate Run and optional examples."""

    run_mapper: StringRunMapper
    """Maps the Run to a dictionary with 'input' and 'prediction' strings."""
    example_mapper: Optional[StringExampleMapper] = None
    """Maps the Example (dataset row) to a dictionary
    with a 'reference' string."""
    name: str
    """The name of the evaluation metric."""
    string_evaluator: StringEvaluator
    """The evaluation chain."""

    @property
    def input_keys(self) -> List[str]:
        return ["run", "example"]

    @property
    def output_keys(self) -> List[str]:
        return ["feedback"]

    def _prepare_input(self, inputs: Dict[str, Any]) -> Dict[str, str]:
        run: Run = inputs["run"]
        example: Optional[Example] = inputs.get("example")
        evaluate_strings_inputs = self.run_mapper(run)
        if not self.string_evaluator.requires_input:
            # Hide warning about unused input
            evaluate_strings_inputs.pop("input", None)
        if example and self.example_mapper and self.string_evaluator.requires_reference:
            evaluate_strings_inputs.update(self.example_mapper(example))
        elif self.string_evaluator.requires_reference:
            raise ValueError(
                f"Evaluator {self.name} requires an reference"
                " example from the dataset,"
                f" but none was provided for run {run.id}."
            )
        return evaluate_strings_inputs

    def _prepare_output(self, output: Dict[str, Any]) -> Dict[str, Any]:
        evaluation_result = EvaluationResult(
            key=self.name, comment=output.get("reasoning"), **output
        )
        if RUN_KEY in output:
            # TODO: Not currently surfaced. Update
            evaluation_result.evaluator_info[RUN_KEY] = output[RUN_KEY]
        return {"feedback": evaluation_result}

    def _call(
        self,
        inputs: Dict[str, str],
        run_manager: Optional[CallbackManagerForChainRun] = None,
    ) -> Dict[str, Any]:
        """Call the evaluation chain."""
        evaluate_strings_inputs = self._prepare_input(inputs)
        _run_manager = run_manager or CallbackManagerForChainRun.get_noop_manager()
        callbacks = _run_manager.get_child()
        chain_output = self.string_evaluator.evaluate_strings(
            **evaluate_strings_inputs,
            callbacks=callbacks,
            include_run_info=True,
        )
        return self._prepare_output(chain_output)

    async def _acall(
        self,
        inputs: Dict[str, str],
        run_manager: AsyncCallbackManagerForChainRun | None = None,
    ) -> Dict[str, Any]:
        """Call the evaluation chain."""
        evaluate_strings_inputs = self._prepare_input(inputs)
        _run_manager = run_manager or AsyncCallbackManagerForChainRun.get_noop_manager()
        callbacks = _run_manager.get_child()
        chain_output = await self.string_evaluator.aevaluate_strings(
            **evaluate_strings_inputs,
            callbacks=callbacks,
            include_run_info=True,
        )
        return self._prepare_output(chain_output)

    def _prepare_evaluator_output(self, output: Dict[str, Any]) -> EvaluationResult:
        feedback: EvaluationResult = output["feedback"]
        if RUN_KEY not in feedback.evaluator_info:
            feedback.evaluator_info[RUN_KEY] = output[RUN_KEY]
        return feedback

[docs]    def evaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        """Evaluate an example."""
        result = self({"run": run, "example": example}, include_run_info=True)
        return self._prepare_evaluator_output(result)

[docs]    async def aevaluate_run(
        self, run: Run, example: Optional[Example] = None
    ) -> EvaluationResult:
        """Evaluate an example."""
        result = await self.acall(
            {"run": run, "example": example}, include_run_info=True
        )
        return self._prepare_evaluator_output(result)

[docs]    @classmethod
    def from_run_and_data_type(
        cls,
        evaluator: StringEvaluator,
        run_type: str,
        data_type: DataType,
        input_key: Optional[str] = None,
        prediction_key: Optional[str] = None,
        reference_key: Optional[str] = None,
        tags: Optional[List[str]] = None,
    ) -> StringRunEvaluatorChain:
        """
        Create a StringRunEvaluatorChain from an evaluator and the run and dataset types.

        This method provides an easy way to instantiate a StringRunEvaluatorChain, by
        taking an evaluator and information about the type of run and the data.
        The method supports LLM and chain runs.

        Args:
            evaluator (StringEvaluator): The string evaluator to use.
            run_type (str): The type of run being evaluated.
                Supported types are LLM and Chain.
            data_type (DataType): The type of dataset used in the run.
            input_key (str, optional): The key used to map the input from the run.
            prediction_key (str, optional): The key used to map the prediction from the run.
            reference_key (str, optional): The key used to map the reference from the dataset.
            tags (List[str], optional): List of tags to attach to the evaluation chain.

        Returns:
            StringRunEvaluatorChain: The instantiated evaluation chain.

        Raises:
            ValueError: If the run type is not supported, or if the evaluator requires a
                reference from the dataset but the reference key is not provided.

        """  # noqa: E501

        # Configure how run inputs/predictions are passed to the evaluator
        if run_type == "llm":
            run_mapper: StringRunMapper = LLMStringRunMapper()
        elif run_type == "chain":
            run_mapper = ChainStringRunMapper(
                input_key=input_key, prediction_key=prediction_key
            )
        else:
            raise ValueError(
                f"Unsupported run type {run_type}. Expected one of 'llm' or 'chain'."
            )

        # Configure how example rows are fed as a reference string to the evaluator
        if reference_key is not None or data_type in (DataType.llm, DataType.chat):
            example_mapper = StringExampleMapper(reference_key=reference_key)
        elif evaluator.requires_reference:
            raise ValueError(
                f"Evaluator {evaluator.evaluation_name} requires a reference"
                " example from the dataset. Please specify the reference key from"
                " amongst the dataset outputs keys."
            )
        else:
            example_mapper = None
        return cls(
            name=evaluator.evaluation_name,
            run_mapper=run_mapper,
            example_mapper=example_mapper,
            string_evaluator=evaluator,
            tags=tags,
        )