diff --git a/python/langsmith/evaluation/llm_evaluator.py b/python/langsmith/evaluation/llm_evaluator.py index b9fcbefe..b4c557e1 100644 --- a/python/langsmith/evaluation/llm_evaluator.py +++ b/python/langsmith/evaluation/llm_evaluator.py @@ -1,7 +1,8 @@ """Contains the LLMEvaluator class for building LLM-as-a-judge evaluators.""" +import warnings from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast -from uuid import uuid4 +from uuid import UUID, uuid4 from pydantic import BaseModel @@ -19,16 +20,47 @@ class CategoricalScoreConfig(BaseModel): description (str): Detailed description provided to the LLM judge of what this score evaluates. reasoning_key (Optional[str]): Key used to store the reasoning/explanation - for the score. Defaults to None. + for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output. reasoning_description (Optional[str]): Description provided to the LLM judge of what should be included in the reasoning. Defaults to None. - """ + If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be." + """ # noqa: E501 key: str choices: List[str] description: str reasoning_key: Optional[str] = None reasoning_description: Optional[str] = None + include_explanation: bool = False # Deprecated + explanation_description: Optional[str] = None # Deprecated + + def __init__(self, **data): + """Initialize CategoricalScoreConfig.""" + if data.get("include_explanation") and data.get("reasoning_key"): + raise ValueError( + "Cannot include both include_explanation and reasoning_key, " + "please just use reasoning_key - include_explanation has been deprecated" # noqa: E501 + ) + if data.get("explanation_description") and data.get("reasoning_description"): + raise ValueError( + "Cannot include both explanation_description and reasoning_description, " # noqa: E501 + "please just use reasoning_description - explanation_description has been deprecated" # noqa: E501 + ) + if data.get("include_explanation"): + warnings.warn( + "'include_explanation' is deprecated. Use 'reasoning_key=\"explanation\"' instead.", # noqa: E501 + DeprecationWarning, + ) + data["reasoning_key"] = "explanation" + + if "explanation_description" in data: + warnings.warn( + "'explanation_description' is deprecated. Use 'reasoning_description' instead.", # noqa: E501 + DeprecationWarning, + ) + data["reasoning_description"] = data["explanation_description"] + + super().__init__(**data) class ContinuousScoreConfig(BaseModel): @@ -41,10 +73,11 @@ class ContinuousScoreConfig(BaseModel): description (str): Detailed description provided to the LLM judge of what this score evaluates. reasoning_key (Optional[str]): Key used to store the reasoning/explanation - for the score. Defaults to None. + for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output. reasoning_description (Optional[str]): Description provided to the LLM judge of what should be included in the reasoning. Defaults to None. - """ + If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be." + """ # noqa: E501 key: str min: float = 0 @@ -52,6 +85,38 @@ class ContinuousScoreConfig(BaseModel): description: str reasoning_key: Optional[str] = None reasoning_description: Optional[str] = None + include_explanation: bool = False # Deprecated + explanation_description: Optional[str] = None # Deprecated + + def __init__(self, **data): + """Initialize ContinuousScoreConfig.""" + if data.get("include_explanation") and data.get("reasoning_key"): + raise ValueError( + "Cannot include both include_explanation and reasoning_key, " + "please just use reasoning_key - include_explanation has been deprecated" # noqa: E501 + ) + if data.get("explanation_description") and data.get("reasoning_description"): + raise ValueError( + "Cannot include both explanation_description and reasoning_description, " # noqa: E501 + "please just use reasoning_description - explanation_description has been deprecated" # noqa: E501 + ) + if data.get("include_explanation"): + warnings.warn( + "'include_explanation' is deprecated. Use \ + 'reasoning_key=\"explanation\"' instead.", + DeprecationWarning, + ) + data["reasoning_key"] = "explanation" + + if "explanation_description" in data: + warnings.warn( + "'explanation_description' is deprecated. \ + Use 'reasoning_description' instead.", + DeprecationWarning, + ) + data["reasoning_description"] = data["explanation_description"] + + super().__init__(**data) def _create_score_json_schema( @@ -63,17 +128,17 @@ def _create_score_json_schema( properties[score_config.reasoning_key] = { "type": "string", "description": ( - "The explanation for the score." + "Think step-by-step about what the correct score should be." if score_config.reasoning_description is None else score_config.reasoning_description ), } if isinstance(score_config, CategoricalScoreConfig): - properties["value"] = { + properties["category"] = { "type": "string", "enum": score_config.choices, - "description": f"The score for the evaluation, one of " + "description": f"The selected catgory for the evaluation, one of " f"{', '.join(score_config.choices)}.", } elif isinstance(score_config, ContinuousScoreConfig): @@ -87,25 +152,27 @@ def _create_score_json_schema( else: raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'") + required_keys = ( + [ + ( + "category" + if isinstance(score_config, CategoricalScoreConfig) + else "score" + ), + score_config.reasoning_key, + ] + if score_config.reasoning_key + else [ + "category" if isinstance(score_config, CategoricalScoreConfig) else "score" + ] + ) + return { "title": score_config.key, "description": score_config.description, "type": "object", "properties": properties, - "required": ( - [ - ( - "value" - if isinstance(score_config, CategoricalScoreConfig) - else "score" - ), - score_config.reasoning_key, - ] - if score_config.reasoning_key - else [ - "value" if isinstance(score_config, CategoricalScoreConfig) else "score" - ] - ), + "required": required_keys, } @@ -247,7 +314,7 @@ def evaluate_run( dict, self.runnable.invoke(variables, config={"run_id": source_run_id}), ) - return self._parse_output(output, str(source_run_id)) + return self._parse_output(output, source_run_id) @warn_beta async def aevaluate_run( @@ -261,7 +328,7 @@ async def aevaluate_run( await self.runnable.ainvoke(variables, config={"run_id": source_run_id}), ) - return self._parse_output(output, str(source_run_id)) + return self._parse_output(output, source_run_id) def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict: """Prepare variables for model invocation.""" @@ -321,11 +388,11 @@ def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict: return variables def _parse_output( - self, output: dict, source_run_id: str + self, output: dict, source_run_id: UUID ) -> Union[EvaluationResult, EvaluationResults]: """Parse the model output into an evaluation result.""" if isinstance(self.score_config, CategoricalScoreConfig): - value = output["value"] + value = output["category"] explanation = output.get(self.score_config.reasoning_key, None) return EvaluationResult( key=self.score_config.key, diff --git a/python/tests/integration_tests/test_llm_evaluator.py b/python/tests/integration_tests/test_llm_evaluator.py index a1739d60..9425ded0 100644 --- a/python/tests/integration_tests/test_llm_evaluator.py +++ b/python/tests/integration_tests/test_llm_evaluator.py @@ -25,17 +25,17 @@ def test_llm_evaluator_init() -> None: "description": "Whether the response is vague. Y for yes, N for no.", "type": "object", "properties": { - "value": { + "category": { "type": "string", "enum": ["Y", "N"], - "description": "The score for the evaluation, one of Y, N.", + "description": "The selected catgory for the evaluation, one of Y, N.", }, "explanation": { "type": "string", - "description": "The explanation for the score.", + "description": "Think step-by-step about what the correct score should be.", # noqa: E501 }, }, - "required": ["value", "explanation"], + "required": ["category", "explanation"], } # Try a continuous score @@ -196,3 +196,83 @@ async def apredict(inputs: dict) -> dict: evaluators=[reference_accuracy, accuracy], experiment_prefix=__name__ + "::test_evaluate.aevaluate", ) + + +@pytest.mark.parametrize( + "config_class", [CategoricalScoreConfig, ContinuousScoreConfig] +) +def test_backwards_compatibility(config_class) -> None: + # Test include_explanation deprecation + with pytest.warns(DeprecationWarning, match="include_explanation.*reasoning_key"): + config = config_class( + key="test", + description="test description", + include_explanation=True, + **( + {"choices": ["Y", "N"]} + if config_class == CategoricalScoreConfig + else {} + ), + ) + assert config.reasoning_key == "explanation" + + # Test explanation_description deprecation + with pytest.warns( + DeprecationWarning, match="explanation_description.*reasoning_description" + ): + config = config_class( + key="test", + description="test description", + explanation_description="test explanation", + **( + {"choices": ["Y", "N"]} + if config_class == CategoricalScoreConfig + else {} + ), + ) + assert config.reasoning_description == "test explanation" + + # Test both deprecated fields together + with pytest.warns(DeprecationWarning) as warnings: + config = config_class( + key="test", + description="test description", + include_explanation=True, + explanation_description="test explanation", + **( + {"choices": ["Y", "N"]} + if config_class == CategoricalScoreConfig + else {} + ), + ) + assert len(warnings) == 2 # Should show both deprecation warnings + assert config.reasoning_key == "explanation" + assert config.reasoning_description == "test explanation" + + with pytest.raises(ValueError): + config = config_class( + key="test", + description="test description", + reasoning_key="custom_key", + reasoning_description="custom description", + explanation_description="old description", + **( + {"choices": ["Y", "N"]} + if config_class == CategoricalScoreConfig + else {} + ), + ) + + with pytest.raises(ValueError): + config = config_class( + key="test", + description="test description", + reasoning_key="custom_key", + include_explanation=True, + reasoning_description="custom description", + **( + {"choices": ["Y", "N"]} + if config_class == CategoricalScoreConfig + else {} + ), + )