will comments

langchain-ai · Nov 12, 2024 · 37b6212 · 37b6212
1 parent 8a35c07
commit 37b6212
Show file tree

Hide file tree

Showing 2 changed files with 177 additions and 30 deletions.
diff --git a/python/langsmith/evaluation/llm_evaluator.py b/python/langsmith/evaluation/llm_evaluator.py
@@ -1,7 +1,8 @@
 """Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""
 
+import warnings
 from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
-from uuid import uuid4
+from uuid import UUID, uuid4
 
 from pydantic import BaseModel
 
@@ -19,16 +20,47 @@ class CategoricalScoreConfig(BaseModel):
         description (str): Detailed description provided to the LLM judge of what
             this score evaluates.
         reasoning_key (Optional[str]): Key used to store the reasoning/explanation
-            for the score. Defaults to None.
+            for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output.
         reasoning_description (Optional[str]): Description provided to the LLM judge
             of what should be included in the reasoning. Defaults to None.
-    """
+            If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be."
+    """  # noqa: E501
 
     key: str
     choices: List[str]
     description: str
     reasoning_key: Optional[str] = None
     reasoning_description: Optional[str] = None
+    include_explanation: bool = False  # Deprecated
+    explanation_description: Optional[str] = None  # Deprecated
+
+    def __init__(self, **data):
+        """Initialize CategoricalScoreConfig."""
+        if data.get("include_explanation") and data.get("reasoning_key"):
+            raise ValueError(
+                "Cannot include both include_explanation and reasoning_key, "
+                "please just use reasoning_key - include_explanation has been deprecated"  # noqa: E501
+            )
+        if data.get("explanation_description") and data.get("reasoning_description"):
+            raise ValueError(
+                "Cannot include both explanation_description and reasoning_description, "  # noqa: E501
+                "please just use reasoning_description - explanation_description has been deprecated"  # noqa: E501
+            )
+        if data.get("include_explanation"):
+            warnings.warn(
+                "'include_explanation' is deprecated. Use 'reasoning_key=\"explanation\"' instead.",  # noqa: E501
+                DeprecationWarning,
+            )
+            data["reasoning_key"] = "explanation"
+
+        if "explanation_description" in data:
+            warnings.warn(
+                "'explanation_description' is deprecated. Use 'reasoning_description' instead.",  # noqa: E501
+                DeprecationWarning,
+            )
+            data["reasoning_description"] = data["explanation_description"]
+
+        super().__init__(**data)
 
 
 class ContinuousScoreConfig(BaseModel):
@@ -41,17 +73,50 @@ class ContinuousScoreConfig(BaseModel):
         description (str): Detailed description provided to the LLM judge of what
             this score evaluates.
         reasoning_key (Optional[str]): Key used to store the reasoning/explanation
-            for the score. Defaults to None.
+            for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output.
         reasoning_description (Optional[str]): Description provided to the LLM judge
             of what should be included in the reasoning. Defaults to None.
-    """
+            If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be."
+    """  # noqa: E501
 
     key: str
     min: float = 0
     max: float = 1
     description: str
     reasoning_key: Optional[str] = None
     reasoning_description: Optional[str] = None
+    include_explanation: bool = False  # Deprecated
+    explanation_description: Optional[str] = None  # Deprecated
+
+    def __init__(self, **data):
+        """Initialize ContinuousScoreConfig."""
+        if data.get("include_explanation") and data.get("reasoning_key"):
+            raise ValueError(
+                "Cannot include both include_explanation and reasoning_key, "
+                "please just use reasoning_key - include_explanation has been deprecated"  # noqa: E501
+            )
+        if data.get("explanation_description") and data.get("reasoning_description"):
+            raise ValueError(
+                "Cannot include both explanation_description and reasoning_description, "  # noqa: E501
+                "please just use reasoning_description - explanation_description has been deprecated"  # noqa: E501
+            )
+        if data.get("include_explanation"):
+            warnings.warn(
+                "'include_explanation' is deprecated. Use \
+                    'reasoning_key=\"explanation\"' instead.",
+                DeprecationWarning,
+            )
+            data["reasoning_key"] = "explanation"
+
+        if "explanation_description" in data:
+            warnings.warn(
+                "'explanation_description' is deprecated. \
+                    Use 'reasoning_description' instead.",
+                DeprecationWarning,
+            )
+            data["reasoning_description"] = data["explanation_description"]
+
+        super().__init__(**data)
 
 
 def _create_score_json_schema(
@@ -63,17 +128,17 @@ def _create_score_json_schema(
         properties[score_config.reasoning_key] = {
             "type": "string",
             "description": (
-                "The explanation for the score."
+                "Think step-by-step about what the correct score should be."
                 if score_config.reasoning_description is None
                 else score_config.reasoning_description
             ),
         }
 
     if isinstance(score_config, CategoricalScoreConfig):
-        properties["value"] = {
+        properties["category"] = {
             "type": "string",
             "enum": score_config.choices,
-            "description": f"The score for the evaluation, one of "
+            "description": f"The selected catgory for the evaluation, one of "
             f"{', '.join(score_config.choices)}.",
         }
     elif isinstance(score_config, ContinuousScoreConfig):
@@ -87,25 +152,27 @@ def _create_score_json_schema(
     else:
         raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")
 
+    required_keys = (
+        [
+            (
+                "category"
+                if isinstance(score_config, CategoricalScoreConfig)
+                else "score"
+            ),
+            score_config.reasoning_key,
+        ]
+        if score_config.reasoning_key
+        else [
+            "category" if isinstance(score_config, CategoricalScoreConfig) else "score"
+        ]
+    )
+
     return {
         "title": score_config.key,
         "description": score_config.description,
         "type": "object",
         "properties": properties,
-        "required": (
-            [
-                (
-                    "value"
-                    if isinstance(score_config, CategoricalScoreConfig)
-                    else "score"
-                ),
-                score_config.reasoning_key,
-            ]
-            if score_config.reasoning_key
-            else [
-                "value" if isinstance(score_config, CategoricalScoreConfig) else "score"
-            ]
-        ),
+        "required": required_keys,
     }
 
 
@@ -247,7 +314,7 @@ def evaluate_run(
             dict,
             self.runnable.invoke(variables, config={"run_id": source_run_id}),
         )
-        return self._parse_output(output, str(source_run_id))
+        return self._parse_output(output, source_run_id)
 
     @warn_beta
     async def aevaluate_run(
@@ -261,7 +328,7 @@ async def aevaluate_run(
             await self.runnable.ainvoke(variables, config={"run_id": source_run_id}),
         )
 
-        return self._parse_output(output, str(source_run_id))
+        return self._parse_output(output, source_run_id)
 
     def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
         """Prepare variables for model invocation."""
@@ -321,11 +388,11 @@ def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
         return variables
 
     def _parse_output(
-        self, output: dict, source_run_id: str
+        self, output: dict, source_run_id: UUID
     ) -> Union[EvaluationResult, EvaluationResults]:
         """Parse the model output into an evaluation result."""
         if isinstance(self.score_config, CategoricalScoreConfig):
-            value = output["value"]
+            value = output["category"]
             explanation = output.get(self.score_config.reasoning_key, None)
             return EvaluationResult(
                 key=self.score_config.key,

diff --git a/python/tests/integration_tests/test_llm_evaluator.py b/python/tests/integration_tests/test_llm_evaluator.py
@@ -25,17 +25,17 @@ def test_llm_evaluator_init() -> None:
         "description": "Whether the response is vague. Y for yes, N for no.",
         "type": "object",
         "properties": {
-            "value": {
+            "category": {
                 "type": "string",
                 "enum": ["Y", "N"],
-                "description": "The score for the evaluation, one of Y, N.",
+                "description": "The selected catgory for the evaluation, one of Y, N.",
             },
             "explanation": {
                 "type": "string",
-                "description": "The explanation for the score.",
+                "description": "Think step-by-step about what the correct score should be.",  # noqa: E501
             },
         },
-        "required": ["value", "explanation"],
+        "required": ["category", "explanation"],
     }
 
     # Try a continuous score
@@ -196,3 +196,83 @@ async def apredict(inputs: dict) -> dict:
         evaluators=[reference_accuracy, accuracy],
         experiment_prefix=__name__ + "::test_evaluate.aevaluate",
     )
+
+
+@pytest.mark.parametrize(
+    "config_class", [CategoricalScoreConfig, ContinuousScoreConfig]
+)
+def test_backwards_compatibility(config_class) -> None:
+    # Test include_explanation deprecation
+    with pytest.warns(DeprecationWarning, match="include_explanation.*reasoning_key"):
+        config = config_class(
+            key="test",
+            description="test description",
+            include_explanation=True,
+            **(
+                {"choices": ["Y", "N"]}
+                if config_class == CategoricalScoreConfig
+                else {}
+            ),
+        )
+        assert config.reasoning_key == "explanation"
+
+    # Test explanation_description deprecation
+    with pytest.warns(
+        DeprecationWarning, match="explanation_description.*reasoning_description"
+    ):
+        config = config_class(
+            key="test",
+            description="test description",
+            explanation_description="test explanation",
+            **(
+                {"choices": ["Y", "N"]}
+                if config_class == CategoricalScoreConfig
+                else {}
+            ),
+        )
+        assert config.reasoning_description == "test explanation"
+
+    # Test both deprecated fields together
+    with pytest.warns(DeprecationWarning) as warnings:
+        config = config_class(
+            key="test",
+            description="test description",
+            include_explanation=True,
+            explanation_description="test explanation",
+            **(
+                {"choices": ["Y", "N"]}
+                if config_class == CategoricalScoreConfig
+                else {}
+            ),
+        )
+        assert len(warnings) == 2  # Should show both deprecation warnings
+        assert config.reasoning_key == "explanation"
+        assert config.reasoning_description == "test explanation"
+
+    with pytest.raises(ValueError):
+        config = config_class(
+            key="test",
+            description="test description",
+            reasoning_key="custom_key",
+            reasoning_description="custom description",
+            explanation_description="old description",
+            **(
+                {"choices": ["Y", "N"]}
+                if config_class == CategoricalScoreConfig
+                else {}
+            ),
+        )
+
+    with pytest.raises(ValueError):
+        config = config_class(
+            key="test",
+            description="test description",
+            reasoning_key="custom_key",
+            include_explanation=True,
+            reasoning_description="custom description",
+            **(
+                {"choices": ["Y", "N"]}
+                if config_class == CategoricalScoreConfig
+                else {}
+            ),
+        )