Skip to content

Commit

Permalink
will comments
Browse files Browse the repository at this point in the history
  • Loading branch information
isahers1 committed Nov 12, 2024
1 parent 8a35c07 commit 37b6212
Show file tree
Hide file tree
Showing 2 changed files with 177 additions and 30 deletions.
119 changes: 93 additions & 26 deletions python/langsmith/evaluation/llm_evaluator.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
"""Contains the LLMEvaluator class for building LLM-as-a-judge evaluators."""

Check notice on line 1 in python/langsmith/evaluation/llm_evaluator.py

View workflow job for this annotation

GitHub Actions / benchmark

Benchmark results

......................................... create_5_000_run_trees: Mean +- std dev: 618 ms +- 43 ms ......................................... create_10_000_run_trees: Mean +- std dev: 1.19 sec +- 0.05 sec ......................................... create_20_000_run_trees: Mean +- std dev: 1.19 sec +- 0.05 sec ......................................... dumps_class_nested_py_branch_and_leaf_200x400: Mean +- std dev: 702 us +- 8 us ......................................... dumps_class_nested_py_leaf_50x100: Mean +- std dev: 25.0 ms +- 0.3 ms ......................................... dumps_class_nested_py_leaf_100x200: Mean +- std dev: 104 ms +- 3 ms ......................................... dumps_dataclass_nested_50x100: Mean +- std dev: 25.6 ms +- 0.3 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (16.0 ms) is 24% of the mean (67.6 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydantic_nested_50x100: Mean +- std dev: 67.6 ms +- 16.0 ms ......................................... WARNING: the benchmark result may be unstable * the standard deviation (30.6 ms) is 14% of the mean (221 ms) Try to rerun the benchmark with more runs, values and/or loops. Run 'python -m pyperf system tune' command to reduce the system jitter. Use pyperf stats, pyperf dump and pyperf hist to analyze results. Use --quiet option to hide these warnings. dumps_pydanticv1_nested_50x100: Mean +- std dev: 221 ms +- 31 ms

Check notice on line 1 in python/langsmith/evaluation/llm_evaluator.py

View workflow job for this annotation

GitHub Actions / benchmark

Comparison against main

+------------------------------------+---------+-----------------------+ | Benchmark | main | changes | +====================================+=========+=======================+ | dumps_class_nested_py_leaf_100x200 | 105 ms | 104 ms: 1.01x faster | +------------------------------------+---------+-----------------------+ | dumps_class_nested_py_leaf_50x100 | 25.2 ms | 25.0 ms: 1.01x faster | +------------------------------------+---------+-----------------------+ | dumps_dataclass_nested_50x100 | 25.7 ms | 25.6 ms: 1.01x faster | +------------------------------------+---------+-----------------------+ | Geometric mean | (ref) | 1.00x slower | +------------------------------------+---------+-----------------------+ Benchmark hidden because not significant (6): dumps_class_nested_py_branch_and_leaf_200x400, create_10_000_run_trees, dumps_pydanticv1_nested_50x100, create_20_000_run_trees, dumps_pydantic_nested_50x100, create_5_000_run_trees

import warnings
from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast
from uuid import uuid4
from uuid import UUID, uuid4

from pydantic import BaseModel

Expand All @@ -19,16 +20,47 @@ class CategoricalScoreConfig(BaseModel):
description (str): Detailed description provided to the LLM judge of what
this score evaluates.
reasoning_key (Optional[str]): Key used to store the reasoning/explanation
for the score. Defaults to None.
for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output.
reasoning_description (Optional[str]): Description provided to the LLM judge
of what should be included in the reasoning. Defaults to None.
"""
If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be."
""" # noqa: E501

key: str
choices: List[str]
description: str
reasoning_key: Optional[str] = None
reasoning_description: Optional[str] = None
include_explanation: bool = False # Deprecated
explanation_description: Optional[str] = None # Deprecated

def __init__(self, **data):
"""Initialize CategoricalScoreConfig."""
if data.get("include_explanation") and data.get("reasoning_key"):
raise ValueError(
"Cannot include both include_explanation and reasoning_key, "
"please just use reasoning_key - include_explanation has been deprecated" # noqa: E501
)
if data.get("explanation_description") and data.get("reasoning_description"):
raise ValueError(
"Cannot include both explanation_description and reasoning_description, " # noqa: E501
"please just use reasoning_description - explanation_description has been deprecated" # noqa: E501
)
if data.get("include_explanation"):
warnings.warn(
"'include_explanation' is deprecated. Use 'reasoning_key=\"explanation\"' instead.", # noqa: E501
DeprecationWarning,
)
data["reasoning_key"] = "explanation"

if "explanation_description" in data:
warnings.warn(
"'explanation_description' is deprecated. Use 'reasoning_description' instead.", # noqa: E501
DeprecationWarning,
)
data["reasoning_description"] = data["explanation_description"]

super().__init__(**data)


class ContinuousScoreConfig(BaseModel):
Expand All @@ -41,17 +73,50 @@ class ContinuousScoreConfig(BaseModel):
description (str): Detailed description provided to the LLM judge of what
this score evaluates.
reasoning_key (Optional[str]): Key used to store the reasoning/explanation
for the score. Defaults to None.
for the score. Defaults to None, which means not including a reasoning field in the LLM-judge output.
reasoning_description (Optional[str]): Description provided to the LLM judge
of what should be included in the reasoning. Defaults to None.
"""
If None, but reasoning_key was passed it defaults to "Think step-by-step about what the correct score should be."
""" # noqa: E501

key: str
min: float = 0
max: float = 1
description: str
reasoning_key: Optional[str] = None
reasoning_description: Optional[str] = None
include_explanation: bool = False # Deprecated
explanation_description: Optional[str] = None # Deprecated

def __init__(self, **data):
"""Initialize ContinuousScoreConfig."""
if data.get("include_explanation") and data.get("reasoning_key"):
raise ValueError(
"Cannot include both include_explanation and reasoning_key, "
"please just use reasoning_key - include_explanation has been deprecated" # noqa: E501
)
if data.get("explanation_description") and data.get("reasoning_description"):
raise ValueError(
"Cannot include both explanation_description and reasoning_description, " # noqa: E501
"please just use reasoning_description - explanation_description has been deprecated" # noqa: E501
)
if data.get("include_explanation"):
warnings.warn(
"'include_explanation' is deprecated. Use \
'reasoning_key=\"explanation\"' instead.",
DeprecationWarning,
)
data["reasoning_key"] = "explanation"

if "explanation_description" in data:
warnings.warn(
"'explanation_description' is deprecated. \
Use 'reasoning_description' instead.",
DeprecationWarning,
)
data["reasoning_description"] = data["explanation_description"]

super().__init__(**data)


def _create_score_json_schema(
Expand All @@ -63,17 +128,17 @@ def _create_score_json_schema(
properties[score_config.reasoning_key] = {
"type": "string",
"description": (
"The explanation for the score."
"Think step-by-step about what the correct score should be."
if score_config.reasoning_description is None
else score_config.reasoning_description
),
}

if isinstance(score_config, CategoricalScoreConfig):
properties["value"] = {
properties["category"] = {
"type": "string",
"enum": score_config.choices,
"description": f"The score for the evaluation, one of "
"description": f"The selected catgory for the evaluation, one of "
f"{', '.join(score_config.choices)}.",
}
elif isinstance(score_config, ContinuousScoreConfig):
Expand All @@ -87,25 +152,27 @@ def _create_score_json_schema(
else:
raise ValueError("Invalid score type. Must be 'categorical' or 'continuous'")

required_keys = (
[
(
"category"
if isinstance(score_config, CategoricalScoreConfig)
else "score"
),
score_config.reasoning_key,
]
if score_config.reasoning_key
else [
"category" if isinstance(score_config, CategoricalScoreConfig) else "score"
]
)

return {
"title": score_config.key,
"description": score_config.description,
"type": "object",
"properties": properties,
"required": (
[
(
"value"
if isinstance(score_config, CategoricalScoreConfig)
else "score"
),
score_config.reasoning_key,
]
if score_config.reasoning_key
else [
"value" if isinstance(score_config, CategoricalScoreConfig) else "score"
]
),
"required": required_keys,
}


Expand Down Expand Up @@ -247,7 +314,7 @@ def evaluate_run(
dict,
self.runnable.invoke(variables, config={"run_id": source_run_id}),
)
return self._parse_output(output, str(source_run_id))
return self._parse_output(output, source_run_id)

@warn_beta
async def aevaluate_run(
Expand All @@ -261,7 +328,7 @@ async def aevaluate_run(
await self.runnable.ainvoke(variables, config={"run_id": source_run_id}),
)

return self._parse_output(output, str(source_run_id))
return self._parse_output(output, source_run_id)

def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
"""Prepare variables for model invocation."""
Expand Down Expand Up @@ -321,11 +388,11 @@ def _prepare_variables(self, run: Run, example: Optional[Example]) -> dict:
return variables

def _parse_output(
self, output: dict, source_run_id: str
self, output: dict, source_run_id: UUID
) -> Union[EvaluationResult, EvaluationResults]:
"""Parse the model output into an evaluation result."""
if isinstance(self.score_config, CategoricalScoreConfig):
value = output["value"]
value = output["category"]
explanation = output.get(self.score_config.reasoning_key, None)
return EvaluationResult(
key=self.score_config.key,
Expand Down
88 changes: 84 additions & 4 deletions python/tests/integration_tests/test_llm_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,17 +25,17 @@ def test_llm_evaluator_init() -> None:
"description": "Whether the response is vague. Y for yes, N for no.",
"type": "object",
"properties": {
"value": {
"category": {
"type": "string",
"enum": ["Y", "N"],
"description": "The score for the evaluation, one of Y, N.",
"description": "The selected catgory for the evaluation, one of Y, N.",
},
"explanation": {
"type": "string",
"description": "The explanation for the score.",
"description": "Think step-by-step about what the correct score should be.", # noqa: E501
},
},
"required": ["value", "explanation"],
"required": ["category", "explanation"],
}

# Try a continuous score
Expand Down Expand Up @@ -196,3 +196,83 @@ async def apredict(inputs: dict) -> dict:
evaluators=[reference_accuracy, accuracy],
experiment_prefix=__name__ + "::test_evaluate.aevaluate",
)


@pytest.mark.parametrize(
"config_class", [CategoricalScoreConfig, ContinuousScoreConfig]
)
def test_backwards_compatibility(config_class) -> None:
# Test include_explanation deprecation
with pytest.warns(DeprecationWarning, match="include_explanation.*reasoning_key"):
config = config_class(
key="test",
description="test description",
include_explanation=True,
**(
{"choices": ["Y", "N"]}
if config_class == CategoricalScoreConfig
else {}
),
)
assert config.reasoning_key == "explanation"

# Test explanation_description deprecation
with pytest.warns(
DeprecationWarning, match="explanation_description.*reasoning_description"
):
config = config_class(
key="test",
description="test description",
explanation_description="test explanation",
**(
{"choices": ["Y", "N"]}
if config_class == CategoricalScoreConfig
else {}
),
)
assert config.reasoning_description == "test explanation"

# Test both deprecated fields together
with pytest.warns(DeprecationWarning) as warnings:
config = config_class(
key="test",
description="test description",
include_explanation=True,
explanation_description="test explanation",
**(
{"choices": ["Y", "N"]}
if config_class == CategoricalScoreConfig
else {}
),
)
assert len(warnings) == 2 # Should show both deprecation warnings
assert config.reasoning_key == "explanation"
assert config.reasoning_description == "test explanation"

with pytest.raises(ValueError):
config = config_class(
key="test",
description="test description",
reasoning_key="custom_key",
reasoning_description="custom description",
explanation_description="old description",
**(
{"choices": ["Y", "N"]}
if config_class == CategoricalScoreConfig
else {}
),
)

with pytest.raises(ValueError):
config = config_class(
key="test",
description="test description",
reasoning_key="custom_key",
include_explanation=True,
reasoning_description="custom description",
**(
{"choices": ["Y", "N"]}
if config_class == CategoricalScoreConfig
else {}
),
)

0 comments on commit 37b6212

Please sign in to comment.