Skip to content

Commit

Permalink
Merge branch 'main' into add_pdfminer
Browse files Browse the repository at this point in the history
  • Loading branch information
medsriha authored Apr 30, 2024
2 parents 0c5c54a + 2509eee commit 3cc5102
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 108 deletions.
2 changes: 1 addition & 1 deletion haystack/components/evaluators/context_relevance.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def __init__(
api_key=self.api_key,
)

@component.output_types(results=List[Dict[str, Any]])
@component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
Expand Down
30 changes: 15 additions & 15 deletions haystack/components/evaluators/faithfulness.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
"predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
Expand All @@ -24,15 +24,15 @@
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
"predicted_answers": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
Expand Down Expand Up @@ -60,9 +60,9 @@ class FaithfulnessEvaluator(LLMEvaluator):
"Python, created by Guido van Rossum in the late 1980s, is a high-level general-purpose programming language. Its design philosophy emphasizes code readability, and its language constructs aim to help programmers write clear, logical code for both small and large-scale software projects."
],
]
responses = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
predicted_answers = ["Python is a high-level general-purpose programming language that was created by George Lucas."]
evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
print(result["individual_scores"])
# [0.5]
Expand All @@ -87,13 +87,13 @@ def __init__(
Optional few-shot examples conforming to the expected input and output format of FaithfulnessEvaluator.
Default examples will be used if none are provided.
Each example must be a dictionary with keys "inputs" and "outputs".
"inputs" must be a dictionary with keys "questions", "contexts", and "responses".
"inputs" must be a dictionary with keys "questions", "contexts", and "predicted_answers".
"outputs" must be a dictionary with "statements" and "statement_scores".
Expected format:
[{
"inputs": {
"questions": "What is the capital of Italy?", "contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
Expand All @@ -110,11 +110,11 @@ def __init__(
self.instructions = (
"Your task is to judge the faithfulness or groundedness of statements based "
"on context information. First, please extract statements from a provided "
"response to a question. Second, calculate a faithfulness score for each "
"statement made in the response. The score is 1 if the statement can be "
"predicted answer to a question. Second, calculate a faithfulness score for each "
"statement made in the predicted answer. The score is 1 if the statement can be "
"inferred from the provided context or 0 if it cannot be inferred."
)
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
self.inputs = [("questions", List[str]), ("contexts", List[List[str]]), ("predicted_answers", List[str])]
self.outputs = ["statements", "statement_scores"]
self.examples = examples or _DEFAULT_EXAMPLES
self.api = api
Expand All @@ -129,24 +129,24 @@ def __init__(
api_key=self.api_key,
)

@component.output_types(results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]], responses: List[str]) -> Dict[str, Any]:
@component.output_types(individual_scores=List[int], score=float, results=List[Dict[str, Any]])
def run(self, questions: List[str], contexts: List[List[str]], predicted_answers: List[str]) -> Dict[str, Any]:
"""
Run the LLM evaluator.
:param questions:
A list of questions.
:param contexts:
A nested list of contexts that correspond to the questions.
:param responses:
A list of responses.
:param predicted_answers:
A list of predicted answers.
:returns:
A dictionary with the following outputs:
- `score`: Mean faithfulness score over all the provided input answers.
- `individual_scores`: A list of faithfulness scores for each input answer.
- `results`: A list of dictionaries with `statements` and `statement_scores` for each input answer.
"""
result = super().run(questions=questions, contexts=contexts, responses=responses)
result = super().run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)

# calculate average statement faithfulness score per query
for res in result["results"]:
Expand Down
12 changes: 6 additions & 6 deletions haystack/components/evaluators/llm_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,18 +23,18 @@ class LLMEvaluator:
from haystack.components.evaluators import LLMEvaluator
evaluator = LLMEvaluator(
instructions="Is this answer problematic for children?",
inputs=[("responses", List[str])],
inputs=[("predicted_answers", List[str])],
outputs=["score"],
examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}},
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"score": 1}},
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}},
],
)
RESPONSES = [
predicted_answers = [
"Football is the most popular sport with around 4 billion followers worldwide",
"Python language was created by Guido van Rossum.",
]
results = evaluator.run(responses=RESPONSES)
results = evaluator.run(predicted_answers=predicted_answers)
print(results)
# {'results': [{'score': 0}, {'score': 0}]}
```
Expand Down Expand Up @@ -199,7 +199,7 @@ def prepare_template(self) -> str:
The prompt template.
"""
inputs_section = (
"{" + ",".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
"{" + ", ".join([f'"{input_socket[0]}": {{{{ {input_socket[0]} }}}}' for input_socket in self.inputs]) + "}"
)

examples_section = "\n".join(
Expand Down
2 changes: 1 addition & 1 deletion haystack/components/evaluators/sas_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ def run(self, ground_truth_answers: List[str], predicted_answers: List[str]) ->

# Compute cosine-similarities
similarity_scores = [
util.cos_sim(p, l).cpu().numpy() for p, l in zip(predictions_embeddings, label_embeddings)
float(util.cos_sim(p, l).cpu().numpy()) for p, l in zip(predictions_embeddings, label_embeddings)
]

sas_score = np_mean(similarity_scores)
Expand Down
6 changes: 3 additions & 3 deletions haystack/core/component/component.py
Original file line number Diff line number Diff line change
Expand Up @@ -292,7 +292,7 @@ def set_input_types(self, instance, **types):
class MyComponent:
def __init__(self, value: int):
component.set_input_types(value_1=str, value_2=str)
component.set_input_types(self, value_1=str, value_2=str)
...
@component.output_types(output_1=int, output_2=str)
Expand All @@ -309,7 +309,7 @@ def run(self, **kwargs):
class MyComponent:
def __init__(self, value: int):
component.set_input_types(value_1=str, value_2=str)
component.set_input_types(self, value_1=str, value_2=str)
...
@component.output_types(output_1=int, output_2=str)
Expand Down Expand Up @@ -337,7 +337,7 @@ def set_output_types(self, instance, **types):
class MyComponent:
def __init__(self, value: int):
component.set_output_types(output_1=int, output_2=str)
component.set_output_types(self, output_1=int, output_2=str)
...
# no decorators here
Expand Down
46 changes: 29 additions & 17 deletions test/components/evaluators/test_faithfulness_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,23 @@ def test_init_default(self, monkeypatch):
assert component.generator.client.api_key == "test-api-key"
assert component.instructions == (
"Your task is to judge the faithfulness or groundedness of statements based "
"on context information. First, please extract statements from a provided "
"response to a question. Second, calculate a faithfulness score for each "
"statement made in the response. The score is 1 if the statement can be "
"on context information. First, please extract statements from a provided predicted "
"answer to a question. Second, calculate a faithfulness score for each "
"statement made in the predicted answer. The score is 1 if the statement can be "
"inferred from the provided context or 0 if it cannot be inferred."
)
assert component.inputs == [("questions", List[str]), ("contexts", List[List[str]]), ("responses", List[str])]
assert component.inputs == [
("questions", List[str]),
("contexts", List[List[str]]),
("predicted_answers", List[str]),
]
assert component.outputs == ["statements", "statement_scores"]
assert component.examples == [
{
"inputs": {
"questions": "What is the capital of Germany and when was it founded?",
"contexts": ["Berlin is the capital of Germany and was founded in 1244."],
"responses": "The capital of Germany, Berlin, was founded in the 13th century.",
"predicted_answers": "The capital of Germany, Berlin, was founded in the 13th century.",
},
"outputs": {
"statements": ["Berlin is the capital of Germany.", "Berlin was founded in 1244."],
Expand All @@ -38,15 +42,15 @@ def test_init_default(self, monkeypatch):
"inputs": {
"questions": "What is the capital of France?",
"contexts": ["Berlin is the capital of Germany."],
"responses": "Paris",
"predicted_answers": "Paris",
},
"outputs": {"statements": ["Paris is the capital of France."], "statement_scores": [0]},
},
{
"inputs": {
"questions": "What is the capital of Italy?",
"contexts": ["Rome is the capital of Italy."],
"responses": "Rome is the capital of Italy with more than 4 million inhabitants.",
"predicted_answers": "Rome is the capital of Italy with more than 4 million inhabitants.",
},
"outputs": {
"statements": ["Rome is the capital of Italy.", "Rome has more than 4 million inhabitants."],
Expand All @@ -65,15 +69,21 @@ def test_init_with_parameters(self):
api_key=Secret.from_token("test-api-key"),
api="openai",
examples=[
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
{
"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"},
"outputs": {"custom_score": 1},
},
{
"inputs": {"predicted_answers": "Football is the most popular sport."},
"outputs": {"custom_score": 0},
},
],
)
assert component.generator.client.api_key == "test-api-key"
assert component.api == "openai"
assert component.examples == [
{"inputs": {"responses": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
{"inputs": {"predicted_answers": "Damn, this is straight outta hell!!!"}, "outputs": {"custom_score": 1}},
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"custom_score": 0}},
]

def test_from_dict(self, monkeypatch):
Expand All @@ -84,14 +94,16 @@ def test_from_dict(self, monkeypatch):
"init_parameters": {
"api_key": {"env_vars": ["OPENAI_API_KEY"], "strict": True, "type": "env_var"},
"api": "openai",
"examples": [{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}],
"examples": [
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
],
},
}
component = FaithfulnessEvaluator.from_dict(data)
assert component.api == "openai"
assert component.generator.client.api_key == "test-api-key"
assert component.examples == [
{"inputs": {"responses": "Football is the most popular sport."}, "outputs": {"score": 0}}
{"inputs": {"predicted_answers": "Football is the most popular sport."}, "outputs": {"score": 0}}
]

def test_run_calculates_mean_score(self, monkeypatch):
Expand Down Expand Up @@ -120,11 +132,11 @@ def generator_run(self, *args, **kwargs):
"programmers write clear, logical code for both small and large-scale software projects."
],
]
responses = [
predicted_answers = [
"Football is the most popular sport with around 4 billion followers worldwide.",
"Python is a high-level general-purpose programming language that was created by George Lucas.",
]
results = component.run(questions=questions, contexts=contexts, responses=responses)
results = component.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)
assert results == {
"individual_scores": [0.5, 1],
"results": [
Expand All @@ -148,9 +160,9 @@ def test_run_missing_parameters(self, monkeypatch):
def test_live_run(self):
questions = ["What is Python and who created it?"]
contexts = [["Python is a programming language created by Guido van Rossum."]]
responses = ["Python is a programming language created by George Lucas."]
predicted_answers = ["Python is a programming language created by George Lucas."]
evaluator = FaithfulnessEvaluator()
result = evaluator.run(questions=questions, contexts=contexts, responses=responses)
result = evaluator.run(questions=questions, contexts=contexts, predicted_answers=predicted_answers)

required_fields = {"individual_scores", "results", "score"}
assert all(field in result for field in required_fields)
Expand Down
Loading

0 comments on commit 3cc5102

Please sign in to comment.