Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add DocumentNDCGEvaluator component #8419

Merged
merged 9 commits into from
Oct 1, 2024
2 changes: 1 addition & 1 deletion docs/pydoc/config/evaluators_api.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ loaders:
"context_relevance",
"document_map",
"document_mrr",
"document_recall",
"document_ndcg",
"document_recall",
"faithfulness",
"llm_evaluator",
Expand Down
2 changes: 2 additions & 0 deletions haystack/components/evaluators/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .context_relevance import ContextRelevanceEvaluator
from .document_map import DocumentMAPEvaluator
from .document_mrr import DocumentMRREvaluator
from .document_ndcg import DocumentNDCGEvaluator
from .document_recall import DocumentRecallEvaluator
from .faithfulness import FaithfulnessEvaluator
from .llm_evaluator import LLMEvaluator
Expand All @@ -16,6 +17,7 @@
"ContextRelevanceEvaluator",
"DocumentMAPEvaluator",
"DocumentMRREvaluator",
"DocumentNDCGEvaluator",
"DocumentRecallEvaluator",
"FaithfulnessEvaluator",
"LLMEvaluator",
Expand Down
133 changes: 133 additions & 0 deletions haystack/components/evaluators/document_ndcg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from math import log2
from typing import Any, Dict, List

from haystack import Document, component


@component
class DocumentNDCGEvaluator:
"""
Evaluator that calculates the normalized discounted cumulative gain (NDCG) of retrieved documents.

Each question can have multiple ground truth documents and multiple retrieved documents.
If the ground truth documents have relevance scores, the NDCG calculation uses these scores.
Otherwise, it assumes binary relevance of all ground truth documents.

Usage example:
```python
from haystack import Document
from haystack.components.evaluators import DocumentNDCGEvaluator

evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[[Document(content="France", score=1.0), Document(content="Paris", score=0.5)]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
print(result["individual_scores"])
# [0.8869]
print(result["score"])
# 0.8869
```
"""

@component.output_types(score=float, individual_scores=List[float])
def run(
self, ground_truth_documents: List[List[Document]], retrieved_documents: List[List[Document]]
) -> Dict[str, Any]:
"""
Run the DocumentNDCGEvaluator on the given inputs.

`ground_truth_documents` and `retrieved_documents` must have the same length.
Amnah199 marked this conversation as resolved.
Show resolved Hide resolved
The list items within `ground_truth_documents` and `retrieved_documents` can differ in length.

:param ground_truth_documents:
Lists of expected documents, one list per question. Binary relevance is used if documents have no scores.
:param retrieved_documents:
Lists of retrieved documents, one list per question.
:returns:
A dictionary with the following outputs:
- `score` - The average of calculated scores.
- `individual_scores` - A list of numbers from 0.0 to 1.0 that represents the NDCG for each question.
"""
self.validate_inputs(ground_truth_documents, retrieved_documents)

individual_scores = []

for gt_docs, ret_docs in zip(ground_truth_documents, retrieved_documents):
dcg = self.calculate_dcg(gt_docs, ret_docs)
idcg = self.calculate_idcg(gt_docs)
ndcg = dcg / idcg if idcg > 0 else 0
individual_scores.append(ndcg)

score = sum(individual_scores) / len(ground_truth_documents)
tstadel marked this conversation as resolved.
Show resolved Hide resolved

return {"score": score, "individual_scores": individual_scores}

@staticmethod
def validate_inputs(gt_docs: List[List[Document]], ret_docs: List[List[Document]]):
"""
Validate the input parameters.

:param gt_docs:
The ground_truth_documents to validate.
:param ret_docs:
The retrieved_documents to validate.

:raises ValueError:
If the ground_truth_documents or the retrieved_documents are an empty a list.
If the length of ground_truth_documents and retrieved_documents differs.
If any list of documents in ground_truth_documents contains a mix of documents with and without a score.
"""
if len(gt_docs) == 0 or len(ret_docs) == 0:
msg = "ground_truth_documents and retrieved_documents must be provided."
raise ValueError(msg)

if len(gt_docs) != len(ret_docs):
msg = "The length of ground_truth_documents and retrieved_documents must be the same."
raise ValueError(msg)

for docs in gt_docs:
if any(doc.score is not None for doc in docs) and any(doc.score is None for doc in docs):
msg = "Either none or all documents in each list of ground_truth_documents must have a score."
raise ValueError(msg)

@staticmethod
def calculate_dcg(gt_docs: List[Document], ret_docs: List[Document]) -> float:
"""
Calculate the discounted cumulative gain (DCG) of the retrieved documents.

:param gt_docs:
The ground truth documents.
:param ret_docs:
The retrieved documents.
:returns:
The discounted cumulative gain (DCG) of the retrieved
documents based on the ground truth documents.
"""
dcg = 0.0
relevant_id_to_score = {doc.id: doc.score if doc.score is not None else 1 for doc in gt_docs}
for i, doc in enumerate(ret_docs):
if doc.id in relevant_id_to_score: # TODO Related to https://github.com/deepset-ai/haystack/issues/8412
dcg += relevant_id_to_score[doc.id] / log2(i + 2) # i + 2 because i is 0-indexed
return dcg

@staticmethod
def calculate_idcg(gt_docs: List[Document]) -> float:
"""
Calculate the ideal discounted cumulative gain (IDCG) of the ground truth documents.

:param gt_docs:
The ground truth documents.
:returns:
The ideal discounted cumulative gain (IDCG) of the ground truth documents.
"""
idcg = 0.0
for i, doc in enumerate(sorted(gt_docs, key=lambda x: x.score if x.score is not None else 1, reverse=True)):
# If the document has a score, use it; otherwise, use 1 for binary relevance.
relevance = doc.score if doc.score is not None else 1
idcg += relevance / log2(i + 2) # i + 2 because i is 0-indexed
return idcg
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
---
features:
- |
Added a new component DocumentNDCGEvaluator, which is similar to DocumentMRREvaluator and useful for retrieval evaluation. It calculates the normalized discounted cumulative gain, an evaluation metric useful when there are multiple ground truth relevant documents and the order in which they are retrieved is important.
202 changes: 202 additions & 0 deletions test/components/evaluators/test_document_ndcg.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,202 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
import pytest

from haystack import Document
from haystack.components.evaluators.document_ndcg import DocumentNDCGEvaluator


def test_run_with_scores():
Amnah199 marked this conversation as resolved.
Show resolved Hide resolved
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[
[
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc6", score=2),
Document(content="doc7", score=3),
Document(content="doc8", score=2),
]
],
retrieved_documents=[
[
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
]
],
)
assert result["individual_scores"][0] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.6592, abs=1e-4)


def test_run_without_scores():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[[Document(content="France"), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["score"] == pytest.approx(0.9197, abs=1e-4)


def test_run_with_multiple_lists_of_docs():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(
ground_truth_documents=[
[Document(content="France"), Document(content="Paris")],
[
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc6", score=2),
Document(content="doc7", score=3),
Document(content="doc8", score=2),
],
],
retrieved_documents=[
[Document(content="France"), Document(content="Germany"), Document(content="Paris")],
[
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
],
],
)
assert result["individual_scores"][0] == pytest.approx(0.9197, abs=1e-4)
assert result["individual_scores"][1] == pytest.approx(0.6592, abs=1e-4)
assert result["score"] == pytest.approx(0.7895, abs=1e-4)


def test_run_with_different_lengths():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")]],
retrieved_documents=[[Document(content="Berlin")], [Document(content="London")]],
)
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="Berlin")], [Document(content="Paris")]],
retrieved_documents=[[Document(content="Berlin")]],
)


def test_run_with_mixed_documents_with_and_without_scores():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(
ground_truth_documents=[[Document(content="France", score=3), Document(content="Paris")]],
retrieved_documents=[[Document(content="France"), Document(content="Germany"), Document(content="Paris")]],
)


def test_run_empty_retrieved():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[[]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0


def test_run_empty_ground_truth():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[Document(content="France")]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0
tstadel marked this conversation as resolved.
Show resolved Hide resolved


def test_run_empty_retrieved_and_empty_ground_truth():
evaluator = DocumentNDCGEvaluator()
result = evaluator.run(ground_truth_documents=[[]], retrieved_documents=[[]])
assert result["individual_scores"] == [0.0]
assert result["score"] == 0.0


def test_run_no_retrieved():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
result = evaluator.run(ground_truth_documents=[[Document(content="France")]], retrieved_documents=[])


def test_run_no_ground_truth():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(ground_truth_documents=[], retrieved_documents=[[Document(content="France")]])


def test_run_no_retrieved_and_no_ground_truth():
evaluator = DocumentNDCGEvaluator()
with pytest.raises(ValueError):
evaluator.run(ground_truth_documents=[], retrieved_documents=[])


def test_calculate_dcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=2),
Document(content="doc3", score=3),
Document(content="doc4", score=0),
Document(content="doc5", score=1),
Document(content="doc6", score=2),
]
ret_docs = [
Document(content="doc1"),
Document(content="doc2"),
Document(content="doc3"),
Document(content="doc4"),
Document(content="doc5"),
Document(content="doc6"),
]
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(6.8611, abs=1e-4)


def test_calculate_dcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2")]
ret_docs = [Document(content="doc2"), Document(content="doc3"), Document(content="doc1")]
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == pytest.approx(1.5, abs=1e-4)


def test_calculate_dcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1")]
ret_docs = []
dcg = evaluator.calculate_dcg(gt_docs, ret_docs)
assert dcg == 0


def test_calculate_idcg_with_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [
Document(content="doc1", score=3),
Document(content="doc2", score=3),
Document(content="doc3", score=2),
Document(content="doc4", score=3),
Document(content="doc5", score=2),
Document(content="doc6", score=2),
]
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == pytest.approx(8.7403, abs=1e-4)


def test_calculate_idcg_without_scores():
evaluator = DocumentNDCGEvaluator()
gt_docs = [Document(content="doc1"), Document(content="doc2"), Document(content="doc3")]
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == pytest.approx(2.1309, abs=1e-4)


def test_calculate_idcg_empty():
evaluator = DocumentNDCGEvaluator()
gt_docs = []
idcg = evaluator.calculate_idcg(gt_docs)
assert idcg == 0
Loading