Skip to content

Commit

Permalink
fix: PyPDFToDocument correctly serializes custom converters, deprec…
Browse files Browse the repository at this point in the history
…ate `DefaultConverter` (#8430)

* fix: `PyPDFToDocument` correctly serializes custom converters, deprecate `DefaultConverter`

* Remove `auto` prefix from serde util function names, add unit tests
  • Loading branch information
shadeMe authored and julian-risch committed Oct 1, 2024
1 parent 3ae34b5 commit 0bf24c0
Show file tree
Hide file tree
Showing 5 changed files with 193 additions and 11 deletions.
27 changes: 21 additions & 6 deletions haystack/components/converters/pypdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
# SPDX-License-Identifier: Apache-2.0

import io
import warnings
from pathlib import Path
from typing import Any, Dict, List, Optional, Protocol, Union

from haystack import Document, component, default_from_dict, default_to_dict, logging
from haystack.components.converters.utils import get_bytestream_from_source, normalize_metadata
from haystack.dataclasses import ByteStream
from haystack.lazy_imports import LazyImport
from haystack.utils.type_serialization import deserialize_type
from haystack.utils.base_serialization import deserialize_class_instance, serialize_class_instance

with LazyImport("Run 'pip install pypdf'") as pypdf_import:
from pypdf import PdfReader
Expand Down Expand Up @@ -40,6 +41,11 @@ class DefaultConverter:
The default converter class that extracts text from a PdfReader object's pages and returns a Document.
"""

def __init__(self):
warnings.warn(
"This class is deprecated and will be merged into `PyPDFToDocument` in 2.7.0.", DeprecationWarning
)

def convert(self, reader: "PdfReader") -> Document:
"""Extract text from the PDF and return a Document object with the text content."""
text = "\f".join(page.extract_text() for page in reader.pages)
Expand Down Expand Up @@ -86,7 +92,7 @@ def __init__(self, converter: Optional[PyPDFConverter] = None):
"""
pypdf_import.check()

self.converter = converter or DefaultConverter()
self.converter = converter

def to_dict(self):
"""
Expand All @@ -95,7 +101,9 @@ def to_dict(self):
:returns:
Dictionary with serialized data.
"""
return default_to_dict(self, converter=self.converter.to_dict())
return default_to_dict(
self, converter=(serialize_class_instance(self.converter) if self.converter is not None else None)
)

@classmethod
def from_dict(cls, data):
Expand All @@ -108,10 +116,15 @@ def from_dict(cls, data):
:returns:
Deserialized component.
"""
converter_class = deserialize_type(data["init_parameters"]["converter"]["type"])
data["init_parameters"]["converter"] = converter_class.from_dict(data["init_parameters"]["converter"])
custom_converter_data = data["init_parameters"]["converter"]
if custom_converter_data is not None:
data["init_parameters"]["converter"] = deserialize_class_instance(custom_converter_data)
return default_from_dict(cls, data)

def _default_convert(self, reader: "PdfReader") -> Document:
text = "\f".join(page.extract_text() for page in reader.pages)
return Document(content=text)

@component.output_types(documents=List[Document])
def run(
self,
Expand Down Expand Up @@ -145,7 +158,9 @@ def run(
continue
try:
pdf_reader = PdfReader(io.BytesIO(bytestream.data))
document = self.converter.convert(pdf_reader)
document = (
self._default_convert(pdf_reader) if self.converter is None else self.converter.convert(pdf_reader)
)
except Exception as e:
logger.warning(
"Could not read {source} and convert it to Document, skipping. {error}", source=source, error=e
Expand Down
54 changes: 54 additions & 0 deletions haystack/utils/base_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict

from haystack.core.errors import DeserializationError, SerializationError
from haystack.core.serialization import generate_qualified_class_name, import_class_by_name


def serialize_class_instance(obj: Any) -> Dict[str, Any]:
"""
Serializes an object that has a `to_dict` method into a dictionary.
:param obj:
The object to be serialized.
:returns:
A dictionary representation of the object.
:raises SerializationError:
If the object does not have a `to_dict` method.
"""
if not hasattr(obj, "to_dict"):
raise SerializationError(f"Object of class '{type(obj).__name__}' does not have a 'to_dict' method")

output = obj.to_dict()
return {"type": generate_qualified_class_name(type(obj)), "data": output}


def deserialize_class_instance(data: Dict[str, Any]) -> Any:
"""
Deserializes an object from a dictionary representation generated by `auto_serialize_class_instance`.
:param data:
The dictionary to deserialize from.
:returns:
The deserialized object.
:raises DeserializationError:
If the serialization data is malformed, the class type cannot be imported, or the
class does not have a `from_dict` method.
"""
if "type" not in data:
raise DeserializationError("Missing 'type' in serialization data")
if "data" not in data:
raise DeserializationError("Missing 'data' in serialization data")

try:
obj_class = import_class_by_name(data["type"])
except ImportError as e:
raise DeserializationError(f"Class '{data['type']}' not correctly imported") from e

if not hasattr(obj_class, "from_dict"):
raise DeserializationError(f"Class '{data['type']}' does not have a 'from_dict' method")

return obj_class.from_dict(data["data"])
7 changes: 7 additions & 0 deletions releasenotes/notes/pypdf-serde-fixes-f33c29830508ee01.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
---
deprecations:
- |
The `DefaultConverter` class used by the `PyPDFToDocument` component has been deprecated. Its functionality will be merged into the component in 2.7.0.
fixes:
- |
Fix the serialization of `PyPDFToDocument` component to prevent the default converter from being serialized unnecessarily.
49 changes: 44 additions & 5 deletions test/components/converters/test_pypdf_to_document.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest

from haystack import Document, default_from_dict, default_to_dict
from haystack.components.converters.pypdf import DefaultConverter, PyPDFToDocument
from haystack.components.converters.pypdf import PyPDFToDocument
from haystack.dataclasses import ByteStream


Expand All @@ -16,29 +16,68 @@ def pypdf_converter():
return PyPDFToDocument()


class CustomConverter:
def convert(self, reader):
return Document(content="Custom converter")

def to_dict(self):
return {"key": "value", "more": False}

@classmethod
def from_dict(cls, data):
assert data == {"key": "value", "more": False}
return cls()


class TestPyPDFToDocument:
def test_init(self, pypdf_converter):
assert isinstance(pypdf_converter.converter, DefaultConverter)
assert pypdf_converter.converter is None

def test_init_params(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
assert isinstance(pypdf_converter.converter, CustomConverter)

def test_to_dict(self, pypdf_converter):
data = pypdf_converter.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {"converter": None},
}

def test_to_dict_custom_converter(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
data = pypdf_converter.to_dict()
assert data == {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
"converter": {
"data": {"key": "value", "more": False},
"type": "converters.test_pypdf_to_document.CustomConverter",
}
},
}

def test_from_dict(self):
data = {"type": "haystack.components.converters.pypdf.PyPDFToDocument", "init_parameters": {"converter": None}}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert instance.converter is None

def test_from_dict_custom_converter(self):
pypdf_converter = PyPDFToDocument(converter=CustomConverter())
data = pypdf_converter.to_dict()
data = {
"type": "haystack.components.converters.pypdf.PyPDFToDocument",
"init_parameters": {
"converter": {"type": "haystack.components.converters.pypdf.DefaultConverter", "init_parameters": {}}
"converter": {
"data": {"key": "value", "more": False},
"type": "converters.test_pypdf_to_document.CustomConverter",
}
},
}
instance = PyPDFToDocument.from_dict(data)
assert isinstance(instance, PyPDFToDocument)
assert isinstance(instance.converter, DefaultConverter)
assert isinstance(instance.converter, CustomConverter)

@pytest.mark.integration
def test_run(self, test_files_path, pypdf_converter):
Expand Down
67 changes: 67 additions & 0 deletions test/utils/test_base_serialization.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# SPDX-FileCopyrightText: 2022-present deepset GmbH <[email protected]>
#
# SPDX-License-Identifier: Apache-2.0
from haystack.core.errors import DeserializationError, SerializationError
import pytest

from haystack.utils.base_serialization import serialize_class_instance, deserialize_class_instance


class CustomClass:
def to_dict(self):
return {"key": "value", "more": False}

@classmethod
def from_dict(cls, data):
assert data == {"key": "value", "more": False}
return cls()


class CustomClassNoToDict:
@classmethod
def from_dict(cls, data):
assert data == {"key": "value", "more": False}
return cls()


class CustomClassNoFromDict:
def to_dict(self):
return {"key": "value", "more": False}


def test_serialize_class_instance():
result = serialize_class_instance(CustomClass())
assert result == {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}


def test_serialize_class_instance_missing_method():
with pytest.raises(SerializationError, match="does not have a 'to_dict' method"):
serialize_class_instance(CustomClassNoToDict())


def test_deserialize_class_instance():
data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}

result = deserialize_class_instance(data)
assert isinstance(result, CustomClass)


def test_deserialize_class_instance_invalid_data():
data = {"data": {"key": "value", "more": False}, "type": "test_base_serialization.CustomClass"}

with pytest.raises(DeserializationError, match="Missing 'type'"):
deserialize_class_instance({})

with pytest.raises(DeserializationError, match="Missing 'data'"):
deserialize_class_instance({"type": "test_base_serialization.CustomClass"})

with pytest.raises(
DeserializationError, match="Class 'test_base_serialization.CustomClass1' not correctly imported"
):
deserialize_class_instance({"type": "test_base_serialization.CustomClass1", "data": {}})

with pytest.raises(
DeserializationError,
match="Class 'test_base_serialization.CustomClassNoFromDict' does not have a 'from_dict' method",
):
deserialize_class_instance({"type": "test_base_serialization.CustomClassNoFromDict", "data": {}})

0 comments on commit 0bf24c0

Please sign in to comment.