Skip to content

Commit 54ae7e4

Browse files
authored
Serialization support for ParsingSettings.parse_pdf (#1216)
1 parent b52b250 commit 54ae7e4

File tree

2 files changed

+80
-2
lines changed

2 files changed

+80
-2
lines changed

src/paperqa/settings.py

Lines changed: 39 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,14 @@
3434
BaseModel,
3535
ConfigDict,
3636
Field,
37+
SerializerFunctionWrapHandler,
3738
computed_field,
3839
field_validator,
40+
model_serializer,
3941
model_validator,
4042
)
4143
from pydantic.fields import FieldInfo
44+
from pydantic_core.core_schema import SerializationInfo
4245
from pydantic_settings import BaseSettings, CliSettingsSource, SettingsConfigDict
4346

4447
import paperqa.configs
@@ -309,8 +312,9 @@ class ParsingSettings(BaseModel):
309312
)
310313
parse_pdf: PDFParserFn = Field(
311314
default_factory=get_default_pdf_parser,
312-
description="Function to parse PDF.",
313-
exclude=True,
315+
description="Function to parse PDF, or a fully qualified name to import.",
316+
examples=["paperqa_docling.parse_pdf_to_pages"],
317+
exclude=True, # NOTE: a custom serializer is used below, so it's not excluded
314318
)
315319
configure_pdf_parser: Callable[[], Any] = Field(
316320
default=default_pdf_parser_configurator,
@@ -320,6 +324,39 @@ class ParsingSettings(BaseModel):
320324
),
321325
exclude=True,
322326
)
327+
328+
@field_validator("parse_pdf", mode="before")
329+
@classmethod
330+
def _resolve_parse_pdf(cls, v: str | PDFParserFn) -> PDFParserFn:
331+
"""Resolve a fully qualified name to a parser function."""
332+
if isinstance(v, str):
333+
resolved = locate(v)
334+
if resolved is None:
335+
raise ValueError(f"Failed to locate PDF parser function {v!r}.")
336+
if not isinstance(resolved, PDFParserFn):
337+
raise TypeError(f"Value {v!r} is not a PDF parser function.")
338+
return resolved
339+
return v
340+
341+
@model_serializer(mode="wrap")
342+
def _custom_serializer(
343+
self, serializer: SerializerFunctionWrapHandler, info: SerializationInfo
344+
) -> dict[str, Any]:
345+
data = serializer(self)
346+
# NOTE: due to parse_pdf's exclude=True flag, it's not yet in this data.
347+
# Let's now add it back if we can safely deserialize "over the network"
348+
if isinstance(self.parse_pdf, str):
349+
# Already JSON-compliant, so let's un-exclude
350+
data["parse_pdf"] = self.parse_pdf
351+
elif (
352+
info.mode == "json"
353+
and hasattr(self.parse_pdf, "__module__")
354+
and hasattr(self.parse_pdf, "__name__")
355+
):
356+
# If going to JSON, and we can get a FQN, do so for JSON compliance
357+
data["parse_pdf"] = f"{self.parse_pdf.__module__}.{self.parse_pdf.__name__}"
358+
return data
359+
323360
chunking_algorithm: ChunkingOptions = Field(
324361
default=ChunkingOptions.SIMPLE_OVERLAP,
325362
deprecated="This field is deprecated and will be removed in version 6.",

tests/test_paperqa.py

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3287,6 +3287,47 @@ def test_reader_params_deprecation_warnings(recwarn: pytest.WarningsRecorder) ->
32873287
], "Expected clean settings to have no warnings"
32883288

32893289

3290+
def test_parse_pdf_string_resolution() -> None:
3291+
# Test with a valid string FQN
3292+
pymupdf_str = Settings(
3293+
parsing=ParsingSettings(parse_pdf="paperqa_pymupdf.parse_pdf_to_pages")
3294+
)
3295+
assert pymupdf_str.parsing.parse_pdf == pymupdf_parse_pdf_to_pages
3296+
assert (
3297+
pymupdf_str.model_dump(mode="json")["parsing"]["parse_pdf"]
3298+
== "paperqa_pymupdf.reader.parse_pdf_to_pages"
3299+
)
3300+
assert "parse_pdf" not in pymupdf_str.model_dump()["parsing"]
3301+
3302+
# Test another valid string FQN
3303+
pypdf_str = Settings(
3304+
parsing=ParsingSettings(parse_pdf="paperqa_pypdf.parse_pdf_to_pages")
3305+
)
3306+
assert pypdf_str.parsing.parse_pdf == pypdf_parse_pdf_to_pages
3307+
assert (
3308+
pypdf_str.model_dump(mode="json")["parsing"]["parse_pdf"]
3309+
== "paperqa_pypdf.reader.parse_pdf_to_pages"
3310+
)
3311+
assert "parse_pdf" not in pypdf_str.model_dump()["parsing"]
3312+
3313+
# Test directly passing a parser
3314+
pymupdf_fn = Settings(parsing=ParsingSettings(parse_pdf=pymupdf_parse_pdf_to_pages))
3315+
assert pymupdf_fn.parsing.parse_pdf == pymupdf_parse_pdf_to_pages
3316+
assert (
3317+
pymupdf_fn.model_dump(mode="json")["parsing"]["parse_pdf"]
3318+
== "paperqa_pymupdf.reader.parse_pdf_to_pages"
3319+
)
3320+
assert "parse_pdf" not in pymupdf_fn.model_dump()["parsing"]
3321+
3322+
# Test a nonexistent FQN
3323+
with pytest.raises(ValueError, match="Failed to locate"):
3324+
Settings(parsing=ParsingSettings(parse_pdf="nonexistent.module.function"))
3325+
3326+
# Test a valid FQN that is not a parser
3327+
with pytest.raises(TypeError, match="not a PDF parser"):
3328+
Settings(parsing=ParsingSettings(parse_pdf="os.path.sep"))
3329+
3330+
32903331
@pytest.mark.asyncio
32913332
@pytest.mark.parametrize("multimodal", [False, True])
32923333
async def test_reader_config_propagation(stub_data_dir: Path, multimodal: bool) -> None:

0 commit comments

Comments
 (0)