Serialization support for ParsingSettings.parse_pdf (#1216)

jamesbraza · web-flow · commit 54ae7e468150 · 2025-11-21T08:36:22.000-08:00
diff --git a/src/paperqa/settings.py b/src/paperqa/settings.py
@@ -34,11 +34,14 @@
     BaseModel,
     ConfigDict,
     Field,
+    SerializerFunctionWrapHandler,
     computed_field,
     field_validator,
+    model_serializer,
     model_validator,
 )
 from pydantic.fields import FieldInfo
+from pydantic_core.core_schema import SerializationInfo
 from pydantic_settings import BaseSettings, CliSettingsSource, SettingsConfigDict
 
 import paperqa.configs
@@ -309,8 +312,9 @@ class ParsingSettings(BaseModel):
     )
     parse_pdf: PDFParserFn = Field(
         default_factory=get_default_pdf_parser,
-        description="Function to parse PDF.",
-        exclude=True,
+        description="Function to parse PDF, or a fully qualified name to import.",
+        examples=["paperqa_docling.parse_pdf_to_pages"],
+        exclude=True,  # NOTE: a custom serializer is used below, so it's not excluded
     )
     configure_pdf_parser: Callable[[], Any] = Field(
         default=default_pdf_parser_configurator,
@@ -320,6 +324,39 @@ class ParsingSettings(BaseModel):
         ),
         exclude=True,
     )
+
+    @field_validator("parse_pdf", mode="before")
+    @classmethod
+    def _resolve_parse_pdf(cls, v: str | PDFParserFn) -> PDFParserFn:
+        """Resolve a fully qualified name to a parser function."""
+        if isinstance(v, str):
+            resolved = locate(v)
+            if resolved is None:
+                raise ValueError(f"Failed to locate PDF parser function {v!r}.")
+            if not isinstance(resolved, PDFParserFn):
+                raise TypeError(f"Value {v!r} is not a PDF parser function.")
+            return resolved
+        return v
+
+    @model_serializer(mode="wrap")
+    def _custom_serializer(
+        self, serializer: SerializerFunctionWrapHandler, info: SerializationInfo
+    ) -> dict[str, Any]:
+        data = serializer(self)
+        # NOTE: due to parse_pdf's exclude=True flag, it's not yet in this data.
+        # Let's now add it back if we can safely deserialize "over the network"
+        if isinstance(self.parse_pdf, str):
+            # Already JSON-compliant, so let's un-exclude
+            data["parse_pdf"] = self.parse_pdf
+        elif (
+            info.mode == "json"
+            and hasattr(self.parse_pdf, "__module__")
+            and hasattr(self.parse_pdf, "__name__")
+        ):
+            # If going to JSON, and we can get a FQN, do so for JSON compliance
+            data["parse_pdf"] = f"{self.parse_pdf.__module__}.{self.parse_pdf.__name__}"
+        return data
+
     chunking_algorithm: ChunkingOptions = Field(
         default=ChunkingOptions.SIMPLE_OVERLAP,
         deprecated="This field is deprecated and will be removed in version 6.",
diff --git a/tests/test_paperqa.py b/tests/test_paperqa.py
@@ -3287,6 +3287,47 @@ def test_reader_params_deprecation_warnings(recwarn: pytest.WarningsRecorder) ->
     ], "Expected clean settings to have no warnings"
 
 
+def test_parse_pdf_string_resolution() -> None:
+    # Test with a valid string FQN
+    pymupdf_str = Settings(
+        parsing=ParsingSettings(parse_pdf="paperqa_pymupdf.parse_pdf_to_pages")
+    )
+    assert pymupdf_str.parsing.parse_pdf == pymupdf_parse_pdf_to_pages
+    assert (
+        pymupdf_str.model_dump(mode="json")["parsing"]["parse_pdf"]
+        == "paperqa_pymupdf.reader.parse_pdf_to_pages"
+    )
+    assert "parse_pdf" not in pymupdf_str.model_dump()["parsing"]
+
+    # Test another valid string FQN
+    pypdf_str = Settings(
+        parsing=ParsingSettings(parse_pdf="paperqa_pypdf.parse_pdf_to_pages")
+    )
+    assert pypdf_str.parsing.parse_pdf == pypdf_parse_pdf_to_pages
+    assert (
+        pypdf_str.model_dump(mode="json")["parsing"]["parse_pdf"]
+        == "paperqa_pypdf.reader.parse_pdf_to_pages"
+    )
+    assert "parse_pdf" not in pypdf_str.model_dump()["parsing"]
+
+    # Test directly passing a parser
+    pymupdf_fn = Settings(parsing=ParsingSettings(parse_pdf=pymupdf_parse_pdf_to_pages))
+    assert pymupdf_fn.parsing.parse_pdf == pymupdf_parse_pdf_to_pages
+    assert (
+        pymupdf_fn.model_dump(mode="json")["parsing"]["parse_pdf"]
+        == "paperqa_pymupdf.reader.parse_pdf_to_pages"
+    )
+    assert "parse_pdf" not in pymupdf_fn.model_dump()["parsing"]
+
+    # Test a nonexistent FQN
+    with pytest.raises(ValueError, match="Failed to locate"):
+        Settings(parsing=ParsingSettings(parse_pdf="nonexistent.module.function"))
+
+    # Test a valid FQN that is not a parser
+    with pytest.raises(TypeError, match="not a PDF parser"):
+        Settings(parsing=ParsingSettings(parse_pdf="os.path.sep"))
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize("multimodal", [False, True])
 async def test_reader_config_propagation(stub_data_dir: Path, multimodal: bool) -> None: