3434 BaseModel ,
3535 ConfigDict ,
3636 Field ,
37+ SerializerFunctionWrapHandler ,
3738 computed_field ,
3839 field_validator ,
40+ model_serializer ,
3941 model_validator ,
4042)
4143from pydantic .fields import FieldInfo
44+ from pydantic_core .core_schema import SerializationInfo
4245from pydantic_settings import BaseSettings , CliSettingsSource , SettingsConfigDict
4346
4447import paperqa .configs
@@ -309,8 +312,9 @@ class ParsingSettings(BaseModel):
309312 )
310313 parse_pdf : PDFParserFn = Field (
311314 default_factory = get_default_pdf_parser ,
312- description = "Function to parse PDF." ,
313- exclude = True ,
315+ description = "Function to parse PDF, or a fully qualified name to import." ,
316+ examples = ["paperqa_docling.parse_pdf_to_pages" ],
317+ exclude = True , # NOTE: a custom serializer is used below, so it's not excluded
314318 )
315319 configure_pdf_parser : Callable [[], Any ] = Field (
316320 default = default_pdf_parser_configurator ,
@@ -320,6 +324,39 @@ class ParsingSettings(BaseModel):
320324 ),
321325 exclude = True ,
322326 )
327+
328+ @field_validator ("parse_pdf" , mode = "before" )
329+ @classmethod
330+ def _resolve_parse_pdf (cls , v : str | PDFParserFn ) -> PDFParserFn :
331+ """Resolve a fully qualified name to a parser function."""
332+ if isinstance (v , str ):
333+ resolved = locate (v )
334+ if resolved is None :
335+ raise ValueError (f"Failed to locate PDF parser function { v !r} ." )
336+ if not isinstance (resolved , PDFParserFn ):
337+ raise TypeError (f"Value { v !r} is not a PDF parser function." )
338+ return resolved
339+ return v
340+
341+ @model_serializer (mode = "wrap" )
342+ def _custom_serializer (
343+ self , serializer : SerializerFunctionWrapHandler , info : SerializationInfo
344+ ) -> dict [str , Any ]:
345+ data = serializer (self )
346+ # NOTE: due to parse_pdf's exclude=True flag, it's not yet in this data.
347+ # Let's now add it back if we can safely deserialize "over the network"
348+ if isinstance (self .parse_pdf , str ):
349+ # Already JSON-compliant, so let's un-exclude
350+ data ["parse_pdf" ] = self .parse_pdf
351+ elif (
352+ info .mode == "json"
353+ and hasattr (self .parse_pdf , "__module__" )
354+ and hasattr (self .parse_pdf , "__name__" )
355+ ):
356+ # If going to JSON, and we can get a FQN, do so for JSON compliance
357+ data ["parse_pdf" ] = f"{ self .parse_pdf .__module__ } .{ self .parse_pdf .__name__ } "
358+ return data
359+
323360 chunking_algorithm : ChunkingOptions = Field (
324361 default = ChunkingOptions .SIMPLE_OVERLAP ,
325362 deprecated = "This field is deprecated and will be removed in version 6." ,
0 commit comments