99from typing import Any , cast
1010
1111import pypdfium2 as pdfium
12+ from lmi .utils import gather_with_concurrency
1213from paperqa .types import ParsedMedia , ParsedMetadata , ParsedText
1314from paperqa .utils import ImpossibleParsingError
1415from tenacity import RetryError
@@ -31,6 +32,7 @@ async def parse_pdf_to_pages(
3132 full_page : bool = False ,
3233 dpi : int | None = 300 ,
3334 api_params : Mapping [str , Any ] | None = None ,
35+ concurrency : int | asyncio .Semaphore | None = 128 ,
3436 ** _ : Any ,
3537) -> ParsedText :
3638 """Parse a PDF using Nvidia's nemotron-parse VLM.
@@ -48,6 +50,10 @@ async def parse_pdf_to_pages(
4850 dpi: Optional DPI (dots per inch) for image resolution,
4951 if set as None then pypdfium2's default 1 scale will be employed.
5052 api_params: Optional parameters to pass to the nemotron-parse API.
53+ concurrency: Optional concurrency semaphore on concurrent processing of pages,
54+ use to put a ceiling on memory usage. Default is 128 to prioritize reader
55+ speed over memory, but not get obliterated by huge 1000-page PDFs.
56+ Set as None to disable concurrency limits, processing all pages at once.
5157 **_: Thrown away kwargs.
5258
5359 Returns:
@@ -203,9 +209,15 @@ async def process_page(
203209
204210 content : dict [str , str | tuple [str , list [ParsedMedia ]]] = {}
205211 total_length = count_media = 0
206- for page_num , page_content in await asyncio .gather (
207- * (process_page (i ) for i in range (start_page , end_page ))
208- ):
212+
213+ gather = (
214+ asyncio .gather (* (process_page (i ) for i in range (start_page , end_page )))
215+ if concurrency is None
216+ else gather_with_concurrency (
217+ concurrency , (process_page (i ) for i in range (start_page , end_page ))
218+ )
219+ )
220+ for page_num , page_content in await gather :
209221 content [page_num ] = page_content
210222 if parse_media :
211223 page_text , page_media = page_content # type: ignore[misc]
0 commit comments