Add parallel uploading of documents

Signed-off-by: Daniel Henry <iamdanhenry@gmail.com>
2026-03-31 16:23:10 -05:00
parent 29c790fdfd
commit 612fbe2055
4 changed files with 91 additions and 54 deletions
--- a/src/notebook_tools/pipeline.py
+++ b/src/notebook_tools/pipeline.py
@@ -6,19 +6,25 @@ Design goals:
 - Keep the pipeline readable and linear.
 - Return enough information (created ids) for the job API.
 - Avoid hidden side-effects (everything is passed in / returned).
+
+Upload strategy:
+- All per-page PDFs are uploaded to Paperless concurrently (each upload still polls until a doc id exists).
+- OCR (llama) runs one page at a time to respect VRAM.
+- Each page is PATCHed once that page's upload has finished and OCR for that page is done.
 """

 from __future__ import annotations

-from collections.abc import Awaitable, Callable
+import asyncio
 import io
 import logging
 import re
+from collections.abc import Awaitable, Callable

+from notebook_tools import pdf_utils
 from notebook_tools.llama_client import LlamaClient
 from notebook_tools.paperless_client import PaperlessClient
 from notebook_tools.settings import Settings
-from notebook_tools import pdf_utils
 from PIL import Image

 logger = logging.getLogger("notebook_tools.pipeline")
@@ -124,64 +130,86 @@ async def run_pipeline_for_paperless_document(
    if on_progress:
        await on_progress(0, total_pages)

+    # One small PDF per page (used by upload tasks).
+    page_pdfs = [pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=b) for b in jpegs]
+
+    # 3) Start all Paperless uploads in parallel (each task waits for ingest + document id).
+    conc = settings.paperless_upload_concurrency
+    upload_sem: asyncio.Semaphore | None = (
+        asyncio.Semaphore(conc) if conc and conc > 0 else None
+    )
+    logger.info(
+        "job_id=%s starting_parallel_uploads pages=%s concurrency=%s",
+        job_id,
+        total_pages,
+        conc if conc and conc > 0 else "unlimited",
+    )
+
+    async def _upload_one_page(idx_1based: int) -> int:
+        filename = f"job_{job_id}_page_{idx_1based}.pdf"
+        pdf_bytes = page_pdfs[idx_1based - 1]
+        logger.info("job_id=%s page=%s/%s upload_task_starting", job_id, idx_1based, total_pages)
+        if upload_sem is not None:
+            async with upload_sem:
+                return await paperless.upload_pdf(filename=filename, pdf_bytes=pdf_bytes)
+        return await paperless.upload_pdf(filename=filename, pdf_bytes=pdf_bytes)
+
+    upload_tasks: list[asyncio.Task[int]] = [
+        asyncio.create_task(_upload_one_page(i)) for i in range(1, total_pages + 1)
+    ]
+
    created_ids: list[int] = []

-    # 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
-    for idx, jpeg_bytes in enumerate(jpegs, start=1):
-        logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
-        # 3a) Page-number OCR (bottom corners only).
-        page_number = -1
-        for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
-            candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
-            parsed = _parse_page_number(candidate_text)
-            if parsed is not None:
-                # Only accept non-negative numbers, or -1. Anything else becomes unknown.
-                if parsed == -1 or parsed >= 0:
-                    page_number = parsed
-                    if page_number != -1:
-                        break
-        logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
+    # 4) OCR sequentially (VRAM), then await upload for that page + PATCH.
+    try:
+        for idx, jpeg_bytes in enumerate(jpegs, start=1):
+            logger.info("job_id=%s page=%s/%s ocr_starting", job_id, idx, total_pages)
+            # 4a) Page-number OCR (bottom corners only).
+            page_number = -1
+            for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
+                candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
+                parsed = _parse_page_number(candidate_text)
+                if parsed is not None:
+                    if parsed == -1 or parsed >= 0:
+                        page_number = parsed
+                        if page_number != -1:
+                            break
+            logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)

-        # 3b) Full-page OCR for actual searchable text content.
-        logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
-        ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
-        logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
+            # 4b) Full-page OCR for searchable content.
+            logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
+            ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
+            logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))

-        page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
-        logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
+            logger.info("job_id=%s page=%s awaiting_upload_then_patch", job_id, idx)
+            new_id = await upload_tasks[idx - 1]
+            logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)

-        # Upload the per-page PDF as a new Paperless document.
-        logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
-        new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
-        logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
+            custom_fields = [
+                {"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
+                {"field": settings.paperless_custom_field_notebook_page, "value": page_number},
+            ]

-        # Patch metadata:
-        # - content: OCR text so it becomes searchable in Paperless
-        # - custom_fields: notebook_id + notebook_page
-        # - document_type: per-page document type (Paperless id)
-        # - title
-        custom_fields = [
-            {"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
-            {"field": settings.paperless_custom_field_notebook_page, "value": page_number},
-        ]
+            title = f"Notebook {notebook_id} Page {page_number}"

-        # Per your request, title is always in this exact format.
-        # (We keep `title_prefix` in the API for now, but it is no longer used.)
-        title = f"Notebook {notebook_id} Page {page_number}"
+            logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
+            await paperless.patch_document(
+                document_id=new_id,
+                title=title,
+                content=ocr_text,
+                custom_fields=custom_fields,
+                document_type=settings.paperless_document_type_id,
+            )
+            logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)

-        logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
-        await paperless.patch_document(
-            document_id=new_id,
-            title=title,
-            content=ocr_text,
-            custom_fields=custom_fields,
-            document_type=settings.paperless_document_type_id,
-        )
-        logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
-
-        created_ids.append(new_id)
-        if on_progress:
-            await on_progress(idx, total_pages)
+            created_ids.append(new_id)
+            if on_progress:
+                await on_progress(idx, total_pages)
+    except BaseException:
+        for t in upload_tasks:
+            if not t.done():
+                t.cancel()
+        await asyncio.gather(*upload_tasks, return_exceptions=True)
+        raise

    return {"created_document_ids": created_ids}
-