Enhance README with detailed service description, setup instructions, and example .env configuration for the FastAPI service that integrates with Paperless-ngx and llama.cpp for PDF processing.

2026-03-31 14:29:50 -05:00
parent facf6b26f0
commit 9b1705d82b
7 changed files with 699 additions and 0 deletions
--- a/src/notebook_tools/pipeline.py
+++ b/src/notebook_tools/pipeline.py
@@ -0,0 +1,187 @@
+"""OCR pipeline: Paperless PDF -> per-page OCR -> per-page PDFs -> Paperless uploads.
+
+This module is where the "business logic" lives.
+
+Design goals:
+- Keep the pipeline readable and linear.
+- Return enough information (created ids) for the job API.
+- Avoid hidden side-effects (everything is passed in / returned).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Awaitable, Callable
+import io
+import logging
+import re
+
+from notebook_tools.llama_client import LlamaClient
+from notebook_tools.paperless_client import PaperlessClient
+from notebook_tools.settings import Settings
+from notebook_tools import pdf_utils
+from PIL import Image
+
+logger = logging.getLogger("notebook_tools.pipeline")
+
+PAGE_NUMBER_PROMPT = (
+    "You are reading a handwritten page number in the bottom corner of a notebook page. "
+    "Return ONLY the page number as an integer. If you cannot determine it, return -1. "
+    "Do not output any other words."
+)
+
+
+def _crop_bottom_corner_jpegs(*, full_page_jpeg: bytes) -> list[bytes]:
+    """Return small JPEG crops from bottom-left and bottom-right corners.
+
+    Why crop?
+    - It reduces visual clutter so the model focuses on the handwritten page number.
+    - It reduces payload size, making OCR faster.
+
+    The crop is based on percentages so it works across different page sizes.
+    """
+
+    img = Image.open(io.BytesIO(full_page_jpeg)).convert("RGB")
+    w, h = img.size
+
+    # Bottom band (e.g. last 20% of page height)
+    band_h = int(h * 0.22)
+    y0 = max(0, h - band_h)
+
+    # Left/right corner width (e.g. 35% of page width)
+    corner_w = int(w * 0.35)
+
+    crops = [
+        img.crop((0, y0, corner_w, h)),  # bottom-left
+        img.crop((w - corner_w, y0, w, h)),  # bottom-right
+    ]
+
+    out: list[bytes] = []
+    for c in crops:
+        buf = io.BytesIO()
+        c.save(buf, format="JPEG", quality=90, optimize=True)
+        out.append(buf.getvalue())
+    return out
+
+
+def _parse_page_number(text: str) -> int | None:
+    """Try to parse an integer page number from a model response.
+
+    We accept:
+    - '12'
+    - 'Page 12' (if the model disobeys slightly)
+    - '-1'
+    """
+
+    m = re.search(r"-?\d+", text)
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+
+
+async def run_pipeline_for_paperless_document(
+    *,
+    settings: Settings,
+    paperless_document_id: int,
+    notebook_id: str,
+    job_id: str,
+    on_progress: Callable[[int, int], Awaitable[None]] | None,
+    ocr_prompt_override: str | None,
+    title_prefix: str | None,
+) -> dict[str, list[int]]:
+    """Run the full OCR pipeline for one Paperless document id.
+
+    Returns:
+        {"created_document_ids": [...]} where each id is a NEW Paperless document
+        (one per page).
+    """
+
+    paperless = PaperlessClient(
+        base_url=str(settings.paperless_base_url),
+        token=settings.paperless_token,
+        task_timeout_s=settings.paperless_task_timeout_s,
+        task_poll_interval_s=settings.paperless_task_poll_interval_s,
+    )
+    llama = LlamaClient(
+        base_url=str(settings.llama_base_url),
+        model=settings.llama_model,
+        temperature=settings.ocr_temperature,
+        max_tokens=settings.ocr_max_tokens,
+    )
+
+    # 1) Download the source PDF.
+    logger.info("job_id=%s downloading paperless_document_id=%s", job_id, paperless_document_id)
+    pdf_bytes = await paperless.download_document_pdf(document_id=paperless_document_id)
+    logger.info("job_id=%s downloaded_pdf_bytes=%s", job_id, len(pdf_bytes))
+
+    # 2) Render the PDF pages as JPEG images.
+    logger.info("job_id=%s rendering_pages dpi=%s", job_id, settings.render_dpi)
+    jpegs = pdf_utils.render_pdf_to_jpegs(pdf_bytes=pdf_bytes, dpi=settings.render_dpi)
+    total_pages = len(jpegs)
+    logger.info("job_id=%s rendered_pages=%s", job_id, total_pages)
+    if on_progress:
+        await on_progress(0, total_pages)
+
+    created_ids: list[int] = []
+
+    # 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
+    for idx, jpeg_bytes in enumerate(jpegs, start=1):
+        logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
+        # 3a) Page-number OCR (bottom corners only).
+        page_number = -1
+        for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
+            candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
+            parsed = _parse_page_number(candidate_text)
+            if parsed is not None:
+                # Only accept non-negative numbers, or -1. Anything else becomes unknown.
+                if parsed == -1 or parsed >= 0:
+                    page_number = parsed
+                    if page_number != -1:
+                        break
+        logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
+
+        # 3b) Full-page OCR for actual searchable text content.
+        logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
+        ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
+        logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
+
+        page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
+        logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
+
+        # Upload the per-page PDF as a new Paperless document.
+        logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
+        new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
+        logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
+
+        # Patch metadata:
+        # - content: OCR text so it becomes searchable in Paperless
+        # - custom_fields: notebook_id + notebook_page
+        # - document_type: per-page document type (Paperless id)
+        # - title
+        custom_fields = [
+            {"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
+            {"field": settings.paperless_custom_field_notebook_page, "value": page_number},
+        ]
+
+        # Per your request, title is always in this exact format.
+        # (We keep `title_prefix` in the API for now, but it is no longer used.)
+        title = f"Notebook {notebook_id} Page {page_number}"
+
+        logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
+        await paperless.patch_document(
+            document_id=new_id,
+            title=title,
+            content=ocr_text,
+            custom_fields=custom_fields,
+            document_type=settings.paperless_document_type_id,
+        )
+        logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
+
+        created_ids.append(new_id)
+        if on_progress:
+            await on_progress(idx, total_pages)
+
+    return {"created_document_ids": created_ids}
+