Enhance README with detailed service description, setup instructions, and example .env configuration for the FastAPI service that integrates with Paperless-ngx and llama.cpp for PDF processing.

2026-03-31 14:29:50 -05:00
parent facf6b26f0
commit 9b1705d82b
7 changed files with 699 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@@ -0,0 +1,19 @@
+PAPERLESS_BASE_URL="https://paperless.example.com"
+PAPERLESS_TOKEN="paste-token-here"
+
+LLAMA_BASE_URL="http://127.0.0.1:9292"
+LLAMA_MODEL="ggml-model-q4_k_m"
+
+PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID=1
+PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2
+
+# Document type id for each uploaded per-page document
+PAPERLESS_DOCUMENT_TYPE_ID=3
+
+PAPERLESS_TASK_TIMEOUT_S=600
+PAPERLESS_TASK_POLL_INTERVAL_S=5.0
+
+RENDER_DPI=200
+OCR_MAX_TOKENS=1024
+OCR_TEMPERATURE=0.0
+
--- a/README.md
+++ b/README.md
@@ -1,2 +1,56 @@
 # notebook-tools

+FastAPI service that:
+- downloads PDFs from Paperless-ngx
+- splits them into pages (JPEG)
+- OCRs each page via your llama.cpp OpenAI-compatible endpoint
+- converts each page back into a single-page PDF
+- uploads **one Paperless document per page**
+- patches each uploaded document with:
+  - `content` = OCR text
+  - custom fields `notebook_id` (field id 1) and `notebook_page` (field id 2)
+  - `document_type` = Paperless document type id (default **3**, configurable)
+
+## Setup
+
+Install deps:
+
+```bash
+uv sync
+```
+
+Create a `.env` file (example below) and **do not commit it**.
+
+## Run locally
+
+```bash
+uv run uvicorn notebook_tools.api:app --reload --host 0.0.0.0 --port 8080
+```
+
+Then open the docs at:
+- `http://127.0.0.1:8080/docs` (same machine)
+- `http://<your-lan-ip>:8080/docs` (other machines on your network)
+
+If other machines still can’t connect, check your macOS firewall and any router/network rules.
+
+## Example `.env`
+
+```bash
+PAPERLESS_BASE_URL="https://paperless.example.com"
+PAPERLESS_TOKEN="paste-token-here"
+
+LLAMA_BASE_URL="http://127.0.0.1:9292"
+LLAMA_MODEL="ggml-model-q4_k_m"
+
+# Custom field ids in Paperless
+PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID=1
+PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2
+PAPERLESS_DOCUMENT_TYPE_ID=3
+
+# Rendering / OCR knobs
+RENDER_DPI=200
+OCR_MAX_TOKENS=1024
+OCR_TEMPERATURE=0.0
+```
+
+
--- a/src/notebook_tools/paperless_client.py
+++ b/src/notebook_tools/paperless_client.py
@@ -0,0 +1,278 @@
+"""Paperless-ngx REST API client.
+
+We keep this client *thin*:
+- It knows how to talk to Paperless (URLs, auth header, endpoints).
+- It does NOT know about OCR or PDFs beyond "upload bytes".
+
+This separation makes it easier to test and to swap out behavior later.
+"""
+
+from __future__ import annotations
+
+import asyncio
+import json
+import logging
+import re
+from typing import Any
+
+import httpx
+from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential_jitter
+
+logger = logging.getLogger("notebook_tools.paperless")
+
+class PaperlessError(RuntimeError):
+    """Raised for non-2xx responses from Paperless."""
+
+
+def _auth_headers(token: str) -> dict[str, str]:
+    # Paperless uses token auth in the form: Authorization: Token <token>
+    return {"Authorization": f"Token {token}"}
+
+
+def _raise_for_status(resp: httpx.Response) -> None:
+    """Raise a helpful error message.
+
+    httpx has resp.raise_for_status(), but we include the response body (often JSON)
+    because Paperless will usually tell you exactly what's wrong.
+    """
+
+    if 200 <= resp.status_code < 300:
+        return
+    body = resp.text
+    raise PaperlessError(f"Paperless API {resp.status_code}: {body}")
+
+
+def _document_id_from_task_payload(item: dict[str, Any]) -> int | None:
+    """Extract created document id from a Paperless task object.
+
+    Paperless 2.x often returns:
+    - ``related_document`` as a string ``\"10\"`` (not an int)
+    - ``result`` as a string like ``\"Success. New document id 10 created\"`` (not a dict)
+
+    We must handle both, or polling never completes.
+    """
+
+    rd = item.get("related_document")
+    if rd is not None:
+        if isinstance(rd, int):
+            return rd
+        if isinstance(rd, str) and rd.strip().isdigit():
+            return int(rd.strip())
+
+    for key in ("document_id", "document"):
+        val = item.get(key)
+        if isinstance(val, int):
+            return val
+        if isinstance(val, str) and val.strip().isdigit():
+            return int(val.strip())
+
+    result = item.get("result")
+    if isinstance(result, dict):
+        nested = result.get("document_id") or result.get("document")
+        if isinstance(nested, int):
+            return nested
+        if isinstance(nested, str) and nested.strip().isdigit():
+            return int(nested.strip())
+    elif isinstance(result, str):
+        # e.g. "Success. New document id 10 created"
+        m = re.search(r"New document id\s+(\d+)", result, flags=re.IGNORECASE)
+        if m:
+            return int(m.group(1))
+
+    return None
+
+
+class PaperlessClient:
+    def __init__(
+        self,
+        *,
+        base_url: str,
+        token: str,
+        timeout_s: float = 60.0,
+        task_timeout_s: int = 600,
+        task_poll_interval_s: float = 5.0,
+    ) -> None:
+        self._base_url = base_url.rstrip("/")
+        self._token = token
+        self._timeout = httpx.Timeout(timeout_s)
+        self._task_timeout_s = task_timeout_s
+        self._task_poll_interval_s = task_poll_interval_s
+
+    def _url(self, path: str) -> str:
+        return f"{self._base_url}{path}"
+
+    @retry(
+        retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
+        wait=wait_exponential_jitter(initial=0.5, max=5.0),
+        stop=stop_after_attempt(3),
+        reraise=True,
+    )
+    async def download_document_pdf(self, *, document_id: int) -> bytes:
+        """Download the original PDF bytes for a Paperless document."""
+
+        async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client:
+            # Common Paperless endpoint:
+            # GET /api/documents/{id}/download/
+            logger.info("Downloading document_id=%s", document_id)
+            resp = await client.get(self._url(f"/api/documents/{document_id}/download/"))
+            _raise_for_status(resp)
+            return resp.content
+
+    async def upload_pdf(self, *, filename: str, pdf_bytes: bytes) -> int:
+        """Upload a new document (PDF) and return its new document id."""
+
+        async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client:
+            files = {
+                "document": (filename, pdf_bytes, "application/pdf"),
+            }
+            # POST /api/documents/post_document/ returns JSON about the created document/task.
+            logger.info("Uploading PDF filename=%s bytes=%s", filename, len(pdf_bytes))
+            resp = await client.post(self._url("/api/documents/post_document/"), files=files)
+            _raise_for_status(resp)
+
+            data: Any = resp.json()
+
+            # Paperless has had a few response shapes across versions.
+            # We defensively handle the most common ones.
+            if isinstance(data, dict):
+                if "document" in data and isinstance(data["document"], int):
+                    return int(data["document"])
+                if "id" in data and isinstance(data["id"], int):
+                    return int(data["id"])
+                # Some versions return {"task_id": "<uuid>"}.
+                if "task_id" in data and isinstance(data["task_id"], str):
+                    logger.info("Upload returned task_id=%s", data["task_id"])
+                    return await self._wait_for_task_document_id(client=client, task_id=data["task_id"])
+
+            # Other versions return the task id directly as a JSON string: "<uuid>"
+            if isinstance(data, str):
+                logger.info("Upload returned task_id=%s", data)
+                return await self._wait_for_task_document_id(client=client, task_id=data)
+
+            raise PaperlessError(f"Unexpected upload response: {json.dumps(data)[:500]}")
+
+    async def _wait_for_task_document_id(self, *, client: httpx.AsyncClient, task_id: str) -> int:
+        """Poll Paperless' tasks endpoint until it yields a created document id.
+
+        Why polling is needed:
+        - `post_document` triggers async consumption in Paperless.
+        - Many Paperless versions return a Celery task id (UUID) instead of a document id.
+
+        This method makes the rest of the pipeline "feel" synchronous: upload_pdf()
+        still returns a document id, it just waits for Paperless to finish processing.
+        """
+
+        # This endpoint is documented/mentioned in Paperless discussions and commits:
+        # /api/tasks/?task_id=<uuid>
+        # We'll try a few times with a small backoff.
+        last_payload: Any = None
+        # We poll until a time budget is exceeded, because Paperless ingestion time varies a lot.
+        max_attempts = max(1, int(self._task_timeout_s / max(self._task_poll_interval_s, 0.1)))
+
+        for attempt in range(max_attempts):
+            # INFO: every 5th poll + first. DEBUG: every poll (no duplicate INFO line).
+            if logger.isEnabledFor(logging.DEBUG):
+                logger.debug(
+                    "Polling task_id=%s attempt=%s/%s (interval=%.1fs timeout=%ss)",
+                    task_id,
+                    attempt + 1,
+                    max_attempts,
+                    self._task_poll_interval_s,
+                    self._task_timeout_s,
+                )
+            elif attempt == 0 or (attempt + 1) % 5 == 0:
+                logger.info(
+                    "Polling task_id=%s attempt=%s/%s (interval=%.1fs timeout=%ss)",
+                    task_id,
+                    attempt + 1,
+                    max_attempts,
+                    self._task_poll_interval_s,
+                    self._task_timeout_s,
+                )
+            resp = await client.get(self._url("/api/tasks/"), params={"task_id": task_id})
+            _raise_for_status(resp)
+            last_payload = resp.json()
+
+            # We expect a list (paginated or not). Handle both.
+            items: list[dict[str, Any]] = []
+            if isinstance(last_payload, dict) and "results" in last_payload and isinstance(last_payload["results"], list):
+                items = [x for x in last_payload["results"] if isinstance(x, dict)]
+            elif isinstance(last_payload, list):
+                items = [x for x in last_payload if isinstance(x, dict)]
+
+            # Find a matching task and extract document id.
+            for item in items:
+                # Match by Celery UUID (not the numeric DB id).
+                if item.get("task_id") != task_id:
+                    continue
+
+                doc_id = _document_id_from_task_payload(item)
+                if doc_id is not None:
+                    logger.info("Task task_id=%s produced document_id=%s", task_id, doc_id)
+                    return doc_id
+
+                # If we have a numeric task pk, fetch detail — list view can lag; trailing slash required.
+                numeric_task_pk = item.get("id")
+                if isinstance(numeric_task_pk, int):
+                    detail_resp = await client.get(self._url(f"/api/tasks/{numeric_task_pk}/"))
+                    _raise_for_status(detail_resp)
+                    detail = detail_resp.json()
+                    if isinstance(detail, dict):
+                        doc_id = _document_id_from_task_payload(detail)
+                        if doc_id is not None:
+                            logger.info("Task task_id=%s (detail) produced document_id=%s", task_id, doc_id)
+                            return doc_id
+
+            # Not ready yet; sleep a fixed interval (configurable).
+            await asyncio.sleep(self._task_poll_interval_s)
+
+        raise PaperlessError(
+            f"Paperless upload task did not yield document id in time. task_id={task_id} last={json.dumps(last_payload)[:500]}"
+        )
+
+    @retry(
+        retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)),
+        wait=wait_exponential_jitter(initial=0.5, max=5.0),
+        stop=stop_after_attempt(3),
+        reraise=True,
+    )
+    async def patch_document(
+        self,
+        *,
+        document_id: int,
+        title: str | None = None,
+        content: str | None = None,
+        custom_fields: list[dict[str, Any]] | None = None,
+        document_type: int | None = None,
+    ) -> None:
+        """Update metadata on a document.
+
+        For our use-case we mainly set:
+- title: a human-friendly per-page title
+- content: OCR text (so Paperless search works)
+- custom_fields: notebook_id + notebook_page
+- document_type: Paperless document type id (primary key)
+        """
+
+        payload: dict[str, Any] = {}
+        if title is not None:
+            payload["title"] = title
+        if content is not None:
+            payload["content"] = content
+        if custom_fields is not None:
+            payload["custom_fields"] = custom_fields
+        if document_type is not None:
+            payload["document_type"] = document_type
+
+        async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client:
+            logger.info(
+                "Patching document_id=%s set_title=%s set_content=%s set_custom_fields=%s set_document_type=%s",
+                document_id,
+                title is not None,
+                content is not None,
+                custom_fields is not None,
+                document_type is not None,
+            )
+            resp = await client.patch(self._url(f"/api/documents/{document_id}/"), json=payload)
+            _raise_for_status(resp)
+
--- a/src/notebook_tools/pipeline.py
+++ b/src/notebook_tools/pipeline.py
@@ -0,0 +1,187 @@
+"""OCR pipeline: Paperless PDF -> per-page OCR -> per-page PDFs -> Paperless uploads.
+
+This module is where the "business logic" lives.
+
+Design goals:
+- Keep the pipeline readable and linear.
+- Return enough information (created ids) for the job API.
+- Avoid hidden side-effects (everything is passed in / returned).
+"""
+
+from __future__ import annotations
+
+from collections.abc import Awaitable, Callable
+import io
+import logging
+import re
+
+from notebook_tools.llama_client import LlamaClient
+from notebook_tools.paperless_client import PaperlessClient
+from notebook_tools.settings import Settings
+from notebook_tools import pdf_utils
+from PIL import Image
+
+logger = logging.getLogger("notebook_tools.pipeline")
+
+PAGE_NUMBER_PROMPT = (
+    "You are reading a handwritten page number in the bottom corner of a notebook page. "
+    "Return ONLY the page number as an integer. If you cannot determine it, return -1. "
+    "Do not output any other words."
+)
+
+
+def _crop_bottom_corner_jpegs(*, full_page_jpeg: bytes) -> list[bytes]:
+    """Return small JPEG crops from bottom-left and bottom-right corners.
+
+    Why crop?
+    - It reduces visual clutter so the model focuses on the handwritten page number.
+    - It reduces payload size, making OCR faster.
+
+    The crop is based on percentages so it works across different page sizes.
+    """
+
+    img = Image.open(io.BytesIO(full_page_jpeg)).convert("RGB")
+    w, h = img.size
+
+    # Bottom band (e.g. last 20% of page height)
+    band_h = int(h * 0.22)
+    y0 = max(0, h - band_h)
+
+    # Left/right corner width (e.g. 35% of page width)
+    corner_w = int(w * 0.35)
+
+    crops = [
+        img.crop((0, y0, corner_w, h)),  # bottom-left
+        img.crop((w - corner_w, y0, w, h)),  # bottom-right
+    ]
+
+    out: list[bytes] = []
+    for c in crops:
+        buf = io.BytesIO()
+        c.save(buf, format="JPEG", quality=90, optimize=True)
+        out.append(buf.getvalue())
+    return out
+
+
+def _parse_page_number(text: str) -> int | None:
+    """Try to parse an integer page number from a model response.
+
+    We accept:
+    - '12'
+    - 'Page 12' (if the model disobeys slightly)
+    - '-1'
+    """
+
+    m = re.search(r"-?\d+", text)
+    if not m:
+        return None
+    try:
+        return int(m.group(0))
+    except ValueError:
+        return None
+
+
+async def run_pipeline_for_paperless_document(
+    *,
+    settings: Settings,
+    paperless_document_id: int,
+    notebook_id: str,
+    job_id: str,
+    on_progress: Callable[[int, int], Awaitable[None]] | None,
+    ocr_prompt_override: str | None,
+    title_prefix: str | None,
+) -> dict[str, list[int]]:
+    """Run the full OCR pipeline for one Paperless document id.
+
+    Returns:
+        {"created_document_ids": [...]} where each id is a NEW Paperless document
+        (one per page).
+    """
+
+    paperless = PaperlessClient(
+        base_url=str(settings.paperless_base_url),
+        token=settings.paperless_token,
+        task_timeout_s=settings.paperless_task_timeout_s,
+        task_poll_interval_s=settings.paperless_task_poll_interval_s,
+    )
+    llama = LlamaClient(
+        base_url=str(settings.llama_base_url),
+        model=settings.llama_model,
+        temperature=settings.ocr_temperature,
+        max_tokens=settings.ocr_max_tokens,
+    )
+
+    # 1) Download the source PDF.
+    logger.info("job_id=%s downloading paperless_document_id=%s", job_id, paperless_document_id)
+    pdf_bytes = await paperless.download_document_pdf(document_id=paperless_document_id)
+    logger.info("job_id=%s downloaded_pdf_bytes=%s", job_id, len(pdf_bytes))
+
+    # 2) Render the PDF pages as JPEG images.
+    logger.info("job_id=%s rendering_pages dpi=%s", job_id, settings.render_dpi)
+    jpegs = pdf_utils.render_pdf_to_jpegs(pdf_bytes=pdf_bytes, dpi=settings.render_dpi)
+    total_pages = len(jpegs)
+    logger.info("job_id=%s rendered_pages=%s", job_id, total_pages)
+    if on_progress:
+        await on_progress(0, total_pages)
+
+    created_ids: list[int] = []
+
+    # 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
+    for idx, jpeg_bytes in enumerate(jpegs, start=1):
+        logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
+        # 3a) Page-number OCR (bottom corners only).
+        page_number = -1
+        for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
+            candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
+            parsed = _parse_page_number(candidate_text)
+            if parsed is not None:
+                # Only accept non-negative numbers, or -1. Anything else becomes unknown.
+                if parsed == -1 or parsed >= 0:
+                    page_number = parsed
+                    if page_number != -1:
+                        break
+        logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
+
+        # 3b) Full-page OCR for actual searchable text content.
+        logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
+        ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
+        logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
+
+        page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
+        logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
+
+        # Upload the per-page PDF as a new Paperless document.
+        logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
+        new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
+        logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
+
+        # Patch metadata:
+        # - content: OCR text so it becomes searchable in Paperless
+        # - custom_fields: notebook_id + notebook_page
+        # - document_type: per-page document type (Paperless id)
+        # - title
+        custom_fields = [
+            {"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
+            {"field": settings.paperless_custom_field_notebook_page, "value": page_number},
+        ]
+
+        # Per your request, title is always in this exact format.
+        # (We keep `title_prefix` in the API for now, but it is no longer used.)
+        title = f"Notebook {notebook_id} Page {page_number}"
+
+        logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
+        await paperless.patch_document(
+            document_id=new_id,
+            title=title,
+            content=ocr_text,
+            custom_fields=custom_fields,
+            document_type=settings.paperless_document_type_id,
+        )
+        logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
+
+        created_ids.append(new_id)
+        if on_progress:
+            await on_progress(idx, total_pages)
+
+    return {"created_document_ids": created_ids}
+
--- a/src/notebook_tools/settings.py
+++ b/src/notebook_tools/settings.py
@@ -0,0 +1,60 @@
+"""App configuration loaded from environment variables.
+
+Why this module exists:
+- It's common to keep secrets (tokens) and instance-specific URLs out of code.
+- Using one Settings object makes it easy to pass config to clients/pipeline.
+"""
+
+from __future__ import annotations
+
+from pydantic import AnyHttpUrl, Field
+from pydantic_settings import BaseSettings, SettingsConfigDict
+
+
+class Settings(BaseSettings):
+    """Runtime settings for the service.
+
+    Tip: Put these in a local `.env` file while developing, but never commit secrets.
+    """
+
+    model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
+
+    # Paperless-ngx
+    paperless_base_url: AnyHttpUrl = Field(..., alias="PAPERLESS_BASE_URL")
+    paperless_token: str = Field(..., alias="PAPERLESS_TOKEN")
+
+    # llama.cpp (OpenAI-compatible)
+    llama_base_url: AnyHttpUrl = Field(..., alias="LLAMA_BASE_URL")
+    llama_model: str = Field("ggml-model-q4_k_m", alias="LLAMA_MODEL")
+
+    # Custom field ids you already created in Paperless
+    paperless_custom_field_notebook_id: int = Field(1, alias="PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID")
+    paperless_custom_field_notebook_page: int = Field(2, alias="PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE")
+
+    # Document type id applied to each per-page upload (PATCH document)
+    paperless_document_type_id: int = Field(3, alias="PAPERLESS_DOCUMENT_TYPE_ID")
+
+    # Rendering / OCR knobs
+    render_dpi: int = Field(200, alias="RENDER_DPI")
+    ocr_max_tokens: int = Field(1024, alias="OCR_MAX_TOKENS")
+    ocr_temperature: float = Field(0.0, alias="OCR_TEMPERATURE")
+
+    # Paperless async consumption polling
+    #
+    # Paperless may take minutes to ingest uploads depending on server load and OCR settings.
+    # These settings control how long we wait for `/api/tasks/?task_id=...` to produce a document id.
+    paperless_task_timeout_s: int = Field(600, alias="PAPERLESS_TASK_TIMEOUT_S")
+    paperless_task_poll_interval_s: float = Field(5.0, alias="PAPERLESS_TASK_POLL_INTERVAL_S")
+
+    # Logging
+    log_level: str = Field("INFO", alias="LOG_LEVEL")
+
+
+def get_settings() -> Settings:
+    """Create settings once per import.
+
+    In FastAPI we typically create settings at startup so environment errors fail fast.
+    """
+
+    return Settings()
+
--- a/tests/test_paperless_task_parse.py
+++ b/tests/test_paperless_task_parse.py
@@ -0,0 +1,28 @@
+from __future__ import annotations
+
+from notebook_tools.paperless_client import _document_id_from_task_payload
+
+
+def test_related_document_string() -> None:
+    assert (
+        _document_id_from_task_payload(
+            {
+                "related_document": "10",
+                "result": "Success. New document id 10 created",
+            }
+        )
+        == 10
+    )
+
+
+def test_result_string_only() -> None:
+    assert (
+        _document_id_from_task_payload(
+            {"related_document": None, "result": "Success. New document id 42 created"}
+        )
+        == 42
+    )
+
+
+def test_related_document_int() -> None:
+    assert _document_id_from_task_payload({"related_document": 7}) == 7
--- a/tests/test_pipeline_smoke.py
+++ b/tests/test_pipeline_smoke.py
@@ -0,0 +1,73 @@
+from __future__ import annotations
+
+import json
+
+import respx
+from httpx import Response
+
+from notebook_tools.pipeline import run_pipeline_for_paperless_document
+from notebook_tools.settings import Settings
+
+
+@respx.mock
+async def test_pipeline_smoke_single_page(monkeypatch) -> None:
+    # We don't want to depend on real PDF rendering in this test.
+    # Instead we monkeypatch the helpers to produce a controlled "one page" output.
+    from notebook_tools import pdf_utils
+
+    monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"])
+    monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake")
+    # Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image,
+    # we bypass cropping in this smoke test and rely on the default "-1" fallback.
+    import notebook_tools.pipeline as pipeline
+
+    monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: [])
+
+    # Mock Paperless download/upload/patch endpoints.
+    respx.get("https://paperless.local/api/documents/123/download/").mock(
+        return_value=Response(200, content=b"%PDF-1.4 source")
+    )
+    respx.post("https://paperless.local/api/documents/post_document/").mock(
+        return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233")
+    )
+    # When post_document returns a task id, the client polls /api/tasks/?task_id=<uuid>
+    respx.get("https://paperless.local/api/tasks/").mock(
+        return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}])
+    )
+    patch_route = respx.patch("https://paperless.local/api/documents/999/").mock(
+        return_value=Response(200, json={})
+    )
+
+    # Mock llama OCR
+    respx.post("http://llama.local/v1/chat/completions").mock(
+        return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]})
+    )
+
+    settings = Settings(
+        PAPERLESS_BASE_URL="https://paperless.local",
+        PAPERLESS_TOKEN="t",
+        LLAMA_BASE_URL="http://llama.local",
+        LLAMA_MODEL="m",
+    )
+
+    out = await run_pipeline_for_paperless_document(
+        settings=settings,
+        paperless_document_id=123,
+        notebook_id="nb1",
+        job_id="job1",
+        on_progress=None,
+        ocr_prompt_override=None,
+        title_prefix="Notebook nb1",
+    )
+
+    assert out["created_document_ids"] == [999]
+    assert patch_route.called
+    sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8"))
+    assert sent_patch["content"] == "OCR TEXT"
+    assert sent_patch["title"] == "Notebook nb1 Page -1"
+    assert sent_patch["custom_fields"][0]["field"] == 1
+    assert sent_patch["custom_fields"][0]["value"] == "nb1"
+    assert sent_patch["custom_fields"][1]["field"] == 2
+    assert sent_patch["custom_fields"][1]["value"] == -1
+    assert sent_patch["document_type"] == 3
+