From 9b1705d82b281279c735d16bfd32cce13657dba4 Mon Sep 17 00:00:00 2001 From: Daniel Henry Date: Tue, 31 Mar 2026 14:29:50 -0500 Subject: [PATCH] Enhance README with detailed service description, setup instructions, and example .env configuration for the FastAPI service that integrates with Paperless-ngx and llama.cpp for PDF processing. --- .env.example | 19 ++ README.md | 54 +++++ src/notebook_tools/paperless_client.py | 278 +++++++++++++++++++++++++ src/notebook_tools/pipeline.py | 187 +++++++++++++++++ src/notebook_tools/settings.py | 60 ++++++ tests/test_paperless_task_parse.py | 28 +++ tests/test_pipeline_smoke.py | 73 +++++++ 7 files changed, 699 insertions(+) create mode 100644 .env.example create mode 100644 src/notebook_tools/paperless_client.py create mode 100644 src/notebook_tools/pipeline.py create mode 100644 src/notebook_tools/settings.py create mode 100644 tests/test_paperless_task_parse.py create mode 100644 tests/test_pipeline_smoke.py diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..2b5503a --- /dev/null +++ b/.env.example @@ -0,0 +1,19 @@ +PAPERLESS_BASE_URL="https://paperless.example.com" +PAPERLESS_TOKEN="paste-token-here" + +LLAMA_BASE_URL="http://127.0.0.1:9292" +LLAMA_MODEL="ggml-model-q4_k_m" + +PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID=1 +PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2 + +# Document type id for each uploaded per-page document +PAPERLESS_DOCUMENT_TYPE_ID=3 + +PAPERLESS_TASK_TIMEOUT_S=600 +PAPERLESS_TASK_POLL_INTERVAL_S=5.0 + +RENDER_DPI=200 +OCR_MAX_TOKENS=1024 +OCR_TEMPERATURE=0.0 + diff --git a/README.md b/README.md index a87c971..aa23324 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,56 @@ # notebook-tools +FastAPI service that: +- downloads PDFs from Paperless-ngx +- splits them into pages (JPEG) +- OCRs each page via your llama.cpp OpenAI-compatible endpoint +- converts each page back into a single-page PDF +- uploads **one Paperless document per page** +- patches each uploaded document with: + - `content` = OCR text + - custom fields `notebook_id` (field id 1) and `notebook_page` (field id 2) + - `document_type` = Paperless document type id (default **3**, configurable) + +## Setup + +Install deps: + +```bash +uv sync +``` + +Create a `.env` file (example below) and **do not commit it**. + +## Run locally + +```bash +uv run uvicorn notebook_tools.api:app --reload --host 0.0.0.0 --port 8080 +``` + +Then open the docs at: +- `http://127.0.0.1:8080/docs` (same machine) +- `http://:8080/docs` (other machines on your network) + +If other machines still can’t connect, check your macOS firewall and any router/network rules. + +## Example `.env` + +```bash +PAPERLESS_BASE_URL="https://paperless.example.com" +PAPERLESS_TOKEN="paste-token-here" + +LLAMA_BASE_URL="http://127.0.0.1:9292" +LLAMA_MODEL="ggml-model-q4_k_m" + +# Custom field ids in Paperless +PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID=1 +PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2 +PAPERLESS_DOCUMENT_TYPE_ID=3 + +# Rendering / OCR knobs +RENDER_DPI=200 +OCR_MAX_TOKENS=1024 +OCR_TEMPERATURE=0.0 +``` + + diff --git a/src/notebook_tools/paperless_client.py b/src/notebook_tools/paperless_client.py new file mode 100644 index 0000000..edb5859 --- /dev/null +++ b/src/notebook_tools/paperless_client.py @@ -0,0 +1,278 @@ +"""Paperless-ngx REST API client. + +We keep this client *thin*: +- It knows how to talk to Paperless (URLs, auth header, endpoints). +- It does NOT know about OCR or PDFs beyond "upload bytes". + +This separation makes it easier to test and to swap out behavior later. +""" + +from __future__ import annotations + +import asyncio +import json +import logging +import re +from typing import Any + +import httpx +from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_exponential_jitter + +logger = logging.getLogger("notebook_tools.paperless") + +class PaperlessError(RuntimeError): + """Raised for non-2xx responses from Paperless.""" + + +def _auth_headers(token: str) -> dict[str, str]: + # Paperless uses token auth in the form: Authorization: Token + return {"Authorization": f"Token {token}"} + + +def _raise_for_status(resp: httpx.Response) -> None: + """Raise a helpful error message. + + httpx has resp.raise_for_status(), but we include the response body (often JSON) + because Paperless will usually tell you exactly what's wrong. + """ + + if 200 <= resp.status_code < 300: + return + body = resp.text + raise PaperlessError(f"Paperless API {resp.status_code}: {body}") + + +def _document_id_from_task_payload(item: dict[str, Any]) -> int | None: + """Extract created document id from a Paperless task object. + + Paperless 2.x often returns: + - ``related_document`` as a string ``\"10\"`` (not an int) + - ``result`` as a string like ``\"Success. New document id 10 created\"`` (not a dict) + + We must handle both, or polling never completes. + """ + + rd = item.get("related_document") + if rd is not None: + if isinstance(rd, int): + return rd + if isinstance(rd, str) and rd.strip().isdigit(): + return int(rd.strip()) + + for key in ("document_id", "document"): + val = item.get(key) + if isinstance(val, int): + return val + if isinstance(val, str) and val.strip().isdigit(): + return int(val.strip()) + + result = item.get("result") + if isinstance(result, dict): + nested = result.get("document_id") or result.get("document") + if isinstance(nested, int): + return nested + if isinstance(nested, str) and nested.strip().isdigit(): + return int(nested.strip()) + elif isinstance(result, str): + # e.g. "Success. New document id 10 created" + m = re.search(r"New document id\s+(\d+)", result, flags=re.IGNORECASE) + if m: + return int(m.group(1)) + + return None + + +class PaperlessClient: + def __init__( + self, + *, + base_url: str, + token: str, + timeout_s: float = 60.0, + task_timeout_s: int = 600, + task_poll_interval_s: float = 5.0, + ) -> None: + self._base_url = base_url.rstrip("/") + self._token = token + self._timeout = httpx.Timeout(timeout_s) + self._task_timeout_s = task_timeout_s + self._task_poll_interval_s = task_poll_interval_s + + def _url(self, path: str) -> str: + return f"{self._base_url}{path}" + + @retry( + retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)), + wait=wait_exponential_jitter(initial=0.5, max=5.0), + stop=stop_after_attempt(3), + reraise=True, + ) + async def download_document_pdf(self, *, document_id: int) -> bytes: + """Download the original PDF bytes for a Paperless document.""" + + async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client: + # Common Paperless endpoint: + # GET /api/documents/{id}/download/ + logger.info("Downloading document_id=%s", document_id) + resp = await client.get(self._url(f"/api/documents/{document_id}/download/")) + _raise_for_status(resp) + return resp.content + + async def upload_pdf(self, *, filename: str, pdf_bytes: bytes) -> int: + """Upload a new document (PDF) and return its new document id.""" + + async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client: + files = { + "document": (filename, pdf_bytes, "application/pdf"), + } + # POST /api/documents/post_document/ returns JSON about the created document/task. + logger.info("Uploading PDF filename=%s bytes=%s", filename, len(pdf_bytes)) + resp = await client.post(self._url("/api/documents/post_document/"), files=files) + _raise_for_status(resp) + + data: Any = resp.json() + + # Paperless has had a few response shapes across versions. + # We defensively handle the most common ones. + if isinstance(data, dict): + if "document" in data and isinstance(data["document"], int): + return int(data["document"]) + if "id" in data and isinstance(data["id"], int): + return int(data["id"]) + # Some versions return {"task_id": ""}. + if "task_id" in data and isinstance(data["task_id"], str): + logger.info("Upload returned task_id=%s", data["task_id"]) + return await self._wait_for_task_document_id(client=client, task_id=data["task_id"]) + + # Other versions return the task id directly as a JSON string: "" + if isinstance(data, str): + logger.info("Upload returned task_id=%s", data) + return await self._wait_for_task_document_id(client=client, task_id=data) + + raise PaperlessError(f"Unexpected upload response: {json.dumps(data)[:500]}") + + async def _wait_for_task_document_id(self, *, client: httpx.AsyncClient, task_id: str) -> int: + """Poll Paperless' tasks endpoint until it yields a created document id. + + Why polling is needed: + - `post_document` triggers async consumption in Paperless. + - Many Paperless versions return a Celery task id (UUID) instead of a document id. + + This method makes the rest of the pipeline "feel" synchronous: upload_pdf() + still returns a document id, it just waits for Paperless to finish processing. + """ + + # This endpoint is documented/mentioned in Paperless discussions and commits: + # /api/tasks/?task_id= + # We'll try a few times with a small backoff. + last_payload: Any = None + # We poll until a time budget is exceeded, because Paperless ingestion time varies a lot. + max_attempts = max(1, int(self._task_timeout_s / max(self._task_poll_interval_s, 0.1))) + + for attempt in range(max_attempts): + # INFO: every 5th poll + first. DEBUG: every poll (no duplicate INFO line). + if logger.isEnabledFor(logging.DEBUG): + logger.debug( + "Polling task_id=%s attempt=%s/%s (interval=%.1fs timeout=%ss)", + task_id, + attempt + 1, + max_attempts, + self._task_poll_interval_s, + self._task_timeout_s, + ) + elif attempt == 0 or (attempt + 1) % 5 == 0: + logger.info( + "Polling task_id=%s attempt=%s/%s (interval=%.1fs timeout=%ss)", + task_id, + attempt + 1, + max_attempts, + self._task_poll_interval_s, + self._task_timeout_s, + ) + resp = await client.get(self._url("/api/tasks/"), params={"task_id": task_id}) + _raise_for_status(resp) + last_payload = resp.json() + + # We expect a list (paginated or not). Handle both. + items: list[dict[str, Any]] = [] + if isinstance(last_payload, dict) and "results" in last_payload and isinstance(last_payload["results"], list): + items = [x for x in last_payload["results"] if isinstance(x, dict)] + elif isinstance(last_payload, list): + items = [x for x in last_payload if isinstance(x, dict)] + + # Find a matching task and extract document id. + for item in items: + # Match by Celery UUID (not the numeric DB id). + if item.get("task_id") != task_id: + continue + + doc_id = _document_id_from_task_payload(item) + if doc_id is not None: + logger.info("Task task_id=%s produced document_id=%s", task_id, doc_id) + return doc_id + + # If we have a numeric task pk, fetch detail — list view can lag; trailing slash required. + numeric_task_pk = item.get("id") + if isinstance(numeric_task_pk, int): + detail_resp = await client.get(self._url(f"/api/tasks/{numeric_task_pk}/")) + _raise_for_status(detail_resp) + detail = detail_resp.json() + if isinstance(detail, dict): + doc_id = _document_id_from_task_payload(detail) + if doc_id is not None: + logger.info("Task task_id=%s (detail) produced document_id=%s", task_id, doc_id) + return doc_id + + # Not ready yet; sleep a fixed interval (configurable). + await asyncio.sleep(self._task_poll_interval_s) + + raise PaperlessError( + f"Paperless upload task did not yield document id in time. task_id={task_id} last={json.dumps(last_payload)[:500]}" + ) + + @retry( + retry=retry_if_exception_type((httpx.TimeoutException, httpx.NetworkError)), + wait=wait_exponential_jitter(initial=0.5, max=5.0), + stop=stop_after_attempt(3), + reraise=True, + ) + async def patch_document( + self, + *, + document_id: int, + title: str | None = None, + content: str | None = None, + custom_fields: list[dict[str, Any]] | None = None, + document_type: int | None = None, + ) -> None: + """Update metadata on a document. + + For our use-case we mainly set: +- title: a human-friendly per-page title +- content: OCR text (so Paperless search works) +- custom_fields: notebook_id + notebook_page +- document_type: Paperless document type id (primary key) + """ + + payload: dict[str, Any] = {} + if title is not None: + payload["title"] = title + if content is not None: + payload["content"] = content + if custom_fields is not None: + payload["custom_fields"] = custom_fields + if document_type is not None: + payload["document_type"] = document_type + + async with httpx.AsyncClient(timeout=self._timeout, headers=_auth_headers(self._token)) as client: + logger.info( + "Patching document_id=%s set_title=%s set_content=%s set_custom_fields=%s set_document_type=%s", + document_id, + title is not None, + content is not None, + custom_fields is not None, + document_type is not None, + ) + resp = await client.patch(self._url(f"/api/documents/{document_id}/"), json=payload) + _raise_for_status(resp) + diff --git a/src/notebook_tools/pipeline.py b/src/notebook_tools/pipeline.py new file mode 100644 index 0000000..8da19d1 --- /dev/null +++ b/src/notebook_tools/pipeline.py @@ -0,0 +1,187 @@ +"""OCR pipeline: Paperless PDF -> per-page OCR -> per-page PDFs -> Paperless uploads. + +This module is where the "business logic" lives. + +Design goals: +- Keep the pipeline readable and linear. +- Return enough information (created ids) for the job API. +- Avoid hidden side-effects (everything is passed in / returned). +""" + +from __future__ import annotations + +from collections.abc import Awaitable, Callable +import io +import logging +import re + +from notebook_tools.llama_client import LlamaClient +from notebook_tools.paperless_client import PaperlessClient +from notebook_tools.settings import Settings +from notebook_tools import pdf_utils +from PIL import Image + +logger = logging.getLogger("notebook_tools.pipeline") + +PAGE_NUMBER_PROMPT = ( + "You are reading a handwritten page number in the bottom corner of a notebook page. " + "Return ONLY the page number as an integer. If you cannot determine it, return -1. " + "Do not output any other words." +) + + +def _crop_bottom_corner_jpegs(*, full_page_jpeg: bytes) -> list[bytes]: + """Return small JPEG crops from bottom-left and bottom-right corners. + + Why crop? + - It reduces visual clutter so the model focuses on the handwritten page number. + - It reduces payload size, making OCR faster. + + The crop is based on percentages so it works across different page sizes. + """ + + img = Image.open(io.BytesIO(full_page_jpeg)).convert("RGB") + w, h = img.size + + # Bottom band (e.g. last 20% of page height) + band_h = int(h * 0.22) + y0 = max(0, h - band_h) + + # Left/right corner width (e.g. 35% of page width) + corner_w = int(w * 0.35) + + crops = [ + img.crop((0, y0, corner_w, h)), # bottom-left + img.crop((w - corner_w, y0, w, h)), # bottom-right + ] + + out: list[bytes] = [] + for c in crops: + buf = io.BytesIO() + c.save(buf, format="JPEG", quality=90, optimize=True) + out.append(buf.getvalue()) + return out + + +def _parse_page_number(text: str) -> int | None: + """Try to parse an integer page number from a model response. + + We accept: + - '12' + - 'Page 12' (if the model disobeys slightly) + - '-1' + """ + + m = re.search(r"-?\d+", text) + if not m: + return None + try: + return int(m.group(0)) + except ValueError: + return None + + +async def run_pipeline_for_paperless_document( + *, + settings: Settings, + paperless_document_id: int, + notebook_id: str, + job_id: str, + on_progress: Callable[[int, int], Awaitable[None]] | None, + ocr_prompt_override: str | None, + title_prefix: str | None, +) -> dict[str, list[int]]: + """Run the full OCR pipeline for one Paperless document id. + + Returns: + {"created_document_ids": [...]} where each id is a NEW Paperless document + (one per page). + """ + + paperless = PaperlessClient( + base_url=str(settings.paperless_base_url), + token=settings.paperless_token, + task_timeout_s=settings.paperless_task_timeout_s, + task_poll_interval_s=settings.paperless_task_poll_interval_s, + ) + llama = LlamaClient( + base_url=str(settings.llama_base_url), + model=settings.llama_model, + temperature=settings.ocr_temperature, + max_tokens=settings.ocr_max_tokens, + ) + + # 1) Download the source PDF. + logger.info("job_id=%s downloading paperless_document_id=%s", job_id, paperless_document_id) + pdf_bytes = await paperless.download_document_pdf(document_id=paperless_document_id) + logger.info("job_id=%s downloaded_pdf_bytes=%s", job_id, len(pdf_bytes)) + + # 2) Render the PDF pages as JPEG images. + logger.info("job_id=%s rendering_pages dpi=%s", job_id, settings.render_dpi) + jpegs = pdf_utils.render_pdf_to_jpegs(pdf_bytes=pdf_bytes, dpi=settings.render_dpi) + total_pages = len(jpegs) + logger.info("job_id=%s rendered_pages=%s", job_id, total_pages) + if on_progress: + await on_progress(0, total_pages) + + created_ids: list[int] = [] + + # 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata. + for idx, jpeg_bytes in enumerate(jpegs, start=1): + logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages) + # 3a) Page-number OCR (bottom corners only). + page_number = -1 + for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes): + candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT) + parsed = _parse_page_number(candidate_text) + if parsed is not None: + # Only accept non-negative numbers, or -1. Anything else becomes unknown. + if parsed == -1 or parsed >= 0: + page_number = parsed + if page_number != -1: + break + logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number) + + # 3b) Full-page OCR for actual searchable text content. + logger.info("job_id=%s page=%s ocr_full_page", job_id, idx) + ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override) + logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text)) + + page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes) + logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf)) + + # Upload the per-page PDF as a new Paperless document. + logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx) + new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf) + logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id) + + # Patch metadata: + # - content: OCR text so it becomes searchable in Paperless + # - custom_fields: notebook_id + notebook_page + # - document_type: per-page document type (Paperless id) + # - title + custom_fields = [ + {"field": settings.paperless_custom_field_notebook_id, "value": notebook_id}, + {"field": settings.paperless_custom_field_notebook_page, "value": page_number}, + ] + + # Per your request, title is always in this exact format. + # (We keep `title_prefix` in the API for now, but it is no longer used.) + title = f"Notebook {notebook_id} Page {page_number}" + + logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id) + await paperless.patch_document( + document_id=new_id, + title=title, + content=ocr_text, + custom_fields=custom_fields, + document_type=settings.paperless_document_type_id, + ) + logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id) + + created_ids.append(new_id) + if on_progress: + await on_progress(idx, total_pages) + + return {"created_document_ids": created_ids} + diff --git a/src/notebook_tools/settings.py b/src/notebook_tools/settings.py new file mode 100644 index 0000000..de329b1 --- /dev/null +++ b/src/notebook_tools/settings.py @@ -0,0 +1,60 @@ +"""App configuration loaded from environment variables. + +Why this module exists: +- It's common to keep secrets (tokens) and instance-specific URLs out of code. +- Using one Settings object makes it easy to pass config to clients/pipeline. +""" + +from __future__ import annotations + +from pydantic import AnyHttpUrl, Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Runtime settings for the service. + + Tip: Put these in a local `.env` file while developing, but never commit secrets. + """ + + model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore") + + # Paperless-ngx + paperless_base_url: AnyHttpUrl = Field(..., alias="PAPERLESS_BASE_URL") + paperless_token: str = Field(..., alias="PAPERLESS_TOKEN") + + # llama.cpp (OpenAI-compatible) + llama_base_url: AnyHttpUrl = Field(..., alias="LLAMA_BASE_URL") + llama_model: str = Field("ggml-model-q4_k_m", alias="LLAMA_MODEL") + + # Custom field ids you already created in Paperless + paperless_custom_field_notebook_id: int = Field(1, alias="PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID") + paperless_custom_field_notebook_page: int = Field(2, alias="PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE") + + # Document type id applied to each per-page upload (PATCH document) + paperless_document_type_id: int = Field(3, alias="PAPERLESS_DOCUMENT_TYPE_ID") + + # Rendering / OCR knobs + render_dpi: int = Field(200, alias="RENDER_DPI") + ocr_max_tokens: int = Field(1024, alias="OCR_MAX_TOKENS") + ocr_temperature: float = Field(0.0, alias="OCR_TEMPERATURE") + + # Paperless async consumption polling + # + # Paperless may take minutes to ingest uploads depending on server load and OCR settings. + # These settings control how long we wait for `/api/tasks/?task_id=...` to produce a document id. + paperless_task_timeout_s: int = Field(600, alias="PAPERLESS_TASK_TIMEOUT_S") + paperless_task_poll_interval_s: float = Field(5.0, alias="PAPERLESS_TASK_POLL_INTERVAL_S") + + # Logging + log_level: str = Field("INFO", alias="LOG_LEVEL") + + +def get_settings() -> Settings: + """Create settings once per import. + + In FastAPI we typically create settings at startup so environment errors fail fast. + """ + + return Settings() + diff --git a/tests/test_paperless_task_parse.py b/tests/test_paperless_task_parse.py new file mode 100644 index 0000000..a9d22e6 --- /dev/null +++ b/tests/test_paperless_task_parse.py @@ -0,0 +1,28 @@ +from __future__ import annotations + +from notebook_tools.paperless_client import _document_id_from_task_payload + + +def test_related_document_string() -> None: + assert ( + _document_id_from_task_payload( + { + "related_document": "10", + "result": "Success. New document id 10 created", + } + ) + == 10 + ) + + +def test_result_string_only() -> None: + assert ( + _document_id_from_task_payload( + {"related_document": None, "result": "Success. New document id 42 created"} + ) + == 42 + ) + + +def test_related_document_int() -> None: + assert _document_id_from_task_payload({"related_document": 7}) == 7 diff --git a/tests/test_pipeline_smoke.py b/tests/test_pipeline_smoke.py new file mode 100644 index 0000000..0f117db --- /dev/null +++ b/tests/test_pipeline_smoke.py @@ -0,0 +1,73 @@ +from __future__ import annotations + +import json + +import respx +from httpx import Response + +from notebook_tools.pipeline import run_pipeline_for_paperless_document +from notebook_tools.settings import Settings + + +@respx.mock +async def test_pipeline_smoke_single_page(monkeypatch) -> None: + # We don't want to depend on real PDF rendering in this test. + # Instead we monkeypatch the helpers to produce a controlled "one page" output. + from notebook_tools import pdf_utils + + monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"]) + monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake") + # Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image, + # we bypass cropping in this smoke test and rely on the default "-1" fallback. + import notebook_tools.pipeline as pipeline + + monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: []) + + # Mock Paperless download/upload/patch endpoints. + respx.get("https://paperless.local/api/documents/123/download/").mock( + return_value=Response(200, content=b"%PDF-1.4 source") + ) + respx.post("https://paperless.local/api/documents/post_document/").mock( + return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233") + ) + # When post_document returns a task id, the client polls /api/tasks/?task_id= + respx.get("https://paperless.local/api/tasks/").mock( + return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}]) + ) + patch_route = respx.patch("https://paperless.local/api/documents/999/").mock( + return_value=Response(200, json={}) + ) + + # Mock llama OCR + respx.post("http://llama.local/v1/chat/completions").mock( + return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]}) + ) + + settings = Settings( + PAPERLESS_BASE_URL="https://paperless.local", + PAPERLESS_TOKEN="t", + LLAMA_BASE_URL="http://llama.local", + LLAMA_MODEL="m", + ) + + out = await run_pipeline_for_paperless_document( + settings=settings, + paperless_document_id=123, + notebook_id="nb1", + job_id="job1", + on_progress=None, + ocr_prompt_override=None, + title_prefix="Notebook nb1", + ) + + assert out["created_document_ids"] == [999] + assert patch_route.called + sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8")) + assert sent_patch["content"] == "OCR TEXT" + assert sent_patch["title"] == "Notebook nb1 Page -1" + assert sent_patch["custom_fields"][0]["field"] == 1 + assert sent_patch["custom_fields"][0]["value"] == "nb1" + assert sent_patch["custom_fields"][1]["field"] == 2 + assert sent_patch["custom_fields"][1]["value"] == -1 + assert sent_patch["document_type"] == 3 +