notebook-tools/tests/test_pipeline_smoke.py

from __future__ import annotations

import json

import respx
from httpx import Response

from notebook_tools.pipeline import run_pipeline_for_paperless_document
from notebook_tools.settings import Settings


@respx.mock
async def test_pipeline_smoke_single_page(monkeypatch) -> None:
    # We don't want to depend on real PDF rendering in this test.
    # Instead we monkeypatch the helpers to produce a controlled "one page" output.
    from notebook_tools import pdf_utils

    monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"])
    monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake")
    # Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image,
    # we bypass cropping in this smoke test and rely on the default "-1" fallback.
    import notebook_tools.pipeline as pipeline

    monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: [])

    # Mock Paperless download/upload/patch endpoints.
    respx.get("https://paperless.local/api/documents/123/download/").mock(
        return_value=Response(200, content=b"%PDF-1.4 source")
    )
    respx.post("https://paperless.local/api/documents/post_document/").mock(
        return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233")
    )
    # When post_document returns a task id, the client polls /api/tasks/?task_id=<uuid>
    respx.get("https://paperless.local/api/tasks/").mock(
        return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}])
    )
    patch_route = respx.patch("https://paperless.local/api/documents/999/").mock(
        return_value=Response(200, json={})
    )

    # Mock llama OCR
    respx.post("http://llama.local/v1/chat/completions").mock(
        return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]})
    )

    settings = Settings(
        PAPERLESS_BASE_URL="https://paperless.local",
        PAPERLESS_TOKEN="t",
        LLAMA_BASE_URL="http://llama.local",
        LLAMA_MODEL="m",
    )

    out = await run_pipeline_for_paperless_document(
        settings=settings,
        paperless_document_id=123,
        notebook_id="nb1",
        job_id="job1",
        on_progress=None,
        ocr_prompt_override=None,
        title_prefix="Notebook nb1",
    )

    assert out["created_document_ids"] == [999]
    assert patch_route.called
    sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8"))
    assert sent_patch["content"] == "OCR TEXT"
    assert sent_patch["title"] == "Notebook nb1 Page -1"
    assert sent_patch["custom_fields"][0]["field"] == 1
    assert sent_patch["custom_fields"][0]["value"] == "nb1"
    assert sent_patch["custom_fields"][1]["field"] == 2
    assert sent_patch["custom_fields"][1]["value"] == -1
    assert sent_patch["document_type"] == 3