Files
notebook-tools/tests/test_pipeline_smoke.py

74 lines
2.9 KiB
Python

from __future__ import annotations
import json
import respx
from httpx import Response
from notebook_tools.pipeline import run_pipeline_for_paperless_document
from notebook_tools.settings import Settings
@respx.mock
async def test_pipeline_smoke_single_page(monkeypatch) -> None:
# We don't want to depend on real PDF rendering in this test.
# Instead we monkeypatch the helpers to produce a controlled "one page" output.
from notebook_tools import pdf_utils
monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"])
monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake")
# Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image,
# we bypass cropping in this smoke test and rely on the default "-1" fallback.
import notebook_tools.pipeline as pipeline
monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: [])
# Mock Paperless download/upload/patch endpoints.
respx.get("https://paperless.local/api/documents/123/download/").mock(
return_value=Response(200, content=b"%PDF-1.4 source")
)
respx.post("https://paperless.local/api/documents/post_document/").mock(
return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233")
)
# When post_document returns a task id, the client polls /api/tasks/?task_id=<uuid>
respx.get("https://paperless.local/api/tasks/").mock(
return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}])
)
patch_route = respx.patch("https://paperless.local/api/documents/999/").mock(
return_value=Response(200, json={})
)
# Mock llama OCR
respx.post("http://llama.local/v1/chat/completions").mock(
return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]})
)
settings = Settings(
PAPERLESS_BASE_URL="https://paperless.local",
PAPERLESS_TOKEN="t",
LLAMA_BASE_URL="http://llama.local",
LLAMA_MODEL="m",
)
out = await run_pipeline_for_paperless_document(
settings=settings,
paperless_document_id=123,
notebook_id="nb1",
job_id="job1",
on_progress=None,
ocr_prompt_override=None,
title_prefix="Notebook nb1",
)
assert out["created_document_ids"] == [999]
assert patch_route.called
sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8"))
assert sent_patch["content"] == "OCR TEXT"
assert sent_patch["title"] == "Notebook nb1 Page -1"
assert sent_patch["custom_fields"][0]["field"] == 1
assert sent_patch["custom_fields"][0]["value"] == "nb1"
assert sent_patch["custom_fields"][1]["field"] == 2
assert sent_patch["custom_fields"][1]["value"] == -1
assert sent_patch["document_type"] == 3