74 lines
2.9 KiB
Python
74 lines
2.9 KiB
Python
from __future__ import annotations
|
|
|
|
import json
|
|
|
|
import respx
|
|
from httpx import Response
|
|
|
|
from notebook_tools.pipeline import run_pipeline_for_paperless_document
|
|
from notebook_tools.settings import Settings
|
|
|
|
|
|
@respx.mock
|
|
async def test_pipeline_smoke_single_page(monkeypatch) -> None:
|
|
# We don't want to depend on real PDF rendering in this test.
|
|
# Instead we monkeypatch the helpers to produce a controlled "one page" output.
|
|
from notebook_tools import pdf_utils
|
|
|
|
monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"])
|
|
monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake")
|
|
# Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image,
|
|
# we bypass cropping in this smoke test and rely on the default "-1" fallback.
|
|
import notebook_tools.pipeline as pipeline
|
|
|
|
monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: [])
|
|
|
|
# Mock Paperless download/upload/patch endpoints.
|
|
respx.get("https://paperless.local/api/documents/123/download/").mock(
|
|
return_value=Response(200, content=b"%PDF-1.4 source")
|
|
)
|
|
respx.post("https://paperless.local/api/documents/post_document/").mock(
|
|
return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233")
|
|
)
|
|
# When post_document returns a task id, the client polls /api/tasks/?task_id=<uuid>
|
|
respx.get("https://paperless.local/api/tasks/").mock(
|
|
return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}])
|
|
)
|
|
patch_route = respx.patch("https://paperless.local/api/documents/999/").mock(
|
|
return_value=Response(200, json={})
|
|
)
|
|
|
|
# Mock llama OCR
|
|
respx.post("http://llama.local/v1/chat/completions").mock(
|
|
return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]})
|
|
)
|
|
|
|
settings = Settings(
|
|
PAPERLESS_BASE_URL="https://paperless.local",
|
|
PAPERLESS_TOKEN="t",
|
|
LLAMA_BASE_URL="http://llama.local",
|
|
LLAMA_MODEL="m",
|
|
)
|
|
|
|
out = await run_pipeline_for_paperless_document(
|
|
settings=settings,
|
|
paperless_document_id=123,
|
|
notebook_id="nb1",
|
|
job_id="job1",
|
|
on_progress=None,
|
|
ocr_prompt_override=None,
|
|
title_prefix="Notebook nb1",
|
|
)
|
|
|
|
assert out["created_document_ids"] == [999]
|
|
assert patch_route.called
|
|
sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8"))
|
|
assert sent_patch["content"] == "OCR TEXT"
|
|
assert sent_patch["title"] == "Notebook nb1 Page -1"
|
|
assert sent_patch["custom_fields"][0]["field"] == 1
|
|
assert sent_patch["custom_fields"][0]["value"] == "nb1"
|
|
assert sent_patch["custom_fields"][1]["field"] == 2
|
|
assert sent_patch["custom_fields"][1]["value"] == -1
|
|
assert sent_patch["document_type"] == 3
|
|
|