from __future__ import annotations import json import respx from httpx import Response from notebook_tools.pipeline import run_pipeline_for_paperless_document from notebook_tools.settings import Settings @respx.mock async def test_pipeline_smoke_single_page(monkeypatch) -> None: # We don't want to depend on real PDF rendering in this test. # Instead we monkeypatch the helpers to produce a controlled "one page" output. from notebook_tools import pdf_utils monkeypatch.setattr(pdf_utils, "render_pdf_to_jpegs", lambda *, pdf_bytes, dpi: [b"jpegbytes"]) monkeypatch.setattr(pdf_utils, "jpeg_to_pdf_bytes", lambda *, jpeg_bytes: b"%PDF-1.4 fake") # Page-number OCR now crops the JPEG; since our fake jpeg bytes aren't a real image, # we bypass cropping in this smoke test and rely on the default "-1" fallback. import notebook_tools.pipeline as pipeline monkeypatch.setattr(pipeline, "_crop_bottom_corner_jpegs", lambda *, full_page_jpeg: []) # Mock Paperless download/upload/patch endpoints. respx.get("https://paperless.local/api/documents/123/download/").mock( return_value=Response(200, content=b"%PDF-1.4 source") ) respx.post("https://paperless.local/api/documents/post_document/").mock( return_value=Response(200, json="34de1527-aade-499b-8a06-dd0174c9f233") ) # When post_document returns a task id, the client polls /api/tasks/?task_id= respx.get("https://paperless.local/api/tasks/").mock( return_value=Response(200, json=[{"id": 31, "task_id": "34de1527-aade-499b-8a06-dd0174c9f233", "related_document": 999}]) ) patch_route = respx.patch("https://paperless.local/api/documents/999/").mock( return_value=Response(200, json={}) ) # Mock llama OCR respx.post("http://llama.local/v1/chat/completions").mock( return_value=Response(200, json={"choices": [{"message": {"content": "OCR TEXT"}}]}) ) settings = Settings( PAPERLESS_BASE_URL="https://paperless.local", PAPERLESS_TOKEN="t", LLAMA_BASE_URL="http://llama.local", LLAMA_MODEL="m", ) out = await run_pipeline_for_paperless_document( settings=settings, paperless_document_id=123, notebook_id="nb1", job_id="job1", on_progress=None, ocr_prompt_override=None, title_prefix="Notebook nb1", ) assert out["created_document_ids"] == [999] assert patch_route.called sent_patch = json.loads(patch_route.calls[0].request.content.decode("utf-8")) assert sent_patch["content"] == "OCR TEXT" assert sent_patch["title"] == "Notebook nb1 Page -1" assert sent_patch["custom_fields"][0]["field"] == 1 assert sent_patch["custom_fields"][0]["value"] == "nb1" assert sent_patch["custom_fields"][1]["field"] == 2 assert sent_patch["custom_fields"][1]["value"] == -1 assert sent_patch["document_type"] == 3