Enhance README with detailed service description, setup instructions, and example .env configuration for the FastAPI service that integrates with Paperless-ngx and llama.cpp for PDF processing.

This commit is contained in:
2026-03-31 14:29:50 -05:00
parent facf6b26f0
commit 9b1705d82b
7 changed files with 699 additions and 0 deletions

View File

@@ -0,0 +1,187 @@
"""OCR pipeline: Paperless PDF -> per-page OCR -> per-page PDFs -> Paperless uploads.
This module is where the "business logic" lives.
Design goals:
- Keep the pipeline readable and linear.
- Return enough information (created ids) for the job API.
- Avoid hidden side-effects (everything is passed in / returned).
"""
from __future__ import annotations
from collections.abc import Awaitable, Callable
import io
import logging
import re
from notebook_tools.llama_client import LlamaClient
from notebook_tools.paperless_client import PaperlessClient
from notebook_tools.settings import Settings
from notebook_tools import pdf_utils
from PIL import Image
logger = logging.getLogger("notebook_tools.pipeline")
PAGE_NUMBER_PROMPT = (
"You are reading a handwritten page number in the bottom corner of a notebook page. "
"Return ONLY the page number as an integer. If you cannot determine it, return -1. "
"Do not output any other words."
)
def _crop_bottom_corner_jpegs(*, full_page_jpeg: bytes) -> list[bytes]:
"""Return small JPEG crops from bottom-left and bottom-right corners.
Why crop?
- It reduces visual clutter so the model focuses on the handwritten page number.
- It reduces payload size, making OCR faster.
The crop is based on percentages so it works across different page sizes.
"""
img = Image.open(io.BytesIO(full_page_jpeg)).convert("RGB")
w, h = img.size
# Bottom band (e.g. last 20% of page height)
band_h = int(h * 0.22)
y0 = max(0, h - band_h)
# Left/right corner width (e.g. 35% of page width)
corner_w = int(w * 0.35)
crops = [
img.crop((0, y0, corner_w, h)), # bottom-left
img.crop((w - corner_w, y0, w, h)), # bottom-right
]
out: list[bytes] = []
for c in crops:
buf = io.BytesIO()
c.save(buf, format="JPEG", quality=90, optimize=True)
out.append(buf.getvalue())
return out
def _parse_page_number(text: str) -> int | None:
"""Try to parse an integer page number from a model response.
We accept:
- '12'
- 'Page 12' (if the model disobeys slightly)
- '-1'
"""
m = re.search(r"-?\d+", text)
if not m:
return None
try:
return int(m.group(0))
except ValueError:
return None
async def run_pipeline_for_paperless_document(
*,
settings: Settings,
paperless_document_id: int,
notebook_id: str,
job_id: str,
on_progress: Callable[[int, int], Awaitable[None]] | None,
ocr_prompt_override: str | None,
title_prefix: str | None,
) -> dict[str, list[int]]:
"""Run the full OCR pipeline for one Paperless document id.
Returns:
{"created_document_ids": [...]} where each id is a NEW Paperless document
(one per page).
"""
paperless = PaperlessClient(
base_url=str(settings.paperless_base_url),
token=settings.paperless_token,
task_timeout_s=settings.paperless_task_timeout_s,
task_poll_interval_s=settings.paperless_task_poll_interval_s,
)
llama = LlamaClient(
base_url=str(settings.llama_base_url),
model=settings.llama_model,
temperature=settings.ocr_temperature,
max_tokens=settings.ocr_max_tokens,
)
# 1) Download the source PDF.
logger.info("job_id=%s downloading paperless_document_id=%s", job_id, paperless_document_id)
pdf_bytes = await paperless.download_document_pdf(document_id=paperless_document_id)
logger.info("job_id=%s downloaded_pdf_bytes=%s", job_id, len(pdf_bytes))
# 2) Render the PDF pages as JPEG images.
logger.info("job_id=%s rendering_pages dpi=%s", job_id, settings.render_dpi)
jpegs = pdf_utils.render_pdf_to_jpegs(pdf_bytes=pdf_bytes, dpi=settings.render_dpi)
total_pages = len(jpegs)
logger.info("job_id=%s rendered_pages=%s", job_id, total_pages)
if on_progress:
await on_progress(0, total_pages)
created_ids: list[int] = []
# 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
for idx, jpeg_bytes in enumerate(jpegs, start=1):
logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
# 3a) Page-number OCR (bottom corners only).
page_number = -1
for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
parsed = _parse_page_number(candidate_text)
if parsed is not None:
# Only accept non-negative numbers, or -1. Anything else becomes unknown.
if parsed == -1 or parsed >= 0:
page_number = parsed
if page_number != -1:
break
logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
# 3b) Full-page OCR for actual searchable text content.
logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
# Upload the per-page PDF as a new Paperless document.
logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
# Patch metadata:
# - content: OCR text so it becomes searchable in Paperless
# - custom_fields: notebook_id + notebook_page
# - document_type: per-page document type (Paperless id)
# - title
custom_fields = [
{"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
{"field": settings.paperless_custom_field_notebook_page, "value": page_number},
]
# Per your request, title is always in this exact format.
# (We keep `title_prefix` in the API for now, but it is no longer used.)
title = f"Notebook {notebook_id} Page {page_number}"
logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
await paperless.patch_document(
document_id=new_id,
title=title,
content=ocr_text,
custom_fields=custom_fields,
document_type=settings.paperless_document_type_id,
)
logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
created_ids.append(new_id)
if on_progress:
await on_progress(idx, total_pages)
return {"created_document_ids": created_ids}