Enhance README with detailed service description, setup instructions, and example .env configuration for the FastAPI service that integrates with Paperless-ngx and llama.cpp for PDF processing.
This commit is contained in:
187
src/notebook_tools/pipeline.py
Normal file
187
src/notebook_tools/pipeline.py
Normal file
@@ -0,0 +1,187 @@
|
||||
"""OCR pipeline: Paperless PDF -> per-page OCR -> per-page PDFs -> Paperless uploads.
|
||||
|
||||
This module is where the "business logic" lives.
|
||||
|
||||
Design goals:
|
||||
- Keep the pipeline readable and linear.
|
||||
- Return enough information (created ids) for the job API.
|
||||
- Avoid hidden side-effects (everything is passed in / returned).
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Awaitable, Callable
|
||||
import io
|
||||
import logging
|
||||
import re
|
||||
|
||||
from notebook_tools.llama_client import LlamaClient
|
||||
from notebook_tools.paperless_client import PaperlessClient
|
||||
from notebook_tools.settings import Settings
|
||||
from notebook_tools import pdf_utils
|
||||
from PIL import Image
|
||||
|
||||
logger = logging.getLogger("notebook_tools.pipeline")
|
||||
|
||||
PAGE_NUMBER_PROMPT = (
|
||||
"You are reading a handwritten page number in the bottom corner of a notebook page. "
|
||||
"Return ONLY the page number as an integer. If you cannot determine it, return -1. "
|
||||
"Do not output any other words."
|
||||
)
|
||||
|
||||
|
||||
def _crop_bottom_corner_jpegs(*, full_page_jpeg: bytes) -> list[bytes]:
|
||||
"""Return small JPEG crops from bottom-left and bottom-right corners.
|
||||
|
||||
Why crop?
|
||||
- It reduces visual clutter so the model focuses on the handwritten page number.
|
||||
- It reduces payload size, making OCR faster.
|
||||
|
||||
The crop is based on percentages so it works across different page sizes.
|
||||
"""
|
||||
|
||||
img = Image.open(io.BytesIO(full_page_jpeg)).convert("RGB")
|
||||
w, h = img.size
|
||||
|
||||
# Bottom band (e.g. last 20% of page height)
|
||||
band_h = int(h * 0.22)
|
||||
y0 = max(0, h - band_h)
|
||||
|
||||
# Left/right corner width (e.g. 35% of page width)
|
||||
corner_w = int(w * 0.35)
|
||||
|
||||
crops = [
|
||||
img.crop((0, y0, corner_w, h)), # bottom-left
|
||||
img.crop((w - corner_w, y0, w, h)), # bottom-right
|
||||
]
|
||||
|
||||
out: list[bytes] = []
|
||||
for c in crops:
|
||||
buf = io.BytesIO()
|
||||
c.save(buf, format="JPEG", quality=90, optimize=True)
|
||||
out.append(buf.getvalue())
|
||||
return out
|
||||
|
||||
|
||||
def _parse_page_number(text: str) -> int | None:
|
||||
"""Try to parse an integer page number from a model response.
|
||||
|
||||
We accept:
|
||||
- '12'
|
||||
- 'Page 12' (if the model disobeys slightly)
|
||||
- '-1'
|
||||
"""
|
||||
|
||||
m = re.search(r"-?\d+", text)
|
||||
if not m:
|
||||
return None
|
||||
try:
|
||||
return int(m.group(0))
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
async def run_pipeline_for_paperless_document(
|
||||
*,
|
||||
settings: Settings,
|
||||
paperless_document_id: int,
|
||||
notebook_id: str,
|
||||
job_id: str,
|
||||
on_progress: Callable[[int, int], Awaitable[None]] | None,
|
||||
ocr_prompt_override: str | None,
|
||||
title_prefix: str | None,
|
||||
) -> dict[str, list[int]]:
|
||||
"""Run the full OCR pipeline for one Paperless document id.
|
||||
|
||||
Returns:
|
||||
{"created_document_ids": [...]} where each id is a NEW Paperless document
|
||||
(one per page).
|
||||
"""
|
||||
|
||||
paperless = PaperlessClient(
|
||||
base_url=str(settings.paperless_base_url),
|
||||
token=settings.paperless_token,
|
||||
task_timeout_s=settings.paperless_task_timeout_s,
|
||||
task_poll_interval_s=settings.paperless_task_poll_interval_s,
|
||||
)
|
||||
llama = LlamaClient(
|
||||
base_url=str(settings.llama_base_url),
|
||||
model=settings.llama_model,
|
||||
temperature=settings.ocr_temperature,
|
||||
max_tokens=settings.ocr_max_tokens,
|
||||
)
|
||||
|
||||
# 1) Download the source PDF.
|
||||
logger.info("job_id=%s downloading paperless_document_id=%s", job_id, paperless_document_id)
|
||||
pdf_bytes = await paperless.download_document_pdf(document_id=paperless_document_id)
|
||||
logger.info("job_id=%s downloaded_pdf_bytes=%s", job_id, len(pdf_bytes))
|
||||
|
||||
# 2) Render the PDF pages as JPEG images.
|
||||
logger.info("job_id=%s rendering_pages dpi=%s", job_id, settings.render_dpi)
|
||||
jpegs = pdf_utils.render_pdf_to_jpegs(pdf_bytes=pdf_bytes, dpi=settings.render_dpi)
|
||||
total_pages = len(jpegs)
|
||||
logger.info("job_id=%s rendered_pages=%s", job_id, total_pages)
|
||||
if on_progress:
|
||||
await on_progress(0, total_pages)
|
||||
|
||||
created_ids: list[int] = []
|
||||
|
||||
# 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
|
||||
for idx, jpeg_bytes in enumerate(jpegs, start=1):
|
||||
logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
|
||||
# 3a) Page-number OCR (bottom corners only).
|
||||
page_number = -1
|
||||
for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
|
||||
candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
|
||||
parsed = _parse_page_number(candidate_text)
|
||||
if parsed is not None:
|
||||
# Only accept non-negative numbers, or -1. Anything else becomes unknown.
|
||||
if parsed == -1 or parsed >= 0:
|
||||
page_number = parsed
|
||||
if page_number != -1:
|
||||
break
|
||||
logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
|
||||
|
||||
# 3b) Full-page OCR for actual searchable text content.
|
||||
logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
|
||||
ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
|
||||
logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
|
||||
|
||||
page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
|
||||
logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
|
||||
|
||||
# Upload the per-page PDF as a new Paperless document.
|
||||
logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
|
||||
new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
|
||||
logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
|
||||
|
||||
# Patch metadata:
|
||||
# - content: OCR text so it becomes searchable in Paperless
|
||||
# - custom_fields: notebook_id + notebook_page
|
||||
# - document_type: per-page document type (Paperless id)
|
||||
# - title
|
||||
custom_fields = [
|
||||
{"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
|
||||
{"field": settings.paperless_custom_field_notebook_page, "value": page_number},
|
||||
]
|
||||
|
||||
# Per your request, title is always in this exact format.
|
||||
# (We keep `title_prefix` in the API for now, but it is no longer used.)
|
||||
title = f"Notebook {notebook_id} Page {page_number}"
|
||||
|
||||
logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
|
||||
await paperless.patch_document(
|
||||
document_id=new_id,
|
||||
title=title,
|
||||
content=ocr_text,
|
||||
custom_fields=custom_fields,
|
||||
document_type=settings.paperless_document_type_id,
|
||||
)
|
||||
logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
|
||||
|
||||
created_ids.append(new_id)
|
||||
if on_progress:
|
||||
await on_progress(idx, total_pages)
|
||||
|
||||
return {"created_document_ids": created_ids}
|
||||
|
||||
Reference in New Issue
Block a user