Compare commits
4 Commits
29c790fdfd
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
| 4820888ff9 | |||
| bcb4714778 | |||
| 7fec4bc575 | |||
| 612fbe2055 |
16
.dockerignore
Normal file
16
.dockerignore
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
.git
|
||||||
|
.github
|
||||||
|
.gitea
|
||||||
|
.venv
|
||||||
|
__pycache__
|
||||||
|
*.py[cod]
|
||||||
|
.pytest_cache
|
||||||
|
.ruff_cache
|
||||||
|
.coverage
|
||||||
|
htmlcov
|
||||||
|
.env
|
||||||
|
.env.*
|
||||||
|
!.env.example
|
||||||
|
tests
|
||||||
|
.cursor
|
||||||
|
*.plan.md
|
||||||
@@ -13,6 +13,9 @@ PAPERLESS_DOCUMENT_TYPE_ID=3
|
|||||||
PAPERLESS_TASK_TIMEOUT_S=600
|
PAPERLESS_TASK_TIMEOUT_S=600
|
||||||
PAPERLESS_TASK_POLL_INTERVAL_S=5.0
|
PAPERLESS_TASK_POLL_INTERVAL_S=5.0
|
||||||
|
|
||||||
|
# 0 = unlimited concurrent per-page uploads
|
||||||
|
PAPERLESS_UPLOAD_CONCURRENCY=0
|
||||||
|
|
||||||
RENDER_DPI=200
|
RENDER_DPI=200
|
||||||
OCR_MAX_TOKENS=1024
|
OCR_MAX_TOKENS=1024
|
||||||
OCR_TEMPERATURE=0.0
|
OCR_TEMPERATURE=0.0
|
||||||
|
|||||||
38
.github/workflows/build-docker.yml
vendored
Normal file
38
.github/workflows/build-docker.yml
vendored
Normal file
@@ -0,0 +1,38 @@
|
|||||||
|
name: Build and Publish Docker Image
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
build-and-push:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Checkout code
|
||||||
|
uses: actions/checkout@v4
|
||||||
|
|
||||||
|
- name: Set up Docker Buildx
|
||||||
|
uses: docker/setup-buildx-action@v3
|
||||||
|
|
||||||
|
- name: Login to Docker Registry
|
||||||
|
uses: docker/login-action@v3
|
||||||
|
with:
|
||||||
|
registry: ${{ secrets.DOCKER_REGISTRY }}
|
||||||
|
username: ${{ secrets.DOCKER_USERNAME }}
|
||||||
|
password: ${{ secrets.DOCKER_PASSWORD }}
|
||||||
|
|
||||||
|
- name: Build and push
|
||||||
|
uses: docker/build-push-action@v5
|
||||||
|
with:
|
||||||
|
context: .
|
||||||
|
file: Dockerfile
|
||||||
|
push: true
|
||||||
|
tags: |
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/notebook-tools:${{ gitea.sha }}
|
||||||
|
${{ secrets.DOCKER_REGISTRY }}/${{ secrets.DOCKER_USERNAME }}/notebook-tools:latest
|
||||||
|
labels: |
|
||||||
|
org.opencontainers.image.source=${{ gitea.server_url }}/${{ gitea.repository }}
|
||||||
|
org.opencontainers.image.description=Notebook tools — Paperless + llama.cpp OCR API
|
||||||
|
cache-from: type=gha
|
||||||
|
cache-to: type=gha,mode=max
|
||||||
19
Dockerfile
Normal file
19
Dockerfile
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
# syntax=docker/dockerfile:1
|
||||||
|
# Production image: uv sync (frozen lockfile), run FastAPI with uvicorn.
|
||||||
|
FROM ghcr.io/astral-sh/uv:python3.12-bookworm-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
ENV UV_COMPILE_BYTECODE=1 \
|
||||||
|
UV_LINK_MODE=copy
|
||||||
|
|
||||||
|
COPY pyproject.toml uv.lock README.md ./
|
||||||
|
COPY src ./src
|
||||||
|
|
||||||
|
RUN uv sync --frozen --no-dev
|
||||||
|
|
||||||
|
ENV PATH="/app/.venv/bin:$PATH"
|
||||||
|
|
||||||
|
EXPOSE 8080
|
||||||
|
|
||||||
|
CMD ["uvicorn", "notebook_tools.api:app", "--host", "0.0.0.0", "--port", "8080"]
|
||||||
50
README.md
50
README.md
@@ -5,7 +5,7 @@ FastAPI service that:
|
|||||||
- splits them into pages (JPEG)
|
- splits them into pages (JPEG)
|
||||||
- OCRs each page via your llama.cpp OpenAI-compatible endpoint
|
- OCRs each page via your llama.cpp OpenAI-compatible endpoint
|
||||||
- converts each page back into a single-page PDF
|
- converts each page back into a single-page PDF
|
||||||
- uploads **one Paperless document per page**
|
- uploads **one Paperless document per page** (all uploads run **in parallel**; OCR stays **one page at a time** for VRAM)
|
||||||
- patches each uploaded document with:
|
- patches each uploaded document with:
|
||||||
- `content` = OCR text
|
- `content` = OCR text
|
||||||
- custom fields `notebook_id` (field id 1) and `notebook_page` (field id 2)
|
- custom fields `notebook_id` (field id 1) and `notebook_page` (field id 2)
|
||||||
@@ -33,6 +33,51 @@ Then open the docs at:
|
|||||||
|
|
||||||
If other machines still can’t connect, check your macOS firewall and any router/network rules.
|
If other machines still can’t connect, check your macOS firewall and any router/network rules.
|
||||||
|
|
||||||
|
## Docker
|
||||||
|
|
||||||
|
Build and run (pass env via file or `-e`; the app reads `.env` only if you mount it):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker build -t notebook-tools:local .
|
||||||
|
docker run --rm -p 8080:8080 --env-file .env notebook-tools:local
|
||||||
|
```
|
||||||
|
|
||||||
|
`LLAMA_BASE_URL` / `PAPERLESS_BASE_URL` must be reachable **from inside the container** (use `host.docker.internal` on Docker Desktop, or your LAN IP, not `127.0.0.1` for services on the host).
|
||||||
|
|
||||||
|
### Docker Compose
|
||||||
|
|
||||||
|
Save as `compose.yaml` (any directory with your `.env`):
|
||||||
|
|
||||||
|
```yaml
|
||||||
|
services:
|
||||||
|
notebook-tools:
|
||||||
|
image: git.danhenry.dev/daniel/notebook-tools:latest
|
||||||
|
ports:
|
||||||
|
- "8080:8080"
|
||||||
|
env_file:
|
||||||
|
- .env
|
||||||
|
# Lets the container reach services bound on the host (e.g. llama on :9292).
|
||||||
|
# Linux: requires Docker 20.10+ / Compose v2; omit on Docker Desktop if already available.
|
||||||
|
extra_hosts:
|
||||||
|
- "host.docker.internal:host-gateway"
|
||||||
|
```
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker compose pull && docker compose up
|
||||||
|
```
|
||||||
|
|
||||||
|
Log in to `git.danhenry.dev` first if the registry requires auth: `docker login git.danhenry.dev`.
|
||||||
|
|
||||||
|
For llama running **on the host**, set in `.env`:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
LLAMA_BASE_URL="http://host.docker.internal:9292"
|
||||||
|
```
|
||||||
|
|
||||||
|
`PAPERLESS_BASE_URL` can stay a normal `https://…` URL if the container has network access to it.
|
||||||
|
|
||||||
|
CI: on push to `main`, [.github/workflows/build-docker.yml](.github/workflows/build-docker.yml) builds and pushes using the same secrets pattern as your other Gitea repos (`DOCKER_REGISTRY`, `DOCKER_USERNAME`, `DOCKER_PASSWORD`). For Docker Hub, set `DOCKER_REGISTRY` to `docker.io` (or leave per your runner docs).
|
||||||
|
|
||||||
## Example `.env`
|
## Example `.env`
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
@@ -47,6 +92,9 @@ PAPERLESS_CUSTOM_FIELD_NOTEBOOK_ID=1
|
|||||||
PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2
|
PAPERLESS_CUSTOM_FIELD_NOTEBOOK_PAGE=2
|
||||||
PAPERLESS_DOCUMENT_TYPE_ID=3
|
PAPERLESS_DOCUMENT_TYPE_ID=3
|
||||||
|
|
||||||
|
# Optional: cap concurrent Paperless uploads (0 = unlimited)
|
||||||
|
PAPERLESS_UPLOAD_CONCURRENCY=4
|
||||||
|
|
||||||
# Rendering / OCR knobs
|
# Rendering / OCR knobs
|
||||||
RENDER_DPI=200
|
RENDER_DPI=200
|
||||||
OCR_MAX_TOKENS=1024
|
OCR_MAX_TOKENS=1024
|
||||||
|
|||||||
@@ -6,19 +6,25 @@ Design goals:
|
|||||||
- Keep the pipeline readable and linear.
|
- Keep the pipeline readable and linear.
|
||||||
- Return enough information (created ids) for the job API.
|
- Return enough information (created ids) for the job API.
|
||||||
- Avoid hidden side-effects (everything is passed in / returned).
|
- Avoid hidden side-effects (everything is passed in / returned).
|
||||||
|
|
||||||
|
Upload strategy:
|
||||||
|
- All per-page PDFs are uploaded to Paperless concurrently (each upload still polls until a doc id exists).
|
||||||
|
- OCR (llama) runs one page at a time to respect VRAM.
|
||||||
|
- Each page is PATCHed once that page's upload has finished and OCR for that page is done.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from collections.abc import Awaitable, Callable
|
import asyncio
|
||||||
import io
|
import io
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
|
from collections.abc import Awaitable, Callable
|
||||||
|
|
||||||
|
from notebook_tools import pdf_utils
|
||||||
from notebook_tools.llama_client import LlamaClient
|
from notebook_tools.llama_client import LlamaClient
|
||||||
from notebook_tools.paperless_client import PaperlessClient
|
from notebook_tools.paperless_client import PaperlessClient
|
||||||
from notebook_tools.settings import Settings
|
from notebook_tools.settings import Settings
|
||||||
from notebook_tools import pdf_utils
|
|
||||||
from PIL import Image
|
from PIL import Image
|
||||||
|
|
||||||
logger = logging.getLogger("notebook_tools.pipeline")
|
logger = logging.getLogger("notebook_tools.pipeline")
|
||||||
@@ -124,64 +130,86 @@ async def run_pipeline_for_paperless_document(
|
|||||||
if on_progress:
|
if on_progress:
|
||||||
await on_progress(0, total_pages)
|
await on_progress(0, total_pages)
|
||||||
|
|
||||||
|
# One small PDF per page (used by upload tasks).
|
||||||
|
page_pdfs = [pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=b) for b in jpegs]
|
||||||
|
|
||||||
|
# 3) Start all Paperless uploads in parallel (each task waits for ingest + document id).
|
||||||
|
conc = settings.paperless_upload_concurrency
|
||||||
|
upload_sem: asyncio.Semaphore | None = (
|
||||||
|
asyncio.Semaphore(conc) if conc and conc > 0 else None
|
||||||
|
)
|
||||||
|
logger.info(
|
||||||
|
"job_id=%s starting_parallel_uploads pages=%s concurrency=%s",
|
||||||
|
job_id,
|
||||||
|
total_pages,
|
||||||
|
conc if conc and conc > 0 else "unlimited",
|
||||||
|
)
|
||||||
|
|
||||||
|
async def _upload_one_page(idx_1based: int) -> int:
|
||||||
|
filename = f"job_{job_id}_page_{idx_1based}.pdf"
|
||||||
|
pdf_bytes = page_pdfs[idx_1based - 1]
|
||||||
|
logger.info("job_id=%s page=%s/%s upload_task_starting", job_id, idx_1based, total_pages)
|
||||||
|
if upload_sem is not None:
|
||||||
|
async with upload_sem:
|
||||||
|
return await paperless.upload_pdf(filename=filename, pdf_bytes=pdf_bytes)
|
||||||
|
return await paperless.upload_pdf(filename=filename, pdf_bytes=pdf_bytes)
|
||||||
|
|
||||||
|
upload_tasks: list[asyncio.Task[int]] = [
|
||||||
|
asyncio.create_task(_upload_one_page(i)) for i in range(1, total_pages + 1)
|
||||||
|
]
|
||||||
|
|
||||||
created_ids: list[int] = []
|
created_ids: list[int] = []
|
||||||
|
|
||||||
# 3) For each page: OCR -> convert to single-page PDF -> upload -> patch metadata.
|
# 4) OCR sequentially (VRAM), then await upload for that page + PATCH.
|
||||||
for idx, jpeg_bytes in enumerate(jpegs, start=1):
|
try:
|
||||||
logger.info("job_id=%s page=%s/%s starting", job_id, idx, total_pages)
|
for idx, jpeg_bytes in enumerate(jpegs, start=1):
|
||||||
# 3a) Page-number OCR (bottom corners only).
|
logger.info("job_id=%s page=%s/%s ocr_starting", job_id, idx, total_pages)
|
||||||
page_number = -1
|
# 4a) Page-number OCR (bottom corners only).
|
||||||
for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
|
page_number = -1
|
||||||
candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
|
for corner_jpeg in _crop_bottom_corner_jpegs(full_page_jpeg=jpeg_bytes):
|
||||||
parsed = _parse_page_number(candidate_text)
|
candidate_text = await llama.ocr_jpeg(jpeg_bytes=corner_jpeg, prompt=PAGE_NUMBER_PROMPT)
|
||||||
if parsed is not None:
|
parsed = _parse_page_number(candidate_text)
|
||||||
# Only accept non-negative numbers, or -1. Anything else becomes unknown.
|
if parsed is not None:
|
||||||
if parsed == -1 or parsed >= 0:
|
if parsed == -1 or parsed >= 0:
|
||||||
page_number = parsed
|
page_number = parsed
|
||||||
if page_number != -1:
|
if page_number != -1:
|
||||||
break
|
break
|
||||||
logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
|
logger.info("job_id=%s page=%s detected_page_number=%s", job_id, idx, page_number)
|
||||||
|
|
||||||
# 3b) Full-page OCR for actual searchable text content.
|
# 4b) Full-page OCR for searchable content.
|
||||||
logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
|
logger.info("job_id=%s page=%s ocr_full_page", job_id, idx)
|
||||||
ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
|
ocr_text = await llama.ocr_jpeg(jpeg_bytes=jpeg_bytes, prompt=ocr_prompt_override)
|
||||||
logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
|
logger.info("job_id=%s page=%s ocr_chars=%s", job_id, idx, len(ocr_text))
|
||||||
|
|
||||||
page_pdf = pdf_utils.jpeg_to_pdf_bytes(jpeg_bytes=jpeg_bytes)
|
logger.info("job_id=%s page=%s awaiting_upload_then_patch", job_id, idx)
|
||||||
logger.info("job_id=%s page=%s pdf_bytes=%s", job_id, idx, len(page_pdf))
|
new_id = await upload_tasks[idx - 1]
|
||||||
|
logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
|
||||||
|
|
||||||
# Upload the per-page PDF as a new Paperless document.
|
custom_fields = [
|
||||||
logger.info("job_id=%s page=%s uploading_to_paperless", job_id, idx)
|
{"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
|
||||||
new_id = await paperless.upload_pdf(filename=f"job_{job_id}_page_{idx}.pdf", pdf_bytes=page_pdf)
|
{"field": settings.paperless_custom_field_notebook_page, "value": page_number},
|
||||||
logger.info("job_id=%s page=%s uploaded_document_id=%s", job_id, idx, new_id)
|
]
|
||||||
|
|
||||||
# Patch metadata:
|
title = f"Notebook {notebook_id} Page {page_number}"
|
||||||
# - content: OCR text so it becomes searchable in Paperless
|
|
||||||
# - custom_fields: notebook_id + notebook_page
|
|
||||||
# - document_type: per-page document type (Paperless id)
|
|
||||||
# - title
|
|
||||||
custom_fields = [
|
|
||||||
{"field": settings.paperless_custom_field_notebook_id, "value": notebook_id},
|
|
||||||
{"field": settings.paperless_custom_field_notebook_page, "value": page_number},
|
|
||||||
]
|
|
||||||
|
|
||||||
# Per your request, title is always in this exact format.
|
logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
|
||||||
# (We keep `title_prefix` in the API for now, but it is no longer used.)
|
await paperless.patch_document(
|
||||||
title = f"Notebook {notebook_id} Page {page_number}"
|
document_id=new_id,
|
||||||
|
title=title,
|
||||||
|
content=ocr_text,
|
||||||
|
custom_fields=custom_fields,
|
||||||
|
document_type=settings.paperless_document_type_id,
|
||||||
|
)
|
||||||
|
logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
|
||||||
|
|
||||||
logger.info("job_id=%s page=%s patching_document_id=%s", job_id, idx, new_id)
|
created_ids.append(new_id)
|
||||||
await paperless.patch_document(
|
if on_progress:
|
||||||
document_id=new_id,
|
await on_progress(idx, total_pages)
|
||||||
title=title,
|
except BaseException:
|
||||||
content=ocr_text,
|
for t in upload_tasks:
|
||||||
custom_fields=custom_fields,
|
if not t.done():
|
||||||
document_type=settings.paperless_document_type_id,
|
t.cancel()
|
||||||
)
|
await asyncio.gather(*upload_tasks, return_exceptions=True)
|
||||||
logger.info("job_id=%s page=%s patched_document_id=%s", job_id, idx, new_id)
|
raise
|
||||||
|
|
||||||
created_ids.append(new_id)
|
|
||||||
if on_progress:
|
|
||||||
await on_progress(idx, total_pages)
|
|
||||||
|
|
||||||
return {"created_document_ids": created_ids}
|
return {"created_document_ids": created_ids}
|
||||||
|
|
||||||
|
|||||||
@@ -46,6 +46,9 @@ class Settings(BaseSettings):
|
|||||||
paperless_task_timeout_s: int = Field(600, alias="PAPERLESS_TASK_TIMEOUT_S")
|
paperless_task_timeout_s: int = Field(600, alias="PAPERLESS_TASK_TIMEOUT_S")
|
||||||
paperless_task_poll_interval_s: float = Field(5.0, alias="PAPERLESS_TASK_POLL_INTERVAL_S")
|
paperless_task_poll_interval_s: float = Field(5.0, alias="PAPERLESS_TASK_POLL_INTERVAL_S")
|
||||||
|
|
||||||
|
# Max concurrent per-page uploads to Paperless (0 = unlimited). Limits load on the server.
|
||||||
|
paperless_upload_concurrency: int = Field(0, alias="PAPERLESS_UPLOAD_CONCURRENCY")
|
||||||
|
|
||||||
# Logging
|
# Logging
|
||||||
log_level: str = Field("INFO", alias="LOG_LEVEL")
|
log_level: str = Field("INFO", alias="LOG_LEVEL")
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user