From 4ce3eb0abd3e2f5ced96346fedccc72cd953cd2c Mon Sep 17 00:00:00 2001 From: Paul Walko Date: Fri, 12 Dec 2025 17:52:44 +0100 Subject: [PATCH] poller docker --- .gitea/workflows/build-push-poller.yaml | 30 +++++++++++++++++ poller/.dockerignore | 8 +++++ poller/.python-version | 1 - poller/Dockerfile | 19 +++++++++++ poller/README.md | 44 +++++++++++++++++++++++++ poller/main.py | 33 ++++++++++++------- poller/pyproject.toml | 2 +- 7 files changed, 124 insertions(+), 13 deletions(-) create mode 100644 .gitea/workflows/build-push-poller.yaml create mode 100644 poller/.dockerignore delete mode 100644 poller/.python-version create mode 100644 poller/Dockerfile diff --git a/.gitea/workflows/build-push-poller.yaml b/.gitea/workflows/build-push-poller.yaml new file mode 100644 index 0000000..2044103 --- /dev/null +++ b/.gitea/workflows/build-push-poller.yaml @@ -0,0 +1,30 @@ +name: Build and Push Poller Docker Image + +on: + push: + branches: + - main + +jobs: + build: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Login to Gitea Container Registry + uses: docker/login-action@v3 + with: + registry: git.seaturtle.pw + username: ${{ gitea.actor }} + password: ${{ secrets.ACTIONS_PUSH_TOKEN }} + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v3 + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: ./poller + push: true + tags: git.seaturtle.pw/cavepedia/cavepediav2-poller:latest diff --git a/poller/.dockerignore b/poller/.dockerignore new file mode 100644 index 0000000..c3437b0 --- /dev/null +++ b/poller/.dockerignore @@ -0,0 +1,8 @@ +.venv/ +__pycache__/ +*.pyc +*.pyo +.git +.gitignore +README.md +.python-version diff --git a/poller/.python-version b/poller/.python-version deleted file mode 100644 index 2c07333..0000000 --- a/poller/.python-version +++ /dev/null @@ -1 +0,0 @@ -3.11 diff --git a/poller/Dockerfile b/poller/Dockerfile new file mode 100644 index 0000000..b85c2fc --- /dev/null +++ b/poller/Dockerfile @@ -0,0 +1,19 @@ +# syntax=docker/dockerfile:1 + +FROM python:3.14-slim + +WORKDIR /app + +# Install uv for fast dependency management +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ + +# Copy dependency files +COPY pyproject.toml uv.lock ./ + +# Install dependencies +RUN uv sync --frozen --no-dev --no-install-project + +# Copy application code +COPY main.py ./ + +CMD ["uv", "run", "main.py"] diff --git a/poller/README.md b/poller/README.md index b387771..9263e40 100644 --- a/poller/README.md +++ b/poller/README.md @@ -12,3 +12,47 @@ Every 5 minutes, this polls for new documents as follows: * A temporary public S3 file link is generated using a presigned s3 url. 5. Checks the `embeddings` table for any rows that have been OCR'd, but do not have embeddings generated, then generates embeddings with cohere. * No batching is used with cohere. + +## Environment Variables + +| Variable | Required | Default | Description | +|----------|----------|---------|-------------| +| `COHERE_API_KEY` | Yes | - | Cohere API key for embeddings | +| `S3_ACCESS_KEY` | Yes | - | S3/MinIO access key | +| `S3_SECRET_KEY` | Yes | - | S3/MinIO secret key | +| `DB_PASSWORD` | Yes | - | PostgreSQL password | +| `ANTHROPIC_API_KEY` | Yes | - | Claude API key for OCR | +| `DB_HOST` | No | localhost | PostgreSQL host | +| `DB_PORT` | No | 5432 | PostgreSQL port | +| `DB_NAME` | No | cavepediav2_db | PostgreSQL database name | +| `DB_USER` | No | cavepediav2_user | PostgreSQL username | +| `S3_ENDPOINT` | No | https://s3.bigcavemaps.com | S3 endpoint URL | +| `S3_REGION` | No | eu | S3 region | + +## Development + +```bash +# Create .env file with required variables +cp .env.example .env + +# Install dependencies +uv sync + +# Run +python main.py +``` + +## Deployment + +The poller is automatically built and pushed to `git.seaturtle.pw/cavepedia/cavepediav2-poller:latest` on push to main. + +```bash +docker run \ + -e COHERE_API_KEY="xxx" \ + -e S3_ACCESS_KEY="xxx" \ + -e S3_SECRET_KEY="xxx" \ + -e DB_PASSWORD="xxx" \ + -e DB_HOST="postgres" \ + -e ANTHROPIC_API_KEY="xxx" \ + git.seaturtle.pw/cavepedia/cavepediav2-poller:latest +``` diff --git a/poller/main.py b/poller/main.py index cad4a03..a7aa42e 100644 --- a/poller/main.py +++ b/poller/main.py @@ -27,26 +27,37 @@ logger.addHandler(logHandler) ##### -dotenv.load_dotenv('/home/pew/scripts-private/loser/cavepedia-v2/poller.env') +# Load .env file if it exists (for local dev) +dotenv.load_dotenv() -COHERE_API_KEY = os.getenv('COHERE_API_KEY') -S3_ACCESS_KEY = os.getenv('S3_ACCESS_KEY') -S3_SECRET_KEY = os.getenv('S3_SECRET_KEY') +# Required environment variables +COHERE_API_KEY = os.environ['COHERE_API_KEY'] +S3_ACCESS_KEY = os.environ['S3_ACCESS_KEY'] +S3_SECRET_KEY = os.environ['S3_SECRET_KEY'] +S3_ENDPOINT = os.environ.get('S3_ENDPOINT', 'https://s3.bigcavemaps.com') +S3_REGION = os.environ.get('S3_REGION', 'eu') + +# Database config +DB_HOST = os.environ.get('DB_HOST', 'localhost') +DB_PORT = int(os.environ.get('DB_PORT', '5432')) +DB_NAME = os.environ.get('DB_NAME', 'cavepediav2_db') +DB_USER = os.environ.get('DB_USER', 'cavepediav2_user') +DB_PASSWORD = os.environ['DB_PASSWORD'] s3 = boto3.client( 's3', aws_access_key_id=S3_ACCESS_KEY, aws_secret_access_key=S3_SECRET_KEY, - endpoint_url='https://s3.bigcavemaps.com', - region_name='eu', + endpoint_url=S3_ENDPOINT, + region_name=S3_REGION, ) co = cohere.ClientV2(api_key=COHERE_API_KEY) conn = psycopg.connect( - host='::1', - port=9030, - dbname='cavepediav2_db', - user='cavepediav2_user', - password='cavepediav2_pw', + host=DB_HOST, + port=DB_PORT, + dbname=DB_NAME, + user=DB_USER, + password=DB_PASSWORD, row_factory=dict_row, ) diff --git a/poller/pyproject.toml b/poller/pyproject.toml index a8fda24..f11cd42 100644 --- a/poller/pyproject.toml +++ b/poller/pyproject.toml @@ -3,7 +3,7 @@ name = "poller" version = "1.0.0" description = "Cavepedia v2 Poller" readme = "README.md" -requires-python = ">=3.11" +requires-python = ">=3.14" dependencies = [ "anthropic>=0.52.0", "boto3>=1.42.4",