diff --git a/poller/main.py b/poller/main.py index 5728198..3e43519 100644 --- a/poller/main.py +++ b/poller/main.py @@ -13,6 +13,17 @@ import numpy as np import os import psycopg import time +import logging +from pythonjsonlogger.json import JsonFormatter + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) +logHandler = logging.StreamHandler() +formatter = JsonFormatter("{asctime}{message}", style="{") +logHandler.setFormatter(formatter) +logger.addHandler(logHandler) + +##### dotenv.load_dotenv('/home/paul/scripts-private/lech/cavepedia-v2/poller.env') @@ -67,7 +78,7 @@ def split_pdfs(): key = record['s3']['object']['key'] key = unquote(key) - print(f'SPLITTING bucket: {bucket}, key: {key}') + logger.info(f'SPLITTING bucket: {bucket}, key: {key}') ##### get pdf ##### with s3.get_object(bucket, key) as obj: @@ -131,7 +142,7 @@ def process_events(): for row in rows: bucket = row['bucket'] key = row['key'] - print(f'PROCESSING bucket: {bucket}, key: {key}') + logger.info(f'PROCESSING bucket: {bucket}, key: {key}') # tier 1 limit: 4k tokens/min # single pdf = 2-3k tokens @@ -151,15 +162,15 @@ def process_events(): if e.type == 'overloaded_error': if attempt < max_retries - 1: sleep_time = retry_delay * (2 ** attempt) - print(f"Overload error. Retrying in {sleep_time:.2f} seconds...") + logger.info(f"Overload error. Retrying in {sleep_time:.2f} seconds...") time.sleep(sleep_time) else: - print('Max retries reached.') + logger.info('Max retries reached.') raise else: raise except Exception as e: - print(f"An unexpected error occurred: {e}") + logger.error(f"An unexpected error occurred: {e}") BACKOFF = True break @@ -173,6 +184,14 @@ def embed(text, input_type): ) return resp.embeddings.float[0] +def fix_pages(): + i = 766 + while i > 0: + conn.execute('UPDATE embeddings SET key = %s WHERE key = %s', (f'public/va/caves-of-virginia.pdf/page-{i}.pdf', f'public/va/caves-of-virginia.pdf/page-{i-1}.pdf')) + conn.commit() + i -= 1 + + if __name__ == '__main__': create_tables() while True: @@ -182,6 +201,6 @@ if __name__ == '__main__': process_events() if BACKOFF: - print('BACKOFF') + logger.info('BACKOFF') time.sleep(10 * 60) time.sleep(5 * 60) diff --git a/poller/pyproject.toml b/poller/pyproject.toml index 3ece2a9..9603ca2 100644 --- a/poller/pyproject.toml +++ b/poller/pyproject.toml @@ -13,5 +13,6 @@ dependencies = [ "psycopg[binary]>=3.2.9", "pypdf>=5.5.0", "python-dotenv>=1.1.0", + "python-json-logger>=3.3.0", "types-psycopg2>=2.9.21.20250516", ] diff --git a/poller/search.py b/poller/search.py index 7f93eb6..b174797 100644 --- a/poller/search.py +++ b/poller/search.py @@ -36,7 +36,7 @@ def embed(text, input_type): return resp.embeddings.float[0] def search(): - query = 'caves locations in bath county' + query = 'tazwell county caves' query_embedding = embed(query, 'search_query') rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall() diff --git a/poller/uv.lock b/poller/uv.lock index 7889297..9f0332a 100644 --- a/poller/uv.lock +++ b/poller/uv.lock @@ -552,6 +552,7 @@ dependencies = [ { name = "psycopg", extra = ["binary"] }, { name = "pypdf" }, { name = "python-dotenv" }, + { name = "python-json-logger" }, { name = "types-psycopg2" }, ] @@ -565,6 +566,7 @@ requires-dist = [ { name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" }, { name = "pypdf", specifier = ">=5.5.0" }, { name = "python-dotenv", specifier = ">=1.1.0" }, + { name = "python-json-logger", specifier = ">=3.3.0" }, { name = "types-psycopg2", specifier = ">=2.9.21.20250516" }, ] @@ -763,6 +765,15 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256, upload-time = "2025-03-25T10:14:55.034Z" }, ] +[[package]] +name = "python-json-logger" +version = "3.3.0" +source = { registry = "https://pypi.org/simple" } +sdist = { url = "https://files.pythonhosted.org/packages/9e/de/d3144a0bceede957f961e975f3752760fbe390d57fbe194baf709d8f1f7b/python_json_logger-3.3.0.tar.gz", hash = "sha256:12b7e74b17775e7d565129296105bbe3910842d9d0eb083fc83a6a617aa8df84", size = 16642, upload-time = "2025-03-07T07:08:27.301Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/08/20/0f2523b9e50a8052bc6a8b732dfc8568abbdc42010aef03a2d750bdab3b2/python_json_logger-3.3.0-py3-none-any.whl", hash = "sha256:dd980fae8cffb24c13caf6e158d3d61c0d6d22342f932cb6e9deedab3d35eec7", size = 15163, upload-time = "2025-03-07T07:08:25.627Z" }, +] + [[package]] name = "pyyaml" version = "6.0.2"