logging
This commit is contained in:
@@ -13,6 +13,17 @@ import numpy as np
|
|||||||
import os
|
import os
|
||||||
import psycopg
|
import psycopg
|
||||||
import time
|
import time
|
||||||
|
import logging
|
||||||
|
from pythonjsonlogger.json import JsonFormatter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logHandler = logging.StreamHandler()
|
||||||
|
formatter = JsonFormatter("{asctime}{message}", style="{")
|
||||||
|
logHandler.setFormatter(formatter)
|
||||||
|
logger.addHandler(logHandler)
|
||||||
|
|
||||||
|
#####
|
||||||
|
|
||||||
dotenv.load_dotenv('/home/paul/scripts-private/lech/cavepedia-v2/poller.env')
|
dotenv.load_dotenv('/home/paul/scripts-private/lech/cavepedia-v2/poller.env')
|
||||||
|
|
||||||
@@ -67,7 +78,7 @@ def split_pdfs():
|
|||||||
key = record['s3']['object']['key']
|
key = record['s3']['object']['key']
|
||||||
key = unquote(key)
|
key = unquote(key)
|
||||||
|
|
||||||
print(f'SPLITTING bucket: {bucket}, key: {key}')
|
logger.info(f'SPLITTING bucket: {bucket}, key: {key}')
|
||||||
|
|
||||||
##### get pdf #####
|
##### get pdf #####
|
||||||
with s3.get_object(bucket, key) as obj:
|
with s3.get_object(bucket, key) as obj:
|
||||||
@@ -131,7 +142,7 @@ def process_events():
|
|||||||
for row in rows:
|
for row in rows:
|
||||||
bucket = row['bucket']
|
bucket = row['bucket']
|
||||||
key = row['key']
|
key = row['key']
|
||||||
print(f'PROCESSING bucket: {bucket}, key: {key}')
|
logger.info(f'PROCESSING bucket: {bucket}, key: {key}')
|
||||||
|
|
||||||
# tier 1 limit: 4k tokens/min
|
# tier 1 limit: 4k tokens/min
|
||||||
# single pdf = 2-3k tokens
|
# single pdf = 2-3k tokens
|
||||||
@@ -151,15 +162,15 @@ def process_events():
|
|||||||
if e.type == 'overloaded_error':
|
if e.type == 'overloaded_error':
|
||||||
if attempt < max_retries - 1:
|
if attempt < max_retries - 1:
|
||||||
sleep_time = retry_delay * (2 ** attempt)
|
sleep_time = retry_delay * (2 ** attempt)
|
||||||
print(f"Overload error. Retrying in {sleep_time:.2f} seconds...")
|
logger.info(f"Overload error. Retrying in {sleep_time:.2f} seconds...")
|
||||||
time.sleep(sleep_time)
|
time.sleep(sleep_time)
|
||||||
else:
|
else:
|
||||||
print('Max retries reached.')
|
logger.info('Max retries reached.')
|
||||||
raise
|
raise
|
||||||
else:
|
else:
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"An unexpected error occurred: {e}")
|
logger.error(f"An unexpected error occurred: {e}")
|
||||||
BACKOFF = True
|
BACKOFF = True
|
||||||
break
|
break
|
||||||
|
|
||||||
@@ -173,6 +184,14 @@ def embed(text, input_type):
|
|||||||
)
|
)
|
||||||
return resp.embeddings.float[0]
|
return resp.embeddings.float[0]
|
||||||
|
|
||||||
|
def fix_pages():
|
||||||
|
i = 766
|
||||||
|
while i > 0:
|
||||||
|
conn.execute('UPDATE embeddings SET key = %s WHERE key = %s', (f'public/va/caves-of-virginia.pdf/page-{i}.pdf', f'public/va/caves-of-virginia.pdf/page-{i-1}.pdf'))
|
||||||
|
conn.commit()
|
||||||
|
i -= 1
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
create_tables()
|
create_tables()
|
||||||
while True:
|
while True:
|
||||||
@@ -182,6 +201,6 @@ if __name__ == '__main__':
|
|||||||
process_events()
|
process_events()
|
||||||
|
|
||||||
if BACKOFF:
|
if BACKOFF:
|
||||||
print('BACKOFF')
|
logger.info('BACKOFF')
|
||||||
time.sleep(10 * 60)
|
time.sleep(10 * 60)
|
||||||
time.sleep(5 * 60)
|
time.sleep(5 * 60)
|
||||||
|
|||||||
@@ -13,5 +13,6 @@ dependencies = [
|
|||||||
"psycopg[binary]>=3.2.9",
|
"psycopg[binary]>=3.2.9",
|
||||||
"pypdf>=5.5.0",
|
"pypdf>=5.5.0",
|
||||||
"python-dotenv>=1.1.0",
|
"python-dotenv>=1.1.0",
|
||||||
|
"python-json-logger>=3.3.0",
|
||||||
"types-psycopg2>=2.9.21.20250516",
|
"types-psycopg2>=2.9.21.20250516",
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ def embed(text, input_type):
|
|||||||
return resp.embeddings.float[0]
|
return resp.embeddings.float[0]
|
||||||
|
|
||||||
def search():
|
def search():
|
||||||
query = 'caves locations in bath county'
|
query = 'tazwell county caves'
|
||||||
query_embedding = embed(query, 'search_query')
|
query_embedding = embed(query, 'search_query')
|
||||||
|
|
||||||
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()
|
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()
|
||||||
|
|||||||
11
poller/uv.lock
generated
11
poller/uv.lock
generated
@@ -552,6 +552,7 @@ dependencies = [
|
|||||||
{ name = "psycopg", extra = ["binary"] },
|
{ name = "psycopg", extra = ["binary"] },
|
||||||
{ name = "pypdf" },
|
{ name = "pypdf" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
|
{ name = "python-json-logger" },
|
||||||
{ name = "types-psycopg2" },
|
{ name = "types-psycopg2" },
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -565,6 +566,7 @@ requires-dist = [
|
|||||||
{ name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" },
|
{ name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" },
|
||||||
{ name = "pypdf", specifier = ">=5.5.0" },
|
{ name = "pypdf", specifier = ">=5.5.0" },
|
||||||
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
||||||
|
{ name = "python-json-logger", specifier = ">=3.3.0" },
|
||||||
{ name = "types-psycopg2", specifier = ">=2.9.21.20250516" },
|
{ name = "types-psycopg2", specifier = ">=2.9.21.20250516" },
|
||||||
]
|
]
|
||||||
|
|
||||||
@@ -763,6 +765,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256, upload-time = "2025-03-25T10:14:55.034Z" },
|
{ url = "https://files.pythonhosted.org/packages/1e/18/98a99ad95133c6a6e2005fe89faedf294a748bd5dc803008059409ac9b1e/python_dotenv-1.1.0-py3-none-any.whl", hash = "sha256:d7c01d9e2293916c18baf562d95698754b0dbbb5e74d457c45d4f6561fb9d55d", size = 20256, upload-time = "2025-03-25T10:14:55.034Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "python-json-logger"
|
||||||
|
version = "3.3.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/9e/de/d3144a0bceede957f961e975f3752760fbe390d57fbe194baf709d8f1f7b/python_json_logger-3.3.0.tar.gz", hash = "sha256:12b7e74b17775e7d565129296105bbe3910842d9d0eb083fc83a6a617aa8df84", size = 16642, upload-time = "2025-03-07T07:08:27.301Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/08/20/0f2523b9e50a8052bc6a8b732dfc8568abbdc42010aef03a2d750bdab3b2/python_json_logger-3.3.0-py3-none-any.whl", hash = "sha256:dd980fae8cffb24c13caf6e158d3d61c0d6d22342f932cb6e9deedab3d35eec7", size = 15163, upload-time = "2025-03-07T07:08:25.627Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "pyyaml"
|
name = "pyyaml"
|
||||||
version = "6.0.2"
|
version = "6.0.2"
|
||||||
|
|||||||
Reference in New Issue
Block a user