This commit is contained in:
2025-05-28 08:51:25 -04:00
parent af918aab19
commit 30f68a9d04
4 changed files with 83 additions and 42 deletions

View File

@@ -7,9 +7,7 @@ up () {
--detach \ --detach \
--name cp2-pg \ --name cp2-pg \
--restart unless-stopped \ --restart unless-stopped \
--env POSTGRES_DB=cavepediav2_db \ --env-file $HOME/scripts-private/lech/cavepedia-v2/cp2-pg.env \
--env POSTGRES_PASSWORD=cavepediav2_pw \
--env POSTGRES_USER=cavepediav2_user \
--volume /mammoth/cp2/cp2-pg/data:/var/lib/postgresql/data:rw \ --volume /mammoth/cp2/cp2-pg/data:/var/lib/postgresql/data:rw \
--publish 127.0.0.1:4010:5432 \ --publish 127.0.0.1:4010:5432 \
--network pew-net \ --network pew-net \

53
poller/getobject.py Normal file
View File

@@ -0,0 +1,53 @@
from pgvector.psycopg import register_vector, Bit
from psycopg.rows import dict_row
from urllib.parse import unquote
from pypdf import PdfReader, PdfWriter
import anthropic
import cohere
import dotenv
import datetime
import io
import json
import minio
import numpy as np
import os
import psycopg
import time
import logging
from pythonjsonlogger.json import JsonFormatter
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
logHandler = logging.StreamHandler()
formatter = JsonFormatter("{asctime}{message}", style="{")
logHandler.setFormatter(formatter)
logger.addHandler(logHandler)
#####
dotenv.load_dotenv('/home/paul/scripts-private/lech/cavepedia-v2/poller.env')
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
s3 = minio.Minio(
's3.bigcavemaps.com',
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
region='kansascity',
)
def getobject():
bucket = 'cavepedia-v2'
key = 'public/var/fyi/VAR-FYI 1982-01.pdf'
with s3.get_object(bucket, key) as obj:
with open('/tmp/file.pdf', 'wb') as f:
while True:
chunk = obj.read(1024)
if not chunk:
break
f.write(chunk)
if __name__ == '__main__':
getobject()

View File

@@ -47,9 +47,8 @@ conn = psycopg.connect(
row_factory=dict_row, row_factory=dict_row,
) )
BACKOFF = False
## init ## init
# events table is created by minio up creation of event destination
def create_tables(): def create_tables():
commands = ( commands = (
"CREATE EXTENSION IF NOT EXISTS vector", "CREATE EXTENSION IF NOT EXISTS vector",
@@ -73,10 +72,11 @@ def split_pdfs():
for row in rows: for row in rows:
with conn.cursor() as cur: with conn.cursor() as cur:
for record in row['event_data']['Records']: for record in row['value']['Records']:
bucket = record['s3']['bucket']['name'] bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key'] key = record['s3']['object']['key']
key = unquote(key) key = unquote(key)
key = key.replace('+',' ')
logger.info(f'SPLITTING bucket: {bucket}, key: {key}') logger.info(f'SPLITTING bucket: {bucket}, key: {key}')
@@ -100,10 +100,10 @@ def split_pdfs():
with io.BytesIO() as bs: with io.BytesIO() as bs:
writer.write(bs) writer.write(bs)
bs.seek(0) bs.seek(0)
s3.put_object('cavepedia-v2-pages', f'{key}/page-{i}.pdf', bs, len(bs.getvalue())) s3.put_object(f'{bucket}-pages', f'{key}/page-{i + 1}.pdf', bs, len(bs.getvalue()))
cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i}.pdf')) cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i + 1}.pdf'))
cur.execute('DELETE FROM events WHERE event_time = %s', (row['event_time'],)) cur.execute('DELETE FROM events WHERE key = %s', (row['key'],))
conn.commit() conn.commit()
## processing ## processing
@@ -137,42 +137,31 @@ def ocr(bucket, key):
return message return message
def process_events(): def process_events():
rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL') rows = conn.execute("SELECT COUNT(*) FROM embeddings WHERE embedding IS NULL")
row = rows.fetchone()
logger.info(f'Found {row["count"]} ready to be processed')
rows = conn.execute("SELECT * FROM embeddings WHERE embedding IS NULL")
for row in rows: for row in rows:
bucket = row['bucket'] bucket = row['bucket']
key = row['key'] key = row['key']
logger.info(f'PROCESSING bucket: {bucket}, key: {key}') logger.info(f'PROCESSING bucket: {bucket}, key: {key}')
# tier 1 limit: 4k tokens/min ## claude 4 sonnet ##
# single pdf = 2-3k tokens # tier 1 limit: 8k tokens/min
max_retries = 5 # tier 2: enough
retry_delay = 30 # single pdf page: up to 2k tokens
for attempt in range(max_retries): try:
try: ai_ocr = ocr(bucket, key)
ai_ocr = ocr(bucket, key) text = ai_ocr.content[0].text
text = ai_ocr.content[0].text
text = text.replace('\n',' ')
embedding=embed(text, 'search_document') embedding=embed(text, 'search_document')
conn.execute('UPDATE embeddings SET content = %s, embedding = %s::vector WHERE bucket = %s AND key = %s;', (text, embedding, bucket, key)) conn.execute('UPDATE embeddings SET content = %s, embedding = %s::vector WHERE bucket = %s AND key = %s;', (text, embedding, bucket, key))
conn.commit() conn.commit()
break except Exception as e:
except anthropic.APIStatusError as e: logger.error(f"An unexpected error occurred: {e}")
if e.type == 'overloaded_error': return True
if attempt < max_retries - 1:
sleep_time = retry_delay * (2 ** attempt)
logger.info(f"Overload error. Retrying in {sleep_time:.2f} seconds...")
time.sleep(sleep_time)
else:
logger.info('Max retries reached.')
raise
else:
raise
except Exception as e:
logger.error(f"An unexpected error occurred: {e}")
BACKOFF = True
break
### embeddings ### embeddings
def embed(text, input_type): def embed(text, input_type):
@@ -198,9 +187,10 @@ if __name__ == '__main__':
BACKOFF = False BACKOFF = False
split_pdfs() split_pdfs()
process_events() BACKOFF = process_events()
if BACKOFF: if BACKOFF:
logger.info('BACKOFF') logger.info('backoff detected, sleeping an extra 5 minutes')
time.sleep(10 * 60) time.sleep(5 * 60)
logger.info('sleeping 5 minutes')
time.sleep(5 * 60) time.sleep(5 * 60)

View File

@@ -36,7 +36,7 @@ def embed(text, input_type):
return resp.embeddings.float[0] return resp.embeddings.float[0]
def search(): def search():
query = 'tazwell county caves' query = 'links trip with not more than 2 people'
query_embedding = embed(query, 'search_query') query_embedding = embed(query, 'search_query')
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall() rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()