backps
This commit is contained in:
@@ -7,9 +7,7 @@ up () {
|
|||||||
--detach \
|
--detach \
|
||||||
--name cp2-pg \
|
--name cp2-pg \
|
||||||
--restart unless-stopped \
|
--restart unless-stopped \
|
||||||
--env POSTGRES_DB=cavepediav2_db \
|
--env-file $HOME/scripts-private/lech/cavepedia-v2/cp2-pg.env \
|
||||||
--env POSTGRES_PASSWORD=cavepediav2_pw \
|
|
||||||
--env POSTGRES_USER=cavepediav2_user \
|
|
||||||
--volume /mammoth/cp2/cp2-pg/data:/var/lib/postgresql/data:rw \
|
--volume /mammoth/cp2/cp2-pg/data:/var/lib/postgresql/data:rw \
|
||||||
--publish 127.0.0.1:4010:5432 \
|
--publish 127.0.0.1:4010:5432 \
|
||||||
--network pew-net \
|
--network pew-net \
|
||||||
53
poller/getobject.py
Normal file
53
poller/getobject.py
Normal file
@@ -0,0 +1,53 @@
|
|||||||
|
from pgvector.psycopg import register_vector, Bit
|
||||||
|
from psycopg.rows import dict_row
|
||||||
|
from urllib.parse import unquote
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
|
import anthropic
|
||||||
|
import cohere
|
||||||
|
import dotenv
|
||||||
|
import datetime
|
||||||
|
import io
|
||||||
|
import json
|
||||||
|
import minio
|
||||||
|
import numpy as np
|
||||||
|
import os
|
||||||
|
import psycopg
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
from pythonjsonlogger.json import JsonFormatter
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
logger.setLevel(logging.INFO)
|
||||||
|
logHandler = logging.StreamHandler()
|
||||||
|
formatter = JsonFormatter("{asctime}{message}", style="{")
|
||||||
|
logHandler.setFormatter(formatter)
|
||||||
|
logger.addHandler(logHandler)
|
||||||
|
|
||||||
|
#####
|
||||||
|
|
||||||
|
dotenv.load_dotenv('/home/paul/scripts-private/lech/cavepedia-v2/poller.env')
|
||||||
|
|
||||||
|
COHERE_API_KEY = os.getenv('COHERE_API_KEY')
|
||||||
|
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
|
||||||
|
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
|
||||||
|
|
||||||
|
s3 = minio.Minio(
|
||||||
|
's3.bigcavemaps.com',
|
||||||
|
access_key=MINIO_ACCESS_KEY,
|
||||||
|
secret_key=MINIO_SECRET_KEY,
|
||||||
|
region='kansascity',
|
||||||
|
)
|
||||||
|
|
||||||
|
def getobject():
|
||||||
|
bucket = 'cavepedia-v2'
|
||||||
|
key = 'public/var/fyi/VAR-FYI 1982-01.pdf'
|
||||||
|
with s3.get_object(bucket, key) as obj:
|
||||||
|
with open('/tmp/file.pdf', 'wb') as f:
|
||||||
|
while True:
|
||||||
|
chunk = obj.read(1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
getobject()
|
||||||
@@ -47,9 +47,8 @@ conn = psycopg.connect(
|
|||||||
row_factory=dict_row,
|
row_factory=dict_row,
|
||||||
)
|
)
|
||||||
|
|
||||||
BACKOFF = False
|
|
||||||
|
|
||||||
## init
|
## init
|
||||||
|
# events table is created by minio up creation of event destination
|
||||||
def create_tables():
|
def create_tables():
|
||||||
commands = (
|
commands = (
|
||||||
"CREATE EXTENSION IF NOT EXISTS vector",
|
"CREATE EXTENSION IF NOT EXISTS vector",
|
||||||
@@ -73,10 +72,11 @@ def split_pdfs():
|
|||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
with conn.cursor() as cur:
|
with conn.cursor() as cur:
|
||||||
for record in row['event_data']['Records']:
|
for record in row['value']['Records']:
|
||||||
bucket = record['s3']['bucket']['name']
|
bucket = record['s3']['bucket']['name']
|
||||||
key = record['s3']['object']['key']
|
key = record['s3']['object']['key']
|
||||||
key = unquote(key)
|
key = unquote(key)
|
||||||
|
key = key.replace('+',' ')
|
||||||
|
|
||||||
logger.info(f'SPLITTING bucket: {bucket}, key: {key}')
|
logger.info(f'SPLITTING bucket: {bucket}, key: {key}')
|
||||||
|
|
||||||
@@ -100,10 +100,10 @@ def split_pdfs():
|
|||||||
with io.BytesIO() as bs:
|
with io.BytesIO() as bs:
|
||||||
writer.write(bs)
|
writer.write(bs)
|
||||||
bs.seek(0)
|
bs.seek(0)
|
||||||
s3.put_object('cavepedia-v2-pages', f'{key}/page-{i}.pdf', bs, len(bs.getvalue()))
|
s3.put_object(f'{bucket}-pages', f'{key}/page-{i + 1}.pdf', bs, len(bs.getvalue()))
|
||||||
cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i}.pdf'))
|
cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i + 1}.pdf'))
|
||||||
|
|
||||||
cur.execute('DELETE FROM events WHERE event_time = %s', (row['event_time'],))
|
cur.execute('DELETE FROM events WHERE key = %s', (row['key'],))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
|
|
||||||
## processing
|
## processing
|
||||||
@@ -137,42 +137,31 @@ def ocr(bucket, key):
|
|||||||
return message
|
return message
|
||||||
|
|
||||||
def process_events():
|
def process_events():
|
||||||
rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL')
|
rows = conn.execute("SELECT COUNT(*) FROM embeddings WHERE embedding IS NULL")
|
||||||
|
row = rows.fetchone()
|
||||||
|
logger.info(f'Found {row["count"]} ready to be processed')
|
||||||
|
|
||||||
|
rows = conn.execute("SELECT * FROM embeddings WHERE embedding IS NULL")
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
bucket = row['bucket']
|
bucket = row['bucket']
|
||||||
key = row['key']
|
key = row['key']
|
||||||
logger.info(f'PROCESSING bucket: {bucket}, key: {key}')
|
logger.info(f'PROCESSING bucket: {bucket}, key: {key}')
|
||||||
|
|
||||||
# tier 1 limit: 4k tokens/min
|
## claude 4 sonnet ##
|
||||||
# single pdf = 2-3k tokens
|
# tier 1 limit: 8k tokens/min
|
||||||
max_retries = 5
|
# tier 2: enough
|
||||||
retry_delay = 30
|
# single pdf page: up to 2k tokens
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
try:
|
||||||
ai_ocr = ocr(bucket, key)
|
ai_ocr = ocr(bucket, key)
|
||||||
text = ai_ocr.content[0].text
|
text = ai_ocr.content[0].text
|
||||||
text = text.replace('\n',' ')
|
|
||||||
|
|
||||||
embedding=embed(text, 'search_document')
|
embedding=embed(text, 'search_document')
|
||||||
conn.execute('UPDATE embeddings SET content = %s, embedding = %s::vector WHERE bucket = %s AND key = %s;', (text, embedding, bucket, key))
|
conn.execute('UPDATE embeddings SET content = %s, embedding = %s::vector WHERE bucket = %s AND key = %s;', (text, embedding, bucket, key))
|
||||||
conn.commit()
|
conn.commit()
|
||||||
break
|
|
||||||
except anthropic.APIStatusError as e:
|
|
||||||
if e.type == 'overloaded_error':
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
sleep_time = retry_delay * (2 ** attempt)
|
|
||||||
logger.info(f"Overload error. Retrying in {sleep_time:.2f} seconds...")
|
|
||||||
time.sleep(sleep_time)
|
|
||||||
else:
|
|
||||||
logger.info('Max retries reached.')
|
|
||||||
raise
|
|
||||||
else:
|
|
||||||
raise
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"An unexpected error occurred: {e}")
|
logger.error(f"An unexpected error occurred: {e}")
|
||||||
BACKOFF = True
|
return True
|
||||||
break
|
|
||||||
|
|
||||||
### embeddings
|
### embeddings
|
||||||
def embed(text, input_type):
|
def embed(text, input_type):
|
||||||
@@ -198,9 +187,10 @@ if __name__ == '__main__':
|
|||||||
BACKOFF = False
|
BACKOFF = False
|
||||||
|
|
||||||
split_pdfs()
|
split_pdfs()
|
||||||
process_events()
|
BACKOFF = process_events()
|
||||||
|
|
||||||
if BACKOFF:
|
if BACKOFF:
|
||||||
logger.info('BACKOFF')
|
logger.info('backoff detected, sleeping an extra 5 minutes')
|
||||||
time.sleep(10 * 60)
|
time.sleep(5 * 60)
|
||||||
|
logger.info('sleeping 5 minutes')
|
||||||
time.sleep(5 * 60)
|
time.sleep(5 * 60)
|
||||||
|
|||||||
@@ -36,7 +36,7 @@ def embed(text, input_type):
|
|||||||
return resp.embeddings.float[0]
|
return resp.embeddings.float[0]
|
||||||
|
|
||||||
def search():
|
def search():
|
||||||
query = 'tazwell county caves'
|
query = 'links trip with not more than 2 people'
|
||||||
query_embedding = embed(query, 'search_query')
|
query_embedding = embed(query, 'search_query')
|
||||||
|
|
||||||
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()
|
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()
|
||||||
|
|||||||
Reference in New Issue
Block a user