This commit is contained in:
2025-05-26 09:43:39 -04:00
parent aeae900cae
commit 1b8213a92e
3 changed files with 45 additions and 20 deletions

View File

@@ -3,6 +3,4 @@
https://min.io/docs/minio/linux/developers/python/API.html#presigned-get-object-bucket-name-object-name-expires-timedelta-days-7-response-headers-none-request-date-none-version-id-none-extra-query-params-none https://min.io/docs/minio/linux/developers/python/API.html#presigned-get-object-bucket-name-object-name-expires-timedelta-days-7-response-headers-none-request-date-none-version-id-none-extra-query-params-none
## TODO ## TODO
- cavepedia-v2 -> - claude exponential backoff
- split pdfs -> chunk and write to cavepedia-v2-pages ->
- cohere embedding limits TODO

View File

@@ -36,11 +36,12 @@ conn = psycopg.connect(
row_factory=dict_row, row_factory=dict_row,
) )
BACKOFF = False
## init ## init
def create_tables(): def create_tables():
commands = ( commands = (
"CREATE EXTENSION IF NOT EXISTS vector", "CREATE EXTENSION IF NOT EXISTS vector",
"DROP TABLE IF EXISTS embeddings",
""" """
CREATE TABLE IF NOT EXISTS embeddings ( CREATE TABLE IF NOT EXISTS embeddings (
bucket TEXT, bucket TEXT,
@@ -128,21 +129,39 @@ def process_events():
rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL') rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL')
for row in rows: for row in rows:
for record in row['event_data']['Records']: bucket = row['bucket']
bucket = record['s3']['bucket']['name'] key = row['key']
key = record['s3']['object']['key'] print(f'PROCESSING bucket: {bucket}, key: {key}')
print(f'PROCESSING event_time: {row["event_time"]}, bucket: {bucket}, key: {key}')
print()
ai_ocr = ocr(bucket, key) # tier 1 limit: 4k tokens/min
text = ai_ocr.content[0].text # single pdf = 2-3k tokens
text = text.replace('\n',' ') max_retries = 5
retry_delay = 30
for attempt in range(max_retries):
try:
ai_ocr = ocr(bucket, key)
text = ai_ocr.content[0].text
text = text.replace('\n',' ')
embedding=embed(text, 'search_document') embedding=embed(text, 'search_document')
with conn.cursor() as cur: conn.execute('UPDATE embeddings SET content = %s, embedding = %s::vector WHERE bucket = %s AND key = %s;', (text, embedding, bucket, key))
cur.execute('INSERT INTO embeddings (bucket, key, embedding) VALUES (%s, %s, %s::vector);', (bucket, key, embedding)) conn.commit()
cur.execute('DELETE FROM events WHERE event_time = %s', (row['event_time'],)) break
conn.commit() except anthropic.APIStatusError as e:
if e.type == 'overloaded_error':
if attempt < max_retries - 1:
sleep_time = retry_delay * (2 ** attempt)
print(f"Overload error. Retrying in {sleep_time:.2f} seconds...")
time.sleep(sleep_time)
else:
print('Max retries reached.')
raise
else:
raise
except Exception as e:
print(f"An unexpected error occurred: {e}")
BACKOFF = True
break
### embeddings ### embeddings
def embed(text, input_type): def embed(text, input_type):
@@ -156,5 +175,13 @@ def embed(text, input_type):
if __name__ == '__main__': if __name__ == '__main__':
create_tables() create_tables()
split_pdfs() while True:
# process_events() BACKOFF = False
split_pdfs()
process_events()
if BACKOFF:
print('BACKOFF')
time.sleep(10 * 60)
time.sleep(5 * 60)

View File

@@ -36,7 +36,7 @@ def embed(text, input_type):
return resp.embeddings.float[0] return resp.embeddings.float[0]
def search(): def search():
query = 'sex' query = 'caves locations in bath county'
query_embedding = embed(query, 'search_query') query_embedding = embed(query, 'search_query')
rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall() rows = conn.execute('SELECT * FROM embeddings ORDER BY embedding <=> %s::vector LIMIT 5', (query_embedding,)).fetchall()