split pdf

This commit is contained in:
2025-05-26 08:54:13 -04:00
parent e0c6eef76d
commit aeae900cae
4 changed files with 67 additions and 16 deletions

View File

@@ -3,5 +3,6 @@
https://min.io/docs/minio/linux/developers/python/API.html#presigned-get-object-bucket-name-object-name-expires-timedelta-days-7-response-headers-none-request-date-none-version-id-none-extra-query-params-none
## TODO
- if pages > 100 -> chunk to cavepedia-v2-scratch -> collect content
- cavepedia-v2 ->
- split pdfs -> chunk and write to cavepedia-v2-pages ->
- cohere embedding limits TODO

View File

@@ -1,10 +1,12 @@
from pgvector.psycopg import register_vector, Bit
from psycopg.rows import dict_row
from urllib.parse import unquote
from pypdf import PdfReader, PdfWriter
import anthropic
import cohere
import dotenv
import datetime
import io
import json
import minio
import numpy as np
@@ -18,6 +20,12 @@ COHERE_API_KEY = os.getenv('COHERE_API_KEY')
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
s3 = minio.Minio(
's3.bigcavemaps.com',
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
region='kansascity',
)
co = cohere.ClientV2(COHERE_API_KEY)
conn = psycopg.connect(
host='127.0.0.1',
@@ -32,6 +40,7 @@ conn = psycopg.connect(
def create_tables():
commands = (
"CREATE EXTENSION IF NOT EXISTS vector",
"DROP TABLE IF EXISTS embeddings",
"""
CREATE TABLE IF NOT EXISTS embeddings (
bucket TEXT,
@@ -46,25 +55,53 @@ def create_tables():
conn.commit()
register_vector(conn)
## splitting
def split_pdfs():
rows = conn.execute('SELECT * FROM events')
for row in rows:
with conn.cursor() as cur:
for record in row['event_data']['Records']:
bucket = record['s3']['bucket']['name']
key = record['s3']['object']['key']
key = unquote(key)
print(f'SPLITTING bucket: {bucket}, key: {key}')
##### get pdf #####
with s3.get_object(bucket, key) as obj:
with open('/tmp/file.pdf', 'wb') as f:
while True:
chunk = obj.read(1024)
if not chunk:
break
f.write(chunk)
##### split #####
with open('/tmp/file.pdf', 'rb') as f:
reader = PdfReader(f)
for i in range(len(reader.pages)):
writer = PdfWriter()
writer.add_page(reader.pages[i])
with io.BytesIO() as bs:
writer.write(bs)
bs.seek(0)
s3.put_object('cavepedia-v2-pages', f'{key}/page-{i}.pdf', bs, len(bs.getvalue()))
cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i}.pdf'))
cur.execute('DELETE FROM events WHERE event_time = %s', (row['event_time'],))
conn.commit()
## processing
def get_presigned_url(bucket, key) -> str:
client = minio.Minio(
's3.bigcavemaps.com',
access_key=MINIO_ACCESS_KEY,
secret_key=MINIO_SECRET_KEY,
region='kansascity',
)
url = client.presigned_get_object(bucket, unquote(key))
return url
def ocr(bucket, key):
url = get_presigned_url(bucket, key)
url = s3.presigned_get_object(bucket, unquote(key))
client = anthropic.Anthropic()
message = client.messages.create(
model='claude-sonnet-4-20250514',
max_tokens=1000,
max_tokens=4000,
temperature=1,
messages=[
{
@@ -88,7 +125,7 @@ def ocr(bucket, key):
return message
def process_events():
rows = conn.execute('SELECT * FROM events')
rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL')
for row in rows:
for record in row['event_data']['Records']:
@@ -119,4 +156,5 @@ def embed(text, input_type):
if __name__ == '__main__':
create_tables()
process_events()
split_pdfs()
# process_events()

View File

@@ -11,6 +11,7 @@ dependencies = [
"mypy>=1.15.0",
"pgvector>=0.4.1",
"psycopg[binary]>=3.2.9",
"pypdf>=5.5.0",
"python-dotenv>=1.1.0",
"types-psycopg2>=2.9.21.20250516",
]

11
poller/uv.lock generated
View File

@@ -550,6 +550,7 @@ dependencies = [
{ name = "mypy" },
{ name = "pgvector" },
{ name = "psycopg", extra = ["binary"] },
{ name = "pypdf" },
{ name = "python-dotenv" },
{ name = "types-psycopg2" },
]
@@ -562,6 +563,7 @@ requires-dist = [
{ name = "mypy", specifier = ">=1.15.0" },
{ name = "pgvector", specifier = ">=0.4.1" },
{ name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" },
{ name = "pypdf", specifier = ">=5.5.0" },
{ name = "python-dotenv", specifier = ">=1.1.0" },
{ name = "types-psycopg2", specifier = ">=2.9.21.20250516" },
]
@@ -743,6 +745,15 @@ wheels = [
{ url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
]
[[package]]
name = "pypdf"
version = "5.5.0"
source = { registry = "https://pypi.org/simple" }
sdist = { url = "https://files.pythonhosted.org/packages/e0/c8/543f8ae1cd9e182e9f979d9ab1df18e3445350471abadbdabc0166ae5741/pypdf-5.5.0.tar.gz", hash = "sha256:8ce6a18389f7394fd09a1d4b7a34b097b11c19088a23cfd09e5008f85893e254", size = 5021690, upload-time = "2025-05-11T14:00:42.043Z" }
wheels = [
{ url = "https://files.pythonhosted.org/packages/a1/4e/931b90b51e3ebc69699be926b3d5bfdabae2d9c84337fd0c9fb98adbf70c/pypdf-5.5.0-py3-none-any.whl", hash = "sha256:2f61f2d32dde00471cd70b8977f98960c64e84dd5ba0d070e953fcb4da0b2a73", size = 303371, upload-time = "2025-05-11T14:00:40.064Z" },
]
[[package]]
name = "python-dotenv"
version = "1.1.0"