split pdf
This commit is contained in:
@@ -3,5 +3,6 @@
|
|||||||
https://min.io/docs/minio/linux/developers/python/API.html#presigned-get-object-bucket-name-object-name-expires-timedelta-days-7-response-headers-none-request-date-none-version-id-none-extra-query-params-none
|
https://min.io/docs/minio/linux/developers/python/API.html#presigned-get-object-bucket-name-object-name-expires-timedelta-days-7-response-headers-none-request-date-none-version-id-none-extra-query-params-none
|
||||||
|
|
||||||
## TODO
|
## TODO
|
||||||
- if pages > 100 -> chunk to cavepedia-v2-scratch -> collect content
|
- cavepedia-v2 ->
|
||||||
|
- split pdfs -> chunk and write to cavepedia-v2-pages ->
|
||||||
- cohere embedding limits TODO
|
- cohere embedding limits TODO
|
||||||
|
|||||||
@@ -1,10 +1,12 @@
|
|||||||
from pgvector.psycopg import register_vector, Bit
|
from pgvector.psycopg import register_vector, Bit
|
||||||
from psycopg.rows import dict_row
|
from psycopg.rows import dict_row
|
||||||
from urllib.parse import unquote
|
from urllib.parse import unquote
|
||||||
|
from pypdf import PdfReader, PdfWriter
|
||||||
import anthropic
|
import anthropic
|
||||||
import cohere
|
import cohere
|
||||||
import dotenv
|
import dotenv
|
||||||
import datetime
|
import datetime
|
||||||
|
import io
|
||||||
import json
|
import json
|
||||||
import minio
|
import minio
|
||||||
import numpy as np
|
import numpy as np
|
||||||
@@ -18,6 +20,12 @@ COHERE_API_KEY = os.getenv('COHERE_API_KEY')
|
|||||||
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
|
MINIO_ACCESS_KEY = os.getenv('MINIO_ACCESS_KEY')
|
||||||
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
|
MINIO_SECRET_KEY = os.getenv('MINIO_SECRET_KEY')
|
||||||
|
|
||||||
|
s3 = minio.Minio(
|
||||||
|
's3.bigcavemaps.com',
|
||||||
|
access_key=MINIO_ACCESS_KEY,
|
||||||
|
secret_key=MINIO_SECRET_KEY,
|
||||||
|
region='kansascity',
|
||||||
|
)
|
||||||
co = cohere.ClientV2(COHERE_API_KEY)
|
co = cohere.ClientV2(COHERE_API_KEY)
|
||||||
conn = psycopg.connect(
|
conn = psycopg.connect(
|
||||||
host='127.0.0.1',
|
host='127.0.0.1',
|
||||||
@@ -32,6 +40,7 @@ conn = psycopg.connect(
|
|||||||
def create_tables():
|
def create_tables():
|
||||||
commands = (
|
commands = (
|
||||||
"CREATE EXTENSION IF NOT EXISTS vector",
|
"CREATE EXTENSION IF NOT EXISTS vector",
|
||||||
|
"DROP TABLE IF EXISTS embeddings",
|
||||||
"""
|
"""
|
||||||
CREATE TABLE IF NOT EXISTS embeddings (
|
CREATE TABLE IF NOT EXISTS embeddings (
|
||||||
bucket TEXT,
|
bucket TEXT,
|
||||||
@@ -46,25 +55,53 @@ def create_tables():
|
|||||||
conn.commit()
|
conn.commit()
|
||||||
register_vector(conn)
|
register_vector(conn)
|
||||||
|
|
||||||
|
## splitting
|
||||||
|
def split_pdfs():
|
||||||
|
rows = conn.execute('SELECT * FROM events')
|
||||||
|
|
||||||
|
for row in rows:
|
||||||
|
with conn.cursor() as cur:
|
||||||
|
for record in row['event_data']['Records']:
|
||||||
|
bucket = record['s3']['bucket']['name']
|
||||||
|
key = record['s3']['object']['key']
|
||||||
|
key = unquote(key)
|
||||||
|
|
||||||
|
print(f'SPLITTING bucket: {bucket}, key: {key}')
|
||||||
|
|
||||||
|
##### get pdf #####
|
||||||
|
with s3.get_object(bucket, key) as obj:
|
||||||
|
with open('/tmp/file.pdf', 'wb') as f:
|
||||||
|
while True:
|
||||||
|
chunk = obj.read(1024)
|
||||||
|
if not chunk:
|
||||||
|
break
|
||||||
|
f.write(chunk)
|
||||||
|
|
||||||
|
##### split #####
|
||||||
|
with open('/tmp/file.pdf', 'rb') as f:
|
||||||
|
reader = PdfReader(f)
|
||||||
|
|
||||||
|
for i in range(len(reader.pages)):
|
||||||
|
writer = PdfWriter()
|
||||||
|
writer.add_page(reader.pages[i])
|
||||||
|
|
||||||
|
with io.BytesIO() as bs:
|
||||||
|
writer.write(bs)
|
||||||
|
bs.seek(0)
|
||||||
|
s3.put_object('cavepedia-v2-pages', f'{key}/page-{i}.pdf', bs, len(bs.getvalue()))
|
||||||
|
cur.execute('INSERT INTO embeddings (bucket, key) VALUES (%s, %s);', (f'{bucket}-pages', f'{key}/page-{i}.pdf'))
|
||||||
|
|
||||||
|
cur.execute('DELETE FROM events WHERE event_time = %s', (row['event_time'],))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
## processing
|
## processing
|
||||||
def get_presigned_url(bucket, key) -> str:
|
|
||||||
client = minio.Minio(
|
|
||||||
's3.bigcavemaps.com',
|
|
||||||
access_key=MINIO_ACCESS_KEY,
|
|
||||||
secret_key=MINIO_SECRET_KEY,
|
|
||||||
region='kansascity',
|
|
||||||
)
|
|
||||||
|
|
||||||
url = client.presigned_get_object(bucket, unquote(key))
|
|
||||||
return url
|
|
||||||
|
|
||||||
def ocr(bucket, key):
|
def ocr(bucket, key):
|
||||||
url = get_presigned_url(bucket, key)
|
url = s3.presigned_get_object(bucket, unquote(key))
|
||||||
|
|
||||||
client = anthropic.Anthropic()
|
client = anthropic.Anthropic()
|
||||||
message = client.messages.create(
|
message = client.messages.create(
|
||||||
model='claude-sonnet-4-20250514',
|
model='claude-sonnet-4-20250514',
|
||||||
max_tokens=1000,
|
max_tokens=4000,
|
||||||
temperature=1,
|
temperature=1,
|
||||||
messages=[
|
messages=[
|
||||||
{
|
{
|
||||||
@@ -88,7 +125,7 @@ def ocr(bucket, key):
|
|||||||
return message
|
return message
|
||||||
|
|
||||||
def process_events():
|
def process_events():
|
||||||
rows = conn.execute('SELECT * FROM events')
|
rows = conn.execute('SELECT * FROM embeddings WHERE embedding IS NULL')
|
||||||
|
|
||||||
for row in rows:
|
for row in rows:
|
||||||
for record in row['event_data']['Records']:
|
for record in row['event_data']['Records']:
|
||||||
@@ -119,4 +156,5 @@ def embed(text, input_type):
|
|||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
create_tables()
|
create_tables()
|
||||||
process_events()
|
split_pdfs()
|
||||||
|
# process_events()
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ dependencies = [
|
|||||||
"mypy>=1.15.0",
|
"mypy>=1.15.0",
|
||||||
"pgvector>=0.4.1",
|
"pgvector>=0.4.1",
|
||||||
"psycopg[binary]>=3.2.9",
|
"psycopg[binary]>=3.2.9",
|
||||||
|
"pypdf>=5.5.0",
|
||||||
"python-dotenv>=1.1.0",
|
"python-dotenv>=1.1.0",
|
||||||
"types-psycopg2>=2.9.21.20250516",
|
"types-psycopg2>=2.9.21.20250516",
|
||||||
]
|
]
|
||||||
|
|||||||
11
poller/uv.lock
generated
11
poller/uv.lock
generated
@@ -550,6 +550,7 @@ dependencies = [
|
|||||||
{ name = "mypy" },
|
{ name = "mypy" },
|
||||||
{ name = "pgvector" },
|
{ name = "pgvector" },
|
||||||
{ name = "psycopg", extra = ["binary"] },
|
{ name = "psycopg", extra = ["binary"] },
|
||||||
|
{ name = "pypdf" },
|
||||||
{ name = "python-dotenv" },
|
{ name = "python-dotenv" },
|
||||||
{ name = "types-psycopg2" },
|
{ name = "types-psycopg2" },
|
||||||
]
|
]
|
||||||
@@ -562,6 +563,7 @@ requires-dist = [
|
|||||||
{ name = "mypy", specifier = ">=1.15.0" },
|
{ name = "mypy", specifier = ">=1.15.0" },
|
||||||
{ name = "pgvector", specifier = ">=0.4.1" },
|
{ name = "pgvector", specifier = ">=0.4.1" },
|
||||||
{ name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" },
|
{ name = "psycopg", extras = ["binary"], specifier = ">=3.2.9" },
|
||||||
|
{ name = "pypdf", specifier = ">=5.5.0" },
|
||||||
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
{ name = "python-dotenv", specifier = ">=1.1.0" },
|
||||||
{ name = "types-psycopg2", specifier = ">=2.9.21.20250516" },
|
{ name = "types-psycopg2", specifier = ">=2.9.21.20250516" },
|
||||||
]
|
]
|
||||||
@@ -743,6 +745,15 @@ wheels = [
|
|||||||
{ url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
|
{ url = "https://files.pythonhosted.org/packages/32/56/8a7ca5d2cd2cda1d245d34b1c9a942920a718082ae8e54e5f3e5a58b7add/pydantic_core-2.33.2-pp311-pypy311_pp73-win_amd64.whl", hash = "sha256:329467cecfb529c925cf2bbd4d60d2c509bc2fb52a20c1045bf09bb70971a9c1", size = 2066757, upload-time = "2025-04-23T18:33:30.645Z" },
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "pypdf"
|
||||||
|
version = "5.5.0"
|
||||||
|
source = { registry = "https://pypi.org/simple" }
|
||||||
|
sdist = { url = "https://files.pythonhosted.org/packages/e0/c8/543f8ae1cd9e182e9f979d9ab1df18e3445350471abadbdabc0166ae5741/pypdf-5.5.0.tar.gz", hash = "sha256:8ce6a18389f7394fd09a1d4b7a34b097b11c19088a23cfd09e5008f85893e254", size = 5021690, upload-time = "2025-05-11T14:00:42.043Z" }
|
||||||
|
wheels = [
|
||||||
|
{ url = "https://files.pythonhosted.org/packages/a1/4e/931b90b51e3ebc69699be926b3d5bfdabae2d9c84337fd0c9fb98adbf70c/pypdf-5.5.0-py3-none-any.whl", hash = "sha256:2f61f2d32dde00471cd70b8977f98960c64e84dd5ba0d070e953fcb4da0b2a73", size = 303371, upload-time = "2025-05-11T14:00:40.064Z" },
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "python-dotenv"
|
name = "python-dotenv"
|
||||||
version = "1.1.0"
|
version = "1.1.0"
|
||||||
|
|||||||
Reference in New Issue
Block a user