Skip to content

Commit ea8e346

Browse files
authored
Merge branch 'pgvector:master' into rtm-pgvector-python
2 parents bd70e8a + ca637bf commit ea8e346

3 files changed

Lines changed: 61 additions & 0 deletions

File tree

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ Or check out some examples:
3333
- [Hybrid search](https://github.com/pgvector/pgvector-python/blob/master/examples/hybrid_search/cross_encoder.py) with SentenceTransformers (cross-encoder)
3434
- [Sparse search](https://github.com/pgvector/pgvector-python/blob/master/examples/sparse_search/example.py) with Transformers
3535
- [Late interaction search](https://github.com/pgvector/pgvector-python/blob/master/examples/colbert/exact.py) with ColBERT
36+
- [Visual document retrieval](https://github.com/pgvector/pgvector-python/blob/master/examples/colpali/exact.py) with ColPali
3637
- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/image_search/example.py) with PyTorch
3738
- [Image search](https://github.com/pgvector/pgvector-python/blob/master/examples/imagehash/example.py) with perceptual hashing
3839
- [Morgan fingerprints](https://github.com/pgvector/pgvector-python/blob/master/examples/rdkit/example.py) with RDKit

examples/colpali/exact.py

Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
from colpali_engine.models import ColQwen2, ColQwen2Processor
2+
from colpali_engine.utils.torch_utils import get_torch_device
3+
from datasets import load_dataset
4+
from pgvector.psycopg import register_vector, Bit
5+
import psycopg
6+
import torch
7+
8+
conn = psycopg.connect(dbname='pgvector_example', autocommit=True)
9+
10+
conn.execute('CREATE EXTENSION IF NOT EXISTS vector')
11+
register_vector(conn)
12+
13+
conn.execute('DROP TABLE IF EXISTS documents')
14+
conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, embeddings bit(128)[])')
15+
conn.execute("""
16+
CREATE OR REPLACE FUNCTION max_sim(document bit[], query bit[]) RETURNS double precision AS $$
17+
WITH queries AS (
18+
SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query)
19+
),
20+
documents AS (
21+
SELECT unnest(document) AS document
22+
),
23+
similarities AS (
24+
SELECT query_number, 1 - ((document <~> query) / bit_length(query)) AS similarity FROM queries CROSS JOIN documents
25+
),
26+
max_similarities AS (
27+
SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number
28+
)
29+
SELECT SUM(max_similarity) FROM max_similarities
30+
$$ LANGUAGE SQL
31+
""")
32+
33+
device = get_torch_device('auto')
34+
model = ColQwen2.from_pretrained('vidore/colqwen2-v1.0', torch_dtype=torch.bfloat16, device_map=device).eval()
35+
processor = ColQwen2Processor.from_pretrained('vidore/colqwen2-v1.0')
36+
37+
38+
def generate_embeddings(processed):
39+
with torch.no_grad():
40+
return model(**processed.to(model.device)).to(torch.float32).numpy(force=True)
41+
42+
43+
def binary_quantize(embedding):
44+
return Bit(embedding > 0)
45+
46+
47+
input = load_dataset('vidore/docvqa_test_subsampled', split='test[:3]')['image']
48+
for content in input:
49+
embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_images([content]))[0]]
50+
conn.execute('INSERT INTO documents (embeddings) VALUES (%s)', (embeddings,))
51+
52+
query = 'dividend'
53+
query_embeddings = [binary_quantize(e) for e in generate_embeddings(processor.process_queries([query]))[0]]
54+
result = conn.execute('SELECT id, max_sim(embeddings, %s) AS max_sim FROM documents ORDER BY max_sim DESC LIMIT 5', (query_embeddings,)).fetchall()
55+
for row in result:
56+
print(row)

examples/colpali/requirements.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
colpali-engine
2+
datasets
3+
pgvector
4+
psycopg[binary]

0 commit comments

Comments
 (0)