From ac1a543ab33a09efa2758f0179cea6a89257b601 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:07:29 -0700 Subject: [PATCH 01/47] Improved validation for Bit constructor --- pgvector/bit.py | 4 +++- tests/test_bit.py | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 4be7385..9a890a1 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -13,7 +13,9 @@ def __init__(self, value): elif value.dtype != np.bool: raise ValueError('expected dtype to be bool or uint8') else: - value = np.asarray(value, dtype=bool) + value = np.asarray(value) + if value.dtype != np.bool: + raise ValueError('expected dtype to be bool') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 5e1bff2..0c661d0 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,6 +7,11 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_int(self): + with pytest.raises(ValueError) as error: + Bit([254, 7, 0]) + assert str(error.value) == 'expected dtype to be bool' + def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 900cbb38370eebfeebdd519482cfd1a30cf6e937 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:14:46 -0700 Subject: [PATCH 02/47] Improved error message --- pgvector/bit.py | 2 +- tests/test_bit.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 9a890a1..a8feb55 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): else: value = np.asarray(value) if value.dtype != np.bool: - raise ValueError('expected dtype to be bool') + raise ValueError('expected all elements to be boolean') if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 0c661d0..ae27359 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -7,10 +7,15 @@ class TestBit: def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] + def test_list_none(self): + with pytest.raises(ValueError) as error: + Bit([True, None, True]) + assert str(error.value) == 'expected all elements to be boolean' + def test_list_int(self): with pytest.raises(ValueError) as error: Bit([254, 7, 0]) - assert str(error.value) == 'expected dtype to be bool' + assert str(error.value) == 'expected all elements to be boolean' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] From 534ec18683d4c5e3058ba14d7810d0d5df7d8c55 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:34:18 -0700 Subject: [PATCH 03/47] Added support for bytes to Bit constructor --- CHANGELOG.md | 1 + pgvector/bit.py | 2 ++ tests/test_bit.py | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index ebc165a..89e955a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ - Added top-level `pgvector` package - Added support for pg8000 +- Added support for `bytes` to `Bit` constructor - Changed `globally` option to default to `False` for Psycopg 2 - Changed `arrays` option to default to `True` for Psycopg 2 - Fixed equality for `Vector`, `HalfVector`, `Bit`, and `SparseVector` classes diff --git a/pgvector/bit.py b/pgvector/bit.py index a8feb55..8766f65 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -6,6 +6,8 @@ class Bit: def __init__(self, value): if isinstance(value, str): self._value = self.from_text(value)._value + elif isinstance(value, bytes): + self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: if isinstance(value, np.ndarray): if value.dtype == np.uint8: diff --git a/tests/test_bit.py b/tests/test_bit.py index ae27359..571205f 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -23,6 +23,10 @@ def test_tuple(self): def test_str(self): assert Bit('101').to_list() == [True, False, True] + def test_bytes(self): + assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] + assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) assert Bit(arr).to_text() == '111111100000011100000000' From 2d1b754773f8c4f41970b3f61b93b20460961f98 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 14:54:26 -0700 Subject: [PATCH 04/47] Restored backwards compatibility of Bit constructor --- pgvector/bit.py | 15 ++++++--------- tests/test_bit.py | 18 ++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 8766f65..935f0f0 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -1,5 +1,6 @@ import numpy as np from struct import pack, unpack_from +from warnings import warn class Bit: @@ -9,15 +10,11 @@ def __init__(self, value): elif isinstance(value, bytes): self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) else: - if isinstance(value, np.ndarray): - if value.dtype == np.uint8: - value = np.unpackbits(value).astype(bool) - elif value.dtype != np.bool: - raise ValueError('expected dtype to be bool or uint8') - else: - value = np.asarray(value) - if value.dtype != np.bool: - raise ValueError('expected all elements to be boolean') + value = np.asarray(value) + + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) if value.ndim != 1: raise ValueError('expected ndim to be 1') diff --git a/tests/test_bit.py b/tests/test_bit.py index 571205f..a13f476 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,14 +8,12 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.raises(ValueError) as error: - Bit([True, None, True]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.raises(ValueError) as error: - Bit([254, 7, 0]) - assert str(error.value) == 'expected all elements to be boolean' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): assert Bit((True, False, True)).to_list() == [True, False, True] @@ -29,13 +27,13 @@ def test_bytes(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - assert Bit(arr).to_text() == '111111100000011100000000' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.raises(ValueError) as error: - Bit(arr) - assert str(error.value) == 'expected dtype to be bool or uint8' + with pytest.warns(UserWarning, match='expected elements to be boolean'): + assert Bit(arr).to_text() == '110' def test_ndarray_same_object(self): arr = np.array([True, False, True]) From 2ce3f43e6693fec29e92fa84f7d46fefb96f98f0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 15:35:09 -0700 Subject: [PATCH 05/47] Improved internal representation of Bit class --- pgvector/bit.py | 47 +++++++++++++++++++++++++++-------------------- tests/test_bit.py | 10 +++++----- 2 files changed, 32 insertions(+), 25 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 935f0f0..72b8052 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -5,51 +5,58 @@ class Bit: def __init__(self, value): - if isinstance(value, str): - self._value = self.from_text(value)._value - elif isinstance(value, bytes): - self._value = np.unpackbits(np.frombuffer(value, dtype=np.uint8)).astype(bool) + if isinstance(value, bytes): + self._len = 8 * len(value) + self._data = value else: - value = np.asarray(value) + if isinstance(value, str): + value = [v != '0' for v in value] + else: + value = np.asarray(value) - if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) - value = value.astype(bool) + if value.dtype != np.bool: + warn('expected elements to be boolean', stacklevel=2) + value = value.astype(bool) - if value.ndim != 1: - raise ValueError('expected ndim to be 1') + if value.ndim != 1: + raise ValueError('expected ndim to be 1') - self._value = value + self._len = len(value) + self._data = np.packbits(value).tobytes() def __repr__(self): return f'Bit({self.to_text()})' def __eq__(self, other): if isinstance(other, self.__class__): - return np.array_equal(self.to_numpy(), other.to_numpy()) + return self._len == other._len and self._data == other._data return False def to_list(self): - return self._value.tolist() + return self.to_numpy().tolist() def to_numpy(self): - return self._value + return np.unpackbits(np.frombuffer(self._data, dtype=np.uint8), count=self._len).astype(bool) def to_text(self): - return ''.join(self._value.astype(np.uint8).astype(str)) + return ''.join(format(v, '08b') for v in self._data)[:self._len] def to_binary(self): - return pack('>i', len(self._value)) + np.packbits(self._value).tobytes() + return pack('>i', self._len) + self._data @classmethod def from_text(cls, value): - return cls(np.asarray([v != '0' for v in value], dtype=bool)) + return cls(str(value)) @classmethod def from_binary(cls, value): - count = unpack_from('>i', value)[0] - buf = np.frombuffer(value, dtype=np.uint8, offset=4) - return cls(np.unpackbits(buf, count=count).astype(bool)) + if not isinstance(value, bytes): + raise ValueError('expected bytes') + + bit = cls.__new__(cls) + bit._len = unpack_from('>i', value)[0] + bit._data = value[4:] + return bit @classmethod def _to_db(cls, value): diff --git a/tests/test_bit.py b/tests/test_bit.py index a13f476..cf1275e 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -25,6 +25,11 @@ def test_bytes(self): assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + def test_ndarray(self): + arr = np.array([True, False, True]) + assert Bit(arr).to_list() == [True, False, True] + assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): @@ -35,11 +40,6 @@ def test_ndarray_uint16(self): with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' - def test_ndarray_same_object(self): - arr = np.array([True, False, True]) - assert Bit(arr).to_list() == [True, False, True] - assert Bit(arr).to_numpy() is arr - def test_ndim_two(self): with pytest.raises(ValueError) as error: Bit([[True, False], [True, False]]) From c2c17c2ab6365e55677bde47d1d13c63b4e87642 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:02:46 -0700 Subject: [PATCH 06/47] Removed warning for result of np.unpackbits --- pgvector/bit.py | 4 +++- tests/test_bit.py | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 72b8052..edfaec6 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,9 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - warn('expected elements to be boolean', stacklevel=2) + # allow result of np.unpackbits + if value.dtype != np.uint8 or np.any(value > 1): + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index cf1275e..ef049c7 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -30,6 +30,10 @@ def test_ndarray(self): assert Bit(arr).to_list() == [True, False, True] assert np.array_equal(Bit(arr).to_numpy(), arr) + def test_ndarray_unpackbits(self): + arr = np.unpackbits(np.array([254, 7, 0], dtype=np.uint8)) + assert Bit(arr).to_text() == '111111100000011100000000' + def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) with pytest.warns(UserWarning, match='expected elements to be boolean'): From 50fac76f7959a155444e46d9e11be42403b09b26 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:04:10 -0700 Subject: [PATCH 07/47] Improved test --- tests/test_bit.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_bit.py b/tests/test_bit.py index ef049c7..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -22,8 +22,8 @@ def test_str(self): assert Bit('101').to_list() == [True, False, True] def test_bytes(self): - assert Bit(b'\xff\x00').to_list() == [True, True, True, True, True, True, True, True, False, False, False, False, False, False, False, False] - assert Bit(b'\xfe\x07').to_list() == [True, True, True, True, True, True, True, False, False, False, False, False, False, True, True, True] + assert Bit(b'\xff\x00\xf0').to_text() == '111111110000000011110000' + assert Bit(b'\xfe\x07\x00').to_text() == '111111100000011100000000' def test_ndarray(self): arr = np.array([True, False, True]) From 92bb02a531fc012369ee20f065028aec230d5dcf Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:05:17 -0700 Subject: [PATCH 08/47] Updated comment [skip ci] --- pgvector/bit.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index edfaec6..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -15,7 +15,7 @@ def __init__(self, value): value = np.asarray(value) if value.dtype != np.bool: - # allow result of np.unpackbits + # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) From 4e22f9b26545f1b871cfba0fde21812ebc88ca84 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:16:01 -0700 Subject: [PATCH 09/47] Updated warning message --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index 26a9d8d..e82b325 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('expected elements to be boolean', stacklevel=2) + warn('elements should be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index 5a71642..e920228 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='expected elements to be boolean'): + with pytest.warns(UserWarning, match='elements should be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 7a2dd806e79ad82960cc1a89159ca61f9a12a373 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:20:17 -0700 Subject: [PATCH 10/47] Revert "Updated warning message" This reverts commit 4e22f9b26545f1b871cfba0fde21812ebc88ca84. --- pgvector/bit.py | 2 +- tests/test_bit.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/pgvector/bit.py b/pgvector/bit.py index e82b325..26a9d8d 100644 --- a/pgvector/bit.py +++ b/pgvector/bit.py @@ -17,7 +17,7 @@ def __init__(self, value): if value.dtype != np.bool: # skip warning for result of np.unpackbits if value.dtype != np.uint8 or np.any(value > 1): - warn('elements should be boolean', stacklevel=2) + warn('expected elements to be boolean', stacklevel=2) value = value.astype(bool) if value.ndim != 1: diff --git a/tests/test_bit.py b/tests/test_bit.py index e920228..5a71642 100644 --- a/tests/test_bit.py +++ b/tests/test_bit.py @@ -8,11 +8,11 @@ def test_list(self): assert Bit([True, False, True]).to_list() == [True, False, True] def test_list_none(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([True, None, True]).to_text() == '101' def test_list_int(self): - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit([254, 7, 0]).to_text() == '110' def test_tuple(self): @@ -36,12 +36,12 @@ def test_ndarray_unpackbits(self): def test_ndarray_uint8(self): arr = np.array([254, 7, 0], dtype=np.uint8) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndarray_uint16(self): arr = np.array([254, 7, 0], dtype=np.uint16) - with pytest.warns(UserWarning, match='elements should be boolean'): + with pytest.warns(UserWarning, match='expected elements to be boolean'): assert Bit(arr).to_text() == '110' def test_ndim_two(self): From 6bb6df8cce6d5b03e1a8a9b683ae37faaf12db7a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 16:35:04 -0700 Subject: [PATCH 11/47] Removed unreleased import --- pgvector/psycopg2/__init__.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/pgvector/psycopg2/__init__.py b/pgvector/psycopg2/__init__.py index b40c673..33e5124 100644 --- a/pgvector/psycopg2/__init__.py +++ b/pgvector/psycopg2/__init__.py @@ -1,11 +1,10 @@ from .register import register_vector # TODO remove -from .. import HalfVector, SparseVector, Vector +from .. import HalfVector, SparseVector __all__ = [ 'register_vector', - 'Vector', 'HalfVector', 'SparseVector' ] From a8f2a5f8428ae10d79be53c0367fc007eca4ab78 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 15 Mar 2025 17:53:02 -0700 Subject: [PATCH 12/47] Version bump to 0.4.0 [skip ci] --- CHANGELOG.md | 2 +- README.md | 2 +- pyproject.toml | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 89e955a..d0e2730 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.0 (unreleased) +## 0.4.0 (2025-03-15) - Added top-level `pgvector` package - Added support for pg8000 diff --git a/README.md b/README.md index 299753e..b6bc055 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ And follow the instructions for your database library: - [Psycopg 3](#psycopg-3) - [Psycopg 2](#psycopg-2) - [asyncpg](#asyncpg) -- [pg8000](#pg8000) [unreleased] +- [pg8000](#pg8000) - [Peewee](#peewee) Or check out some examples: diff --git a/pyproject.toml b/pyproject.toml index 0f291f5..b889f4b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.3.6" +version = "0.4.0" description = "pgvector support for Python" readme = "README.md" authors = [ From e19df465f0745aef4240f5388b5ca765137397be Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 17 Mar 2025 15:58:59 -0700 Subject: [PATCH 13/47] Added basic RAG example [skip ci] --- .gitignore | 1 + README.md | 1 + examples/rag/example.py | 65 +++++++++++++++++++++++++++++++++++ examples/rag/requirements.txt | 3 ++ 4 files changed, 70 insertions(+) create mode 100644 examples/rag/example.py create mode 100644 examples/rag/requirements.txt diff --git a/.gitignore b/.gitignore index f7ff659..c55ff44 100644 --- a/.gitignore +++ b/.gitignore @@ -6,3 +6,4 @@ venv/ *.pyc __pycache__ .pytest_cache/ +examples/rag/README.md diff --git a/README.md b/README.md index b6bc055..24d9bb9 100644 --- a/README.md +++ b/README.md @@ -27,6 +27,7 @@ And follow the instructions for your database library: Or check out some examples: +- [Retrieval-augmented generation](https://github.com/pgvector/pgvector-python/blob/master/examples/rag/example.py) with Ollama - [Embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/openai/example.py) with OpenAI - [Binary embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/cohere/example.py) with Cohere - [Sentence embeddings](https://github.com/pgvector/pgvector-python/blob/master/examples/sentence_transformers/example.py) with SentenceTransformers diff --git a/examples/rag/example.py b/examples/rag/example.py new file mode 100644 index 0000000..4d5d307 --- /dev/null +++ b/examples/rag/example.py @@ -0,0 +1,65 @@ +# Run: +# ollama pull llama3.2 +# ollama pull nomic-embed-text +# ollama serve + +import numpy as np +import ollama +from pathlib import Path +from pgvector.psycopg import register_vector +import psycopg +import urllib.request + +query = 'What index types are supported?' +load_data = True + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +if load_data: + # get data + url = 'https://raw.githubusercontent.com/pgvector/pgvector/refs/heads/master/README.md' + dest = Path(__file__).parent / 'README.md' + if not dest.exists(): + urllib.request.urlretrieve(url, dest) + + with open(dest, encoding='utf-8') as f: + doc = f.read() + + # generate chunks + # TODO improve chunking + # TODO remove markdown + chunks = doc.split('\n## ') + + # embed chunks + # nomic-embed-text has task instruction prefix + input = ['search_document: ' + chunk for chunk in chunks] + embeddings = ollama.embed(model='nomic-embed-text', input=input).embeddings + + # create table + conn.execute('DROP TABLE IF EXISTS chunks') + conn.execute('CREATE TABLE chunks (id bigserial PRIMARY KEY, content text, embedding vector(768))') + + # store chunks + cur = conn.cursor() + with cur.copy('COPY chunks (content, embedding) FROM STDIN WITH (FORMAT BINARY)') as copy: + copy.set_types(['text', 'vector']) + + for content, embedding in zip(chunks, embeddings): + copy.write_row([content, embedding]) + +# embed query +# nomic-embed-text has task instruction prefix +input = 'search_query: ' + query +embedding = ollama.embed(model='nomic-embed-text', input=input).embeddings[0] + +# retrieve chunks +result = conn.execute('SELECT content FROM chunks ORDER BY embedding <=> %s LIMIT 5', (np.array(embedding),)).fetchall() +context = '\n\n'.join([row[0] for row in result]) + +# get answer +# TODO improve prompt +prompt = f'Answer this question: {query}\n\n{context}' +response = ollama.generate(model='llama3.2', prompt=prompt).response +print(response) diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt new file mode 100644 index 0000000..4eb5864 --- /dev/null +++ b/examples/rag/requirements.txt @@ -0,0 +1,3 @@ +ollama +pgvector +psycopg[binary] From 1901b9cc8ab1eaf3a7415e3424509381a3399ccc Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 24 Mar 2025 01:20:18 -0700 Subject: [PATCH 14/47] Improved test [skip ci] --- tests/test_sqlalchemy.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 0d8d1ca..5aec977 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -43,7 +43,7 @@ def psycopg_connect(dbapi_connection, connection_record): psycopg_async_type_engine = create_async_engine('postgresql+psycopg://localhost/pgvector_python_test') @event.listens_for(psycopg_async_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def psycopg_async_connect(dbapi_connection, connection_record): from pgvector.psycopg import register_vector_async dbapi_connection.run_async(register_vector_async) @@ -51,7 +51,7 @@ def connect(dbapi_connection, connection_record): asyncpg_type_engine = create_async_engine('postgresql+asyncpg://localhost/pgvector_python_test') @event.listens_for(asyncpg_type_engine.sync_engine, "connect") - def connect(dbapi_connection, connection_record): + def asyncpg_connect(dbapi_connection, connection_record): from pgvector.asyncpg import register_vector dbapi_connection.run_async(register_vector) From eb654016181b69e9ed06871c39d8df329614cb66 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:42:07 -0700 Subject: [PATCH 15/47] Added ColBERT example for approximate search - #123 [skip ci] --- examples/colbert/approximate.py | 75 +++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) create mode 100644 examples/colbert/approximate.py diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py new file mode 100644 index 0000000..0508d0f --- /dev/null +++ b/examples/colbert/approximate.py @@ -0,0 +1,75 @@ +# approach from section 3.6 in https://arxiv.org/abs/2004.12832 + +from colbert.infra import ColBERTConfig +from colbert.modeling.checkpoint import Checkpoint +from pgvector.psycopg import register_vector +import psycopg + +conn = psycopg.connect(dbname='pgvector_example', autocommit=True) + +conn.execute('CREATE EXTENSION IF NOT EXISTS vector') +register_vector(conn) + +conn.execute('DROP TABLE IF EXISTS documents') +conn.execute('DROP TABLE IF EXISTS document_embeddings') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') +conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') +conn.execute(""" +CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ + WITH queries AS ( + SELECT row_number() OVER () AS query_number, * FROM (SELECT unnest(query) AS query) + ), + documents AS ( + SELECT unnest(document) AS document + ), + similarities AS ( + SELECT query_number, 1 - (document <=> query) AS similarity FROM queries CROSS JOIN documents + ), + max_similarities AS ( + SELECT MAX(similarity) AS max_similarity FROM similarities GROUP BY query_number + ) + SELECT SUM(max_similarity) FROM max_similarities +$$ LANGUAGE SQL +""") + +config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) +checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) + +input = [ + 'The dog is barking', + 'The cat is purring', + 'The bear is growling' +] +doc_embeddings = checkpoint.docFromText(input, keep_dims=False) +for content, embeddings in zip(input, doc_embeddings): + with conn.transaction(): + result = conn.execute('INSERT INTO documents (content) VALUES (%s) RETURNING id', (content,)).fetchone() + params = [] + for embedding in embeddings: + params.extend([result[0], embedding.numpy()]) + values = ', '.join(['(%s, %s)' for _ in embeddings]) + conn.execute(f'INSERT INTO document_embeddings (document_id, embedding) VALUES {values}', params) + +conn.execute('CREATE INDEX ON document_embeddings (document_id)') +conn.execute('CREATE INDEX ON document_embeddings USING hnsw (embedding vector_cosine_ops)') + +query = 'puppy' +query_embeddings = [e.numpy() for e in checkpoint.queryFromText([query])[0]] +approximate_stage = ' UNION ALL '.join(['(SELECT document_id FROM document_embeddings ORDER BY embedding <=> %s LIMIT 5)' for _ in query_embeddings]) +sql = f""" +WITH approximate_stage AS ( + {approximate_stage} +), +embeddings AS ( + SELECT document_id, array_agg(embedding) AS embeddings FROM document_embeddings + WHERE document_id IN (SELECT DISTINCT document_id FROM approximate_stage) + GROUP BY document_id +) +SELECT content, max_sim(embeddings, %s) AS max_sim FROM documents +INNER JOIN embeddings ON embeddings.document_id = documents.id +ORDER BY max_sim DESC LIMIT 10 +""" +params = [v for v in query_embeddings] + [query_embeddings] +result = conn.execute(sql, params).fetchall() +for row in result: + print(row) From 8718cdde9f91490b39a06293ec48d8f26193334b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:51:47 -0700 Subject: [PATCH 16/47] Updated comment [skip ci] --- examples/colbert/approximate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 0508d0f..fc1d396 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -1,4 +1,4 @@ -# approach from section 3.6 in https://arxiv.org/abs/2004.12832 +# based on section 3.6 of https://arxiv.org/abs/2004.12832 from colbert.infra import ColBERTConfig from colbert.modeling.checkpoint import Checkpoint From 123f74343b03a7910b8b66de4fc33127f4696430 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 22:53:10 -0700 Subject: [PATCH 17/47] Improved example [skip ci] --- examples/colbert/approximate.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index fc1d396..290e66d 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -12,8 +12,10 @@ conn.execute('DROP TABLE IF EXISTS documents') conn.execute('DROP TABLE IF EXISTS document_embeddings') + conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text)') conn.execute('CREATE TABLE document_embeddings (id bigserial PRIMARY KEY, document_id bigint, embedding vector(128))') + conn.execute(""" CREATE OR REPLACE FUNCTION max_sim(document vector[], query vector[]) RETURNS double precision AS $$ WITH queries AS ( @@ -69,7 +71,7 @@ INNER JOIN embeddings ON embeddings.document_id = documents.id ORDER BY max_sim DESC LIMIT 10 """ -params = [v for v in query_embeddings] + [query_embeddings] +params = query_embeddings + [query_embeddings] result = conn.execute(sql, params).fetchall() for row in result: print(row) From bef31a81ced1517f33c5fd960e7ba10f2fd5d8e2 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:02:35 -0700 Subject: [PATCH 18/47] Improved ColBERT examples [skip ci] --- examples/colbert/approximate.py | 4 ++++ examples/colbert/exact.py | 4 ++++ examples/colbert/requirements.txt | 1 + 3 files changed, 9 insertions(+) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 290e66d..623f913 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -4,6 +4,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index 1c90b47..ceed2e3 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -2,6 +2,10 @@ from colbert.modeling.checkpoint import Checkpoint from pgvector.psycopg import register_vector import psycopg +import warnings + +# ignore warnings from colbert +warnings.filterwarnings('ignore') conn = psycopg.connect(dbname='pgvector_example', autocommit=True) diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt index 4402ce8..54b2cb9 100644 --- a/examples/colbert/requirements.txt +++ b/examples/colbert/requirements.txt @@ -1,3 +1,4 @@ colbert-ai pgvector psycopg[binary] +transformers==4.49.0 From 208b11a893c6e5a672481847251bc13a72c84165 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:08:09 -0700 Subject: [PATCH 19/47] Improved examples[skip ci] --- examples/colbert/approximate.py | 6 +++--- examples/colbert/exact.py | 6 +++--- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 623f913..14f1ce0 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -6,9 +6,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -38,6 +35,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index ceed2e3..c1ca236 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -4,9 +4,6 @@ import psycopg import warnings -# ignore warnings from colbert -warnings.filterwarnings('ignore') - conn = psycopg.connect(dbname='pgvector_example', autocommit=True) conn.execute('CREATE EXTENSION IF NOT EXISTS vector') @@ -32,6 +29,9 @@ $$ LANGUAGE SQL """) +# ignore warnings from colbert +warnings.filterwarnings('ignore') + config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 6ff9b8997e75632936230829bd557281c49e1891 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 8 Apr 2025 23:13:23 -0700 Subject: [PATCH 20/47] Updated ColBERT examples [skip ci] --- examples/colbert/approximate.py | 3 +-- examples/colbert/exact.py | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/examples/colbert/approximate.py b/examples/colbert/approximate.py index 14f1ce0..41f88b2 100644 --- a/examples/colbert/approximate.py +++ b/examples/colbert/approximate.py @@ -35,8 +35,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) diff --git a/examples/colbert/exact.py b/examples/colbert/exact.py index c1ca236..e6a2936 100644 --- a/examples/colbert/exact.py +++ b/examples/colbert/exact.py @@ -29,8 +29,7 @@ $$ LANGUAGE SQL """) -# ignore warnings from colbert -warnings.filterwarnings('ignore') +warnings.filterwarnings('ignore') # ignore warnings from colbert config = ColBERTConfig(doc_maxlen=220, query_maxlen=32) checkpoint = Checkpoint('colbert-ir/colbertv2.0', colbert_config=config, verbose=0) From 3f9e9a20b9f08033e7dc4e61ff4c43b34951d2ec Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 15 Apr 2025 10:01:51 -0700 Subject: [PATCH 21/47] Updated Cohere example [skip ci] --- examples/cohere/example.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/cohere/example.py b/examples/cohere/example.py index 393d1e0..5ef4eec 100644 --- a/examples/cohere/example.py +++ b/examples/cohere/example.py @@ -9,12 +9,12 @@ register_vector(conn) conn.execute('DROP TABLE IF EXISTS documents') -conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1024))') +conn.execute('CREATE TABLE documents (id bigserial PRIMARY KEY, content text, embedding bit(1536))') def embed(input, input_type): - co = cohere.Client() - response = co.embed(texts=input, model='embed-english-v3.0', input_type=input_type, embedding_types=['ubinary']) + co = cohere.ClientV2() + response = co.embed(texts=input, model='embed-v4.0', input_type=input_type, embedding_types=['ubinary']) return [np.unpackbits(np.array(embedding, dtype=np.uint8)) for embedding in response.embeddings.ubinary] From 713590a798190b34f4c43c4b097dbd61455113c3 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:09:36 -0700 Subject: [PATCH 22/47] Fixed SparseVector constructor for SciPy sparse matrices - fixes #127 --- CHANGELOG.md | 4 ++++ pgvector/sparsevec.py | 2 +- tests/test_sparse_vector.py | 14 +++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d0e2730..1bbd73c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.1 (unreleased) + +- Fixed `SparseVector` constructor for SciPy sparse matrices + ## 0.4.0 (2025-03-15) - Added top-level `pgvector` package diff --git a/pgvector/sparsevec.py b/pgvector/sparsevec.py index 8df2dfd..895fbd0 100644 --- a/pgvector/sparsevec.py +++ b/pgvector/sparsevec.py @@ -85,7 +85,7 @@ def _from_sparse(self, value): if hasattr(value, 'coords'): # scipy 1.13+ - self._indices = value.coords[0].tolist() + self._indices = value.coords[-1].tolist() else: self._indices = value.col.tolist() self._values = value.data.tolist() diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index dff03dd..933cfff 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array +from scipy.sparse import coo_array, csr_array, csr_matrix from struct import pack @@ -49,6 +49,18 @@ def test_dok_array(self): assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] + def test_csr_array(self): + arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(arr) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + + def test_csr_matrix(self): + mat = csr_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_repr(self): assert repr(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' assert str(SparseVector([1, 0, 2, 0, 3, 0])) == 'SparseVector({0: 1.0, 2: 2.0, 4: 3.0}, 6)' From 76afd8ec3013ac58bb6cc60a1b5b705f157ea18b Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:15:41 -0700 Subject: [PATCH 23/47] Added test for coo_matrix --- tests/test_sparse_vector.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 933cfff..0cf0a72 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -1,7 +1,7 @@ import numpy as np from pgvector import SparseVector import pytest -from scipy.sparse import coo_array, csr_array, csr_matrix +from scipy.sparse import coo_array, coo_matrix, csr_array, csr_matrix from struct import pack @@ -43,6 +43,12 @@ def test_coo_array_dimensions(self): SparseVector(coo_array(np.array([1, 0, 2, 0, 3, 0])), 6) assert str(error.value) == 'extra argument' + def test_coo_matrix(self): + mat = coo_matrix(np.array([1, 0, 2, 0, 3, 0])) + vec = SparseVector(mat) + assert vec.to_list() == [1, 0, 2, 0, 3, 0] + assert vec.indices() == [0, 2, 4] + def test_dok_array(self): arr = coo_array(np.array([1, 0, 2, 0, 3, 0])).todok() vec = SparseVector(arr) From 809287f92847e1c609a9c395891da76f674379ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 21 Apr 2025 03:20:20 -0700 Subject: [PATCH 24/47] Fixed CI --- tests/test_sparse_vector.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_sparse_vector.py b/tests/test_sparse_vector.py index 0cf0a72..d580f32 100644 --- a/tests/test_sparse_vector.py +++ b/tests/test_sparse_vector.py @@ -56,7 +56,7 @@ def test_dok_array(self): assert vec.indices() == [0, 2, 4] def test_csr_array(self): - arr = csr_array(np.array([1, 0, 2, 0, 3, 0])) + arr = csr_array(np.array([[1, 0, 2, 0, 3, 0]])) vec = SparseVector(arr) assert vec.to_list() == [1, 0, 2, 0, 3, 0] assert vec.indices() == [0, 2, 4] From f9d2073df5cce39f0691ead6f9e030516baac7f8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 26 Apr 2025 11:56:00 -0700 Subject: [PATCH 25/47] Version bump to 0.4.1 [skip ci] --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 1bbd73c..0ed80e3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.1 (unreleased) +## 0.4.1 (2025-04-26) - Fixed `SparseVector` constructor for SciPy sparse matrices diff --git a/pyproject.toml b/pyproject.toml index b889f4b..9395f9e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.4.0" +version = "0.4.1" description = "pgvector support for Python" readme = "README.md" authors = [ From 7793bb069942fbcc2e77cf7349c59ffc28d8b6e0 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 21 May 2025 18:16:18 -0700 Subject: [PATCH 26/47] Improved example [skip ci] --- examples/loading/example.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/loading/example.py b/examples/loading/example.py index 0702129..7f3dce8 100644 --- a/examples/loading/example.py +++ b/examples/loading/example.py @@ -25,12 +25,12 @@ copy.set_types(['vector']) for i, embedding in enumerate(embeddings): + copy.write_row([embedding]) + # show progress if i % 10000 == 0: print('.', end='', flush=True) - copy.write_row([embedding]) - print('\nSuccess!') # create any indexes *after* loading initial data (skipping for this example) From 91088aacfadad37c9b8ea533b1e2b16b08d12ac4 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sun, 8 Jun 2025 16:28:24 -0700 Subject: [PATCH 27/47] Updated readme [skip ci] --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 24d9bb9..7c302b1 100644 --- a/README.md +++ b/README.md @@ -409,7 +409,7 @@ Enable the extension conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.psycopg import register_vector @@ -472,7 +472,7 @@ cur = conn.cursor() cur.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection or cursor +Register the types with your connection or cursor ```python from pgvector.psycopg2 import register_vector @@ -518,7 +518,7 @@ Enable the extension await conn.execute('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.asyncpg import register_vector @@ -572,7 +572,7 @@ Enable the extension conn.run('CREATE EXTENSION IF NOT EXISTS vector') ``` -Register the vector type with your connection +Register the types with your connection ```python from pgvector.pg8000 import register_vector From ee3e71ca2c07a12a8332a3877c0ce14adc9a5da8 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Mon, 16 Jun 2025 15:36:53 -0700 Subject: [PATCH 28/47] Updated format for license identifier --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 9395f9e..0cfa183 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -10,7 +10,7 @@ readme = "README.md" authors = [ {name = "Andrew Kane", email = "andrew@ankane.org"} ] -license = {text = "MIT"} +license = "MIT" requires-python = ">= 3.9" dependencies = [ "numpy" From 33dee606229489c9ffb0cb5a1cd72bd4705ac618 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Tue, 2 Sep 2025 16:35:08 -0700 Subject: [PATCH 29/47] Added support for str objects for bit type with SQLAlchemy - #137 Co-authored-by: Giacomo rua --- CHANGELOG.md | 4 ++++ pgvector/sqlalchemy/bit.py | 12 ++++++++++++ tests/test_sqlalchemy.py | 5 +++++ 3 files changed, 21 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0ed80e3..f219b22 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,7 @@ +## 0.4.2 (unreleased) + +- Added support for `str` objects for `bit` type with SQLAlchemy + ## 0.4.1 (2025-04-26) - Fixed `SparseVector` constructor for SciPy sparse matrices diff --git a/pgvector/sqlalchemy/bit.py b/pgvector/sqlalchemy/bit.py index 0f83f3c..1ea85c3 100644 --- a/pgvector/sqlalchemy/bit.py +++ b/pgvector/sqlalchemy/bit.py @@ -14,6 +14,18 @@ def get_col_spec(self, **kw): return 'BIT' return 'BIT(%d)' % self.length + def bind_processor(self, dialect): + if dialect.__class__.__name__ == 'PGDialect_asyncpg': + import asyncpg + + def process(value): + if isinstance(value, str): + return asyncpg.BitString(value) + return value + return process + else: + return super().bind_processor(dialect) + class comparator_factory(UserDefinedType.Comparator): def hamming_distance(self, other): return self.op('<~>', return_type=Float)(other) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 5aec977..cd7bad8 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -596,6 +596,11 @@ async def test_bit(self, engine): item = await session.get(Item, 1) assert item.binary_embedding == embedding + if engine == asyncpg_engine: + session.add(Item(id=2, binary_embedding='101')) + item = await session.get(Item, 2) + assert item.binary_embedding == embedding + await engine.dispose() @pytest.mark.asyncio From dc9a8f959995f009649fd230139ca41193e0a801 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 3 Sep 2025 12:04:21 -0700 Subject: [PATCH 30/47] Added test for binary quantization with re-ranking --- tests/test_sqlalchemy.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index cd7bad8..702eee1 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -528,6 +528,22 @@ def test_binary_quantize(self, engine): items = session.query(Item).order_by(distance).all() assert [v.id for v in items] == [2, 3, 1] + def test_binary_quantize_reranking(self, engine): + # recreate index (could also vacuum table) + binary_quantize_index.drop(setup_engine) + binary_quantize_index.create(setup_engine) + + with Session(engine) as session: + session.add(Item(id=1, embedding=[-1, -2, -3])) + session.add(Item(id=2, embedding=[1, -2, 3])) + session.add(Item(id=3, embedding=[1, 2, 3])) + session.commit() + + distance = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) + subquery = session.query(Item).order_by(distance).limit(20).subquery() + items = session.query(subquery).order_by(subquery.c.embedding.cosine_distance([3, -1, 2])).limit(5).all() + assert [v.id for v in items] == [2, 3, 1] + @pytest.mark.parametrize('engine', array_engines) class TestSqlalchemyArray: From caf1a2e0dd7a1ba2ad0ca9f09b50516dcfffcdeb Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 3 Sep 2025 12:09:18 -0700 Subject: [PATCH 31/47] Added docs for binary quantization with SQLAlchemy [skip ci] --- README.md | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/README.md b/README.md index 7c302b1..bfec8bb 100644 --- a/README.md +++ b/README.md @@ -271,6 +271,38 @@ order = func.cast(Item.embedding, HALFVEC(3)).l2_distance([3, 1, 2]) session.scalars(select(Item).order_by(order).limit(5)) ``` +#### Binary Quantization + +Use expression indexing for binary quantization + +```python +from pgvector.sqlalchemy import BIT +from sqlalchemy.sql import func + +index = Index( + 'my_index', + func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), + postgresql_using='hnsw', + postgresql_with={'m': 16, 'ef_construction': 64}, + postgresql_ops={'embedding': 'bit_hamming_ops'} +) +``` + +Get the nearest neighbors by Hamming distance + +```python +order = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) +session.scalars(select(Item).order_by(order).limit(5)) +``` + +Re-rank by the original vectors for better recall + +```python +order = func.cast(func.binary_quantize(Item.embedding), BIT(3)).hamming_distance(func.binary_quantize(func.cast([3, -1, 2], VECTOR(3)))) +subquery = session.query(Item).order_by(order).limit(20).subquery() +session.scalars(select(subquery).order_by(subquery.c.embedding.cosine_distance([3, -1, 2])).limit(5)) +``` + #### Arrays Add an array column From c820a53bfb46196551de3c3f59f81b192d890574 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 3 Sep 2025 12:11:38 -0700 Subject: [PATCH 32/47] Simplified examples [skip ci] --- README.md | 2 -- tests/test_sqlalchemy.py | 2 -- 2 files changed, 4 deletions(-) diff --git a/README.md b/README.md index bfec8bb..7cff86c 100644 --- a/README.md +++ b/README.md @@ -259,7 +259,6 @@ index = Index( 'my_index', func.cast(Item.embedding, HALFVEC(3)).label('embedding'), postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'halfvec_l2_ops'} ) ``` @@ -283,7 +282,6 @@ index = Index( 'my_index', func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'bit_hamming_ops'} ) ``` diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 702eee1..c59c12e 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -103,7 +103,6 @@ class Item(Base): 'sqlalchemy_orm_half_precision_index', func.cast(Item.embedding, HALFVEC(3)).label('embedding'), postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'halfvec_l2_ops'} ) half_precision_index.create(setup_engine) @@ -112,7 +111,6 @@ class Item(Base): 'sqlalchemy_orm_binary_quantize_index', func.cast(func.binary_quantize(Item.embedding), BIT(3)).label('embedding'), postgresql_using='hnsw', - postgresql_with={'m': 16, 'ef_construction': 64}, postgresql_ops={'embedding': 'bit_hamming_ops'} ) binary_quantize_index.create(setup_engine) From 1a72b7571adf3325174b383aca85bfb3a5b925fa Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Sep 2025 00:12:48 -0700 Subject: [PATCH 33/47] Updated pgvector on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 4d4e8ed..d943ea0 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.8.0 https://github.com/pgvector/pgvector.git + git clone --branch v0.8.1 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install From e211ba4029f204734f0c001fbb90f6a594d561ae Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 9 Oct 2025 23:19:50 -0700 Subject: [PATCH 34/47] Test with Python 3.14 on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index d943ea0..52ab712 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -6,7 +6,7 @@ jobs: strategy: fail-fast: false matrix: - python: [3.13, 3.9] + python: [3.14, 3.9] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 From e2986daf2b1533cc2c849f7e39350e31d57ac325 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 4 Dec 2025 16:47:23 -0800 Subject: [PATCH 35/47] Added support for Django 6 --- CHANGELOG.md | 1 + pgvector/django/extensions.py | 9 +++++++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index f219b22..62da0bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,6 @@ ## 0.4.2 (unreleased) +- Added support for Django 6 - Added support for `str` objects for `bit` type with SQLAlchemy ## 0.4.1 (2025-04-26) diff --git a/pgvector/django/extensions.py b/pgvector/django/extensions.py index 0573f72..1d04739 100644 --- a/pgvector/django/extensions.py +++ b/pgvector/django/extensions.py @@ -1,6 +1,11 @@ +from django import VERSION from django.contrib.postgres.operations import CreateExtension class VectorExtension(CreateExtension): - def __init__(self): - self.name = 'vector' + if VERSION[0] >= 6: + def __init__(self, hints=None): + super().__init__('vector', hints=hints) + else: + def __init__(self): + self.name = 'vector' From 674f5ba3410c873d49f50fa9725b95d9db50c674 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 4 Dec 2025 16:50:18 -0800 Subject: [PATCH 36/47] Updated checkout action [skip ci] --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 52ab712..34f15d5 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -8,7 +8,7 @@ jobs: matrix: python: [3.14, 3.9] steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v5 - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} From 2968f258f9486531bd1340cbda4ff8fcaf06cdc1 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 4 Dec 2025 17:06:01 -0800 Subject: [PATCH 37/47] Version bump to 0.4.2 [skip ci] --- CHANGELOG.md | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 62da0bb..745335f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.4.2 (unreleased) +## 0.4.2 (2025-12-04) - Added support for Django 6 - Added support for `str` objects for `bit` type with SQLAlchemy diff --git a/pyproject.toml b/pyproject.toml index 0cfa183..6f91e04 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "pgvector" -version = "0.4.1" +version = "0.4.2" description = "pgvector support for Python" readme = "README.md" authors = [ From 05387da3c5ce0dc9f1d6ef238dcae118aa8176ea Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Dec 2025 16:31:46 -0800 Subject: [PATCH 38/47] Updated examples [skip ci] --- README.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 7cff86c..a208ae0 100644 --- a/README.md +++ b/README.md @@ -177,10 +177,10 @@ session.execute(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(Base): - embedding = mapped_column(Vector(3)) + embedding = mapped_column(VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` @@ -306,11 +306,11 @@ session.scalars(select(subquery).order_by(subquery.c.embedding.cosine_distance([ Add an array column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import ARRAY class Item(Base): - embeddings = mapped_column(ARRAY(Vector(3))) + embeddings = mapped_column(ARRAY(VECTOR(3))) ``` And register the types with the underlying driver @@ -359,10 +359,10 @@ session.exec(text('CREATE EXTENSION IF NOT EXISTS vector')) Add a vector column ```python -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR class Item(SQLModel, table=True): - embedding: Any = Field(sa_type=Vector(3)) + embedding: Any = Field(sa_type=VECTOR(3)) ``` Also supports `HALFVEC`, `BIT`, and `SPARSEVEC` From 6d8db07f74fd95b3673fd8149f3f805a15788f48 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Dec 2025 16:34:08 -0800 Subject: [PATCH 39/47] Updated readme [skip ci] --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a208ae0..36d6c06 100644 --- a/README.md +++ b/README.md @@ -345,7 +345,7 @@ from sqlalchemy import event @event.listens_for(engine, "connect") def connect(dbapi_connection, connection_record): - register_vector(dbapi_connection, arrays=True) + register_vector(dbapi_connection) ``` ## SQLModel From b34f1c994e843dd7468d600b0f0ff5dbb949ec61 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 6 Dec 2025 16:38:13 -0800 Subject: [PATCH 40/47] Updated examples [skip ci] --- examples/implicit/example.py | 6 +++--- examples/lightfm/example.py | 6 +++--- examples/surprise/example.py | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/examples/implicit/example.py b/examples/implicit/example.py index f70eb8c..2cbf7c6 100644 --- a/examples/implicit/example.py +++ b/examples/implicit/example.py @@ -1,6 +1,6 @@ import implicit from implicit.datasets.movielens import get_movielens -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import create_engine, insert, select, text, Integer, String from sqlalchemy.orm import declarative_base, mapped_column, Session @@ -16,7 +16,7 @@ class User(Base): __tablename__ = 'user' id = mapped_column(Integer, primary_key=True) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) class Item(Base): @@ -24,7 +24,7 @@ class Item(Base): id = mapped_column(Integer, primary_key=True) title = mapped_column(String) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) Base.metadata.drop_all(engine) diff --git a/examples/lightfm/example.py b/examples/lightfm/example.py index fcb9027..65031c4 100644 --- a/examples/lightfm/example.py +++ b/examples/lightfm/example.py @@ -1,6 +1,6 @@ from lightfm import LightFM from lightfm.datasets import fetch_movielens -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import create_engine, insert, select, text, Float, Integer, String from sqlalchemy.orm import declarative_base, mapped_column, Session @@ -16,7 +16,7 @@ class User(Base): __tablename__ = 'user' id = mapped_column(Integer, primary_key=True) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) class Item(Base): @@ -24,7 +24,7 @@ class Item(Base): id = mapped_column(Integer, primary_key=True) title = mapped_column(String) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) bias = mapped_column(Float) diff --git a/examples/surprise/example.py b/examples/surprise/example.py index bd7d18d..e413bcf 100644 --- a/examples/surprise/example.py +++ b/examples/surprise/example.py @@ -1,4 +1,4 @@ -from pgvector.sqlalchemy import Vector +from pgvector.sqlalchemy import VECTOR from sqlalchemy import create_engine, insert, select, text, Integer from sqlalchemy.orm import declarative_base, mapped_column, Session from surprise import Dataset, SVD @@ -15,14 +15,14 @@ class User(Base): __tablename__ = 'user' id = mapped_column(Integer, primary_key=True) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) class Item(Base): __tablename__ = 'item' id = mapped_column(Integer, primary_key=True) - factors = mapped_column(Vector(20)) + factors = mapped_column(VECTOR(20)) Base.metadata.drop_all(engine) From e1dda975bdb2635f273cc3e6cc1b9c01780cec00 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Dec 2025 12:20:29 -0800 Subject: [PATCH 41/47] Switched to getuser for tests --- tests/test_pg8000.py | 4 ++-- tests/test_sqlalchemy.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/test_pg8000.py b/tests/test_pg8000.py index 4d3e474..61fbc4c 100644 --- a/tests/test_pg8000.py +++ b/tests/test_pg8000.py @@ -1,10 +1,10 @@ +from getpass import getuser import numpy as np -import os from pgvector import HalfVector, SparseVector, Vector from pgvector.pg8000 import register_vector from pg8000.native import Connection -conn = Connection(os.environ["USER"], database='pgvector_python_test') +conn = Connection(getuser(), database='pgvector_python_test') conn.run('CREATE EXTENSION IF NOT EXISTS vector') conn.run('DROP TABLE IF EXISTS pg8000_items') diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index c59c12e..4e870cc 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -1,6 +1,6 @@ import asyncpg +from getpass import getuser import numpy as np -import os from pgvector import HalfVector, SparseVector, Vector from pgvector.sqlalchemy import VECTOR, HALFVEC, BIT, SPARSEVEC, avg, sum import pytest @@ -28,7 +28,7 @@ def psycopg2_connect(dbapi_connection, connection_record): register_vector(dbapi_connection) -pg8000_engine = create_engine(f'postgresql+pg8000://{os.environ["USER"]}@localhost/pgvector_python_test') +pg8000_engine = create_engine(f'postgresql+pg8000://{getuser()}@localhost/pgvector_python_test') if sqlalchemy_version > 1: psycopg_engine = create_engine('postgresql+psycopg://localhost/pgvector_python_test') From 5392e2cd3200574829610efedef1678dafbaa4d7 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 17 Dec 2025 13:13:06 -0800 Subject: [PATCH 42/47] Switched to dependency groups --- .github/workflows/build.yml | 2 +- README.md | 2 +- pyproject.toml | 15 +++++++++++++++ requirements.txt | 12 ------------ 4 files changed, 17 insertions(+), 14 deletions(-) delete mode 100644 requirements.txt diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 34f15d5..7d0225b 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,7 +12,7 @@ jobs: - uses: actions/setup-python@v5 with: python-version: ${{ matrix.python }} - - run: pip install -r requirements.txt + - run: pip install --group dev - uses: ankane/setup-postgres@v1 with: database: pgvector_python_test diff --git a/README.md b/README.md index 36d6c06..7671c00 100644 --- a/README.md +++ b/README.md @@ -807,7 +807,7 @@ To get started with development: ```sh git clone https://github.com/pgvector/pgvector-python.git cd pgvector-python -pip install -r requirements.txt +pip install --group dev createdb pgvector_python_test pytest ``` diff --git a/pyproject.toml b/pyproject.toml index 6f91e04..5716d05 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,21 @@ dependencies = [ [project.urls] Homepage = "https://github.com/pgvector/pgvector-python" +[dependency-groups] +dev = [ + "asyncpg", + "Django", + "peewee", + "pg8000", + "psycopg[binary,pool]", + "psycopg2-binary", + "pytest", + "pytest-asyncio", + "scipy", + "SQLAlchemy[asyncio]>=2", + "sqlmodel>=0.0.12" +] + [tool.pytest.ini_options] asyncio_mode = "auto" asyncio_default_fixture_loop_scope = "function" diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index a13be06..0000000 --- a/requirements.txt +++ /dev/null @@ -1,12 +0,0 @@ -asyncpg -Django -numpy -peewee -pg8000 -psycopg[binary,pool] -psycopg2-binary -pytest -pytest-asyncio -scipy -SQLAlchemy[asyncio]>=2 -sqlmodel>=0.0.12 From 6c1fa981ad624f7ded69e3a60df8fa303466e19c Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 18 Dec 2025 21:52:46 -0800 Subject: [PATCH 43/47] Updated example to pyproject.toml [skip ci] --- README.md | 2 +- examples/citus/pyproject.toml | 11 +++++++++++ examples/citus/requirements.txt | 3 --- examples/cohere/pyproject.toml | 11 +++++++++++ examples/cohere/requirements.txt | 3 --- examples/colbert/pyproject.toml | 12 ++++++++++++ examples/colbert/requirements.txt | 4 ---- examples/colpali/pyproject.toml | 12 ++++++++++++ examples/colpali/requirements.txt | 4 ---- examples/gensim/pyproject.toml | 13 +++++++++++++ examples/gensim/requirements.txt | 5 ----- examples/hybrid_search/pyproject.toml | 11 +++++++++++ examples/hybrid_search/requirements.txt | 3 --- examples/image_search/pyproject.toml | 14 ++++++++++++++ examples/image_search/requirements.txt | 6 ------ examples/imagehash/pyproject.toml | 13 +++++++++++++ examples/imagehash/requirements.txt | 5 ----- examples/implicit/pyproject.toml | 13 +++++++++++++ examples/implicit/requirements.txt | 5 ----- examples/lightfm/pyproject.toml | 12 ++++++++++++ examples/lightfm/requirements.txt | 4 ---- examples/loading/pyproject.toml | 11 +++++++++++ examples/loading/requirements.txt | 3 --- examples/openai/pyproject.toml | 11 +++++++++++ examples/openai/requirements.txt | 3 --- examples/rag/pyproject.toml | 11 +++++++++++ examples/rag/requirements.txt | 3 --- examples/rdkit/pyproject.toml | 11 +++++++++++ examples/rdkit/requirements.txt | 3 --- examples/sentence_transformers/pyproject.toml | 11 +++++++++++ examples/sentence_transformers/requirements.txt | 3 --- examples/sparse_search/pyproject.toml | 13 +++++++++++++ examples/sparse_search/requirements.txt | 5 ----- examples/surprise/pyproject.toml | 12 ++++++++++++ examples/surprise/requirements.txt | 4 ---- 35 files changed, 203 insertions(+), 67 deletions(-) create mode 100644 examples/citus/pyproject.toml delete mode 100644 examples/citus/requirements.txt create mode 100644 examples/cohere/pyproject.toml delete mode 100644 examples/cohere/requirements.txt create mode 100644 examples/colbert/pyproject.toml delete mode 100644 examples/colbert/requirements.txt create mode 100644 examples/colpali/pyproject.toml delete mode 100644 examples/colpali/requirements.txt create mode 100644 examples/gensim/pyproject.toml delete mode 100644 examples/gensim/requirements.txt create mode 100644 examples/hybrid_search/pyproject.toml delete mode 100644 examples/hybrid_search/requirements.txt create mode 100644 examples/image_search/pyproject.toml delete mode 100644 examples/image_search/requirements.txt create mode 100644 examples/imagehash/pyproject.toml delete mode 100644 examples/imagehash/requirements.txt create mode 100644 examples/implicit/pyproject.toml delete mode 100644 examples/implicit/requirements.txt create mode 100644 examples/lightfm/pyproject.toml delete mode 100644 examples/lightfm/requirements.txt create mode 100644 examples/loading/pyproject.toml delete mode 100644 examples/loading/requirements.txt create mode 100644 examples/openai/pyproject.toml delete mode 100644 examples/openai/requirements.txt create mode 100644 examples/rag/pyproject.toml delete mode 100644 examples/rag/requirements.txt create mode 100644 examples/rdkit/pyproject.toml delete mode 100644 examples/rdkit/requirements.txt create mode 100644 examples/sentence_transformers/pyproject.toml delete mode 100644 examples/sentence_transformers/requirements.txt create mode 100644 examples/sparse_search/pyproject.toml delete mode 100644 examples/sparse_search/requirements.txt create mode 100644 examples/surprise/pyproject.toml delete mode 100644 examples/surprise/requirements.txt diff --git a/README.md b/README.md index 7671c00..95d5fbe 100644 --- a/README.md +++ b/README.md @@ -816,7 +816,7 @@ To run an example: ```sh cd examples/loading -pip install -r requirements.txt +pip install --group dev createdb pgvector_example python3 example.py ``` diff --git a/examples/citus/pyproject.toml b/examples/citus/pyproject.toml new file mode 100644 index 0000000..ee40a36 --- /dev/null +++ b/examples/citus/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "numpy", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/citus/requirements.txt b/examples/citus/requirements.txt deleted file mode 100644 index 1cf8ee9..0000000 --- a/examples/citus/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -numpy -pgvector -psycopg[binary] diff --git a/examples/cohere/pyproject.toml b/examples/cohere/pyproject.toml new file mode 100644 index 0000000..f0c88b7 --- /dev/null +++ b/examples/cohere/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "cohere", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/cohere/requirements.txt b/examples/cohere/requirements.txt deleted file mode 100644 index 22fd056..0000000 --- a/examples/cohere/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -cohere -pgvector -psycopg[binary] diff --git a/examples/colbert/pyproject.toml b/examples/colbert/pyproject.toml new file mode 100644 index 0000000..face4d2 --- /dev/null +++ b/examples/colbert/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "colbert-ai", + "pgvector", + "psycopg[binary]", + "transformers==4.49.0" +] diff --git a/examples/colbert/requirements.txt b/examples/colbert/requirements.txt deleted file mode 100644 index 54b2cb9..0000000 --- a/examples/colbert/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -colbert-ai -pgvector -psycopg[binary] -transformers==4.49.0 diff --git a/examples/colpali/pyproject.toml b/examples/colpali/pyproject.toml new file mode 100644 index 0000000..23fb23f --- /dev/null +++ b/examples/colpali/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "colpali-engine", + "datasets", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/colpali/requirements.txt b/examples/colpali/requirements.txt deleted file mode 100644 index 4cf770d..0000000 --- a/examples/colpali/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -colpali-engine -datasets -pgvector -psycopg[binary] diff --git a/examples/gensim/pyproject.toml b/examples/gensim/pyproject.toml new file mode 100644 index 0000000..7a33423 --- /dev/null +++ b/examples/gensim/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "gensim", + "numpy", + "pgvector", + "psycopg[binary]", + "scipy<1.13" +] diff --git a/examples/gensim/requirements.txt b/examples/gensim/requirements.txt deleted file mode 100644 index 15411cd..0000000 --- a/examples/gensim/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -gensim -numpy -pgvector -psycopg[binary] -scipy<1.13 diff --git a/examples/hybrid_search/pyproject.toml b/examples/hybrid_search/pyproject.toml new file mode 100644 index 0000000..b5a904a --- /dev/null +++ b/examples/hybrid_search/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "pgvector", + "psycopg[binary]", + "sentence-transformers" +] diff --git a/examples/hybrid_search/requirements.txt b/examples/hybrid_search/requirements.txt deleted file mode 100644 index 237dcd1..0000000 --- a/examples/hybrid_search/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pgvector -psycopg[binary] -sentence-transformers diff --git a/examples/image_search/pyproject.toml b/examples/image_search/pyproject.toml new file mode 100644 index 0000000..7644382 --- /dev/null +++ b/examples/image_search/pyproject.toml @@ -0,0 +1,14 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "matplotlib", + "pgvector", + "psycopg[binary]", + "torch", + "torchvision", + "tqdm" +] diff --git a/examples/image_search/requirements.txt b/examples/image_search/requirements.txt deleted file mode 100644 index 3d82365..0000000 --- a/examples/image_search/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -matplotlib -pgvector -psycopg[binary] -torch -torchvision -tqdm diff --git a/examples/imagehash/pyproject.toml b/examples/imagehash/pyproject.toml new file mode 100644 index 0000000..cf06c2b --- /dev/null +++ b/examples/imagehash/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "datasets", + "imagehash", + "matplotlib", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/imagehash/requirements.txt b/examples/imagehash/requirements.txt deleted file mode 100644 index e3971e6..0000000 --- a/examples/imagehash/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -datasets -imagehash -matplotlib -pgvector -psycopg[binary] diff --git a/examples/implicit/pyproject.toml b/examples/implicit/pyproject.toml new file mode 100644 index 0000000..c03b187 --- /dev/null +++ b/examples/implicit/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "h5py", + "implicit", + "pgvector", + "psycopg[binary]", + "SQLAlchemy" +] diff --git a/examples/implicit/requirements.txt b/examples/implicit/requirements.txt deleted file mode 100644 index 424abbd..0000000 --- a/examples/implicit/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -h5py -implicit -pgvector -psycopg[binary] -SQLAlchemy diff --git a/examples/lightfm/pyproject.toml b/examples/lightfm/pyproject.toml new file mode 100644 index 0000000..c202058 --- /dev/null +++ b/examples/lightfm/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "lightfm", + "pgvector", + "psycopg[binary]", + "SQLAlchemy" +] diff --git a/examples/lightfm/requirements.txt b/examples/lightfm/requirements.txt deleted file mode 100644 index cfa5f51..0000000 --- a/examples/lightfm/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -lightfm -pgvector -psycopg[binary] -SQLAlchemy diff --git a/examples/loading/pyproject.toml b/examples/loading/pyproject.toml new file mode 100644 index 0000000..ee40a36 --- /dev/null +++ b/examples/loading/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "numpy", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/loading/requirements.txt b/examples/loading/requirements.txt deleted file mode 100644 index 1cf8ee9..0000000 --- a/examples/loading/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -numpy -pgvector -psycopg[binary] diff --git a/examples/openai/pyproject.toml b/examples/openai/pyproject.toml new file mode 100644 index 0000000..3e6661a --- /dev/null +++ b/examples/openai/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "openai", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/openai/requirements.txt b/examples/openai/requirements.txt deleted file mode 100644 index 18587e2..0000000 --- a/examples/openai/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -openai -pgvector -psycopg[binary] diff --git a/examples/rag/pyproject.toml b/examples/rag/pyproject.toml new file mode 100644 index 0000000..fa0dcfd --- /dev/null +++ b/examples/rag/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "ollama", + "pgvector", + "psycopg[binary]" +] diff --git a/examples/rag/requirements.txt b/examples/rag/requirements.txt deleted file mode 100644 index 4eb5864..0000000 --- a/examples/rag/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -ollama -pgvector -psycopg[binary] diff --git a/examples/rdkit/pyproject.toml b/examples/rdkit/pyproject.toml new file mode 100644 index 0000000..f8c035a --- /dev/null +++ b/examples/rdkit/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "pgvector", + "psycopg[binary]", + "rdkit" +] diff --git a/examples/rdkit/requirements.txt b/examples/rdkit/requirements.txt deleted file mode 100644 index 85a3e4f..0000000 --- a/examples/rdkit/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pgvector -psycopg[binary] -rdkit diff --git a/examples/sentence_transformers/pyproject.toml b/examples/sentence_transformers/pyproject.toml new file mode 100644 index 0000000..b5a904a --- /dev/null +++ b/examples/sentence_transformers/pyproject.toml @@ -0,0 +1,11 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "pgvector", + "psycopg[binary]", + "sentence-transformers" +] diff --git a/examples/sentence_transformers/requirements.txt b/examples/sentence_transformers/requirements.txt deleted file mode 100644 index 237dcd1..0000000 --- a/examples/sentence_transformers/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ -pgvector -psycopg[binary] -sentence-transformers diff --git a/examples/sparse_search/pyproject.toml b/examples/sparse_search/pyproject.toml new file mode 100644 index 0000000..7927c34 --- /dev/null +++ b/examples/sparse_search/pyproject.toml @@ -0,0 +1,13 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "numpy", + "pgvector", + "psycopg[binary]", + "torch", + "transformers" +] diff --git a/examples/sparse_search/requirements.txt b/examples/sparse_search/requirements.txt deleted file mode 100644 index 3de81c7..0000000 --- a/examples/sparse_search/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -numpy -pgvector -psycopg[binary] -torch -transformers diff --git a/examples/surprise/pyproject.toml b/examples/surprise/pyproject.toml new file mode 100644 index 0000000..94c6f13 --- /dev/null +++ b/examples/surprise/pyproject.toml @@ -0,0 +1,12 @@ +[project] +name = "example" +version = "0.1.0" +requires-python = ">= 3.9" + +[dependency-groups] +dev = [ + "pgvector", + "psycopg[binary]", + "scikit-surprise", + "SQLAlchemy" +] diff --git a/examples/surprise/requirements.txt b/examples/surprise/requirements.txt deleted file mode 100644 index cb2dca4..0000000 --- a/examples/surprise/requirements.txt +++ /dev/null @@ -1,4 +0,0 @@ -pgvector -psycopg[binary] -scikit-surprise -SQLAlchemy From 016478953c094d908096e775c4366309601a2a38 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Fri, 19 Dec 2025 16:35:14 -0800 Subject: [PATCH 44/47] Updated gitignore [skip ci] --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index c55ff44..5556c9f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,5 @@ venv/ *.pyc __pycache__ .pytest_cache/ +*.lock examples/rag/README.md From a3520fef143f01a93bc2e70b00070080d2996c29 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Wed, 7 Jan 2026 15:26:48 -0800 Subject: [PATCH 45/47] Updated CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7d0225b..7f46dd6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -9,7 +9,7 @@ jobs: python: [3.14, 3.9] steps: - uses: actions/checkout@v5 - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: ${{ matrix.python }} - run: pip install --group dev From 8bd3c61fc0954fc7d0e1060ce81a91315ca15010 Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Thu, 22 Jan 2026 10:20:14 -0800 Subject: [PATCH 46/47] Added tests for subqueries with SQLAlchemy - closes #147 --- tests/test_sqlalchemy.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/test_sqlalchemy.py b/tests/test_sqlalchemy.py index 4e870cc..7558942 100644 --- a/tests/test_sqlalchemy.py +++ b/tests/test_sqlalchemy.py @@ -396,6 +396,20 @@ def test_sparsevec_l1_distance_orm(self, engine): items = session.scalars(select(Item).order_by(Item.sparse_embedding.l1_distance([1, 1, 1]))) assert [v.id for v in items] == [1, 3, 2] + def test_subquery(self, engine): + create_items() + with Session(engine) as session: + subquery = select(Item.embedding).filter_by(id=1).scalar_subquery() + items = session.query(Item).order_by(Item.embedding.l2_distance(subquery)).all() + assert [v.id for v in items] == [1, 3, 2] + + def test_subquery_orm(self, engine): + create_items() + with Session(engine) as session: + subquery = select(Item.embedding).filter_by(id=1).scalar_subquery() + items = session.scalars(select(Item).order_by(Item.embedding.l2_distance(subquery))) + assert [v.id for v in items] == [1, 3, 2] + def test_filter(self, engine): create_items() with Session(engine) as session: From 5b06584aaf1588b04441e622e28bd9b0aa09924a Mon Sep 17 00:00:00 2001 From: Andrew Kane Date: Sat, 28 Feb 2026 18:44:10 -0800 Subject: [PATCH 47/47] Updated pgvector on CI --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 7f46dd6..85db5b6 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -19,7 +19,7 @@ jobs: dev-files: true - run: | cd /tmp - git clone --branch v0.8.1 https://github.com/pgvector/pgvector.git + git clone --branch v0.8.2 https://github.com/pgvector/pgvector.git cd pgvector make sudo make install