From 7f2709225a1a5a71b33522dafd354dc7159c358f Mon Sep 17 00:00:00 2001 From: dishaprakash <57954147+dishaprakash@users.noreply.github.com> Date: Fri, 6 Sep 2024 06:41:37 +0000 Subject: [PATCH 01/12] fix: replacing cosine_similarity and maximal_marginal_relevance local methods with the ones in langchain core. (#190) Co-authored-by: Averi Kitsch --- .../vectorstore.py | 77 +------------------ 1 file changed, 2 insertions(+), 75 deletions(-) diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py index 964e9df9..cbe12767 100644 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -22,7 +22,7 @@ import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore +from langchain_core.vectorstores import VectorStore, utils from sqlalchemy.engine.row import RowMapping from .engine import PostgresEngine @@ -788,7 +788,7 @@ async def amax_marginal_relevance_search_with_score_by_vector( fetch_k = fetch_k if fetch_k else self.fetch_k lambda_mult = lambda_mult if lambda_mult else self.lambda_mult embedding_list = [json.loads(row[self.embedding_column]) for row in results] - mmr_selected = maximal_marginal_relevance( + mmr_selected = utils.maximal_marginal_relevance( np.array(embedding, dtype=np.float32), embedding_list, k=k, @@ -963,76 +963,3 @@ async def is_valid_index( """ results = await self.engine._afetch(query) return bool(len(results) == 1) - - -### The following is copied from langchain-community until it's moved into core - -Matrix = Union[List[List[float]], List[np.ndarray], np.ndarray] - - -def maximal_marginal_relevance( - query_embedding: np.ndarray, - embedding_list: list, - lambda_mult: float = 0.5, - k: int = 4, -) -> List[int]: - """Calculate maximal marginal relevance.""" - if min(k, len(embedding_list)) <= 0: - return [] - if query_embedding.ndim == 1: - query_embedding = np.expand_dims(query_embedding, axis=0) - similarity_to_query = cosine_similarity(query_embedding, embedding_list)[0] - most_similar = int(np.argmax(similarity_to_query)) - idxs = [most_similar] - selected = np.array([embedding_list[most_similar]]) - while len(idxs) < min(k, len(embedding_list)): - best_score = -np.inf - idx_to_add = -1 - similarity_to_selected = cosine_similarity(embedding_list, selected) - for i, query_score in enumerate(similarity_to_query): - if i in idxs: - continue - redundant_score = max(similarity_to_selected[i]) - equation_score = ( - lambda_mult * query_score - (1 - lambda_mult) * redundant_score - ) - if equation_score > best_score: - best_score = equation_score - idx_to_add = i - idxs.append(idx_to_add) - selected = np.append(selected, [embedding_list[idx_to_add]], axis=0) - return idxs - - -def cosine_similarity(X: Matrix, Y: Matrix) -> np.ndarray: - """Row-wise cosine similarity between two equal-width matrices.""" - if len(X) == 0 or len(Y) == 0: - return np.array([]) - - X = np.array(X) - Y = np.array(Y) - if X.shape[1] != Y.shape[1]: - raise ValueError( - f"Number of columns in X and Y must be the same. X has shape {X.shape} " - f"and Y has shape {Y.shape}." - ) - try: - import simsimd as simd # type: ignore - - X = np.array(X, dtype=np.float32) - Y = np.array(Y, dtype=np.float32) - Z = 1 - simd.cdist(X, Y, metric="cosine") - if isinstance(Z, float): - return np.array([Z]) - return Z - except ImportError: - X_norm = np.linalg.norm(X, axis=1) - Y_norm = np.linalg.norm(Y, axis=1) - # Ignore divide by zero errors run time warnings as those are handled below. - with np.errstate(divide="ignore", invalid="ignore"): - similarity = np.dot(X, Y.T) / np.outer(X_norm, Y_norm) - similarity[np.isnan(similarity) | np.isinf(similarity)] = 0.0 - return similarity - - -### End code from langchain-community From 5672e98412538fc8b93e44bb141e797bb36aa604 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Fri, 6 Sep 2024 14:27:18 -0700 Subject: [PATCH 02/12] chore(samples): update langchain on vertexai requirement testing (#202) --- samples/index_tuning_sample/requirements.txt | 4 ++-- .../prebuilt_langchain_agent_template.py | 6 +----- samples/langchain_on_vertexai/requirements.txt | 9 +++++---- .../retriever_agent_with_history_template.py | 7 +------ .../langchain_on_vertexai/retriever_chain_template.py | 6 +----- samples/requirements.txt | 9 +++++---- 6 files changed, 15 insertions(+), 26 deletions(-) diff --git a/samples/index_tuning_sample/requirements.txt b/samples/index_tuning_sample/requirements.txt index 0a7a2867..d59a3563 100644 --- a/samples/index_tuning_sample/requirements.txt +++ b/samples/index_tuning_sample/requirements.txt @@ -1,3 +1,3 @@ -langchain-google-cloud-sql-pg==0.7.0 -langchain==0.2.14 +langchain-community==0.2.16 +langchain-google-cloud-sql-pg==0.9.0 langchain-google-vertexai==1.0.10 \ No newline at end of file diff --git a/samples/langchain_on_vertexai/prebuilt_langchain_agent_template.py b/samples/langchain_on_vertexai/prebuilt_langchain_agent_template.py index 64bbda0c..f8c6ca71 100644 --- a/samples/langchain_on_vertexai/prebuilt_langchain_agent_template.py +++ b/samples/langchain_on_vertexai/prebuilt_langchain_agent_template.py @@ -99,11 +99,7 @@ def similarity_search(query: str) -> List[Document]: "temperature": 0.1, }, ), - requirements=[ - "google-cloud-aiplatform[reasoningengine,langchain]==1.57.0", - "langchain-google-cloud-sql-pg==0.6.1", - "langchain-google-vertexai==1.0.4", - ], + requirements="requirements.txt", display_name=DISPLAY_NAME, sys_version="3.11", extra_packages=["config.py"], diff --git a/samples/langchain_on_vertexai/requirements.txt b/samples/langchain_on_vertexai/requirements.txt index 95f38d0f..13db4cc8 100644 --- a/samples/langchain_on_vertexai/requirements.txt +++ b/samples/langchain_on_vertexai/requirements.txt @@ -1,4 +1,5 @@ -google-cloud-aiplatform[reasoningengine,langchain]==1.57.0 -langchain-google-cloud-sql-pg==0.6.1 -langchain-google-vertexai==1.0.4 -google-cloud-resource-manager==1.12.3 \ No newline at end of file +google-cloud-aiplatform[reasoningengine,langchain]==1.65.0 +google-cloud-resource-manager==1.12.5 +langchain-community==0.2.16 +langchain-google-cloud-sql-pg==0.9.0 +langchain-google-vertexai==1.0.10 \ No newline at end of file diff --git a/samples/langchain_on_vertexai/retriever_agent_with_history_template.py b/samples/langchain_on_vertexai/retriever_agent_with_history_template.py index 548d5e67..df8a1f8e 100644 --- a/samples/langchain_on_vertexai/retriever_agent_with_history_template.py +++ b/samples/langchain_on_vertexai/retriever_agent_with_history_template.py @@ -186,12 +186,7 @@ def query(self, input: str, session_id: str) -> str: user=USER, password=PASSWORD, ), - requirements=[ - "google-cloud-aiplatform[reasoningengine,langchain]==1.57.0", - "langchain-google-cloud-sql-pg==0.6.1", - "langchain-google-vertexai==1.0.4", - "langchainhub==0.1.20", - ], + requirements="requirements.txt", display_name=DISPLAY_NAME, sys_version="3.11", extra_packages=["config.py"], diff --git a/samples/langchain_on_vertexai/retriever_chain_template.py b/samples/langchain_on_vertexai/retriever_chain_template.py index d65842db..c032cb79 100644 --- a/samples/langchain_on_vertexai/retriever_chain_template.py +++ b/samples/langchain_on_vertexai/retriever_chain_template.py @@ -155,11 +155,7 @@ def query(self, input: str) -> str: user=USER, password=PASSWORD, ), - requirements=[ - "google-cloud-aiplatform[reasoningengine,langchain]==1.57.0", - "langchain-google-cloud-sql-pg==0.6.1", - "langchain-google-vertexai==1.0.4", - ], + requirements="requirements.txt", display_name=DISPLAY_NAME, sys_version="3.11", extra_packages=["config.py"], diff --git a/samples/requirements.txt b/samples/requirements.txt index 462950ce..13db4cc8 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,4 +1,5 @@ -google-cloud-aiplatform[reasoningengine,langchain] -langchain-google-vertexai -langchain-community -google-cloud-resource-manager \ No newline at end of file +google-cloud-aiplatform[reasoningengine,langchain]==1.65.0 +google-cloud-resource-manager==1.12.5 +langchain-community==0.2.16 +langchain-google-cloud-sql-pg==0.9.0 +langchain-google-vertexai==1.0.10 \ No newline at end of file From 0651231b7d77e0451ae769f78fe6dce3e724dec4 Mon Sep 17 00:00:00 2001 From: dishaprakash <57954147+dishaprakash@users.noreply.github.com> Date: Mon, 9 Sep 2024 17:58:25 +0000 Subject: [PATCH 03/12] fix: updating the minimum langchain core version to 0.2.36 (#205) --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index b6627534..83a3bc23 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,7 @@ authors = [ dependencies = [ "cloud-sql-python-connector[asyncpg] >= 1.10.0, <2.0.0", - "langchain-core>=0.1.1, <1.0.0 ", + "langchain-core>=0.2.36, <1.0.0 ", "numpy>=1.24.4, <2.0.0", "pgvector>=0.2.5, <1.0.0", "SQLAlchemy[asyncio]>=2.0.25, <3.0.0" From de168427f9884f33332086b68308e1225ee9e952 Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Tue, 10 Sep 2024 15:04:51 -0700 Subject: [PATCH 04/12] feat!: refactor to support both async and sync usage (#206) * feat: separate Async only interface (#186) * feat: separate Async only interface * add tests * remove afetch/aexecute * respond to comments * update tests * add close * fix test * add tests * clean up * test * remove connector closing * lint * lint * feat: refactor vector store to wrap async class (#187) * feat: separate Async only interface * add tests * remove afetch/aexecute * respond to comments * update tests * add close * fix test * add tests * clean up * test * remove connector closing * lint * lint * feat: refactor vector store to wrap async class * rebase * refactor * remove changes * fix * update tests * respond to comments * feat: refactor chat message history (#192) * feat: separate Async only interface * add tests * remove afetch/aexecute * respond to comments * update tests * add close * fix test * add tests * clean up * test * remove connector closing * lint * lint * feat: refactor vector store to wrap async class * rebase * refactor * remove changes * fix * update tests * feat: refactor chat message history * lint * feat: refactor loader (#193) * feat: separate Async only interface * add tests * remove afetch/aexecute * respond to comments * update tests * add close * fix test * add tests * clean up * test * remove connector closing * lint * lint * feat: refactor vector store to wrap async class * rebase * refactor * remove changes * fix * update tests * feat: refactor loader * lint * update async tests * lint * feat: add from_engine_args (#194) * feat: add from_engine_args * lint * Update test_engine.py * add support for loop * add chat tests * tests * add proxy * Debug * fix tests * clean up * clean up * use wget * fix version * debug * feat: ensure schema support for refactor * fix --- integration.cloudbuild.yaml | 21 +- .../async_chat_message_history.py | 148 +++ .../async_loader.py | 450 +++++++++ .../async_vectorstore.py | 875 ++++++++++++++++++ .../chat_message_history.py | 111 +-- src/langchain_google_cloud_sql_pg/engine.py | 422 ++++++--- src/langchain_google_cloud_sql_pg/loader.py | 344 +------ .../vectorstore.py | 660 +++++-------- tests/test_async_chatmessagehistory.py | 124 +++ tests/test_async_loader.py | 757 +++++++++++++++ ...ctorstore.py => test_async_vectorstore.py} | 191 ++-- tests/test_async_vectorstore_from_methods.py | 182 ++++ ...dex.py => test_async_vectorstore_index.py} | 21 +- tests/test_async_vectorstore_search.py | 270 ++++++ ...ehistory.py => test_chatmessagehistory.py} | 104 ++- ...st_postgresql_engine.py => test_engine.py} | 148 ++- ...st_postgresql_loader.py => test_loader.py} | 72 +- tests/test_vectorstore.py | 532 +++++++++++ ...ds.py => test_vectorstore_from_methods.py} | 88 +- tests/test_vectorstore_index.py | 223 +++++ ...e_search.py => test_vectorstore_search.py} | 81 +- 21 files changed, 4655 insertions(+), 1169 deletions(-) create mode 100644 src/langchain_google_cloud_sql_pg/async_chat_message_history.py create mode 100644 src/langchain_google_cloud_sql_pg/async_loader.py create mode 100644 src/langchain_google_cloud_sql_pg/async_vectorstore.py create mode 100644 tests/test_async_chatmessagehistory.py create mode 100644 tests/test_async_loader.py rename tests/{test_cloudsql_vectorstore.py => test_async_vectorstore.py} (67%) create mode 100644 tests/test_async_vectorstore_from_methods.py rename tests/{test_cloudsql_vectorstore_index.py => test_async_vectorstore_index.py} (88%) create mode 100644 tests/test_async_vectorstore_search.py rename tests/{test_postgresql_chatmessagehistory.py => test_chatmessagehistory.py} (60%) rename tests/{test_postgresql_engine.py => test_engine.py} (71%) rename tests/{test_postgresql_loader.py => test_loader.py} (93%) create mode 100644 tests/test_vectorstore.py rename tests/{test_cloudsql_vectorstore_from_methods.py => test_vectorstore_from_methods.py} (68%) create mode 100644 tests/test_vectorstore_index.py rename tests/{test_cloudsql_vectorstore_search.py => test_vectorstore_search.py} (79%) diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index 3bece7aa..957248be 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -23,16 +23,30 @@ steps: entrypoint: pip args: ["install", ".[test]", "--user"] + - id: proxy-install + name: alpine:3.10 + entrypoint: sh + args: + - -c + - | + wget -O /workspace/cloud_sql_proxy https://storage.googleapis.com/cloudsql-proxy/v1.37.0/cloud_sql_proxy.linux.386 + chmod +x /workspace/cloud_sql_proxy + - id: Run integration tests name: python:${_VERSION} - entrypoint: python - args: ["-m", "pytest", "--cov=langchain_google_cloud_sql_pg", "--cov-config=.coveragerc", "tests/"] + entrypoint: /bin/bash env: - "PROJECT_ID=$PROJECT_ID" - "INSTANCE_ID=$_INSTANCE_ID" - "DATABASE_ID=$_DATABASE_ID" - "REGION=$_REGION" + - "IP_ADDRESS=$_IP_ADDRESS" secretEnv: ["DB_USER", "DB_PASSWORD", "IAM_ACCOUNT"] + args: + - "-c" + - | + /workspace/cloud_sql_proxy -dir=/workspace -instances=${_INSTANCE_CONNECTION_NAME}=tcp:$_IP_ADDRESS:$_DATABASE_PORT & sleep 2; + python -m pytest --cov=langchain_google_cloud_sql_pg --cov-config=.coveragerc tests/ availableSecrets: secretManager: @@ -44,9 +58,12 @@ availableSecrets: env: "IAM_ACCOUNT" substitutions: + _INSTANCE_CONNECTION_NAME: ${PROJECT_ID}:${_REGION}:${_INSTANCE_ID} + _DATABASE_PORT: "5432" _DATABASE_ID: test-database _REGION: us-central1 _VERSION: "3.8" + _IP_ADDRESS: "127.0.0.1" options: dynamicSubstitutions: true diff --git a/src/langchain_google_cloud_sql_pg/async_chat_message_history.py b/src/langchain_google_cloud_sql_pg/async_chat_message_history.py new file mode 100644 index 00000000..a7873aae --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/async_chat_message_history.py @@ -0,0 +1,148 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +from typing import List, Sequence + +from langchain_core.chat_history import BaseChatMessageHistory +from langchain_core.messages import BaseMessage, messages_from_dict +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine + +from .engine import PostgresEngine + + +class AsyncPostgresChatMessageHistory(BaseChatMessageHistory): + """Chat message history stored in an Cloud SQL for PostgreSQL database.""" + + __create_key = object() + + def __init__( + self, + key: object, + pool: AsyncEngine, + session_id: str, + table_name: str, + schema_name: str = "public", + ): + """AsyncPostgresChatMessageHistory constructor. + + Args: + key (object): Key to prevent direct constructor usage. + engine (PostgresEngine): Database connection pool. + session_id (str): Retrieve the table content with this session ID. + table_name (str): Table name that stores the chat message history. + schema_name (str, optional): Database schema name of the chat message history table. Defaults to "public". + + Raises: + Exception: If constructor is directly called by the user. + """ + if key != AsyncPostgresChatMessageHistory.__create_key: + raise Exception( + "Only create class through 'create' or 'create_sync' methods!" + ) + self.pool = pool + self.session_id = session_id + self.table_name = table_name + self.schema_name = schema_name + + @classmethod + async def create( + cls, + engine: PostgresEngine, + session_id: str, + table_name: str, + schema_name: str = "public", + ) -> AsyncPostgresChatMessageHistory: + """Create a new AsyncPostgresChatMessageHistory instance. + + Args: + engine (PostgresEngine): Postgres engine to use. + session_id (str): Retrieve the table content with this session ID. + table_name (str): Table name that stores the chat message history. + schema_name (str, optional): Database schema name for the chat message history table. Defaults to "public". + + Raises: + IndexError: If the table provided does not contain required schema. + + Returns: + AsyncPostgresChatMessageHistory: A newly created instance of AsyncPostgresChatMessageHistory. + """ + table_schema = await engine._aload_table_schema(table_name, schema_name) + column_names = table_schema.columns.keys() + + required_columns = ["id", "session_id", "data", "type"] + + if not (all(x in column_names for x in required_columns)): + raise IndexError( + f"Table '{schema_name}'.'{table_name}' has incorrect schema. Got " + f"column names '{column_names}' but required column names " + f"'{required_columns}'.\nPlease create table with following schema:" + f"\nCREATE TABLE {schema_name}.{table_name} (" + "\n id INT AUTO_INCREMENT PRIMARY KEY," + "\n session_id TEXT NOT NULL," + "\n data JSON NOT NULL," + "\n type TEXT NOT NULL" + "\n);" + ) + return cls(cls.__create_key, engine._pool, session_id, table_name) + + async def aadd_message(self, message: BaseMessage) -> None: + """Append the message to the record in PostgreSQL""" + query = f"""INSERT INTO "{self.schema_name}"."{self.table_name}"(session_id, data, type) + VALUES (:session_id, :data, :type); + """ + async with self.pool.connect() as conn: + await conn.execute( + text(query), + { + "session_id": self.session_id, + "data": json.dumps(message.dict()), + "type": message.type, + }, + ) + await conn.commit() + + async def aadd_messages(self, messages: Sequence[BaseMessage]) -> None: + """Append a list of messages to the record in PostgreSQL""" + for message in messages: + await self.aadd_message(message) + + async def aclear(self) -> None: + """Clear session memory from PostgreSQL""" + query = f"""DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE session_id = :session_id;""" + async with self.pool.connect() as conn: + await conn.execute(text(query), {"session_id": self.session_id}) + await conn.commit() + + async def _aget_messages(self) -> List[BaseMessage]: + """Retrieve the messages from PostgreSQL.""" + query = f"""SELECT data, type FROM "{self.schema_name}"."{self.table_name}" WHERE session_id = :session_id ORDER BY id;""" + async with self.pool.connect() as conn: + result = await conn.execute(text(query), {"session_id": self.session_id}) + result_map = result.mappings() + results = result_map.fetchall() + if not results: + return [] + + items = [{"data": result["data"], "type": result["type"]} for result in results] + messages = messages_from_dict(items) + return messages + + def clear(self) -> None: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresChatMessageHistory. Use PostgresChatMessageHistory interface instead." + ) diff --git a/src/langchain_google_cloud_sql_pg/async_loader.py b/src/langchain_google_cloud_sql_pg/async_loader.py new file mode 100644 index 00000000..90e94526 --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/async_loader.py @@ -0,0 +1,450 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import annotations + +import json +from typing import Any, AsyncIterator, Callable, Dict, Iterable, List, Optional + +from langchain_core.document_loaders.base import BaseLoader +from langchain_core.documents import Document +from sqlalchemy import text +from sqlalchemy.ext.asyncio import AsyncEngine + +from .engine import PostgresEngine + +DEFAULT_CONTENT_COL = "page_content" +DEFAULT_METADATA_COL = "langchain_metadata" + + +def text_formatter(row: dict, content_columns: List[str]) -> str: + """txt document formatter.""" + return " ".join(str(row[column]) for column in content_columns if column in row) + + +def csv_formatter(row: dict, content_columns: List[str]) -> str: + """CSV document formatter.""" + return ", ".join(str(row[column]) for column in content_columns if column in row) + + +def yaml_formatter(row: dict, content_columns: List[str]) -> str: + """YAML document formatter.""" + return "\n".join( + f"{column}: {str(row[column])}" for column in content_columns if column in row + ) + + +def json_formatter(row: dict, content_columns: List[str]) -> str: + """JSON document formatter.""" + dictionary = {} + for column in content_columns: + if column in row: + dictionary[column] = row[column] + return json.dumps(dictionary) + + +def _parse_doc_from_row( + content_columns: Iterable[str], + metadata_columns: Iterable[str], + row: dict, + metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, + formatter: Callable = text_formatter, +) -> Document: + """Parse row into document.""" + page_content = formatter(row, content_columns) + metadata: Dict[str, Any] = {} + # unnest metadata from langchain_metadata column + if metadata_json_column and row.get(metadata_json_column): + for k, v in row[metadata_json_column].items(): + metadata[k] = v + # load metadata from other columns + for column in metadata_columns: + if column in row and column != metadata_json_column: + metadata[column] = row[column] + + return Document(page_content=page_content, metadata=metadata) + + +def _parse_row_from_doc( + doc: Document, + column_names: Iterable[str], + content_column: str = DEFAULT_CONTENT_COL, + metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, +) -> Dict: + """Parse document into a dictionary of rows.""" + doc_metadata = doc.metadata.copy() + row: Dict[str, Any] = {content_column: doc.page_content} + for entry in doc.metadata: + if entry in column_names: + row[entry] = doc_metadata[entry] + del doc_metadata[entry] + # store extra metadata in langchain_metadata column in json format + if metadata_json_column: + row[metadata_json_column] = doc_metadata + return row + + +class AsyncPostgresLoader(BaseLoader): + """Load documents from PostgreSQL`. + + Each document represents one row of the result. The `content_columns` are + written into the `content_columns`of the document. The `metadata_columns` are written + into the `metadata_columns` of the document. By default, first columns is written into + the `page_content` and everything else into the `metadata`. + """ + + __create_key = object() + + def __init__( + self, + key: object, + pool: AsyncEngine, + query: str, + content_columns: List[str], + metadata_columns: List[str], + formatter: Callable, + metadata_json_column: Optional[str] = None, + ) -> None: + """AsyncPostgresLoader constructor. + + Args: + key (object): Prevent direct constructor usage. + engine (PostgresEngine): AsyncEngine with pool connection to the postgres database + query (Optional[str], optional): SQL query. Defaults to None. + content_columns (Optional[List[str]], optional): Column that represent a Document's page_content. Defaults to the first column. + metadata_columns (Optional[List[str]], optional): Column(s) that represent a Document's metadata. Defaults to None. + formatter (Optional[Callable], optional): A function to format page content (OneOf: format, formatter). Defaults to None. + metadata_json_column (Optional[str], optional): Column to store metadata as JSON. Defaults to "langchain_metadata". + + + Raises: + Exception: If called directly by user. + """ + if key != AsyncPostgresLoader.__create_key: + raise Exception( + "Only create class through 'create' or 'create_sync' methods!" + ) + + self.pool = pool + self.query = query + self.content_columns = content_columns + self.metadata_columns = metadata_columns + self.formatter = formatter + self.metadata_json_column = metadata_json_column + + @classmethod + async def create( + cls, + engine: PostgresEngine, + query: Optional[str] = None, + table_name: Optional[str] = None, + schema_name: str = "public", + content_columns: Optional[List[str]] = None, + metadata_columns: Optional[List[str]] = None, + metadata_json_column: Optional[str] = None, + format: Optional[str] = None, + formatter: Optional[Callable] = None, + ) -> AsyncPostgresLoader: + """Create a new AsyncPostgresLoader instance. + + Args: + engine (PostgresEngine):AsyncEngine with pool connection to the postgres database + query (Optional[str], optional): SQL query. Defaults to None. + table_name (Optional[str], optional): Name of table to query. Defaults to None. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + content_columns (Optional[List[str]], optional): Column that represent a Document's page_content. Defaults to the first column. + metadata_columns (Optional[List[str]], optional): Column(s) that represent a Document's metadata. Defaults to None. + metadata_json_column (Optional[str], optional): Column to store metadata as JSON. Defaults to "langchain_metadata". + format (Optional[str], optional): Format of page content (OneOf: text, csv, YAML, JSON). Defaults to 'text'. + formatter (Optional[Callable], optional): A function to format page content (OneOf: format, formatter). Defaults to None. + + Returns: + AsyncPostgresLoader + """ + if table_name and query: + raise ValueError("Only one of 'table_name' or 'query' should be specified.") + if not table_name and not query: + raise ValueError( + "At least one of the parameters 'table_name' or 'query' needs to be provided" + ) + if format and formatter: + raise ValueError("Only one of 'format' or 'formatter' should be specified.") + + if format and format not in ["csv", "text", "JSON", "YAML"]: + raise ValueError("format must be type: 'csv', 'text', 'JSON', 'YAML'") + if formatter: + formatter = formatter + elif format == "csv": + formatter = csv_formatter + elif format == "YAML": + formatter = yaml_formatter + elif format == "JSON": + formatter = json_formatter + else: + formatter = text_formatter + + if not query: + query = f'SELECT * FROM "{schema_name}"."{table_name}"' + + async with engine._pool.connect() as connection: + result_proxy = await connection.execute(text(query)) + column_names = list(result_proxy.keys()) + # Select content or default to first column + content_columns = content_columns or [column_names[0]] + # Select metadata columns + metadata_columns = metadata_columns or [ + col for col in column_names if col not in content_columns + ] + # Check validity of metadata json column + if metadata_json_column and metadata_json_column not in column_names: + raise ValueError( + f"Column {metadata_json_column} not found in query result {column_names}." + ) + # use default metadata json column if not specified + if metadata_json_column and metadata_json_column in column_names: + metadata_json_column = metadata_json_column + elif DEFAULT_METADATA_COL in column_names: + metadata_json_column = DEFAULT_METADATA_COL + else: + metadata_json_column = None + + # check validity of other column + all_names = content_columns + metadata_columns + for name in all_names: + if name not in column_names: + raise ValueError( + f"Column {name} not found in query result {column_names}." + ) + return cls( + cls.__create_key, + engine._pool, + query, + content_columns, + metadata_columns, + formatter, + metadata_json_column, + ) + + async def aload(self) -> List[Document]: + """Load PostgreSQL data into Document objects.""" + return [doc async for doc in self.alazy_load()] + + async def alazy_load(self) -> AsyncIterator[Document]: + """Load PostgreSQL data into Document objects lazily.""" + async with self.pool.connect() as connection: + result_proxy = await connection.execute(text(self.query)) + # load document one by one + while True: + row = result_proxy.fetchone() + if not row: + break + + row_data = {} + column_names = self.content_columns + self.metadata_columns + column_names += ( + [self.metadata_json_column] if self.metadata_json_column else [] + ) + for column in column_names: + value = getattr(row, column) + row_data[column] = value + + yield _parse_doc_from_row( + self.content_columns, + self.metadata_columns, + row_data, + self.metadata_json_column, + self.formatter, + ) + + +class AsyncPostgresDocumentSaver: + """A class for saving langchain documents into a PostgreSQL database table.""" + + __create_key = object() + + def __init__( + self, + key: object, + pool: AsyncEngine, + table_name: str, + content_column: str, + schema_name: str = "public", + metadata_columns: List[str] = [], + metadata_json_column: Optional[str] = None, + ): + """AsyncPostgresDocumentSaver constructor. + + Args: + key (object): Prevent direct constructor usage. + engine (PostgresEngine): AsyncEngine with pool connection to the postgres database + table_name (Optional[str], optional): Name of table to query. Defaults to None. + content_columns (Optional[List[str]], optional): Column that represent a Document's page_content. Defaults to the first column. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + metadata_columns (Optional[List[str]], optional): Column(s) that represent a Document's metadata. Defaults to None. + metadata_json_column (Optional[str], optional): Column to store metadata as JSON. Defaults to "langchain_metadata". + + Raises: + Exception: if called directly by user. + """ + if key != AsyncPostgresDocumentSaver.__create_key: + raise Exception( + "Only create class through 'create' or 'create_sync' methods!" + ) + self.pool = pool + self.table_name = table_name + self.content_column = content_column + self.schema_name = schema_name + self.metadata_columns = metadata_columns + self.metadata_json_column = metadata_json_column + + @classmethod + async def create( + cls, + engine: PostgresEngine, + table_name: str, + schema_name: str = "public", + content_column: str = DEFAULT_CONTENT_COL, + metadata_columns: List[str] = [], + metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, + ) -> AsyncPostgresDocumentSaver: + """Create an AsyncPostgresDocumentSaver instance. + + Args: + engine (PostgresEngine):AsyncEngine with pool connection to the postgres database + table_name (Optional[str], optional): Name of table to query. Defaults to None. + content_columns (Optional[List[str]], optional): Column that represent a Document's page_content. Defaults to the first column. + metadata_columns (Optional[List[str]], optional): Column(s) that represent a Document's metadata. Defaults to None. + metadata_json_column (Optional[str], optional): Column to store metadata as JSON. Defaults to "langchain_metadata". + + Returns: + AsyncPostgresDocumentSaver + """ + table_schema = await engine._aload_table_schema(table_name, schema_name) + column_names = table_schema.columns.keys() + if content_column not in column_names: + raise ValueError(f"Content column, {content_column}, does not exist.") + + # Set metadata columns to all columns if not set + if len(metadata_columns) == 0: + metadata_columns = [ + column + for column in column_names + if column != content_column and column != metadata_json_column + ] + + # Check and set metadata json column + for column in metadata_columns: + if column not in column_names: + raise ValueError(f"Metadata column, {column}, does not exist.") + + if ( + metadata_json_column + and metadata_json_column != DEFAULT_METADATA_COL + and metadata_json_column not in column_names + ): + raise ValueError(f"Metadata JSON column, {column}, does not exist.") + elif metadata_json_column not in column_names: + metadata_json_column = None + + return cls( + cls.__create_key, + engine._pool, + table_name, + content_column, + schema_name, + metadata_columns, + metadata_json_column, + ) + + async def aadd_documents(self, docs: List[Document]) -> None: + """ + Save documents in the DocumentSaver table. Document’s metadata is added to columns if found or + stored in langchain_metadata JSON column. + + Args: + docs (List[langchain_core.documents.Document]): a list of documents to be saved. + """ + + for doc in docs: + row = _parse_row_from_doc( + doc, + self.metadata_columns, + self.content_column, + self.metadata_json_column, + ) + for key, value in row.items(): + if isinstance(value, dict): + row[key] = json.dumps(value) + + # Create list of column names + insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"({self.content_column}' + values_stmt = f"VALUES (:{self.content_column}" + + # Add metadata + for metadata_column in self.metadata_columns: + if metadata_column in doc.metadata: + insert_stmt += f", {metadata_column}" + values_stmt += f", :{metadata_column}" + + # Add JSON column and/or close statement + insert_stmt += ( + f", {self.metadata_json_column})" if self.metadata_json_column else ")" + ) + if self.metadata_json_column: + values_stmt += f", :{self.metadata_json_column})" + else: + values_stmt += ")" + + query = insert_stmt + values_stmt + async with self.pool.connect() as conn: + await conn.execute(text(query), row) + await conn.commit() + + async def adelete(self, docs: List[Document]) -> None: + """ + Delete all instances of a document from the DocumentSaver table by matching the entire Document + object. + + Args: + docs (List[langchain_core.documents.Document]): a list of documents to be deleted. + """ + for doc in docs: + row = _parse_row_from_doc( + doc, + self.metadata_columns, + self.content_column, + self.metadata_json_column, + ) + # delete by matching all fields of document + where_conditions_list = [] + for key, value in row.items(): + if isinstance(value, dict): + where_conditions_list.append( + f"{key}::jsonb @> '{json.dumps(value)}'::jsonb" + ) + else: + # Handle simple key-value pairs + where_conditions_list.append(f"{key} = :{key}") + + where_conditions = " AND ".join(where_conditions_list) + stmt = f'DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE {where_conditions};' + values = {} + for key, value in row.items(): + if type(value) is int: + values[key] = str(value) + else: + values[key] = value + async with self.pool.connect() as conn: + await conn.execute(text(stmt), values) + await conn.commit() diff --git a/src/langchain_google_cloud_sql_pg/async_vectorstore.py b/src/langchain_google_cloud_sql_pg/async_vectorstore.py new file mode 100644 index 00000000..f74a1020 --- /dev/null +++ b/src/langchain_google_cloud_sql_pg/async_vectorstore.py @@ -0,0 +1,875 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# TODO: Remove below import when minimum supported Python version is 3.10 +from __future__ import annotations + +import json +import uuid +from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Type + +import numpy as np +from langchain_core.documents import Document +from langchain_core.embeddings import Embeddings +from langchain_core.vectorstores import VectorStore, utils +from sqlalchemy import text +from sqlalchemy.engine.row import RowMapping +from sqlalchemy.ext.asyncio import AsyncEngine + +from .engine import PostgresEngine +from .indexes import ( + DEFAULT_DISTANCE_STRATEGY, + DEFAULT_INDEX_NAME_SUFFIX, + BaseIndex, + DistanceStrategy, + ExactNearestNeighbor, + QueryOptions, +) + + +class AsyncPostgresVectorStore(VectorStore): + """Google Cloud SQL for PostgreSQL Vector Store class""" + + __create_key = object() + + def __init__( + self, + key: object, + pool: AsyncEngine, + embedding_service: Embeddings, + table_name: str, + schema_name: str = "public", + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + id_column: str = "langchain_id", + metadata_json_column: Optional[str] = "langchain_metadata", + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, + ): + """AsyncPostgresVectorStore constructor. + Args: + key (object): Prevent direct constructor usage. + pool (PostgresEngine): Connection pool engine for managing connections to Postgres database. + embedding_service (Embeddings): Text embedding model to use. + table_name (str): Name of the existing table or the table to be created. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + content_column (str): Column that represent a Document's page_content. Defaults to "content". + embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". + metadata_columns (List[str]): Column(s) that represent a document's metadata. + id_column (str): Column that represents the Document's id. Defaults to "langchain_id". + metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. + + + Raises: + Exception: If called directly by user. + """ + if key != AsyncPostgresVectorStore.__create_key: + raise Exception( + "Only create class through 'create' or 'create_sync' methods!" + ) + + self.pool = pool + self.embedding_service = embedding_service + self.table_name = table_name + self.schema_name = schema_name + self.content_column = content_column + self.embedding_column = embedding_column + self.metadata_columns = metadata_columns + self.id_column = id_column + self.metadata_json_column = metadata_json_column + self.distance_strategy = distance_strategy + self.k = k + self.fetch_k = fetch_k + self.lambda_mult = lambda_mult + self.index_query_options = index_query_options + + @classmethod + async def create( + cls, + engine: PostgresEngine, + embedding_service: Embeddings, + table_name: str, + schema_name: str = "public", + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: Optional[List[str]] = None, + id_column: str = "langchain_id", + metadata_json_column: Optional[str] = "langchain_metadata", + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, + ) -> AsyncPostgresVectorStore: + """Create a new AsyncPostgresVectorStore instance. + + Args: + engine (PostgresEngine): Connection pool engine for managing connections to Cloud SQL for PostgreSQL database. + embedding_service (Embeddings): Text embedding model to use. + table_name (str): Name of an existing table or table to be created. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + content_column (str): Column that represent a Document's page_content. Defaults to "content". + embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". + metadata_columns (List[str]): Column(s) that represent a document's metadata. + ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. + id_column (str): Column that represents the Document's id. Defaults to "langchain_id". + metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. + + Returns: + AsyncPostgresVectorStore + """ + if metadata_columns and ignore_metadata_columns: + raise ValueError( + "Can not use both metadata_columns and ignore_metadata_columns." + ) + # Get field type information + async with engine._pool.connect() as conn: + result = await conn.execute( + text( + f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{table_name}'AND table_schema = '{schema_name}'" + ) + ) + result_map = result.mappings() + results = result_map.fetchall() + + columns = {} + for field in results: + columns[field["column_name"]] = field["data_type"] + + # Check columns + if id_column not in columns: + raise ValueError(f"Id column, {id_column}, does not exist.") + if content_column not in columns: + raise ValueError(f"Content column, {content_column}, does not exist.") + content_type = columns[content_column] + if content_type != "text" and "char" not in content_type: + raise ValueError( + f"Content column, {content_column}, is type, {content_type}. It must be a type of character string." + ) + if embedding_column not in columns: + raise ValueError(f"Embedding column, {embedding_column}, does not exist.") + if columns[embedding_column] != "USER-DEFINED": + raise ValueError( + f"Embedding column, {embedding_column}, is not type Vector." + ) + + metadata_json_column = ( + None if metadata_json_column not in columns else metadata_json_column + ) + + # If using metadata_columns check to make sure column exists + for column in metadata_columns: + if column not in columns: + raise ValueError(f"Metadata column, {column}, does not exist.") + + # If using ignore_metadata_columns, filter out known columns and set known metadata columns + all_columns = columns + if ignore_metadata_columns: + for column in ignore_metadata_columns: + del all_columns[column] + + del all_columns[id_column] + del all_columns[content_column] + del all_columns[embedding_column] + metadata_columns = [k for k in all_columns.keys()] + + return cls( + cls.__create_key, + engine._pool, + embedding_service, + table_name, + schema_name, + content_column, + embedding_column, + metadata_columns, + id_column, + metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, + ) + + @property + def embeddings(self) -> Embeddings: + return self.embedding_service + + async def __aadd_embeddings( + self, + texts: Iterable[str], + embeddings: List[List[float]], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Add embeddings to the table.""" + if not ids: + ids = [str(uuid.uuid4()) for _ in texts] + if not metadatas: + metadatas = [{} for _ in texts] + # Insert embeddings + for id, content, embedding, metadata in zip(ids, texts, embeddings, metadatas): + metadata_col_names = ( + ", " + ", ".join(self.metadata_columns) + if len(self.metadata_columns) > 0 + else "" + ) + insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"({self.id_column}, {self.content_column}, {self.embedding_column}{metadata_col_names}' + values = {"id": id, "content": content, "embedding": str(embedding)} + values_stmt = "VALUES (:id, :content, :embedding" + + # Add metadata + extra = metadata + for metadata_column in self.metadata_columns: + if metadata_column in metadata: + values_stmt += f", :{metadata_column}" + values[metadata_column] = metadata[metadata_column] + del extra[metadata_column] + else: + values_stmt += ",null" + + # Add JSON column and/or close statement + insert_stmt += ( + f", {self.metadata_json_column})" if self.metadata_json_column else ")" + ) + if self.metadata_json_column: + values_stmt += ", :extra)" + values["extra"] = json.dumps(extra) + else: + values_stmt += ")" + + query = insert_stmt + values_stmt + async with self.pool.connect() as conn: + await conn.execute(text(query), values) + await conn.commit() + + return ids + + async def aadd_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Embed texts and add to the table.""" + embeddings = self.embedding_service.embed_documents(list(texts)) + ids = await self.__aadd_embeddings( + texts, embeddings, metadatas=metadatas, ids=ids, **kwargs + ) + return ids + + async def aadd_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + """Embed documents and add to the table""" + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return ids + + async def adelete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + """Delete records from the table.""" + if not ids: + return False + + id_list = ", ".join([f"'{id}'" for id in ids]) + query = f'DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE {self.id_column} in ({id_list})' + async with self.pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + return True + + @classmethod + async def afrom_texts( # type: ignore[override] + cls: Type[AsyncPostgresVectorStore], + texts: List[str], + embedding: Embeddings, + engine: PostgresEngine, + table_name: str, + schema_name: str = "public", + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: Optional[List[str]] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, + **kwargs: Any, + ) -> AsyncPostgresVectorStore: + """Create an AsyncPostgresVectorStore instance from texts. + Args: + texts (List[str]): Texts to add to the vector store. + embedding (Embeddings): Text embedding model to use. + engine (PostgresEngine): Connection pool engine for managing connections to Postgres database. + table_name (str): Name of the existing table or the table to be created. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + metadatas (Optional[List[dict]]): List of metadatas to add to table records. + ids: (Optional[List[str]]): List of IDs to add to table records. + content_column (str): Column that represent a Document’s page_content. Defaults to "content". + embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". + metadata_columns (List[str]): Column(s) that represent a document's metadata. + ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. + id_column (str): Column that represents the Document's id. Defaults to "langchain_id". + metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. + + Returns: + AsyncPostgresVectorStore + """ + vs = await cls.create( + engine, + embedding, + table_name, + schema_name, + content_column, + embedding_column, + metadata_columns, + ignore_metadata_columns, + id_column, + metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, + ) + await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return vs + + @classmethod + async def afrom_documents( # type: ignore[override] + cls: Type[AsyncPostgresVectorStore], + documents: List[Document], + embedding: Embeddings, + engine: PostgresEngine, + table_name: str, + schema_name: str = "public", + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: Optional[List[str]] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, + **kwargs: Any, + ) -> AsyncPostgresVectorStore: + """Create an AsyncPostgresVectorStore instance from documents. + + Args: + documents (List[Document]): Documents to add to the vector store. + embedding (Embeddings): Text embedding model to use. + engine (PostgresEngine): Connection pool engine for managing connections to Postgres database. + table_name (str): Name of the existing table or the table to be created. + schema_name (str, optional): Database schema name of the table. Defaults to "public". + metadatas (Optional[List[dict]]): List of metadatas to add to table records. + ids: (Optional[List[str]]): List of IDs to add to table records. + content_column (str): Column that represent a Document's page_content. Defaults to "content". + embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". + metadata_columns (List[str]): Column(s) that represent a document's metadata. + ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. + id_column (str): Column that represents the Document's id. Defaults to "langchain_id". + metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. + + Returns: + AsyncPostgresVectorStore + """ + vs = await cls.create( + engine, + embedding, + table_name, + schema_name, + content_column, + embedding_column, + metadata_columns, + ignore_metadata_columns, + id_column, + metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, + ) + texts = [doc.page_content for doc in documents] + metadatas = [doc.metadata for doc in documents] + await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + return vs + + async def __query_collection( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> Sequence[RowMapping]: + """Perform similarity search query on the vector store table.""" + k = k if k else self.k + operator = self.distance_strategy.operator + search_function = self.distance_strategy.search_function + + filter = f"WHERE {filter}" if filter else "" + stmt = f"SELECT *, {search_function}({self.embedding_column}, '{embedding}') as distance FROM \"{self.schema_name}\".\"{self.table_name}\" {filter} ORDER BY {self.embedding_column} {operator} '{embedding}' LIMIT {k};" + if self.index_query_options: + async with self.pool.connect() as conn: + await conn.execute( + text(f"SET LOCAL {self.index_query_options.to_string()};") + ) + result = await conn.execute(text(stmt)) + result_map = result.mappings() + results = result_map.fetchall() + else: + async with self.pool.connect() as conn: + result = await conn.execute(text(stmt)) + result_map = result.mappings() + results = result_map.fetchall() + return results + + async def asimilarity_search( + self, + query: str, + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected by similarity search on query.""" + embedding = self.embedding_service.embed_query(text=query) + + return await self.asimilarity_search_by_vector( + embedding=embedding, k=k, filter=filter, **kwargs + ) + + def _select_relevance_score_fn(self) -> Callable[[float], float]: + """Select a relevance function based on distance strategy.""" + # Calculate distance strategy provided in + # vectorstore constructor + if self.distance_strategy == DistanceStrategy.COSINE_DISTANCE: + return self._cosine_relevance_score_fn + if self.distance_strategy == DistanceStrategy.INNER_PRODUCT: + return self._max_inner_product_relevance_score_fn + elif self.distance_strategy == DistanceStrategy.EUCLIDEAN: + return self._euclidean_relevance_score_fn + + async def asimilarity_search_with_score( + self, + query: str, + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected by similarity search on query.""" + embedding = self.embedding_service.embed_query(query) + docs = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter, **kwargs + ) + return docs + + async def asimilarity_search_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected by vector similarity search.""" + docs_and_scores = await self.asimilarity_search_with_score_by_vector( + embedding=embedding, k=k, filter=filter, **kwargs + ) + + return [doc for doc, _ in docs_and_scores] + + async def asimilarity_search_with_score_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected by vector similarity search.""" + results = await self.__query_collection( + embedding=embedding, k=k, filter=filter, **kwargs + ) + + documents_with_scores = [] + for row in results: + metadata = ( + row[self.metadata_json_column] + if self.metadata_json_column and row[self.metadata_json_column] + else {} + ) + for col in self.metadata_columns: + metadata[col] = row[col] + documents_with_scores.append( + ( + Document( + page_content=row[self.content_column], + metadata=metadata, + ), + row["distance"], + ) + ) + + return documents_with_scores + + async def amax_marginal_relevance_search( + self, + query: str, + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + embedding = self.embedding_service.embed_query(text=query) + + return await self.amax_marginal_relevance_search_by_vector( + embedding=embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + + async def amax_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected using the maximal marginal relevance.""" + docs_and_scores = ( + await self.amax_marginal_relevance_search_with_score_by_vector( + embedding, + k=k, + fetch_k=fetch_k, + lambda_mult=lambda_mult, + filter=filter, + **kwargs, + ) + ) + + return [result[0] for result in docs_and_scores] + + async def amax_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected using the maximal marginal relevance.""" + results = await self.__query_collection( + embedding=embedding, k=fetch_k, filter=filter, **kwargs + ) + + k = k if k else self.k + fetch_k = fetch_k if fetch_k else self.fetch_k + lambda_mult = lambda_mult if lambda_mult else self.lambda_mult + embedding_list = [json.loads(row[self.embedding_column]) for row in results] + mmr_selected = utils.maximal_marginal_relevance( + np.array(embedding, dtype=np.float32), + embedding_list, + k=k, + lambda_mult=lambda_mult, + ) + + documents_with_scores = [] + for row in results: + metadata = ( + row[self.metadata_json_column] + if self.metadata_json_column and row[self.metadata_json_column] + else {} + ) + for col in self.metadata_columns: + metadata[col] = row[col] + documents_with_scores.append( + ( + Document( + page_content=row[self.content_column], + metadata=metadata, + ), + row["distance"], + ) + ) + + return [r for i, r in enumerate(documents_with_scores) if i in mmr_selected] + + async def aapply_vector_index( + self, + index: BaseIndex, + name: Optional[str] = None, + concurrently: bool = False, + ) -> None: + """Create an index on the vector store table.""" + if isinstance(index, ExactNearestNeighbor): + await self.adrop_vector_index() + return + + filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" + params = "WITH " + index.index_options() + function = index.distance_strategy.index_function + if name is None: + if index.name == None: + index.name = self.table_name + DEFAULT_INDEX_NAME_SUFFIX + name = index.name + stmt = f'CREATE INDEX {"CONCURRENTLY" if concurrently else ""} {name} ON "{self.schema_name}"."{self.table_name}" USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};' + if concurrently: + async with self.pool.connect() as conn: + await conn.execute(text("COMMIT")) + await conn.execute(text(stmt)) + else: + async with self.pool.connect() as conn: + await conn.execute(text(stmt)) + await conn.commit() + + async def areindex(self, index_name: Optional[str] = None) -> None: + """Re-index the vector store table.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX + query = f"REINDEX INDEX {index_name};" + async with self.pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + async def adrop_vector_index( + self, + index_name: Optional[str] = None, + ) -> None: + """Drop the vector index.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX + query = f"DROP INDEX IF EXISTS {index_name};" + async with self.pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + async def is_valid_index( + self, + index_name: Optional[str] = None, + ) -> bool: + """Check if index exists in the table.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX + stmt = f""" + SELECT tablename, indexname + FROM pg_indexes + WHERE tablename = '{self.table_name}' AND schemaname = '{self.schema_name}' AND indexname = '{index_name}'; + """ + async with self.pool.connect() as conn: + result = await conn.execute(text(stmt)) + result_map = result.mappings() + results = result_map.fetchall() + + return bool(len(results) == 1) + + def similarity_search( + self, + query: str, + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def add_texts( + self, + texts: Iterable[str], + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def add_documents( + self, + documents: List[Document], + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> List[str]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def delete( + self, + ids: Optional[List[str]] = None, + **kwargs: Any, + ) -> Optional[bool]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + @classmethod + def from_texts( # type: ignore[override] + cls: Type[AsyncPostgresVectorStore], + texts: List[str], + embedding: Embeddings, + engine: PostgresEngine, + table_name: str, + metadatas: Optional[List[dict]] = None, + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: Optional[List[str]] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + **kwargs: Any, + ) -> AsyncPostgresVectorStore: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + @classmethod + def from_documents( # type: ignore[override] + cls: Type[AsyncPostgresVectorStore], + documents: List[Document], + embedding: Embeddings, + engine: PostgresEngine, + table_name: str, + ids: Optional[List[str]] = None, + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[str] = [], + ignore_metadata_columns: Optional[List[str]] = None, + id_column: str = "langchain_id", + metadata_json_column: str = "langchain_metadata", + **kwargs: Any, + ) -> AsyncPostgresVectorStore: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def similarity_search_with_score( + self, + query: str, + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def similarity_search_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def max_marginal_relevance_search( + self, + query: str, + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def max_marginal_relevance_search_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) + + def max_marginal_relevance_search_with_score_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + fetch_k: Optional[int] = None, + lambda_mult: Optional[float] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + raise NotImplementedError( + "Sync methods are not implemented for AsyncPostgresVectorStore. Use PostgresVectorStore interface instead." + ) diff --git a/src/langchain_google_cloud_sql_pg/chat_message_history.py b/src/langchain_google_cloud_sql_pg/chat_message_history.py index 0150fa63..306dba15 100644 --- a/src/langchain_google_cloud_sql_pg/chat_message_history.py +++ b/src/langchain_google_cloud_sql_pg/chat_message_history.py @@ -14,32 +14,15 @@ from __future__ import annotations -import json from typing import List, Sequence from langchain_core.chat_history import BaseChatMessageHistory from langchain_core.messages import BaseMessage, messages_from_dict +from .async_chat_message_history import AsyncPostgresChatMessageHistory from .engine import PostgresEngine -async def _aget_messages( - engine: PostgresEngine, - session_id: str, - table_name: str, - schema_name: str = "public", -) -> List[BaseMessage]: - """Retrieve the messages from PostgreSQL.""" - query = f"""SELECT data, type FROM "{schema_name}"."{table_name}" WHERE session_id = :session_id ORDER BY id;""" - results = await engine._afetch(query, {"session_id": session_id}) - if not results: - return [] - - items = [{"data": result["data"], "type": result["type"]} for result in results] - messages = messages_from_dict(items) - return messages - - class PostgresChatMessageHistory(BaseChatMessageHistory): """Chat message history stored in an Cloud SQL for PostgreSQL database.""" @@ -49,21 +32,14 @@ def __init__( self, key: object, engine: PostgresEngine, - session_id: str, - table_name: str, - messages: List[BaseMessage], - schema_name: str = "public", + history: AsyncPostgresChatMessageHistory, ): """PostgresChatMessageHistory constructor. Args: key (object): Key to prevent direct constructor usage. engine (PostgresEngine): Database connection pool. - session_id (str): Retrieve the table content with this session ID. - table_name (str): Table name that stores the chat message history. - messages (List[BaseMessage]): Messages to store. - schema_name (str, optional): Database schema name of the chat message history table. Defaults to "public". - + history (AsyncPostgresChatMessageHistory): Native async implementation Raises: Exception: If constructor is directly called by the user. """ @@ -71,11 +47,8 @@ def __init__( raise Exception( "Only create class through 'create' or 'create_sync' methods!" ) - self.engine = engine - self.session_id = session_id - self.table_name = table_name - self.messages = messages - self.schema_name = schema_name + self._engine = engine + self._history = history @classmethod async def create( @@ -99,27 +72,11 @@ async def create( Returns: PostgresChatMessageHistory: A newly created instance of PostgresChatMessageHistory. """ - table_schema = await engine._aload_table_schema(table_name, schema_name) - column_names = table_schema.columns.keys() - - required_columns = ["id", "session_id", "data", "type"] - - if not (all(x in column_names for x in required_columns)): - raise IndexError( - f"Table '{schema_name}'.'{table_name}' has incorrect schema. Got " - f"column names '{column_names}' but required column names " - f"'{required_columns}'.\nPlease create table with following schema:" - f"\nCREATE TABLE {schema_name}.{table_name} (" - "\n id INT AUTO_INCREMENT PRIMARY KEY," - "\n session_id TEXT NOT NULL," - "\n data JSON NOT NULL," - "\n type TEXT NOT NULL" - "\n);" - ) - messages = await _aget_messages(engine, session_id, table_name, schema_name) - return cls( - cls.__create_key, engine, session_id, table_name, messages, schema_name + coro = AsyncPostgresChatMessageHistory.create( + engine, session_id, table_name, schema_name ) + history = await engine._run_as_async(coro) + return cls(cls.__create_key, engine, history) @classmethod def create_sync( @@ -143,55 +100,37 @@ def create_sync( Returns: PostgresChatMessageHistory: A newly created instance of PostgresChatMessageHistory. """ - coro = cls.create(engine, session_id, table_name, schema_name) - return engine._run_as_sync(coro) + coro = AsyncPostgresChatMessageHistory.create( + engine, session_id, table_name, schema_name + ) + history = engine._run_as_sync(coro) + return cls(cls.__create_key, engine, history) + + @property # type: ignore[override] + def messages(self) -> List[BaseMessage]: + """The abstraction required a property.""" + return self._engine._run_as_sync(self._history._aget_messages()) async def aadd_message(self, message: BaseMessage) -> None: """Append the message to the record in PostgreSQL""" - query = f"""INSERT INTO "{self.schema_name}"."{self.table_name}"(session_id, data, type) - VALUES (:session_id, :data, :type); - """ - await self.engine._aexecute( - query, - { - "session_id": self.session_id, - "data": json.dumps(message.dict()), - "type": message.type, - }, - ) - self.messages = await _aget_messages( - self.engine, self.session_id, self.table_name, self.schema_name - ) + await self._engine._run_as_async(self._history.aadd_message(message)) def add_message(self, message: BaseMessage) -> None: """Append the message to the record in PostgreSQL""" - self.engine._run_as_sync(self.aadd_message(message)) + self._engine._run_as_sync(self._history.aadd_message(message)) async def aadd_messages(self, messages: Sequence[BaseMessage]) -> None: """Append a list of messages to the record in PostgreSQL""" - for message in messages: - await self.aadd_message(message) + await self._engine._run_as_async(self._history.aadd_messages(messages)) def add_messages(self, messages: Sequence[BaseMessage]) -> None: """Append a list of messages to the record in PostgreSQL""" - self.engine._run_as_sync(self.aadd_messages(messages)) + self._engine._run_as_sync(self._history.aadd_messages(messages)) async def aclear(self) -> None: """Clear session memory from PostgreSQL""" - query = f"""DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE session_id = :session_id;""" - await self.engine._aexecute(query, {"session_id": self.session_id}) - self.messages = [] + await self._engine._run_as_async(self._history.aclear()) def clear(self) -> None: """Clear session memory from PostgreSQL""" - self.engine._run_as_sync(self.aclear()) - - async def async_messages(self) -> None: - """Retrieve the messages from Postgres.""" - self.messages = await _aget_messages( - self.engine, self.session_id, self.table_name, self.schema_name - ) - - def sync_messages(self) -> None: - """Retrieve the messages from Postgres.""" - self.engine._run_as_sync(self.async_messages()) + self._engine._run_as_sync(self._history.aclear()) diff --git a/src/langchain_google_cloud_sql_pg/engine.py b/src/langchain_google_cloud_sql_pg/engine.py index dc3137cd..3548f4af 100644 --- a/src/langchain_google_cloud_sql_pg/engine.py +++ b/src/langchain_google_cloud_sql_pg/engine.py @@ -15,25 +15,17 @@ from __future__ import annotations import asyncio +from concurrent.futures import Future from dataclasses import dataclass from threading import Thread -from typing import ( - TYPE_CHECKING, - Awaitable, - Dict, - List, - Optional, - Sequence, - TypeVar, - Union, -) +from typing import TYPE_CHECKING, Any, Awaitable, Dict, List, Optional, TypeVar, Union import aiohttp import google.auth # type: ignore import google.auth.transport.requests # type: ignore from google.cloud.sql.connector import Connector, IPTypes, RefreshStrategy from sqlalchemy import MetaData, Table, text -from sqlalchemy.engine.row import RowMapping +from sqlalchemy.engine import URL from sqlalchemy.exc import InvalidRequestError from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine @@ -114,17 +106,17 @@ class PostgresEngine: def __init__( self, key: object, - engine: AsyncEngine, + pool: AsyncEngine, loop: Optional[asyncio.AbstractEventLoop], thread: Optional[Thread], ): """PostgresEngine constructor. Args: - key(object): Prevent direct constructor usage. - engine(AsyncEngine): Async engine connection pool. + key (object): Prevent direct constructor usage. + pool (AsyncEngine): Async engine connection pool. loop (Optional[asyncio.AbstractEventLoop]): Async event loop used to create the engine. - thread (Optional[Thread] = None): Thread used to create the engine async. + thread (Optional[Thread]): Thread used to create the engine async. Raises: Exception: If the constructor is called directly by the user. @@ -133,62 +125,10 @@ def __init__( raise Exception( "Only create class through 'create' or 'create_sync' methods!" ) - self._engine = engine + self._pool = pool self._loop = loop self._thread = thread - @classmethod - def from_instance( - cls, - project_id: str, - region: str, - instance: str, - database: str, - user: Optional[str] = None, - password: Optional[str] = None, - ip_type: Union[str, IPTypes] = IPTypes.PUBLIC, - quota_project: Optional[str] = None, - iam_account_email: Optional[str] = None, - ) -> PostgresEngine: - """Create a PostgresEngine from a Postgres instance. - - Args: - project_id (str): GCP project ID. - region (str): Postgres instance region. - instance (str): Postgres instance name. - database (str): Database name. - user (Optional[str], optional): Postgres user name. Defaults to None. - password (Optional[str], optional): Postgres user password. Defaults to None. - ip_type (Union[str, IPTypes], optional): IP address type. Defaults to IPTypes.PUBLIC. - quota_project (Optional[str]): Project that provides quota for API calls. - iam_account_email (Optional[str], optional): IAM service account email. Defaults to None. - - Returns: - PostgresEngine: A newly created PostgresEngine instance. - """ - # Running a loop in a background thread allows us to support - # async methods from non-async environments - if cls._default_loop is None: - cls._default_loop = asyncio.new_event_loop() - cls._default_thread = Thread( - target=cls._default_loop.run_forever, daemon=True - ) - cls._default_thread.start() - coro = cls._create( - project_id, - region, - instance, - database, - ip_type, - user, - password, - loop=cls._default_loop, - thread=cls._default_thread, - quota_project=quota_project, - iam_account_email=iam_account_email, - ) - return asyncio.run_coroutine_threadsafe(coro, cls._default_loop).result() - @classmethod async def _create( cls, @@ -211,13 +151,13 @@ async def _create( region (str): Postgres instance region. instance (str): Postgres instance name. database (str): Database name. - ip_type (Union[str, IPTypes], optional): IP address type. Defaults to IPTypes.PUBLIC. - user (Optional[str], optional): Postgres user name. Defaults to None. - password (Optional[str], optional): Postgres user password. Defaults to None. + ip_type (Union[str, IPTypes]): IP address type. Defaults to IPTypes.PUBLIC. + user (Optional[str]): Postgres user name. Defaults to None. + password (Optional[str]): Postgres user password. Defaults to None. loop (Optional[asyncio.AbstractEventLoop]): Async event loop used to create the engine. - thread (Optional[Thread] = None): Thread used to create the engine async. + thread (Optional[Thread]): Thread used to create the engine async. quota_project (Optional[str]): Project that provides quota for API calls. - iam_account_email (Optional[str], optional): IAM service account email. Defaults to None. + iam_account_email (Optional[str]): IAM service account email. Defaults to None. Raises: ValueError: If only one of `user` and `password` is specified. @@ -275,7 +215,43 @@ async def getconn() -> asyncpg.Connection: return cls(cls.__create_key, engine, loop, thread) @classmethod - async def afrom_instance( + def __start_background_loop( + cls, + project_id: str, + region: str, + instance: str, + database: str, + user: Optional[str] = None, + password: Optional[str] = None, + ip_type: Union[str, IPTypes] = IPTypes.PUBLIC, + quota_project: Optional[str] = None, + iam_account_email: Optional[str] = None, + ) -> Future: + # Running a loop in a background thread allows us to support + # async methods from non-async environments + if cls._default_loop is None: + cls._default_loop = asyncio.new_event_loop() + cls._default_thread = Thread( + target=cls._default_loop.run_forever, daemon=True + ) + cls._default_thread.start() + coro = cls._create( + project_id, + region, + instance, + database, + ip_type, + user, + password, + loop=cls._default_loop, + thread=cls._default_thread, + quota_project=quota_project, + iam_account_email=iam_account_email, + ) + return asyncio.run_coroutine_threadsafe(coro, cls._default_loop) + + @classmethod + def from_instance( cls, project_id: str, region: str, @@ -303,73 +279,129 @@ async def afrom_instance( Returns: PostgresEngine: A newly created PostgresEngine instance. """ - return await cls._create( + future = cls.__start_background_loop( project_id, region, instance, database, + user, + password, ip_type, + quota_project=quota_project, + iam_account_email=iam_account_email, + ) + return future.result() + + @classmethod + async def afrom_instance( + cls, + project_id: str, + region: str, + instance: str, + database: str, + user: Optional[str] = None, + password: Optional[str] = None, + ip_type: Union[str, IPTypes] = IPTypes.PUBLIC, + quota_project: Optional[str] = None, + iam_account_email: Optional[str] = None, + ) -> PostgresEngine: + """Create a PostgresEngine from a Postgres instance. + + Args: + project_id (str): GCP project ID. + region (str): Postgres instance region. + instance (str): Postgres instance name. + database (str): Database name. + user (Optional[str], optional): Postgres user name. Defaults to None. + password (Optional[str], optional): Postgres user password. Defaults to None. + ip_type (Union[str, IPTypes], optional): IP address type. Defaults to IPTypes.PUBLIC. + quota_project (Optional[str]): Project that provides quota for API calls. + iam_account_email (Optional[str], optional): IAM service account email. Defaults to None. + + Returns: + PostgresEngine: A newly created PostgresEngine instance. + """ + future = cls.__start_background_loop( + project_id, + region, + instance, + database, user, password, + ip_type, quota_project=quota_project, iam_account_email=iam_account_email, ) + return await asyncio.wrap_future(future) @classmethod - def from_engine(cls, engine: AsyncEngine) -> PostgresEngine: + def from_engine( + cls, + engine: AsyncEngine, + loop: Optional[asyncio.AbstractEventLoop] = None, + ) -> PostgresEngine: """Create an PostgresEngine instance from an AsyncEngine.""" - return cls(cls.__create_key, engine, None, None) + return cls(cls.__create_key, engine, loop, None) - async def _aexecute(self, query: str, params: Optional[dict] = None) -> None: - """Execute a SQL query.""" - async with self._engine.connect() as conn: - await conn.execute(text(query), params) - await conn.commit() + @classmethod + def from_engine_args( + cls, + url: Union[str | URL], + **kwargs: Any, + ) -> PostgresEngine: + """Create an PostgresEngine instance from arguments. These parameters are pass directly into sqlalchemy's create_async_engine function. - async def _aexecute_outside_tx(self, query: str) -> None: - """Execute a SQL query in a new transaction.""" - async with self._engine.connect() as conn: - await conn.execute(text("COMMIT")) - await conn.execute(text(query)) + Args: + url (Union[str | URL]): the URL used to connect to a database + **kwargs (Any, optional): sqlalchemy `create_async_engine` arguments + + Raises: + ValueError: If `postgresql+asyncpg` is not specified as the PG driver + + Returns: + PostgresEngine + """ + # Running a loop in a background thread allows us to support + # async methods from non-async environments + if cls._default_loop is None: + cls._default_loop = asyncio.new_event_loop() + cls._default_thread = Thread( + target=cls._default_loop.run_forever, daemon=True + ) + cls._default_thread.start() - async def _afetch( - self, query: str, params: Optional[dict] = None - ) -> Sequence[RowMapping]: - """Fetch results from a SQL query.""" - async with self._engine.connect() as conn: - result = await conn.execute(text(query), params) - result_map = result.mappings() - result_fetch = result_map.fetchall() - - return result_fetch - - async def _afetch_with_query_options( - self, query: str, query_options: str - ) -> Sequence[RowMapping]: - """Set temporary database flags and fetch results from a SQL query.""" - async with self._engine.connect() as conn: - await conn.execute(text(query_options)) - result = await conn.execute(text(query)) - result_map = result.mappings() - result_fetch = result_map.fetchall() - - return result_fetch - - def _execute(self, query: str, params: Optional[dict] = None) -> None: - """Execute a SQL query.""" - return self._run_as_sync(self._aexecute(query, params)) - - def _fetch(self, query: str, params: Optional[dict] = None) -> Sequence[RowMapping]: - """Fetch results from a SQL query.""" - return self._run_as_sync(self._afetch(query, params)) + driver = "postgresql+asyncpg" + if (isinstance(url, str) and not url.startswith(driver)) or ( + isinstance(url, URL) and url.drivername != driver + ): + raise ValueError("Driver must be type 'postgresql+asyncpg'") + + engine = create_async_engine(url, **kwargs) + return cls(cls.__create_key, engine, cls._default_loop, cls._default_thread) + + async def _run_as_async(self, coro: Awaitable[T]) -> T: + """Run an async coroutine asynchronously""" + # If a loop has not been provided, attempt to run in current thread + if not self._loop: + return await coro + # Otherwise, run in the background thread + return await asyncio.wrap_future( + asyncio.run_coroutine_threadsafe(coro, self._loop) + ) def _run_as_sync(self, coro: Awaitable[T]) -> T: """Run an async coroutine synchronously""" if not self._loop: - raise Exception("Engine was initialized async.") + raise Exception( + "Engine was initialized without a background loop and cannot call sync methods." + ) return asyncio.run_coroutine_threadsafe(coro, self._loop).result() - async def ainit_vectorstore_table( + async def close(self) -> None: + """Dispose of connection pool""" + await self._pool.dispose() + + async def _ainit_vectorstore_table( self, table_name: str, vector_size: int, @@ -407,10 +439,16 @@ async def ainit_vectorstore_table( Raises: :class:`DuplicateTableError `: if table already exists and overwrite flag is not set. """ - await self._aexecute("CREATE EXTENSION IF NOT EXISTS vector") + async with self._pool.connect() as conn: + await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) + await conn.commit() if overwrite_existing: - await self._aexecute(f'DROP TABLE IF EXISTS "{schema_name}"."{table_name}"') + async with self._pool.connect() as conn: + await conn.execute( + text(f'DROP TABLE IF EXISTS "{schema_name}"."{table_name}"') + ) + await conn.commit() query = f"""CREATE TABLE "{schema_name}"."{table_name}"( "{id_column}" UUID PRIMARY KEY, @@ -423,7 +461,59 @@ async def ainit_vectorstore_table( query += f""",\n"{metadata_json_column}" JSON""" query += "\n);" - await self._aexecute(query) + async with self._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + async def ainit_vectorstore_table( + self, + table_name: str, + vector_size: int, + schema_name: str = "public", + content_column: str = "content", + embedding_column: str = "embedding", + metadata_columns: List[Column] = [], + metadata_json_column: str = "langchain_metadata", + id_column: str = "langchain_id", + overwrite_existing: bool = False, + store_metadata: bool = True, + ) -> None: + """ + Create a table for saving of vectors to be used with PostgresVectorStore. + + Args: + table_name (str): The Postgres database table name. + vector_size (int): Vector size for the embedding model to be used. + schema_name (str): The schema name to store Postgres database table. + Default: "public". + content_column (str): Name of the column to store document content. + Default: "page_content". + embedding_column (str) : Name of the column to store vector embeddings. + Default: "embedding". + metadata_columns (List[Column]): A list of Columns to create for custom + metadata. Default: []. Optional. + metadata_json_column (str): The column to store extra metadata in JSON format. + Default: "langchain_metadata". Optional. + id_column (str): Name of the column to store ids. + Default: "langchain_id". Optional, + overwrite_existing (bool): Whether to drop existing table. Default: False. + store_metadata (bool): Whether to store metadata in the table. + Default: True. + """ + await self._run_as_async( + self._ainit_vectorstore_table( + table_name, + vector_size, + schema_name, + content_column, + embedding_column, + metadata_columns, + metadata_json_column, + id_column, + overwrite_existing, + store_metadata, + ) + ) def init_vectorstore_table( self, @@ -460,8 +550,8 @@ def init_vectorstore_table( store_metadata (bool): Whether to store metadata in the table. Default: True. """ - return self._run_as_sync( - self.ainit_vectorstore_table( + self._run_as_sync( + self._ainit_vectorstore_table( table_name, vector_size, schema_name, @@ -475,7 +565,7 @@ def init_vectorstore_table( ) ) - async def ainit_chat_history_table( + async def _ainit_chat_history_table( self, table_name: str, schema_name: str = "public" ) -> None: """Create a Cloud SQL table to store chat history. @@ -494,7 +584,24 @@ async def ainit_chat_history_table( data JSONB NOT NULL, type TEXT NOT NULL );""" - await self._aexecute(create_table_query) + async with self._pool.connect() as conn: + await conn.execute(text(create_table_query)) + await conn.commit() + + async def ainit_chat_history_table( + self, table_name: str, schema_name: str = "public" + ) -> None: + """Create a Cloud SQL table to store chat history. + + Args: + table_name (str): Table name to store chat history. + + Returns: + None + """ + await self._run_as_async( + self._ainit_chat_history_table(table_name, schema_name) + ) def init_chat_history_table( self, table_name: str, schema_name: str = "public" @@ -509,13 +616,37 @@ def init_chat_history_table( Returns: None """ - return self._run_as_sync( - self.ainit_chat_history_table( + self._run_as_sync( + self._ainit_chat_history_table( table_name, schema_name, ) ) + async def _ainit_document_table( + self, + table_name: str, + schema_name: str = "public", + content_column: str = "page_content", + metadata_columns: List[Column] = [], + metadata_json_column: str = "langchain_metadata", + store_metadata: bool = True, + ) -> None: + query = f"""CREATE TABLE "{schema_name}"."{table_name}"( + {content_column} TEXT NOT NULL + """ + for column in metadata_columns: + nullable = "NOT NULL" if not column.nullable else "" + query += f',\n"{column.name}" {column.data_type} {nullable}' + metadata_json_column = metadata_json_column or "langchain_metadata" + if store_metadata: + query += f',\n"{metadata_json_column}" JSON' + query += "\n);" + + async with self._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + async def ainit_document_table( self, table_name: str, @@ -544,19 +675,16 @@ async def ainit_document_table( Raises: :class:`DuplicateTableError `: if table already exists. """ - - query = f"""CREATE TABLE "{schema_name}"."{table_name}"( - {content_column} TEXT NOT NULL - """ - for column in metadata_columns: - nullable = "NOT NULL" if not column.nullable else "" - query += f',\n"{column.name}" {column.data_type} {nullable}' - metadata_json_column = metadata_json_column or "langchain_metadata" - if store_metadata: - query += f',\n"{metadata_json_column}" JSON' - query += "\n);" - - await self._aexecute(query) + await self._run_as_async( + self._ainit_document_table( + table_name, + schema_name, + content_column, + metadata_columns, + metadata_json_column, + store_metadata, + ) + ) def init_document_table( self, @@ -575,13 +703,19 @@ def init_document_table( schema_name (str): The schema name to store PgSQL database table. Default: "public". content_column (str): Name of the column to store document content. + Default: "page_content". metadata_columns (List[sqlalchemy.Column]): A list of SQLAlchemy Columns to create for custom metadata. Optional. + metadata_json_column (str): The column to store extra metadata in JSON format. + Default: "langchain_metadata". Optional. store_metadata (bool): Whether to store extra metadata in a metadata column if not described in 'metadata' field list (Default: True). + + Raises: + :class:`DuplicateTableError `: if table already exists. """ - return self._run_as_sync( - self.ainit_document_table( + self._run_as_sync( + self._ainit_document_table( table_name, schema_name, content_column, @@ -602,7 +736,7 @@ async def _aload_table_schema( (sqlalchemy.Table): The loaded table. """ metadata = MetaData() - async with self._engine.connect() as conn: + async with self._pool.connect() as conn: try: await conn.run_sync( metadata.reflect, schema=schema_name, only=[table_name] diff --git a/src/langchain_google_cloud_sql_pg/loader.py b/src/langchain_google_cloud_sql_pg/loader.py index 39ee8935..b9f14a4b 100644 --- a/src/langchain_google_cloud_sql_pg/loader.py +++ b/src/langchain_google_cloud_sql_pg/loader.py @@ -14,95 +14,18 @@ from __future__ import annotations -import json -from typing import ( - Any, - AsyncIterator, - Callable, - Dict, - Iterable, - Iterator, - List, - Optional, -) - -import sqlalchemy +from typing import AsyncIterator, Callable, Iterator, List, Optional + from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document +from .async_loader import AsyncPostgresDocumentSaver, AsyncPostgresLoader from .engine import PostgresEngine DEFAULT_CONTENT_COL = "page_content" DEFAULT_METADATA_COL = "langchain_metadata" -def text_formatter(row: dict, content_columns: List[str]) -> str: - """txt document formatter.""" - return " ".join(str(row[column]) for column in content_columns if column in row) - - -def csv_formatter(row: dict, content_columns: List[str]) -> str: - """CSV document formatter.""" - return ", ".join(str(row[column]) for column in content_columns if column in row) - - -def yaml_formatter(row: dict, content_columns: List[str]) -> str: - """YAML document formatter.""" - return "\n".join( - f"{column}: {str(row[column])}" for column in content_columns if column in row - ) - - -def json_formatter(row: dict, content_columns: List[str]) -> str: - """JSON document formatter.""" - dictionary = {} - for column in content_columns: - if column in row: - dictionary[column] = row[column] - return json.dumps(dictionary) - - -def _parse_doc_from_row( - content_columns: Iterable[str], - metadata_columns: Iterable[str], - row: dict, - metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, - formatter: Callable = text_formatter, -) -> Document: - """Parse row into document.""" - page_content = formatter(row, content_columns) - metadata: Dict[str, Any] = {} - # unnest metadata from langchain_metadata column - if metadata_json_column and row.get(metadata_json_column): - for k, v in row[metadata_json_column].items(): - metadata[k] = v - # load metadata from other columns - for column in metadata_columns: - if column in row and column != metadata_json_column: - metadata[column] = row[column] - - return Document(page_content=page_content, metadata=metadata) - - -def _parse_row_from_doc( - doc: Document, - column_names: Iterable[str], - content_column: str = DEFAULT_CONTENT_COL, - metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, -) -> Dict: - """Parse document into a dictionary of rows.""" - doc_metadata = doc.metadata.copy() - row: Dict[str, Any] = {content_column: doc.page_content} - for entry in doc.metadata: - if entry in column_names: - row[entry] = doc_metadata[entry] - del doc_metadata[entry] - # store extra metadata in langchain_metadata column in json format - if metadata_json_column: - row[metadata_json_column] = doc_metadata - return row - - class PostgresLoader(BaseLoader): """Load documents from PostgreSQL`. @@ -115,14 +38,7 @@ class PostgresLoader(BaseLoader): __create_key = object() def __init__( - self, - key: object, - engine: PostgresEngine, - query: str, - content_columns: List[str], - metadata_columns: List[str], - formatter: Callable, - metadata_json_column: Optional[str] = None, + self, key: object, engine: PostgresEngine, loader: AsyncPostgresLoader ) -> None: """PostgresLoader constructor. @@ -144,12 +60,8 @@ def __init__( "Only create class through 'create' or 'create_sync' methods!" ) - self.engine = engine - self.query = query - self.content_columns = content_columns - self.metadata_columns = metadata_columns - self.formatter = formatter - self.metadata_json_column = metadata_json_column + self._engine = engine + self._loader = loader @classmethod async def create( @@ -180,71 +92,19 @@ async def create( Returns: PostgresLoader """ - if table_name and query: - raise ValueError("Only one of 'table_name' or 'query' should be specified.") - if not table_name and not query: - raise ValueError( - "At least one of the parameters 'table_name' or 'query' needs to be provided" - ) - if format and formatter: - raise ValueError("Only one of 'format' or 'formatter' should be specified.") - - if format and format not in ["csv", "text", "JSON", "YAML"]: - raise ValueError("format must be type: 'csv', 'text', 'JSON', 'YAML'") - if formatter: - formatter = formatter - elif format == "csv": - formatter = csv_formatter - elif format == "YAML": - formatter = yaml_formatter - elif format == "JSON": - formatter = json_formatter - else: - formatter = text_formatter - - if not query: - query = f'SELECT * FROM "{schema_name}"."{table_name}"' - stmt = sqlalchemy.text(query) - - async with engine._engine.connect() as connection: - result_proxy = await connection.execute(stmt) - - column_names = list(result_proxy.keys()) - # Select content or default to first column - content_columns = content_columns or [column_names[0]] - # Select metadata columns - metadata_columns = metadata_columns or [ - col for col in column_names if col not in content_columns - ] - # Check validity of metadata json column - if metadata_json_column and metadata_json_column not in column_names: - raise ValueError( - f"Column {metadata_json_column} not found in query result {column_names}." - ) - # use default metadata json column if not specified - if metadata_json_column and metadata_json_column in column_names: - metadata_json_column = metadata_json_column - elif DEFAULT_METADATA_COL in column_names: - metadata_json_column = DEFAULT_METADATA_COL - else: - metadata_json_column = None - - # check validity of other column - all_names = content_columns + metadata_columns - for name in all_names: - if name not in column_names: - raise ValueError( - f"Column {name} not found in query result {column_names}." - ) - return cls( - cls.__create_key, + coro = AsyncPostgresLoader.create( engine, query, + table_name, + schema_name, content_columns, metadata_columns, - formatter, metadata_json_column, + format, + formatter, ) + loader = await engine._run_as_async(coro) + return cls(cls.__create_key, engine, loader) @classmethod def create_sync( @@ -275,7 +135,7 @@ def create_sync( Returns: PostgresLoader """ - coro = cls.create( + coro = AsyncPostgresLoader.create( engine, query, table_name, @@ -286,56 +146,36 @@ def create_sync( format, formatter, ) - return engine._run_as_sync(coro) - - async def _collect_async_items(self, docs_generator): - """Exhause document generator into a list.""" - return [doc async for doc in docs_generator] + loader = engine._run_as_sync(coro) + return cls(cls.__create_key, engine, loader) def load(self) -> List[Document]: """Load PostgreSQL data into Document objects.""" - documents = self.engine._run_as_sync( - self._collect_async_items(self.alazy_load()) - ) - return documents + return self._engine._run_as_sync(self._loader.aload()) async def aload(self) -> List[Document]: """Load PostgreSQL data into Document objects.""" - return [doc async for doc in self.alazy_load()] + return await self._engine._run_as_async(self._loader.aload()) def lazy_load(self) -> Iterator[Document]: """Load PostgreSQL data into Document objects lazily.""" - yield from self.engine._run_as_sync( - self._collect_async_items(self.alazy_load()) - ) + iterator = self._loader.alazy_load() + while True: + try: + result = self._engine._run_as_sync(iterator.__anext__()) + yield result + except StopAsyncIteration: + break async def alazy_load(self) -> AsyncIterator[Document]: """Load PostgreSQL data into Document objects lazily.""" - stmt = sqlalchemy.text(self.query) - async with self.engine._engine.connect() as connection: - result_proxy = await connection.execute(stmt) - # load document one by one - while True: - row = result_proxy.fetchone() - if not row: - break - - row_data = {} - column_names = self.content_columns + self.metadata_columns - column_names += ( - [self.metadata_json_column] if self.metadata_json_column else [] - ) - for column in column_names: - value = getattr(row, column) - row_data[column] = value - - yield _parse_doc_from_row( - self.content_columns, - self.metadata_columns, - row_data, - self.metadata_json_column, - self.formatter, - ) + iterator = self._loader.alazy_load() + while True: + try: + result = await self._engine._run_as_async(iterator.__anext__()) + yield result + except StopAsyncIteration: + break class PostgresDocumentSaver: @@ -347,11 +187,7 @@ def __init__( self, key: object, engine: PostgresEngine, - table_name: str, - content_column: str, - schema_name: str = "public", - metadata_columns: List[str] = [], - metadata_json_column: Optional[str] = None, + saver: AsyncPostgresDocumentSaver, ): """PostgresDocumentSaver constructor. @@ -371,12 +207,8 @@ def __init__( raise Exception( "Only create class through 'create' or 'create_sync' methods!" ) - self.engine = engine - self.table_name = table_name - self.content_column = content_column - self.schema_name = schema_name - self.metadata_columns = metadata_columns - self.metadata_json_column = metadata_json_column + self._engine = engine + self._saver = saver @classmethod async def create( @@ -401,42 +233,16 @@ async def create( Returns: PostgresDocumentSaver """ - table_schema = await engine._aload_table_schema(table_name, schema_name) - column_names = table_schema.columns.keys() - if content_column not in column_names: - raise ValueError(f"Content column, {content_column}, does not exist.") - - # Set metadata columns to all columns if not set - if len(metadata_columns) == 0: - metadata_columns = [ - column - for column in column_names - if column != content_column and column != metadata_json_column - ] - - # Check and set metadata json column - for column in metadata_columns: - if column not in column_names: - raise ValueError(f"Metadata column, {column}, does not exist.") - - if ( - metadata_json_column - and metadata_json_column != DEFAULT_METADATA_COL - and metadata_json_column not in column_names - ): - raise ValueError(f"Metadata JSON column, {column}, does not exist.") - elif metadata_json_column not in column_names: - metadata_json_column = None - - return cls( - cls.__create_key, + coro = AsyncPostgresDocumentSaver.create( engine, table_name, - content_column, schema_name, + content_column, metadata_columns, metadata_json_column, ) + saver = await engine._run_as_async(coro) + return cls(cls.__create_key, engine, saver) @classmethod def create_sync( @@ -461,7 +267,7 @@ def create_sync( Returns: PostgresDocumentSaver """ - coro = cls.create( + coro = AsyncPostgresDocumentSaver.create( engine, table_name, schema_name, @@ -469,7 +275,8 @@ def create_sync( metadata_columns, metadata_json_column, ) - return engine._run_as_sync(coro) + saver = engine._run_as_sync(coro) + return cls(cls.__create_key, engine, saver) async def aadd_documents(self, docs: List[Document]) -> None: """ @@ -479,39 +286,7 @@ async def aadd_documents(self, docs: List[Document]) -> None: Args: docs (List[langchain_core.documents.Document]): a list of documents to be saved. """ - - for doc in docs: - row = _parse_row_from_doc( - doc, - self.metadata_columns, - self.content_column, - self.metadata_json_column, - ) - for key, value in row.items(): - if isinstance(value, dict): - row[key] = json.dumps(value) - - # Create list of column names - insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"({self.content_column}' - values_stmt = f"VALUES (:{self.content_column}" - - # Add metadata - for metadata_column in self.metadata_columns: - if metadata_column in doc.metadata: - insert_stmt += f", {metadata_column}" - values_stmt += f", :{metadata_column}" - - # Add JSON column and/or close statement - insert_stmt += ( - f", {self.metadata_json_column})" if self.metadata_json_column else ")" - ) - if self.metadata_json_column: - values_stmt += f", :{self.metadata_json_column})" - else: - values_stmt += ")" - - query = insert_stmt + values_stmt - await self.engine._aexecute(query, row) + await self._engine._run_as_async(self._saver.aadd_documents(docs)) def add_documents(self, docs: List[Document]) -> None: """ @@ -521,7 +296,7 @@ def add_documents(self, docs: List[Document]) -> None: Args: docs (List[langchain_core.documents.Document]): a list of documents to be saved. """ - self.engine._run_as_sync(self.aadd_documents(docs)) + self._engine._run_as_sync(self._saver.aadd_documents(docs)) async def adelete(self, docs: List[Document]) -> None: """ @@ -531,34 +306,7 @@ async def adelete(self, docs: List[Document]) -> None: Args: docs (List[langchain_core.documents.Document]): a list of documents to be deleted. """ - for doc in docs: - row = _parse_row_from_doc( - doc, - self.metadata_columns, - self.content_column, - self.metadata_json_column, - ) - # delete by matching all fields of document - where_conditions_list = [] - for key, value in row.items(): - if isinstance(value, dict): - where_conditions_list.append( - f"{key}::jsonb @> '{json.dumps(value)}'::jsonb" - ) - else: - # Handle simple key-value pairs - where_conditions_list.append(f"{key} = :{key}") - - where_conditions = " AND ".join(where_conditions_list) - stmt = f'DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE {where_conditions};' - values = {} - for key, value in row.items(): - if type(value) is int: - values[key] = str(value) - else: - values[key] = value - - await self.engine._aexecute(stmt, values) + await self._engine._run_as_async(self._saver.adelete(docs)) def delete(self, docs: List[Document]) -> None: """ @@ -568,4 +316,4 @@ def delete(self, docs: List[Document]) -> None: Args: docs (List[langchain_core.documents.Document]): a list of documents to be deleted. """ - self.engine._run_as_sync(self.adelete(docs)) + self._engine._run_as_sync(self._saver.adelete(docs)) diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py index cbe12767..39b79de6 100644 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -15,23 +15,19 @@ # TODO: Remove below import when minimum supported Python version is 3.10 from __future__ import annotations -import json -import uuid -from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Type, Union +from typing import Any, Callable, Iterable, List, Optional, Tuple, Type import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings -from langchain_core.vectorstores import VectorStore, utils -from sqlalchemy.engine.row import RowMapping +from langchain_core.vectorstores import VectorStore +from .async_vectorstore import AsyncPostgresVectorStore from .engine import PostgresEngine from .indexes import ( DEFAULT_DISTANCE_STRATEGY, - DEFAULT_INDEX_NAME_SUFFIX, BaseIndex, DistanceStrategy, - ExactNearestNeighbor, QueryOptions, ) @@ -42,41 +38,13 @@ class PostgresVectorStore(VectorStore): __create_key = object() def __init__( - self, - key: object, - engine: PostgresEngine, - embedding_service: Embeddings, - table_name: str, - schema_name: str = "public", - content_column: str = "content", - embedding_column: str = "embedding", - metadata_columns: List[str] = [], - id_column: str = "langchain_id", - metadata_json_column: Optional[str] = "langchain_metadata", - distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, - k: int = 4, - fetch_k: int = 20, - lambda_mult: float = 0.5, - index_query_options: Optional[QueryOptions] = None, + self, key: object, engine: PostgresEngine, vs: AsyncPostgresVectorStore ): """PostgresVectorStore constructor. Args: key (object): Prevent direct constructor usage. engine (PostgresEngine): Connection pool engine for managing connections to Postgres database. - embedding_service (Embeddings): Text embedding model to use. - table_name (str): Name of the existing table or the table to be created. - schema_name (str, optional): Database schema name of the table. Defaults to "public". - content_column (str): Column that represent a Document’s page_content. Defaults to "content". - embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". - metadata_columns (List[str]): Column(s) that represent a document's metadata. - id_column (str): Column that represents the Document's id. Defaults to "langchain_id". - metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". - distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. - k (int): Number of Documents to return from search. Defaults to 4. - fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. - lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. - index_query_options (QueryOptions): Index query option. - + vs (AsyncPostgresVectorstore): The async only VectorStore implementation Raises: Exception: If called directly by user. @@ -86,20 +54,8 @@ def __init__( "Only create class through 'create' or 'create_sync' methods!" ) - self.engine = engine - self.embedding_service = embedding_service - self.table_name = table_name - self.schema_name = schema_name - self.content_column = content_column - self.embedding_column = embedding_column - self.metadata_columns = metadata_columns - self.id_column = id_column - self.metadata_json_column = metadata_json_column - self.distance_strategy = distance_strategy - self.k = k - self.fetch_k = fetch_k - self.lambda_mult = lambda_mult - self.index_query_options = index_query_options + self._engine = engine + self.__vs = vs @classmethod async def create( @@ -142,56 +98,7 @@ async def create( Returns: PostgresVectorStore """ - if metadata_columns and ignore_metadata_columns: - raise ValueError( - "Can not use both metadata_columns and ignore_metadata_columns." - ) - # Get field type information - stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{table_name}' AND table_schema = '{schema_name}'" - results = await engine._afetch(stmt) - columns = {} - for field in results: - columns[field["column_name"]] = field["data_type"] - - # Check columns - if id_column not in columns: - raise ValueError(f"Id column, {id_column}, does not exist.") - if content_column not in columns: - raise ValueError(f"Content column, {content_column}, does not exist.") - content_type = columns[content_column] - if content_type != "text" and "char" not in content_type: - raise ValueError( - f"Content column, {content_column}, is type, {content_type}. It must be a type of character string." - ) - if embedding_column not in columns: - raise ValueError(f"Embedding column, {embedding_column}, does not exist.") - if columns[embedding_column] != "USER-DEFINED": - raise ValueError( - f"Embedding column, {embedding_column}, is not type Vector." - ) - - metadata_json_column = ( - None if metadata_json_column not in columns else metadata_json_column - ) - - # If using metadata_columns check to make sure column exists - for column in metadata_columns: - if column not in columns: - raise ValueError(f"Metadata column, {column}, does not exist.") - - # If using ignore_metadata_columns, filter out known columns and set known metadata columns - all_columns = columns - if ignore_metadata_columns: - for column in ignore_metadata_columns: - del all_columns[column] - - del all_columns[id_column] - del all_columns[content_column] - del all_columns[embedding_column] - metadata_columns = [k for k in all_columns.keys()] - - return cls( - cls.__create_key, + coro = AsyncPostgresVectorStore.create( engine, embedding_service, table_name, @@ -199,6 +106,7 @@ async def create( content_column, embedding_column, metadata_columns, + ignore_metadata_columns, id_column, metadata_json_column, distance_strategy, @@ -207,6 +115,8 @@ async def create( lambda_mult, index_query_options, ) + vs = await engine._run_as_async(coro) + return cls(cls.__create_key, engine, vs) @classmethod def create_sync( @@ -249,7 +159,7 @@ def create_sync( Returns: PostgresVectorStore """ - coro = cls.create( + coro = AsyncPostgresVectorStore.create( engine, embedding_service, table_name, @@ -266,62 +176,26 @@ def create_sync( lambda_mult, index_query_options, ) - return engine._run_as_sync(coro) + vs = engine._run_as_sync(coro) + return cls(cls.__create_key, engine, vs) @property def embeddings(self) -> Embeddings: - return self.embedding_service + return self.__vs.embedding_service - async def _aadd_embeddings( + async def aadd_texts( self, texts: Iterable[str], - embeddings: List[List[float]], metadatas: Optional[List[dict]] = None, ids: Optional[List[str]] = None, **kwargs: Any, ) -> List[str]: - """Add embeddings to the table.""" - if not ids: - ids = [str(uuid.uuid4()) for _ in texts] - if not metadatas: - metadatas = [{} for _ in texts] - # Insert embeddings - for id, content, embedding, metadata in zip(ids, texts, embeddings, metadatas): - metadata_col_names = ( - ", " + ", ".join(self.metadata_columns) - if len(self.metadata_columns) > 0 - else "" - ) - insert_stmt = f'INSERT INTO "{self.schema_name}"."{self.table_name}"({self.id_column}, {self.content_column}, {self.embedding_column}{metadata_col_names}' - values = {"id": id, "content": content, "embedding": str(embedding)} - values_stmt = "VALUES (:id, :content, :embedding" - - # Add metadata - extra = metadata - for metadata_column in self.metadata_columns: - if metadata_column in metadata: - values_stmt += f", :{metadata_column}" - values[metadata_column] = metadata[metadata_column] - del extra[metadata_column] - else: - values_stmt += ",null" - - # Add JSON column and/or close statement - insert_stmt += ( - f", {self.metadata_json_column})" if self.metadata_json_column else ")" - ) - if self.metadata_json_column: - values_stmt += ", :extra)" - values["extra"] = json.dumps(extra) - else: - values_stmt += ")" - - query = insert_stmt + values_stmt - await self.engine._aexecute(query, values) - - return ids + """Embed texts and add to the table.""" + return await self._engine._run_as_async( + self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) + ) - async def aadd_texts( + def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, @@ -329,11 +203,9 @@ async def aadd_texts( **kwargs: Any, ) -> List[str]: """Embed texts and add to the table.""" - embeddings = self.embedding_service.embed_documents(list(texts)) - ids = await self._aadd_embeddings( - texts, embeddings, metadatas=metadatas, ids=ids, **kwargs + return self._engine._run_as_sync( + self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) ) - return ids async def aadd_documents( self, @@ -342,21 +214,8 @@ async def aadd_documents( **kwargs: Any, ) -> List[str]: """Embed documents and add to the table""" - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) - return ids - - def add_texts( - self, - texts: Iterable[str], - metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, - **kwargs: Any, - ) -> List[str]: - """Embed texts and add to the table.""" - return self.engine._run_as_sync( - self.aadd_texts(texts, metadatas, ids, **kwargs) + return await self._engine._run_as_async( + self.__vs.aadd_documents(documents, ids, **kwargs) ) def add_documents( @@ -366,7 +225,9 @@ def add_documents( **kwargs: Any, ) -> List[str]: """Embed documents and add to the table.""" - return self.engine._run_as_sync(self.aadd_documents(documents, ids, **kwargs)) + return self._engine._run_as_sync( + self.__vs.aadd_documents(documents, ids, **kwargs) + ) async def adelete( self, @@ -374,13 +235,7 @@ async def adelete( **kwargs: Any, ) -> Optional[bool]: """Delete records from the table.""" - if not ids: - return False - - id_list = ", ".join([f"'{id}'" for id in ids]) - query = f'DELETE FROM "{self.schema_name}"."{self.table_name}" WHERE {self.id_column} in ({id_list})' - await self.engine._aexecute(query) - return True + return await self._engine._run_as_async(self.__vs.adelete(ids, **kwargs)) def delete( self, @@ -388,7 +243,7 @@ def delete( **kwargs: Any, ) -> Optional[bool]: """Delete records from the table.""" - return self.engine._run_as_sync(self.adelete(ids, **kwargs)) + return self._engine._run_as_sync(self.__vs.adelete(ids, **kwargs)) @classmethod async def afrom_texts( # type: ignore[override] @@ -406,7 +261,11 @@ async def afrom_texts( # type: ignore[override] ignore_metadata_columns: Optional[List[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", - **kwargs: Any, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from texts. Args: @@ -423,6 +282,11 @@ async def afrom_texts( # type: ignore[override] ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. id_column (str): Column that represents the Document's id. Defaults to "langchain_id". metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. Returns: PostgresVectorStore @@ -438,8 +302,13 @@ async def afrom_texts( # type: ignore[override] ignore_metadata_columns, id_column, metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, ) - await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + await vs.aadd_texts(texts, metadatas=metadatas, ids=ids) return vs @classmethod @@ -457,7 +326,11 @@ async def afrom_documents( # type: ignore[override] ignore_metadata_columns: Optional[List[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", - **kwargs: Any, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from documents. @@ -475,6 +348,11 @@ async def afrom_documents( # type: ignore[override] ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. id_column (str): Column that represents the Document's id. Defaults to "langchain_id". metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. Returns: PostgresVectorStore @@ -490,10 +368,13 @@ async def afrom_documents( # type: ignore[override] ignore_metadata_columns, id_column, metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, ) - texts = [doc.page_content for doc in documents] - metadatas = [doc.metadata for doc in documents] - await vs.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) + await vs.aadd_documents(documents, ids=ids) return vs @classmethod @@ -512,7 +393,11 @@ def from_texts( # type: ignore[override] ignore_metadata_columns: Optional[List[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", - **kwargs: Any, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from texts. Args: @@ -529,27 +414,34 @@ def from_texts( # type: ignore[override] ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. id_column (str): Column that represents the Document's id. Defaults to "langchain_id". metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. Returns: PostgresVectorStore """ - coro = cls.afrom_texts( - texts, - embedding, + vs = cls.create_sync( engine, + embedding, table_name, schema_name, - metadatas=metadatas, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - ids=ids, - **kwargs, + content_column, + embedding_column, + metadata_columns, + ignore_metadata_columns, + id_column, + metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, ) - return engine._run_as_sync(coro) + vs.add_texts(texts, metadatas=metadatas, ids=ids) + return vs @classmethod def from_documents( # type: ignore[override] @@ -566,7 +458,11 @@ def from_documents( # type: ignore[override] ignore_metadata_columns: Optional[List[str]] = None, id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", - **kwargs: Any, + distance_strategy: DistanceStrategy = DEFAULT_DISTANCE_STRATEGY, + k: int = 4, + fetch_k: int = 20, + lambda_mult: float = 0.5, + index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from documents. @@ -584,51 +480,36 @@ def from_documents( # type: ignore[override] ignore_metadata_columns (List[str]): Column(s) to ignore in pre-existing tables for a document's metadata. Can not be used with metadata_columns. Defaults to None. id_column (str): Column that represents the Document's id. Defaults to "langchain_id". metadata_json_column (str): Column to store metadata as JSON. Defaults to "langchain_metadata". + distance_strategy (DistanceStrategy): Distance strategy to use for vector similarity search. Defaults to COSINE_DISTANCE. + k (int): Number of Documents to return from search. Defaults to 4. + fetch_k (int): Number of Documents to fetch to pass to MMR algorithm. + lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. + index_query_options (QueryOptions): Index query option. Returns: PostgresVectorStore """ - coro = cls.afrom_documents( - documents, - embedding, + vs = cls.create_sync( engine, + embedding, table_name, schema_name, - content_column=content_column, - embedding_column=embedding_column, - metadata_columns=metadata_columns, - ignore_metadata_columns=ignore_metadata_columns, - metadata_json_column=metadata_json_column, - id_column=id_column, - ids=ids, - **kwargs, + content_column, + embedding_column, + metadata_columns, + ignore_metadata_columns, + id_column, + metadata_json_column, + distance_strategy, + k, + fetch_k, + lambda_mult, + index_query_options, ) - return engine._run_as_sync(coro) - - async def __query_collection( - self, - embedding: List[float], - k: Optional[int] = None, - filter: Optional[str] = None, - **kwargs: Any, - ) -> Sequence[RowMapping]: - """Perform similarity search query on the vector store table.""" - k = k if k else self.k - operator = self.distance_strategy.operator - search_function = self.distance_strategy.search_function - - filter = f"WHERE {filter}" if filter else "" - stmt = f"SELECT *, {search_function}({self.embedding_column}, '{embedding}') as distance FROM \"{self.schema_name}\".\"{self.table_name}\" {filter} ORDER BY {self.embedding_column} {operator} '{embedding}' LIMIT {k};" - if self.index_query_options: - query_options_stmt = f"SET LOCAL {self.index_query_options.to_string()};" - results = await self.engine._afetch_with_query_options( - stmt, query_options_stmt - ) - else: - results = await self.engine._afetch(stmt) - return results + vs.add_documents(documents, ids=ids) + return vs - def similarity_search( + async def asimilarity_search( self, query: str, k: Optional[int] = None, @@ -636,11 +517,11 @@ def similarity_search( **kwargs: Any, ) -> List[Document]: """Return docs selected by similarity search on query.""" - return self.engine._run_as_sync( - self.asimilarity_search(query, k=k, filter=filter, **kwargs) + return await self._engine._run_as_async( + self.__vs.asimilarity_search(query, k, filter, **kwargs) ) - async def asimilarity_search( + def similarity_search( self, query: str, k: Optional[int] = None, @@ -648,21 +529,19 @@ async def asimilarity_search( **kwargs: Any, ) -> List[Document]: """Return docs selected by similarity search on query.""" - embedding = self.embedding_service.embed_query(text=query) - - return await self.asimilarity_search_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs + return self._engine._run_as_sync( + self.__vs.asimilarity_search(query, k, filter, **kwargs) ) + # Required for (a)similarity_search_with_relevance_scores def _select_relevance_score_fn(self) -> Callable[[float], float]: """Select a relevance function based on distance strategy.""" - # Calculate distance strategy provided in - # vectorstore constructor - if self.distance_strategy == DistanceStrategy.COSINE_DISTANCE: + # Calculate distance strategy provided in vectorstore constructor + if self.__vs.distance_strategy == DistanceStrategy.COSINE_DISTANCE: return self._cosine_relevance_score_fn - if self.distance_strategy == DistanceStrategy.INNER_PRODUCT: + if self.__vs.distance_strategy == DistanceStrategy.INNER_PRODUCT: return self._max_inner_product_relevance_score_fn - elif self.distance_strategy == DistanceStrategy.EUCLIDEAN: + elif self.__vs.distance_strategy == DistanceStrategy.EUCLIDEAN: return self._euclidean_relevance_score_fn async def asimilarity_search_with_score( @@ -673,11 +552,21 @@ async def asimilarity_search_with_score( **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs and distance scores selected by similarity search on query.""" - embedding = self.embedding_service.embed_query(query) - docs = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs + return await self._engine._run_as_async( + self.__vs.asimilarity_search_with_score(query, k, filter, **kwargs) + ) + + def similarity_search_with_score( + self, + query: str, + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected by similarity search on query.""" + return self._engine._run_as_sync( + self.__vs.asimilarity_search_with_score(query, k, filter, **kwargs) ) - return docs async def asimilarity_search_by_vector( self, @@ -687,11 +576,21 @@ async def asimilarity_search_by_vector( **kwargs: Any, ) -> List[Document]: """Return docs selected by vector similarity search.""" - docs_and_scores = await self.asimilarity_search_with_score_by_vector( - embedding=embedding, k=k, filter=filter, **kwargs + return await self._engine._run_as_async( + self.__vs.asimilarity_search_by_vector(embedding, k, filter, **kwargs) ) - return [doc for doc, _ in docs_and_scores] + def similarity_search_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Document]: + """Return docs selected by vector similarity search.""" + return self._engine._run_as_sync( + self.__vs.asimilarity_search_by_vector(embedding, k, filter, **kwargs) + ) async def asimilarity_search_with_score_by_vector( self, @@ -701,30 +600,25 @@ async def asimilarity_search_with_score_by_vector( **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs and distance scores selected by vector similarity search.""" - results = await self.__query_collection( - embedding=embedding, k=k, filter=filter, **kwargs + return await self._engine._run_as_async( + self.__vs.asimilarity_search_with_score_by_vector( + embedding, k, filter, **kwargs + ) ) - documents_with_scores = [] - for row in results: - metadata = ( - row[self.metadata_json_column] - if self.metadata_json_column and row[self.metadata_json_column] - else {} - ) - for col in self.metadata_columns: - metadata[col] = row[col] - documents_with_scores.append( - ( - Document( - page_content=row[self.content_column], - metadata=metadata, - ), - row["distance"], - ) + def similarity_search_with_score_by_vector( + self, + embedding: List[float], + k: Optional[int] = None, + filter: Optional[str] = None, + **kwargs: Any, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected by similarity search on vector.""" + return self._engine._run_as_sync( + self.__vs.asimilarity_search_with_score_by_vector( + embedding, k, filter, **kwargs ) - - return documents_with_scores + ) async def amax_marginal_relevance_search( self, @@ -736,20 +630,15 @@ async def amax_marginal_relevance_search( **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance.""" - embedding = self.embedding_service.embed_query(text=query) - - return await self.amax_marginal_relevance_search_by_vector( - embedding=embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - **kwargs, + return await self._engine._run_as_async( + self.__vs.amax_marginal_relevance_search( + query, k, fetch_k, lambda_mult, filter, **kwargs + ) ) - async def amax_marginal_relevance_search_by_vector( + def max_marginal_relevance_search( self, - embedding: List[float], + query: str, k: Optional[int] = None, fetch_k: Optional[int] = None, lambda_mult: Optional[float] = None, @@ -757,20 +646,13 @@ async def amax_marginal_relevance_search_by_vector( **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance.""" - docs_and_scores = ( - await self.amax_marginal_relevance_search_with_score_by_vector( - embedding, - k=k, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - filter=filter, - **kwargs, + return self._engine._run_as_sync( + self.__vs.amax_marginal_relevance_search( + query, k, fetch_k, lambda_mult, filter, **kwargs ) ) - return [result[0] for result in docs_and_scores] - - async def amax_marginal_relevance_search_with_score_by_vector( + async def amax_marginal_relevance_search_by_vector( self, embedding: List[float], k: Optional[int] = None, @@ -778,82 +660,17 @@ async def amax_marginal_relevance_search_with_score_by_vector( lambda_mult: Optional[float] = None, filter: Optional[str] = None, **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and distance scores selected using the maximal marginal relevance.""" - results = await self.__query_collection( - embedding=embedding, k=fetch_k, filter=filter, **kwargs - ) - - k = k if k else self.k - fetch_k = fetch_k if fetch_k else self.fetch_k - lambda_mult = lambda_mult if lambda_mult else self.lambda_mult - embedding_list = [json.loads(row[self.embedding_column]) for row in results] - mmr_selected = utils.maximal_marginal_relevance( - np.array(embedding, dtype=np.float32), - embedding_list, - k=k, - lambda_mult=lambda_mult, - ) - - documents_with_scores = [] - for row in results: - metadata = ( - row[self.metadata_json_column] - if self.metadata_json_column and row[self.metadata_json_column] - else {} - ) - for col in self.metadata_columns: - metadata[col] = row[col] - documents_with_scores.append( - ( - Document( - page_content=row[self.content_column], - metadata=metadata, - ), - row["distance"], - ) - ) - - return [r for i, r in enumerate(documents_with_scores) if i in mmr_selected] - - def similarity_search_with_score( - self, - query: str, - k: Optional[int] = None, - filter: Optional[str] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on query.""" - coro = self.asimilarity_search_with_score(query, k, filter=filter, **kwargs) - return self.engine._run_as_sync(coro) - - def similarity_search_by_vector( - self, - embedding: List[float], - k: Optional[int] = None, - filter: Optional[str] = None, - **kwargs: Any, ) -> List[Document]: - """Return docs selected by vector similarity search.""" - coro = self.asimilarity_search_by_vector(embedding, k, filter=filter, **kwargs) - return self.engine._run_as_sync(coro) - - def similarity_search_with_score_by_vector( - self, - embedding: List[float], - k: Optional[int] = None, - filter: Optional[str] = None, - **kwargs: Any, - ) -> List[Tuple[Document, float]]: - """Return docs and distance scores selected by similarity search on vector.""" - coro = self.asimilarity_search_with_score_by_vector( - embedding, k, filter=filter, **kwargs + """Return docs selected using the maximal marginal relevance.""" + return await self._engine._run_as_async( + self.__vs.amax_marginal_relevance_search_by_vector( + embedding, k, fetch_k, lambda_mult, filter, **kwargs + ) ) - return self.engine._run_as_sync(coro) - def max_marginal_relevance_search( + def max_marginal_relevance_search_by_vector( self, - query: str, + embedding: List[float], k: Optional[int] = None, fetch_k: Optional[int] = None, lambda_mult: Optional[float] = None, @@ -861,17 +678,13 @@ def max_marginal_relevance_search( **kwargs: Any, ) -> List[Document]: """Return docs selected using the maximal marginal relevance.""" - coro = self.amax_marginal_relevance_search( - query, - k, - filter=filter, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - **kwargs, + return self._engine._run_as_sync( + self.__vs.amax_marginal_relevance_search_by_vector( + embedding, k, fetch_k, lambda_mult, filter, **kwargs + ) ) - return self.engine._run_as_sync(coro) - def max_marginal_relevance_search_by_vector( + async def amax_marginal_relevance_search_with_score_by_vector( self, embedding: List[float], k: Optional[int] = None, @@ -879,17 +692,13 @@ def max_marginal_relevance_search_by_vector( lambda_mult: Optional[float] = None, filter: Optional[str] = None, **kwargs: Any, - ) -> List[Document]: - """Return docs selected using the maximal marginal relevance.""" - coro = self.amax_marginal_relevance_search_by_vector( - embedding, - k, - filter=filter, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - **kwargs, + ) -> List[Tuple[Document, float]]: + """Return docs and distance scores selected using the maximal marginal relevance.""" + return await self._engine._run_as_async( + self.__vs.amax_marginal_relevance_search_with_score_by_vector( + embedding, k, fetch_k, lambda_mult, filter, **kwargs + ) ) - return self.engine._run_as_sync(coro) def max_marginal_relevance_search_with_score_by_vector( self, @@ -901,15 +710,11 @@ def max_marginal_relevance_search_with_score_by_vector( **kwargs: Any, ) -> List[Tuple[Document, float]]: """Return docs and distance scores selected using the maximal marginal relevance.""" - coro = self.amax_marginal_relevance_search_with_score_by_vector( - embedding, - k, - filter=filter, - fetch_k=fetch_k, - lambda_mult=lambda_mult, - **kwargs, + return self._engine._run_as_sync( + self.__vs.amax_marginal_relevance_search_with_score_by_vector( + embedding, k, fetch_k, lambda_mult, filter, **kwargs + ) ) - return self.engine._run_as_sync(coro) async def aapply_vector_index( self, @@ -918,48 +723,55 @@ async def aapply_vector_index( concurrently: bool = False, ) -> None: """Create an index on the vector store table.""" - if isinstance(index, ExactNearestNeighbor): - await self.adrop_vector_index() - return - - filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" - params = "WITH " + index.index_options() - function = index.distance_strategy.index_function - if name is None: - if index.name == None: - index.name = self.table_name + DEFAULT_INDEX_NAME_SUFFIX - name = index.name - stmt = f'CREATE INDEX {"CONCURRENTLY" if concurrently else ""} {name} ON "{self.schema_name}"."{self.table_name}" USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};' - if concurrently: - await self.engine._aexecute_outside_tx(stmt) - else: - await self.engine._aexecute(stmt) + return await self._engine._run_as_async( + self.__vs.aapply_vector_index(index, name, concurrently) + ) + + def apply_vector_index( + self, + index: BaseIndex, + name: Optional[str] = None, + concurrently: bool = False, + ) -> None: + """Create an index on the vector store table.""" + return self._engine._run_as_sync( + self.__vs.aapply_vector_index(index, name, concurrently) + ) async def areindex(self, index_name: Optional[str] = None) -> None: """Re-index the vector store table.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f"REINDEX INDEX {index_name};" - await self.engine._aexecute(query) + return await self._engine._run_as_async(self.__vs.areindex(index_name)) + + def reindex(self, index_name: Optional[str] = None) -> None: + """Re-index the vector store table.""" + return self._engine._run_as_sync(self.__vs.areindex(index_name)) async def adrop_vector_index( self, index_name: Optional[str] = None, ) -> None: """Drop the vector index.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f"DROP INDEX IF EXISTS {index_name};" - await self.engine._aexecute(query) + return await self._engine._run_as_async( + self.__vs.adrop_vector_index(index_name) + ) + + def drop_vector_index( + self, + index_name: Optional[str] = None, + ) -> None: + """Drop the vector index.""" + return self._engine._run_as_sync(self.__vs.adrop_vector_index(index_name)) - async def is_valid_index( + async def ais_valid_index( self, index_name: Optional[str] = None, ) -> bool: """Check if index exists in the table.""" - index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX - query = f""" - SELECT tablename, indexname - FROM pg_indexes - WHERE tablename = '{self.table_name}' AND schemaname = '{self.schema_name}' AND indexname = '{index_name}'; - """ - results = await self.engine._afetch(query) - return bool(len(results) == 1) + return await self._engine._run_as_async(self.__vs.is_valid_index(index_name)) + + def is_valid_index( + self, + index_name: Optional[str] = None, + ) -> bool: + """Check if index exists in the table.""" + return self._engine._run_as_sync(self.__vs.is_valid_index(index_name)) diff --git a/tests/test_async_chatmessagehistory.py b/tests/test_async_chatmessagehistory.py new file mode 100644 index 00000000..b626674b --- /dev/null +++ b/tests/test_async_chatmessagehistory.py @@ -0,0 +1,124 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import uuid +from typing import Any, Generator + +import pytest +import pytest_asyncio +from langchain_core.messages.ai import AIMessage +from langchain_core.messages.human import HumanMessage +from sqlalchemy import text + +from langchain_google_cloud_sql_pg import PostgresEngine +from langchain_google_cloud_sql_pg.async_chat_message_history import ( + AsyncPostgresChatMessageHistory, +) + +project_id = os.environ["PROJECT_ID"] +region = os.environ["REGION"] +instance_id = os.environ["INSTANCE_ID"] +db_name = os.environ["DATABASE_ID"] +table_name = "message_store" + str(uuid.uuid4()) +table_name_async = "message_store" + str(uuid.uuid4()) + + +async def aexecute(engine: PostgresEngine, query: str) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + +@pytest_asyncio.fixture +async def async_engine(): + async_engine = await PostgresEngine.afrom_instance( + project_id=project_id, + region=region, + instance=instance_id, + database=db_name, + ) + await async_engine._ainit_chat_history_table(table_name=table_name_async) + yield async_engine + # use default table for AsyncPostgresChatMessageHistory + query = f'DROP TABLE IF EXISTS "{table_name_async}"' + await aexecute(async_engine, query) + await async_engine.close() + + +@pytest.mark.asyncio +async def test_chat_message_history_async( + async_engine: PostgresEngine, +) -> None: + history = await AsyncPostgresChatMessageHistory.create( + engine=async_engine, session_id="test", table_name=table_name_async + ) + msg1 = HumanMessage(content="hi!") + msg2 = AIMessage(content="whats up?") + await history.aadd_message(msg1) + await history.aadd_message(msg2) + messages = await history._aget_messages() + + # verify messages are correct + assert messages[0].content == "hi!" + assert type(messages[0]) is HumanMessage + assert messages[1].content == "whats up?" + assert type(messages[1]) is AIMessage + + # verify clear() clears message history + await history.aclear() + assert len(await history._aget_messages()) == 0 + + +@pytest.mark.asyncio +async def test_chat_message_history_sync_messages( + async_engine: PostgresEngine, +) -> None: + history1 = await AsyncPostgresChatMessageHistory.create( + engine=async_engine, session_id="test", table_name=table_name_async + ) + history2 = await AsyncPostgresChatMessageHistory.create( + engine=async_engine, session_id="test", table_name=table_name_async + ) + msg1 = HumanMessage(content="hi!") + msg2 = AIMessage(content="whats up?") + await history1.aadd_message(msg1) + await history2.aadd_message(msg2) + + assert len(await history1._aget_messages()) == 2 + assert len(await history2._aget_messages()) == 2 + + # verify clear() clears message history + await history2.aclear() + assert len(await history2._aget_messages()) == 0 + + +@pytest.mark.asyncio +async def test_chat_table_async(async_engine): + with pytest.raises(ValueError): + await AsyncPostgresChatMessageHistory.create( + engine=async_engine, session_id="test", table_name="doesnotexist" + ) + + +@pytest.mark.asyncio +async def test_chat_schema_async(async_engine): + table_name = "test_table" + str(uuid.uuid4()) + await async_engine._ainit_document_table(table_name=table_name) + with pytest.raises(IndexError): + await AsyncPostgresChatMessageHistory.create( + engine=async_engine, session_id="test", table_name=table_name + ) + + query = f'DROP TABLE IF EXISTS "{table_name}"' + await aexecute(async_engine, query) diff --git a/tests/test_async_loader.py b/tests/test_async_loader.py new file mode 100644 index 00000000..c29a82f7 --- /dev/null +++ b/tests/test_async_loader.py @@ -0,0 +1,757 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import uuid + +import pytest +import pytest_asyncio +from langchain_core.documents import Document +from sqlalchemy import text + +from langchain_google_cloud_sql_pg import Column, PostgresEngine +from langchain_google_cloud_sql_pg.async_loader import ( + AsyncPostgresDocumentSaver, + AsyncPostgresLoader, +) + +project_id = os.environ["PROJECT_ID"] +region = os.environ["REGION"] +instance_id = os.environ["INSTANCE_ID"] +db_name = os.environ["DATABASE_ID"] +table_name = "test-table" + str(uuid.uuid4()) + + +async def aexecute(engine: PostgresEngine, query: str) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + +@pytest.mark.asyncio(scope="class") +class TestLoaderAsync: + + @pytest_asyncio.fixture(scope="class") + async def engine(self): + PostgresEngine._connector = None + engine = await PostgresEngine.afrom_instance( + project_id=project_id, + instance=instance_id, + region=region, + database=db_name, + ) + yield engine + + await engine.close() + + async def _collect_async_items(self, docs_generator): + """Collects items from an async generator.""" + docs = [] + async for doc in docs_generator: + docs.append(doc) + return docs + + async def _cleanup_table(self, engine): + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_create_loader_with_invalid_parameters(self, engine): + with pytest.raises(ValueError): + await AsyncPostgresLoader.create( + engine=engine, + ) + with pytest.raises(ValueError): + + def fake_formatter(): + return None + + await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + format="text", + formatter=fake_formatter, + ) + with pytest.raises(ValueError): + await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + format="fake_format", + ) + + async def test_load_from_query_default(self, engine): + table_name = "test-table" + str(uuid.uuid4()) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" ( + fruit_name, variety, quantity_in_stock, price_per_unit, organic + ) VALUES ('Apple', 'Granny Smith', 150, 1, 1); + """ + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="1", + metadata={ + "fruit_name": "Apple", + "variety": "Granny Smith", + "quantity_in_stock": 150, + "price_per_unit": 1, + "organic": 1, + }, + ) + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_load_from_query_customized_content_customized_metadata(self, engine): + await self._cleanup_table(engine) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) + VALUES ('Apple', 'Granny Smith', 150, 0.99, 1), + ('Banana', 'Cavendish', 200, 0.59, 0), + ('Orange', 'Navel', 80, 1.29, 1); + """ + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "fruit_name", + "variety", + "quantity_in_stock", + "price_per_unit", + "organic", + ], + metadata_columns=["fruit_id"], + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="Apple Granny Smith 150 1 1", + metadata={"fruit_id": 1}, + ), + Document( + page_content="Banana Cavendish 200 1 0", + metadata={"fruit_id": 2}, + ), + Document( + page_content="Orange Navel 80 1 1", + metadata={"fruit_id": 3}, + ), + ] + + await self._cleanup_table(engine) + + async def test_load_from_query_customized_content_default_metadata(self, engine): + await self._cleanup_table(engine) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) + VALUES ('Apple', 'Granny Smith', 150, 1, 1); + """ + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "variety", + "quantity_in_stock", + "price_per_unit", + ], + ) + + documents = [] + async for docs in loader.alazy_load(): + documents.append(docs) + + assert documents == [ + Document( + page_content="Granny Smith 150 1", + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "variety", + "quantity_in_stock", + "price_per_unit", + ], + format="JSON", + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content='{"variety": "Granny Smith", "quantity_in_stock": 150, "price_per_unit": 1}', + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + await self._cleanup_table(engine) + + async def test_load_from_query_default_content_customized_metadata(self, engine): + await self._cleanup_table(engine) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" ( + fruit_name, + variety, + quantity_in_stock, + price_per_unit, + organic + ) VALUES ('Apple', 'Granny Smith', 150, 1, 1); + """ + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + metadata_columns=["fruit_name", "organic"], + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="1", + metadata={"fruit_name": "Apple", "organic": 1}, + ) + ] + await self._cleanup_table(engine) + + async def test_load_from_query_with_langchain_metadata(self, engine): + table_name = "test-table" + str(uuid.uuid4()) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}"( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + langchain_metadata JSON NOT NULL + ) + """ + await aexecute(engine, query) + + metadata = json.dumps({"organic": 1}) + insert_query = f""" + INSERT INTO "{table_name}" + (fruit_name, variety, quantity_in_stock, price_per_unit, langchain_metadata) + VALUES ('Apple', 'Granny Smith', 150, 1, '{metadata}');""" + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + metadata_columns=[ + "fruit_name", + "langchain_metadata", + ], + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="1", + metadata={ + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_load_from_query_with_json(self, engine): + + table_name = "test-table" + str(uuid.uuid4()) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}"( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety JSON NOT NULL, + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + langchain_metadata JSON NOT NULL + ) + """ + await aexecute(engine, query) + + metadata = json.dumps({"organic": 1}) + variety = json.dumps({"type": "Granny Smith"}) + insert_query = f""" + INSERT INTO "{table_name}" + (fruit_name, variety, quantity_in_stock, price_per_unit, langchain_metadata) + VALUES ('Apple', '{variety}', 150, 1, '{metadata}');""" + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + metadata_columns=[ + "variety", + ], + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="1", + metadata={ + "variety": {"type": "Granny Smith"}, + "organic": 1, + }, + ) + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_load_from_query_customized_content_default_metadata_custom_formatter( + self, engine + ): + + table_name = "test-table" + str(uuid.uuid4()) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) + VALUES ('Apple', 'Granny Smith', 150, 1, 1); + """ + await aexecute(engine, insert_query) + + def my_formatter(row, content_columns): + return "-".join( + str(row[column]) for column in content_columns if column in row + ) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "variety", + "quantity_in_stock", + "price_per_unit", + ], + formatter=my_formatter, + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="Granny Smith-150-1", + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_load_from_query_customized_content_default_metadata_custom_page_content_format( + self, engine + ): + await self._cleanup_table(engine) + query = f""" + CREATE TABLE IF NOT EXISTS "{table_name}" ( + fruit_id SERIAL PRIMARY KEY, + fruit_name VARCHAR(100) NOT NULL, + variety VARCHAR(50), + quantity_in_stock INT NOT NULL, + price_per_unit INT NOT NULL, + organic INT NOT NULL + ) + """ + await aexecute(engine, query) + + insert_query = f""" + INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) + VALUES ('Apple', 'Granny Smith', 150, 1, 1); + """ + await aexecute(engine, insert_query) + + loader = await AsyncPostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "variety", + "quantity_in_stock", + "price_per_unit", + ], + format="YAML", + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content="variety: Granny Smith\nquantity_in_stock: 150\nprice_per_unit: 1", + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + + await self._cleanup_table(engine) + + async def test_save_doc_with_default_metadata(self, engine): + + await self._cleanup_table(engine) + await engine._ainit_document_table(table_name) + test_docs = [ + Document( + page_content="Apple Granny Smith 150 0.99 1", + metadata={"fruit_id": 1}, + ), + Document( + page_content="Banana Cavendish 200 0.59 0", + metadata={"fruit_id": 2}, + ), + Document( + page_content="Orange Navel 80 1.29 1", + metadata={"fruit_id": 3}, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, table_name=table_name + ) + loader = await AsyncPostgresLoader.create(engine=engine, table_name=table_name) + + await saver.aadd_documents(test_docs) + docs = await self._collect_async_items(loader.alazy_load()) + + assert docs == test_docs + assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + "page_content", + "langchain_metadata", + ] + await self._cleanup_table(engine) + + @pytest.mark.parametrize("store_metadata", [True, False]) + async def test_save_doc_with_customized_metadata(self, engine, store_metadata): + table_name = "test-table" + str(uuid.uuid4()) + await engine._ainit_document_table( + table_name, + metadata_columns=[ + Column("fruit_name", "VARCHAR"), + Column("organic", "BOOLEAN"), + ], + store_metadata=store_metadata, + ) + test_docs = [ + Document( + page_content="Granny Smith 150 0.99", + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": True, + }, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, table_name=table_name + ) + loader = await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + metadata_columns=[ + "fruit_name", + "organic", + ], + ) + + await saver.aadd_documents(test_docs) + docs = await self._collect_async_items(loader.alazy_load()) + + if store_metadata: + docs == test_docs + assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + "page_content", + "fruit_name", + "organic", + "langchain_metadata", + ] + else: + assert docs == [ + Document( + page_content="Granny Smith 150 0.99", + metadata={"fruit_name": "Apple", "organic": True}, + ), + ] + assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + "page_content", + "fruit_name", + "organic", + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_save_doc_without_metadata(self, engine): + table_name = "test-table" + str(uuid.uuid4()) + await engine._ainit_document_table(table_name, store_metadata=False) + test_docs = [ + Document( + page_content="Granny Smith 150 0.99", + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, table_name=table_name + ) + await saver.aadd_documents(test_docs) + + loader = await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + ) + + docs = await self._collect_async_items(loader.alazy_load()) + + assert docs == [ + Document( + page_content="Granny Smith 150 0.99", + metadata={}, + ), + ] + assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + "page_content", + ] + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_delete_doc_with_default_metadata(self, engine): + table_name = "test-table" + str(uuid.uuid4()) + await engine._ainit_document_table(table_name) + + test_docs = [ + Document( + page_content="Apple Granny Smith 150 0.99 1", + metadata={"fruit_id": 1}, + ), + Document( + page_content="Banana Cavendish 200 0.59 0 1", + metadata={"fruit_id": 2}, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, table_name=table_name + ) + loader = await AsyncPostgresLoader.create(engine=engine, table_name=table_name) + + await saver.aadd_documents(test_docs) + docs = await self._collect_async_items(loader.alazy_load()) + assert docs == test_docs + + await saver.adelete(docs[:1]) + assert len(await self._collect_async_items(loader.alazy_load())) == 1 + + await saver.adelete(docs) + assert len(await self._collect_async_items(loader.alazy_load())) == 0 + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') + + async def test_delete_doc_with_query(self, engine): + await self._cleanup_table(engine) + await engine._ainit_document_table( + table_name, + metadata_columns=[ + Column( + "fruit_name", + "VARCHAR", + ), + Column( + "organic", + "BOOLEAN", + ), + ], + store_metadata=True, + ) + + test_docs = [ + Document( + page_content="Granny Smith 150 0.99", + metadata={ + "fruit-id": 1, + "fruit_name": "Apple", + "organic": True, + }, + ), + Document( + page_content="Cavendish 200 0.59 0", + metadata={ + "fruit_id": 2, + "fruit_name": "Banana", + "organic": False, + }, + ), + Document( + page_content="Navel 80 1.29 1", + metadata={ + "fruit_id": 3, + "fruit_name": "Orange", + "organic": True, + }, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, table_name=table_name + ) + query = f"SELECT * FROM \"{table_name}\" WHERE fruit_name='Apple';" + loader = await AsyncPostgresLoader.create(engine=engine, query=query) + + await saver.aadd_documents(test_docs) + docs = await self._collect_async_items(loader.alazy_load()) + assert len(docs) == 1 + + await saver.adelete(docs) + assert len(await self._collect_async_items(loader.alazy_load())) == 0 + await self._cleanup_table(engine) + + @pytest.mark.parametrize("metadata_json_column", [None, "metadata_col_test"]) + async def test_delete_doc_with_customized_metadata( + self, engine, metadata_json_column + ): + table_name = "test-table" + str(uuid.uuid4()) + content_column = "content_col_test" + await engine._ainit_document_table( + table_name, + metadata_columns=[ + Column("fruit_name", "VARCHAR"), + Column("organic", "BOOLEAN"), + ], + content_column=content_column, + metadata_json_column=metadata_json_column, + ) + test_docs = [ + Document( + page_content="Granny Smith 150 0.99", + metadata={ + "fruit-id": 1, + "fruit_name": "Apple", + "organic": True, + }, + ), + Document( + page_content="Cavendish 200 0.59 0", + metadata={ + "fruit_id": 2, + "fruit_name": "Banana", + "organic": True, + }, + ), + ] + saver = await AsyncPostgresDocumentSaver.create( + engine=engine, + table_name=table_name, + content_column=content_column, + metadata_json_column=metadata_json_column, + ) + loader = await AsyncPostgresLoader.create( + engine=engine, + table_name=table_name, + content_columns=[content_column], + metadata_json_column=metadata_json_column, + ) + + await saver.aadd_documents(test_docs) + + docs = await loader.aload() + assert len(docs) == 2 + + await saver.adelete(docs[:1]) + assert len(await self._collect_async_items(loader.alazy_load())) == 1 + + await saver.adelete(docs) + assert len(await self._collect_async_items(loader.alazy_load())) == 0 + await aexecute(engine, f'DROP TABLE IF EXISTS "{table_name}"') diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_async_vectorstore.py similarity index 67% rename from tests/test_cloudsql_vectorstore.py rename to tests/test_async_vectorstore.py index 081853a8..52016149 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_async_vectorstore.py @@ -14,13 +14,17 @@ import os import uuid +from typing import Sequence import pytest import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text +from sqlalchemy.engine.row import RowMapping -from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore +from langchain_google_cloud_sql_pg import Column, PostgresEngine +from langchain_google_cloud_sql_pg.async_vectorstore import AsyncPostgresVectorStore DEFAULT_TABLE = "test_table" + str(uuid.uuid4()) DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()) @@ -45,6 +49,20 @@ def get_env_var(key: str, desc: str) -> str: return v +async def aexecute(engine: PostgresEngine, query: str) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + +async def afetch(engine: PostgresEngine, query: str) -> Sequence[RowMapping]: + async with engine._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + return result_fetch + + @pytest.mark.asyncio(scope="class") class TestVectorStore: @pytest.fixture(scope="module") @@ -73,45 +91,23 @@ async def engine(self, db_project, db_region, db_instance, db_name): ) yield engine - - @pytest_asyncio.fixture(scope="class") - def engine_sync(self, db_project, db_region, db_instance, db_name): - engine = PostgresEngine.from_instance( - project_id=db_project, - instance=db_instance, - region=db_region, - database=db_name, - ) - yield engine - - @pytest_asyncio.fixture(scope="class") - def vs_sync(self, engine_sync): - engine_sync.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) - - vs = PostgresVectorStore.create_sync( - engine_sync, - embedding_service=embeddings_service, - table_name=DEFAULT_TABLE_SYNC, - ) - yield vs - engine_sync._execute(f'DROP TABLE IF EXISTS "{DEFAULT_TABLE_SYNC}"') - engine_sync._engine.dispose() + await aexecute(engine, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE}"') + await aexecute(engine, f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') + await engine.close() @pytest_asyncio.fixture(scope="class") async def vs(self, engine): - await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - vs = await PostgresVectorStore.create( + await engine._ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + vs = await AsyncPostgresVectorStore.create( engine, embedding_service=embeddings_service, table_name=DEFAULT_TABLE, ) yield vs - await engine._aexecute(f'DROP TABLE IF EXISTS "{DEFAULT_TABLE}"') - await engine._engine.dispose() @pytest_asyncio.fixture(scope="class") async def vs_custom(self, engine): - await engine.ainit_vectorstore_table( + await engine._ainit_vectorstore_table( CUSTOM_TABLE, VECTOR_SIZE, id_column="myid", @@ -120,7 +116,7 @@ async def vs_custom(self, engine): metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], metadata_json_column="mymeta", ) - vs = await PostgresVectorStore.create( + vs = await AsyncPostgresVectorStore.create( engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, @@ -131,11 +127,10 @@ async def vs_custom(self, engine): metadata_json_column="mymeta", ) yield vs - await engine._aexecute(f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') async def test_init_with_constructor(self, engine): with pytest.raises(Exception): - PostgresVectorStore( + AsyncPostgresVectorStore( engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, @@ -148,7 +143,7 @@ async def test_init_with_constructor(self, engine): async def test_post_init(self, engine): with pytest.raises(ValueError): - await PostgresVectorStore.create( + await AsyncPostgresVectorStore.create( engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, @@ -162,58 +157,55 @@ async def test_post_init(self, engine): async def test_aadd_texts(self, engine, vs): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_texts(texts, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 3 ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_texts(texts, metadatas, ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 6 - await engine._aexecute(f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') async def test_aadd_texts_edge_cases(self, engine, vs): texts = ["Taylor's", '"Swift"', "best-friend"] ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_texts(texts, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 3 - await engine._aexecute(f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') async def test_aadd_docs(self, engine, vs): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_documents(docs, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') - assert len(results) == 3 - await engine._aexecute(f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - - async def test_aadd_embedding(self, engine, vs): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs._aadd_embeddings(texts, embeddings, metadatas, ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 3 - await engine._aexecute(f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') - async def test_aadd_embedding_without_id(self, engine, vs): - await vs._aadd_embeddings(texts, embeddings, metadatas) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + async def test_aadd_docs_no_ids(self, engine, vs): + await vs.aadd_documents(docs) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 3 - assert results[0]["langchain_id"] - await engine._aexecute(f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') async def test_adelete(self, engine, vs): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_texts(texts, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 3 # delete an ID await vs.adelete([ids[0]]) - results = await engine._afetch(f'SELECT * FROM "{DEFAULT_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') assert len(results) == 2 + # delete with no ids + result = await vs.adelete() + assert result == False + + ##### Custom Vector Store ##### async def test_aadd_texts_custom(self, engine, vs_custom): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs_custom.aadd_texts(texts, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') assert len(results) == 3 assert results[0]["mycontent"] == "foo" assert results[0]["myembedding"] @@ -222,9 +214,9 @@ async def test_aadd_texts_custom(self, engine, vs_custom): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs_custom.aadd_texts(texts, metadatas, ids) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') assert len(results) == 6 - await engine._aexecute(f'TRUNCATE TABLE "{CUSTOM_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') async def test_aadd_docs_custom(self, engine, vs_custom): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -237,51 +229,32 @@ async def test_aadd_docs_custom(self, engine, vs_custom): ] await vs_custom.aadd_documents(docs, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') assert len(results) == 3 assert results[0]["mycontent"] == "foo" assert results[0]["myembedding"] assert results[0]["page"] == "0" assert results[0]["source"] == "google.com" - await engine._aexecute(f'TRUNCATE TABLE "{CUSTOM_TABLE}"') - - async def test_aadd_embedding_custom(self, engine, vs_custom): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - await vs_custom._aadd_embeddings(texts, embeddings, metadatas, ids) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') - assert len(results) == 3 - await engine._aexecute(f'TRUNCATE TABLE "{CUSTOM_TABLE}"') + await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') async def test_adelete_custom(self, engine, vs_custom): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs_custom.aadd_texts(texts, ids=ids) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') content = [result["mycontent"] for result in results] assert len(results) == 3 assert "foo" in content # delete an ID await vs_custom.adelete([ids[0]]) - results = await engine._afetch(f'SELECT * FROM "{CUSTOM_TABLE}"') + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') content = [result["mycontent"] for result in results] assert len(results) == 2 assert "foo" not in content - async def test_add_docs(self, engine_sync, vs_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - vs_sync.add_documents(docs, ids=ids) - results = engine_sync._fetch(f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') - assert len(results) == 3 - - async def test_add_texts(self, engine_sync, vs_sync): - ids = [str(uuid.uuid4()) for i in range(len(texts))] - vs_sync.add_texts(texts, ids=ids) - results = engine_sync._fetch(f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') - assert len(results) == 6 - - async def test_ignore_metadata_columns(self, vs_custom): + async def test_ignore_metadata_columns(self, engine): column_to_ignore = "source" - vs = await PostgresVectorStore.create( - vs_custom.engine, + vs = await AsyncPostgresVectorStore.create( + engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, ignore_metadata_columns=[column_to_ignore], @@ -292,10 +265,10 @@ async def test_ignore_metadata_columns(self, vs_custom): ) assert column_to_ignore not in vs.metadata_columns - async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): + async def test_create_vectorstore_with_invalid_parameters_1(self, engine): with pytest.raises(ValueError): - await PostgresVectorStore.create( - vs_custom.engine, + await AsyncPostgresVectorStore.create( + engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, id_column="myid", @@ -303,9 +276,11 @@ async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): embedding_column="myembedding", metadata_columns=["random_column"], # invalid metadata column ) + + async def test_create_vectorstore_with_invalid_parameters_2(self, engine): with pytest.raises(ValueError): - await PostgresVectorStore.create( - vs_custom.engine, + await AsyncPostgresVectorStore.create( + engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, id_column="myid", @@ -313,9 +288,11 @@ async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): embedding_column="myembedding", metadata_columns=["random_column"], ) + + async def test_create_vectorstore_with_invalid_parameters_3(self, engine): with pytest.raises(ValueError): - await PostgresVectorStore.create( - vs_custom.engine, + await AsyncPostgresVectorStore.create( + engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, id_column="myid", @@ -323,9 +300,11 @@ async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): embedding_column="random_column", # invalid embedding column metadata_columns=["random_column"], ) + + async def test_create_vectorstore_with_invalid_parameters_4(self, engine): with pytest.raises(ValueError): - await PostgresVectorStore.create( - vs_custom.engine, + await AsyncPostgresVectorStore.create( + engine, embedding_service=embeddings_service, table_name=CUSTOM_TABLE, id_column="myid", @@ -334,4 +313,30 @@ async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): metadata_columns=["random_column"], ) - # Need tests for store metadata=False + async def test_create_vectorstore_with_invalid_parameters_5(self, engine): + with pytest.raises(ValueError): + await AsyncPostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="langchain_id", + metadata_columns=["random_column"], + ignore_metadata_columns=[ + "one", + "two", + ], # invalid use of metadata_columns and ignore columns + ) + + async def test_create_vectorstore_with_init(self, engine): + with pytest.raises(Exception): + await AsyncPostgresVectorStore( + engine._pool, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["random_column"], # invalid metadata column + ) diff --git a/tests/test_async_vectorstore_from_methods.py b/tests/test_async_vectorstore_from_methods.py new file mode 100644 index 00000000..fd933508 --- /dev/null +++ b/tests/test_async_vectorstore_from_methods.py @@ -0,0 +1,182 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid +from typing import Sequence + +import pytest +import pytest_asyncio +from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text +from sqlalchemy.engine.row import RowMapping + +from langchain_google_cloud_sql_pg import Column, PostgresEngine +from langchain_google_cloud_sql_pg.async_vectorstore import AsyncPostgresVectorStore + +DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") +DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +VECTOR_SIZE = 768 + + +embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) + +texts = ["foo", "bar", "baz"] +metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] +docs = [ + Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) +] + +embeddings = [embeddings_service.embed_query(texts[i]) for i in range(len(texts))] + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v + + +async def aexecute(engine: PostgresEngine, query: str) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + +async def afetch(engine: PostgresEngine, query: str) -> Sequence[RowMapping]: + async with engine._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + return result_fetch + + +@pytest.mark.asyncio +class TestVectorStoreFromMethods: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "database name on cloud sql instance") + + @pytest_asyncio.fixture + async def engine(self, db_project, db_region, db_instance, db_name): + engine = await PostgresEngine.afrom_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + await engine._ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + await engine._ainit_vectorstore_table( + CUSTOM_TABLE, + VECTOR_SIZE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=False, + ) + yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await engine.close() + + async def test_afrom_texts(self, engine): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await AsyncPostgresVectorStore.afrom_texts( + texts, + embeddings_service, + engine, + DEFAULT_TABLE, + metadatas=metadatas, + ids=ids, + ) + results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") + assert len(results) == 3 + await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") + + async def test_afrom_docs(self, engine): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await AsyncPostgresVectorStore.afrom_documents( + docs, + embeddings_service, + engine, + DEFAULT_TABLE, + ids=ids, + ) + results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") + assert len(results) == 3 + await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") + + async def test_afrom_texts_custom(self, engine): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await AsyncPostgresVectorStore.afrom_texts( + texts, + embeddings_service, + engine, + CUSTOM_TABLE, + ids=ids, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + ) + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") + assert len(results) == 3 + assert results[0]["mycontent"] == "foo" + assert results[0]["myembedding"] + assert results[0]["page"] is None + assert results[0]["source"] is None + + async def test_afrom_docs_custom(self, engine): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + docs = [ + Document( + page_content=texts[i], + metadata={"page": str(i), "source": "google.com"}, + ) + for i in range(len(texts)) + ] + await AsyncPostgresVectorStore.afrom_documents( + docs, + embeddings_service, + engine, + CUSTOM_TABLE, + ids=ids, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + ) + + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") + assert len(results) == 3 + assert results[0]["mycontent"] == "foo" + assert results[0]["myembedding"] + assert results[0]["page"] == "0" + assert results[0]["source"] == "google.com" + await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") diff --git a/tests/test_cloudsql_vectorstore_index.py b/tests/test_async_vectorstore_index.py similarity index 88% rename from tests/test_cloudsql_vectorstore_index.py rename to tests/test_async_vectorstore_index.py index 10baf13a..a3ff8c12 100644 --- a/tests/test_cloudsql_vectorstore_index.py +++ b/tests/test_async_vectorstore_index.py @@ -21,8 +21,10 @@ import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text -from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore +from langchain_google_cloud_sql_pg import PostgresEngine +from langchain_google_cloud_sql_pg.async_vectorstore import AsyncPostgresVectorStore from langchain_google_cloud_sql_pg.indexes import ( DEFAULT_INDEX_NAME_SUFFIX, DistanceStrategy, @@ -54,6 +56,12 @@ def get_env_var(key: str, desc: str) -> str: return v +async def aexecute(engine: PostgresEngine, query: str) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + @pytest.mark.asyncio(scope="class") class TestIndex: @pytest.fixture(scope="module") @@ -81,11 +89,13 @@ async def engine(self, db_project, db_region, db_instance, db_name): database=db_name, ) yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await engine.close() @pytest_asyncio.fixture(scope="class") async def vs(self, engine): - await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) - vs = await PostgresVectorStore.create( + await engine._ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + vs = await AsyncPostgresVectorStore.create( engine, embedding_service=embeddings_service, table_name=DEFAULT_TABLE, @@ -94,16 +104,12 @@ async def vs(self, engine): await vs.aadd_texts(texts, ids=ids) await vs.adrop_vector_index() yield vs - await engine._aexecute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await engine._engine.dispose() - @pytest.mark.run(order=1) async def test_aapply_vector_index(self, vs): index = HNSWIndex() await vs.aapply_vector_index(index) assert await vs.is_valid_index(DEFAULT_INDEX_NAME) - @pytest.mark.run(order=2) async def test_areindex(self, vs): if not await vs.is_valid_index(DEFAULT_INDEX_NAME): index = HNSWIndex() @@ -112,7 +118,6 @@ async def test_areindex(self, vs): await vs.areindex(DEFAULT_INDEX_NAME) assert await vs.is_valid_index(DEFAULT_INDEX_NAME) - @pytest.mark.run(order=3) async def test_dropindex(self, vs): await vs.adrop_vector_index() result = await vs.is_valid_index(DEFAULT_INDEX_NAME) diff --git a/tests/test_async_vectorstore_search.py b/tests/test_async_vectorstore_search.py new file mode 100644 index 00000000..d918415a --- /dev/null +++ b/tests/test_async_vectorstore_search.py @@ -0,0 +1,270 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import uuid + +import pytest +import pytest_asyncio +from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text + +from langchain_google_cloud_sql_pg import Column, PostgresEngine +from langchain_google_cloud_sql_pg.async_vectorstore import AsyncPostgresVectorStore +from langchain_google_cloud_sql_pg.indexes import DistanceStrategy, HNSWQueryOptions + +DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +VECTOR_SIZE = 768 + +embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) + +texts = ["foo", "bar", "baz", "boo"] +ids = [str(uuid.uuid4()) for i in range(len(texts))] +metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] +docs = [ + Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) +] + +embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v + + +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + +@pytest.mark.asyncio(scope="class") +class TestVectorStoreSearch: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "instance for cloud sql") + + @pytest_asyncio.fixture(scope="class") + async def engine(self, db_project, db_region, db_instance, db_name): + engine = await PostgresEngine.afrom_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await engine.close() + + @pytest_asyncio.fixture(scope="class") + async def vs(self, engine): + await engine._ainit_vectorstore_table( + DEFAULT_TABLE, VECTOR_SIZE, store_metadata=False + ) + vs = await AsyncPostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + ) + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_documents(docs, ids=ids) + yield vs + + @pytest_asyncio.fixture(scope="class") + async def vs_custom(self, engine): + await engine._ainit_vectorstore_table( + CUSTOM_TABLE, + VECTOR_SIZE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[ + Column("page", "TEXT"), + Column("source", "TEXT"), + ], + store_metadata=False, + ) + + vs_custom = await AsyncPostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + index_query_options=HNSWQueryOptions(ef_search=1), + ) + await vs_custom.aadd_documents(docs, ids=ids) + yield vs_custom + + async def test_asimilarity_search(self, vs): + results = await vs.asimilarity_search("foo", k=1) + assert len(results) == 1 + assert results == [Document(page_content="foo")] + results = await vs.asimilarity_search("foo", k=1, filter="content = 'bar'") + assert results == [Document(page_content="bar")] + + async def test_asimilarity_search_score(self, vs): + results = await vs.asimilarity_search_with_score("foo") + assert len(results) == 4 + assert results[0][0] == Document(page_content="foo") + assert results[0][1] == 0 + + async def test_asimilarity_search_by_vector(self, vs): + embedding = embeddings_service.embed_query("foo") + results = await vs.asimilarity_search_by_vector(embedding) + assert len(results) == 4 + assert results[0] == Document(page_content="foo") + results = await vs.asimilarity_search_with_score_by_vector(embedding) + assert results[0][0] == Document(page_content="foo") + assert results[0][1] == 0 + + async def test_similarity_search_with_relevance_scores_threshold_cosine(self, vs): + score_threshold = {"score_threshold": 0} + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 4 + + score_threshold = {"score_threshold": 0.02} + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 2 + + score_threshold = {"score_threshold": 0.9} + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 1 + assert results[0][0] == Document(page_content="foo") + + score_threshold = {"score_threshold": 0.02} + vs.distance_strategy = DistanceStrategy.EUCLIDEAN + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 1 + + async def test_similarity_search_with_relevance_scores_threshold_euclidean( + self, engine + ): + vs = await AsyncPostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + distance_strategy=DistanceStrategy.EUCLIDEAN, + ) + + score_threshold = {"score_threshold": 0.9} + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 1 + assert results[0][0] == Document(page_content="foo") + + async def test_amax_marginal_relevance_search(self, vs): + results = await vs.amax_marginal_relevance_search("bar") + assert results[0] == Document(page_content="bar") + results = await vs.amax_marginal_relevance_search( + "bar", filter="content = 'boo'" + ) + assert results[0] == Document(page_content="boo") + + async def test_amax_marginal_relevance_search_vector(self, vs): + embedding = embeddings_service.embed_query("bar") + results = await vs.amax_marginal_relevance_search_by_vector(embedding) + assert results[0] == Document(page_content="bar") + + async def test_amax_marginal_relevance_search_vector_score(self, vs): + embedding = embeddings_service.embed_query("bar") + results = await vs.amax_marginal_relevance_search_with_score_by_vector( + embedding + ) + assert results[0][0] == Document(page_content="bar") + + results = await vs.amax_marginal_relevance_search_with_score_by_vector( + embedding, lambda_mult=0.75, fetch_k=10 + ) + assert results[0][0] == Document(page_content="bar") + + async def test_similarity_search(self, vs_custom): + results = await vs_custom.asimilarity_search("foo", k=1) + assert len(results) == 1 + assert results == [Document(page_content="foo")] + results = await vs_custom.asimilarity_search( + "foo", k=1, filter="mycontent = 'bar'" + ) + assert results == [Document(page_content="bar")] + + async def test_similarity_search_score(self, vs_custom): + results = await vs_custom.asimilarity_search_with_score("foo") + assert len(results) == 4 + assert results[0][0] == Document(page_content="foo") + assert results[0][1] == 0 + + async def test_similarity_search_by_vector(self, vs_custom): + embedding = embeddings_service.embed_query("foo") + results = await vs_custom.asimilarity_search_by_vector(embedding) + assert len(results) == 4 + assert results[0] == Document(page_content="foo") + results = await vs_custom.asimilarity_search_with_score_by_vector(embedding) + assert results[0][0] == Document(page_content="foo") + assert results[0][1] == 0 + + async def test_max_marginal_relevance_search(self, vs_custom): + results = await vs_custom.amax_marginal_relevance_search("bar") + assert results[0] == Document(page_content="bar") + results = await vs_custom.amax_marginal_relevance_search( + "bar", filter="mycontent = 'boo'" + ) + assert results[0] == Document(page_content="boo") + + async def test_max_marginal_relevance_search_vector(self, vs_custom): + embedding = embeddings_service.embed_query("bar") + results = await vs_custom.amax_marginal_relevance_search_by_vector(embedding) + assert results[0] == Document(page_content="bar") + + async def test_max_marginal_relevance_search_vector_score(self, vs_custom): + embedding = embeddings_service.embed_query("bar") + results = await vs_custom.amax_marginal_relevance_search_with_score_by_vector( + embedding + ) + assert results[0][0] == Document(page_content="bar") + + results = await vs_custom.amax_marginal_relevance_search_with_score_by_vector( + embedding, lambda_mult=0.75, fetch_k=10 + ) + assert results[0][0] == Document(page_content="bar") diff --git a/tests/test_postgresql_chatmessagehistory.py b/tests/test_chatmessagehistory.py similarity index 60% rename from tests/test_postgresql_chatmessagehistory.py rename to tests/test_chatmessagehistory.py index ea0b85ee..b0a9420a 100644 --- a/tests/test_postgresql_chatmessagehistory.py +++ b/tests/test_chatmessagehistory.py @@ -19,6 +19,7 @@ import pytest_asyncio from langchain_core.messages.ai import AIMessage from langchain_core.messages.human import HumanMessage +from sqlalchemy import text from langchain_google_cloud_sql_pg import PostgresChatMessageHistory, PostgresEngine @@ -28,10 +29,24 @@ db_name = os.environ["DATABASE_ID"] table_name = "message_store" + str(uuid.uuid4()) table_name_async = "message_store" + str(uuid.uuid4()) +user = os.environ["DB_USER"] +password = os.environ["DB_PASSWORD"] -@pytest.fixture(name="memory_engine") -def setup() -> Generator: +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + +@pytest_asyncio.fixture +async def engine(): engine = PostgresEngine.from_instance( project_id=project_id, region=region, @@ -42,27 +57,29 @@ def setup() -> Generator: yield engine # use default table for PostgresChatMessageHistory query = f'DROP TABLE IF EXISTS "{table_name}"' - engine._execute(query) + await aexecute(engine, query) + await engine.close() @pytest_asyncio.fixture async def async_engine(): - engine = await PostgresEngine.afrom_instance( + async_engine = await PostgresEngine.afrom_instance( project_id=project_id, region=region, instance=instance_id, database=db_name, ) - await engine.ainit_chat_history_table(table_name=table_name_async) - yield engine + await async_engine.ainit_chat_history_table(table_name=table_name_async) + yield async_engine # use default table for PostgresChatMessageHistory - query = f'DROP TABLE IF EXISTS "{table_name}"' - await engine._aexecute(query) + query = f'DROP TABLE IF EXISTS "{table_name_async}"' + await aexecute(async_engine, query) + await async_engine.close() -def test_chat_message_history(memory_engine: PostgresEngine) -> None: +def test_chat_message_history(engine: PostgresEngine) -> None: history = PostgresChatMessageHistory.create_sync( - engine=memory_engine, session_id="test", table_name=table_name + engine=engine, session_id="test", table_name=table_name ) history.add_user_message("hi!") history.add_ai_message("whats up?") @@ -79,23 +96,24 @@ def test_chat_message_history(memory_engine: PostgresEngine) -> None: assert len(history.messages) == 0 -def test_chat_table(memory_engine: Any) -> None: +def test_chat_table(engine: Any) -> None: with pytest.raises(ValueError): PostgresChatMessageHistory.create_sync( - engine=memory_engine, session_id="test", table_name="doesnotexist" + engine=engine, session_id="test", table_name="doesnotexist" ) -def test_chat_schema(memory_engine: Any) -> None: +@pytest.mark.asyncio +async def test_chat_schema(engine: Any) -> None: doc_table_name = "test_table" + str(uuid.uuid4()) - memory_engine.init_document_table(table_name=doc_table_name) + engine.init_document_table(table_name=doc_table_name) with pytest.raises(IndexError): PostgresChatMessageHistory.create_sync( - engine=memory_engine, session_id="test", table_name=doc_table_name + engine=engine, session_id="test", table_name=doc_table_name ) query = f'DROP TABLE IF EXISTS "{doc_table_name}"' - memory_engine._execute(query) + await aexecute(engine, query) @pytest.mark.asyncio @@ -137,11 +155,8 @@ async def test_chat_message_history_sync_messages( await history1.aadd_message(msg1) await history2.aadd_message(msg2) - assert len(history1.messages) == 1 - assert len(history2.messages) == 2 - - await history1.async_messages() assert len(history1.messages) == 2 + assert len(history2.messages) == 2 # verify clear() clears message history await history2.aclear() @@ -166,4 +181,51 @@ async def test_chat_schema_async(async_engine): ) query = f'DROP TABLE IF EXISTS "{table_name}"' - await async_engine._aexecute(query) + await aexecute(async_engine, query) + + +@pytest.mark.asyncio +async def test_cross_env_chat_message_history(engine): + history = PostgresChatMessageHistory.create_sync( + engine=engine, session_id="test_cross", table_name=table_name + ) + await history.aadd_message(HumanMessage(content="hi!")) + messages = history.messages + assert messages[0].content == "hi!" + history.clear() + + history = await PostgresChatMessageHistory.create( + engine=engine, session_id="test_cross", table_name=table_name + ) + history.add_message(HumanMessage(content="hi!")) + messages = history.messages + assert messages[0].content == "hi!" + history.clear() + + +@pytest.mark.asyncio +async def test_from_engine_args_url(): + host = os.environ["IP_ADDRESS"] + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + engine = PostgresEngine.from_engine_args(url) + table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") + await engine.ainit_chat_history_table(table_name) + + history = PostgresChatMessageHistory.create_sync( + engine=engine, session_id="test_cross", table_name=table_name + ) + await history.aadd_message(HumanMessage(content="hi!")) + history.add_message(HumanMessage(content="bye!")) + assert len(history.messages) == 2 + await history.aclear() + + history2 = await PostgresChatMessageHistory.create( + engine=engine, session_id="test_cross", table_name=table_name + ) + await history2.aadd_message(HumanMessage(content="hi!")) + history2.add_message(HumanMessage(content="bye!")) + assert len(history2.messages) == 2 + history2.clear() + + await aexecute(engine, f"DROP TABLE {table_name}") diff --git a/tests/test_postgresql_engine.py b/tests/test_engine.py similarity index 71% rename from tests/test_postgresql_engine.py rename to tests/test_engine.py index 9ccd43a0..70e18802 100644 --- a/tests/test_postgresql_engine.py +++ b/tests/test_engine.py @@ -14,22 +14,29 @@ import os import uuid +from typing import Sequence import asyncpg # type: ignore import pytest import pytest_asyncio from google.cloud.sql.connector import Connector, IPTypes from langchain_core.embeddings import DeterministicFakeEmbedding -from sqlalchemy import VARCHAR +from sqlalchemy import VARCHAR, text +from sqlalchemy.engine import URL +from sqlalchemy.engine.row import RowMapping from sqlalchemy.ext.asyncio import create_async_engine +from sqlalchemy.pool import NullPool from langchain_google_cloud_sql_pg import Column, PostgresEngine DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +DEFAULT_TABLE_SYNC = "test_table" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE_SYNC = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) +host = os.environ["IP_ADDRESS"] def get_env_var(key: str, desc: str) -> str: @@ -39,7 +46,30 @@ def get_env_var(key: str, desc: str) -> str: return v -@pytest.mark.asyncio +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + +async def afetch(engine: PostgresEngine, query: str) -> Sequence[RowMapping]: + async def run(engine, query): + async with engine._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + return result_fetch + + return await engine._run_as_async(run(engine, query)) + + +@pytest.mark.asyncio(scope="module") class TestEngineAsync: @pytest.fixture(scope="module") def db_project(self) -> str: @@ -69,7 +99,7 @@ def password(self) -> str: def iam_account(self) -> str: return get_env_var("IAM_ACCOUNT", "Cloud SQL IAM account email") - @pytest_asyncio.fixture + @pytest_asyncio.fixture(scope="class") async def engine(self, db_project, db_region, db_instance, db_name): engine = await PostgresEngine.afrom_instance( project_id=db_project, @@ -78,10 +108,9 @@ async def engine(self, db_project, db_region, db_instance, db_name): database=db_name, ) yield engine - await engine._engine.dispose() - - async def test_execute(self, engine): - await engine._aexecute("SELECT 1") + await aexecute(engine, f'DROP TABLE "{CUSTOM_TABLE}"') + await aexecute(engine, f'DROP TABLE "{DEFAULT_TABLE}"') + await engine.close() async def test_init_table(self, engine): await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) @@ -89,12 +118,7 @@ async def test_init_table(self, engine): content = "coffee" embedding = await embeddings_service.aembed_query(content) stmt = f"INSERT INTO {DEFAULT_TABLE} (langchain_id, content, embedding) VALUES ('{id}', '{content}','{embedding}');" - await engine._aexecute(stmt) - - async def test_fetch(self, engine): - results = await engine._afetch(f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) > 0 - await engine._aexecute(f"DROP TABLE {DEFAULT_TABLE}") + await aexecute(engine, stmt) async def test_init_table_custom(self, engine): await engine.ainit_vectorstore_table( @@ -107,7 +131,7 @@ async def test_init_table_custom(self, engine): store_metadata=True, ) stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{CUSTOM_TABLE}';" - results = await engine._afetch(stmt) + results = await afetch(engine, stmt) expected = [ {"column_name": "uuid", "data_type": "uuid"}, {"column_name": "my_embedding", "data_type": "USER-DEFINED"}, @@ -119,8 +143,6 @@ async def test_init_table_custom(self, engine): for row in results: assert row in expected - await engine._aexecute(f"DROP TABLE {CUSTOM_TABLE}") - async def test_password( self, db_project, @@ -140,7 +162,7 @@ async def test_password( password=password, ) assert engine - await engine._aexecute("SELECT 1") + await aexecute(engine, "SELECT 1") PostgresEngine._connector = None async def test_from_engine( @@ -172,7 +194,49 @@ async def getconn() -> asyncpg.Connection: ) engine = PostgresEngine.from_engine(engine) - await engine._aexecute("SELECT 1") + await aexecute(engine, "SELECT 1") + await engine.close() + + async def test_from_engine_args_url( + self, + db_name, + user, + password, + ): + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + engine = PostgresEngine.from_engine_args( + url, + echo=True, + poolclass=NullPool, + ) + await aexecute(engine, "SELECT 1") + await engine.close() + + engine = PostgresEngine.from_engine_args( + URL.create("postgresql+asyncpg", user, password, host, port, db_name) + ) + await aexecute(engine, "SELECT 1") + await engine.close() + + async def test_from_engine_args_url_error( + self, + db_name, + user, + password, + ): + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + with pytest.raises(TypeError): + engine = PostgresEngine.from_engine_args(url, random=False) + with pytest.raises(ValueError): + PostgresEngine.from_engine_args( + f"postgresql+pg8000://{user}:{password}@{host}:{port}/{db_name}", + ) + with pytest.raises(ValueError): + PostgresEngine.from_engine_args( + URL.create("postgresql+pg8000", user, password, host, port, db_name) + ) async def test_column(self, engine): with pytest.raises(ValueError): @@ -187,6 +251,7 @@ async def test_iam_account_override( db_region, db_name, iam_account, + engine, ): engine = await PostgresEngine.afrom_instance( project_id=db_project, @@ -196,12 +261,11 @@ async def test_iam_account_override( iam_account_email=iam_account, ) assert engine - await engine._aexecute("SELECT 1") - await engine._connector.close_async() - await engine._engine.dispose() + await aexecute(engine, "SELECT 1") + await engine.close() -@pytest.mark.asyncio +@pytest.mark.asyncio(scope="module") class TestEngineSync: @pytest.fixture(scope="module") def db_project(self) -> str: @@ -231,8 +295,8 @@ def password(self) -> str: def iam_account(self) -> str: return get_env_var("IAM_ACCOUNT", "Cloud SQL IAM account email") - @pytest_asyncio.fixture - def engine(self, db_project, db_region, db_instance, db_name): + @pytest_asyncio.fixture(scope="class") + async def engine(self, db_project, db_region, db_instance, db_name): engine = PostgresEngine.from_instance( project_id=db_project, instance=db_instance, @@ -240,27 +304,21 @@ def engine(self, db_project, db_region, db_instance, db_name): database=db_name, ) yield engine - engine._engine.dispose() - - async def test_execute(self, engine): - engine._execute("SELECT 1") + await aexecute(engine, f'DROP TABLE "{CUSTOM_TABLE_SYNC}"') + await aexecute(engine, f'DROP TABLE "{DEFAULT_TABLE_SYNC}"') + await engine.close() async def test_init_table(self, engine): - engine.init_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + engine.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) id = str(uuid.uuid4()) content = "coffee" embedding = await embeddings_service.aembed_query(content) - stmt = f"INSERT INTO {DEFAULT_TABLE} (langchain_id, content, embedding) VALUES ('{id}', '{content}','{embedding}');" - engine._execute(stmt) - - async def test_fetch(self, engine): - results = engine._fetch(f"SELECT * FROM {DEFAULT_TABLE}") - assert len(results) > 0 - engine._execute(f"DROP TABLE {DEFAULT_TABLE}") + stmt = f"INSERT INTO {DEFAULT_TABLE_SYNC} (langchain_id, content, embedding) VALUES ('{id}', '{content}','{embedding}');" + await aexecute(engine, stmt) async def test_init_table_custom(self, engine): engine.init_vectorstore_table( - CUSTOM_TABLE, + CUSTOM_TABLE_SYNC, VECTOR_SIZE, id_column="uuid", content_column="my-content", @@ -268,8 +326,8 @@ async def test_init_table_custom(self, engine): metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], store_metadata=True, ) - stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{CUSTOM_TABLE}';" - results = engine._fetch(stmt) + stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{CUSTOM_TABLE_SYNC}';" + results = await afetch(engine, stmt) expected = [ {"column_name": "uuid", "data_type": "uuid"}, {"column_name": "my_embedding", "data_type": "USER-DEFINED"}, @@ -281,8 +339,6 @@ async def test_init_table_custom(self, engine): for row in results: assert row in expected - engine._execute(f"DROP TABLE {CUSTOM_TABLE}") - async def test_password( self, db_project, @@ -303,7 +359,7 @@ async def test_password( quota_project=db_project, ) assert engine - engine._execute("SELECT 1") + await aexecute(engine, "SELECT 1") PostgresEngine._connector = None async def test_engine_constructor_key( @@ -314,13 +370,14 @@ async def test_engine_constructor_key( with pytest.raises(Exception): PostgresEngine(key, engine) - def test_iam_account_override( + async def test_iam_account_override( self, db_project, db_instance, db_region, db_name, iam_account, + engine, ): engine = PostgresEngine.from_instance( project_id=db_project, @@ -330,6 +387,5 @@ def test_iam_account_override( iam_account_email=iam_account, ) assert engine - engine._execute("SELECT 1") - engine._connector.close() - engine._engine.dispose() + await aexecute(engine, "SELECT 1") + await engine.close() diff --git a/tests/test_postgresql_loader.py b/tests/test_loader.py similarity index 93% rename from tests/test_postgresql_loader.py rename to tests/test_loader.py index 8f4f5e35..87ac979c 100644 --- a/tests/test_postgresql_loader.py +++ b/tests/test_loader.py @@ -19,6 +19,7 @@ import pytest import pytest_asyncio from langchain_core.documents import Document +from sqlalchemy import text from langchain_google_cloud_sql_pg import ( Column, @@ -34,6 +35,18 @@ table_name = "test-table" + str(uuid.uuid4()) +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + @pytest.mark.asyncio class TestLoaderAsync: @pytest_asyncio.fixture @@ -67,7 +80,7 @@ async def _collect_async_items(self, docs_generator): async def _cleanup_table(self, engine): query = f'DROP TABLE IF EXISTS "{table_name}"' - await engine._aexecute(query) + await aexecute(engine, query) async def test_create_loader_with_invalid_parameters(self, engine): with pytest.raises(ValueError): @@ -105,14 +118,14 @@ async def test_load_from_query_default(self, engine): organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" ( fruit_name, variety, quantity_in_stock, price_per_unit, organic ) VALUES ('Apple', 'Granny Smith', 150, 1, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -152,7 +165,7 @@ async def test_load_from_query_customized_content_customized_metadata(self, engi organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) @@ -160,7 +173,7 @@ async def test_load_from_query_customized_content_customized_metadata(self, engi ('Banana', 'Cavendish', 200, 0.59, 0), ('Orange', 'Navel', 80, 1.29, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -208,13 +221,13 @@ async def test_load_from_query_customized_content_default_metadata(self, engine) organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) VALUES ('Apple', 'Granny Smith', 150, 1, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -226,7 +239,9 @@ async def test_load_from_query_customized_content_default_metadata(self, engine) ], ) - documents = await self._collect_async_items(loader.alazy_load()) + documents = [] + for docs in loader.lazy_load(): + documents.append(docs) assert documents == [ Document( @@ -279,7 +294,7 @@ async def test_load_from_query_default_content_customized_metadata(self, engine) organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" ( @@ -290,7 +305,7 @@ async def test_load_from_query_default_content_customized_metadata(self, engine) organic ) VALUES ('Apple', 'Granny Smith', 150, 1, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -323,14 +338,14 @@ async def test_load_from_query_with_langchain_metadata(self, engine): langchain_metadata JSON NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) metadata = json.dumps({"organic": 1}) insert_query = f""" INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, langchain_metadata) VALUES ('Apple', 'Granny Smith', 150, 1, '{metadata}');""" - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -369,7 +384,7 @@ async def test_load_from_query_with_json(self, engine): langchain_metadata JSON NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) metadata = json.dumps({"organic": 1}) variety = json.dumps({"type": "Granny Smith"}) @@ -377,7 +392,7 @@ async def test_load_from_query_with_json(self, engine): INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, langchain_metadata) VALUES ('Apple', '{variety}', 150, 1, '{metadata}');""" - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -417,13 +432,13 @@ async def test_load_from_query_customized_content_default_metadata_custom_format organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) VALUES ('Apple', 'Granny Smith', 150, 1, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) def my_formatter(row, content_columns): return "-".join( @@ -472,13 +487,13 @@ async def test_load_from_query_customized_content_default_metadata_custom_page_c organic INT NOT NULL ) """ - await engine._aexecute(query) + await aexecute(engine, query) insert_query = f""" INSERT INTO "{table_name}" (fruit_name, variety, quantity_in_stock, price_per_unit, organic) VALUES ('Apple', 'Granny Smith', 150, 1, 1); """ - await engine._aexecute(insert_query) + await aexecute(engine, insert_query) loader = await PostgresLoader.create( engine=engine, @@ -534,7 +549,9 @@ async def test_save_doc_with_default_metadata(self, engine): docs = await self._collect_async_items(loader.alazy_load()) assert docs == test_docs - assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + assert ( + await engine._run_as_async(engine._aload_table_schema(table_name)) + ).columns.keys() == [ "page_content", "langchain_metadata", ] @@ -577,7 +594,9 @@ async def test_save_doc_with_customized_metadata(self, engine, store_metadata): if store_metadata: docs == test_docs - assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + assert ( + await engine._run_as_async(engine._aload_table_schema(table_name)) + ).columns.keys() == [ "page_content", "fruit_name", "organic", @@ -590,7 +609,9 @@ async def test_save_doc_with_customized_metadata(self, engine, store_metadata): metadata={"fruit_name": "Apple", "organic": True}, ), ] - assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + assert ( + await engine._run_as_async(engine._aload_table_schema(table_name)) + ).columns.keys() == [ "page_content", "fruit_name", "organic", @@ -628,7 +649,9 @@ async def test_save_doc_without_metadata(self, engine): metadata={}, ), ] - assert (await engine._aload_table_schema(table_name)).columns.keys() == [ + assert ( + await engine._run_as_async(engine._aload_table_schema(table_name)) + ).columns.keys() == [ "page_content", ] finally: @@ -782,7 +805,7 @@ async def test_delete_doc_with_customized_metadata( await saver.adelete(docs) assert len(await self._collect_async_items(loader.alazy_load())) == 0 - def test_sync_engine(self): + async def test_sync_engine(self): PostgresEngine._connector = None engine = PostgresEngine.from_instance( project_id=project_id, @@ -791,6 +814,7 @@ def test_sync_engine(self): database=db_name, ) assert engine + await engine.close() async def test_load_from_query_default_sync(self, sync_engine): try: @@ -815,7 +839,7 @@ async def test_load_from_query_default_sync(self, sync_engine): engine=sync_engine, query=f'SELECT * FROM "{table_name}";', ) - documents = loader.load() + documents = await loader.aload() assert documents == test_docs saver.delete(test_docs) diff --git a/tests/test_vectorstore.py b/tests/test_vectorstore.py new file mode 100644 index 00000000..7995cd63 --- /dev/null +++ b/tests/test_vectorstore.py @@ -0,0 +1,532 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import asyncio +import os +import uuid +from threading import Thread +from typing import Sequence + +import pytest +import pytest_asyncio +from google.cloud.sql.connector import Connector, IPTypes +from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text +from sqlalchemy.engine.row import RowMapping +from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine + +from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore + +DEFAULT_TABLE = "test_table" + str(uuid.uuid4()) +DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()) +CUSTOM_TABLE = "test-table-custom" + str(uuid.uuid4()) +VECTOR_SIZE = 768 + +embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) +host = os.environ["IP_ADDRESS"] + +texts = ["foo", "bar", "baz"] +metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] +docs = [ + Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) +] + +embeddings = [embeddings_service.embed_query(texts[i]) for i in range(len(texts))] + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v + + +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + +async def afetch(engine: PostgresEngine, query: str) -> Sequence[RowMapping]: + async def run(engine, query): + async with engine._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + return result_fetch + + return await engine._run_as_async(run(engine, query)) + + +@pytest.mark.asyncio(scope="class") +class TestVectorStore: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "database name on cloud sql instance") + + @pytest.fixture(scope="module") + def user(self) -> str: + return get_env_var("DB_USER", "database user for cloud sql") + + @pytest.fixture(scope="module") + def password(self) -> str: + return get_env_var("DB_PASSWORD", "database password for cloud sql") + + @pytest_asyncio.fixture(scope="class") + async def engine(self, db_project, db_region, db_instance, db_name): + engine = await PostgresEngine.afrom_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + + yield engine + await aexecute(engine, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE}"') + await engine.close() + + @pytest_asyncio.fixture(scope="class") + async def vs(self, engine): + await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + ) + yield vs + + @pytest_asyncio.fixture(scope="class") + async def engine_sync(self, db_project, db_region, db_instance, db_name): + engine_sync = PostgresEngine.from_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + yield engine_sync + + await aexecute(engine_sync, f'DROP TABLE IF EXISTS "{DEFAULT_TABLE_SYNC}"') + await engine_sync.close() + + @pytest_asyncio.fixture(scope="class") + def vs_sync(self, engine_sync): + engine_sync.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) + + vs = PostgresVectorStore.create_sync( + engine_sync, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE_SYNC, + ) + yield vs + + @pytest_asyncio.fixture(scope="class") + async def vs_custom(self, engine): + await engine.ainit_vectorstore_table( + CUSTOM_TABLE, + VECTOR_SIZE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + metadata_json_column="mymeta", + ) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + metadata_json_column="mymeta", + ) + yield vs + await aexecute(engine, f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') + + async def test_init_with_constructor(self, engine): + with pytest.raises(Exception): + PostgresVectorStore( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="noname", + embedding_column="myembedding", + metadata_columns=["page", "source"], + metadata_json_column="mymeta", + ) + + async def test_post_init(self, engine): + with pytest.raises(ValueError): + await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="noname", + embedding_column="myembedding", + metadata_columns=["page", "source"], + metadata_json_column="mymeta", + ) + + async def test_aadd_texts(self, engine, vs): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 3 + + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_texts(texts, metadatas, ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 6 + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + + async def test_cross_env_add_texts(self, engine, vs): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + vs.add_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 3 + vs.delete(ids) + + async def test_aadd_texts_edge_cases(self, engine, vs): + texts = ["Taylor's", '"Swift"', "best-friend"] + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 3 + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + + async def test_aadd_docs(self, engine, vs): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_documents(docs, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 3 + await aexecute(engine, f'TRUNCATE TABLE "{DEFAULT_TABLE}"') + + async def test_adelete(self, engine, vs): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs.aadd_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 3 + # delete an ID + await vs.adelete([ids[0]]) + results = await afetch(engine, f'SELECT * FROM "{DEFAULT_TABLE}"') + assert len(results) == 2 + + async def test_aadd_texts_custom(self, engine, vs_custom): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs_custom.aadd_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') + assert len(results) == 3 + assert results[0]["mycontent"] == "foo" + assert results[0]["myembedding"] + assert results[0]["page"] is None + assert results[0]["source"] is None + + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs_custom.aadd_texts(texts, metadatas, ids) + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') + assert len(results) == 6 + await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') + + async def test_aadd_docs_custom(self, engine, vs_custom): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + docs = [ + Document( + page_content=texts[i], + metadata={"page": str(i), "source": "google.com"}, + ) + for i in range(len(texts)) + ] + await vs_custom.aadd_documents(docs, ids=ids) + + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') + assert len(results) == 3 + assert results[0]["mycontent"] == "foo" + assert results[0]["myembedding"] + assert results[0]["page"] == "0" + assert results[0]["source"] == "google.com" + await aexecute(engine, f'TRUNCATE TABLE "{CUSTOM_TABLE}"') + + async def test_adelete_custom(self, engine, vs_custom): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs_custom.aadd_texts(texts, ids=ids) + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') + content = [result["mycontent"] for result in results] + assert len(results) == 3 + assert "foo" in content + # delete an ID + await vs_custom.adelete([ids[0]]) + results = await afetch(engine, f'SELECT * FROM "{CUSTOM_TABLE}"') + content = [result["mycontent"] for result in results] + assert len(results) == 2 + assert "foo" not in content + + async def test_add_docs(self, engine_sync, vs_sync): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + vs_sync.add_documents(docs, ids=ids) + results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') + assert len(results) == 3 + vs_sync.delete(ids) + + async def test_add_texts(self, engine_sync, vs_sync): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + vs_sync.add_texts(texts, ids=ids) + results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') + assert len(results) == 3 + await vs_sync.adelete(ids) + + async def test_cross_env(self, engine_sync, vs_sync): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await vs_sync.aadd_texts(texts, ids=ids) + results = await afetch(engine_sync, f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') + assert len(results) == 3 + await vs_sync.adelete(ids) + + async def test_create_vectorstore_with_invalid_parameters(self, engine): + with pytest.raises(ValueError): + await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["random_column"], # invalid metadata column + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="langchain_id", # invalid content column type + embedding_column="myembedding", + metadata_columns=["random_column"], + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="random_column", # invalid embedding column + metadata_columns=["random_column"], + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="langchain_id", # invalid embedding column data type + metadata_columns=["random_column"], + ) + + async def test_from_engine( + self, + db_project, + db_region, + db_instance, + db_name, + user, + password, + ): + async with Connector() as connector: + + async def getconn(): + conn = await connector.connect_async( # type: ignore + f"{db_project}:{db_region}:{db_instance}", + "asyncpg", + user=user, + password=password, + db=db_name, + enable_iam_auth=False, + ip_type=IPTypes.PUBLIC, + ) + return conn + + engine = create_async_engine( + "postgresql+asyncpg://", + async_creator=getconn, + ) + + engine = PostgresEngine.from_engine(engine) + table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") + await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 1 + + await aexecute(engine, f"DROP TABLE {table_name}") + + async def test_from_engine_loop_connector( + self, + db_project, + db_region, + db_instance, + db_name, + user, + password, + ): + async def init_connection_pool(connector: Connector) -> AsyncEngine: + async def getconn(): + conn = await connector.connect_async( + f"{db_project}:{db_region}:{db_instance}", + "asyncpg", + user=user, + password=password, + db=db_name, + enable_iam_auth=False, + ip_type="PUBLIC", + ) + return conn + + pool = create_async_engine( + "postgresql+asyncpg://", + async_creator=getconn, + ) + return pool + + loop = asyncio.new_event_loop() + thread = Thread(target=loop.run_forever, daemon=True) + thread.start() + + connector = Connector(loop=loop) + coro = init_connection_pool(connector) + pool = asyncio.run_coroutine_threadsafe(coro, loop).result() + engine = PostgresEngine.from_engine(pool, loop) + table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") + await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["foo"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + + await aexecute(engine, f"TRUNCATE TABLE {table_name}") + + vs = PostgresVectorStore.create_sync( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["foo"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + + await aexecute(engine, f"DROP TABLE {table_name}") + + async def test_from_engine_args_url( + self, + db_name, + user, + password, + ): + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + engine = PostgresEngine.from_engine_args(url) + table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") + await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["foo"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + + await aexecute(engine, f"TRUNCATE TABLE {table_name}") + vs = PostgresVectorStore.create_sync( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["bar"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + await aexecute(engine, f"DROP TABLE {table_name}") + + async def test_from_engine_loop( + self, + db_name, + user, + password, + ): + port = "5432" + url = f"postgresql+asyncpg://{user}:{password}@{host}:{port}/{db_name}" + + loop = asyncio.new_event_loop() + thread = Thread(target=loop.run_forever, daemon=True) + thread.start() + pool = create_async_engine(url) + engine = PostgresEngine.from_engine(pool, loop) + + table_name = "test_table" + str(uuid.uuid4()).replace("-", "_") + await engine.ainit_vectorstore_table(table_name, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["foo"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + + await aexecute(engine, f"TRUNCATE TABLE {table_name}") + vs = PostgresVectorStore.create_sync( + engine, + embedding_service=embeddings_service, + table_name=table_name, + ) + await vs.aadd_texts(["foo"]) + vs.add_texts(["bar"]) + results = await afetch(engine, f"SELECT * FROM {table_name}") + assert len(results) == 2 + await aexecute(engine, f"DROP TABLE {table_name}") diff --git a/tests/test_cloudsql_vectorstore_from_methods.py b/tests/test_vectorstore_from_methods.py similarity index 68% rename from tests/test_cloudsql_vectorstore_from_methods.py rename to tests/test_vectorstore_from_methods.py index e7e143cb..8b161f6f 100644 --- a/tests/test_cloudsql_vectorstore_from_methods.py +++ b/tests/test_vectorstore_from_methods.py @@ -14,11 +14,15 @@ import os import uuid +from typing import Sequence import pytest import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import VARCHAR, text +from sqlalchemy.engine.row import RowMapping +from sqlalchemy.ext.asyncio import create_async_engine from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore @@ -46,6 +50,29 @@ def get_env_var(key: str, desc: str) -> str: return v +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + +async def afetch(engine: PostgresEngine, query: str) -> Sequence[RowMapping]: + async def run(engine, query): + async with engine._pool.connect() as conn: + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + return result_fetch + + return await engine._run_as_async(run(engine, query)) + + @pytest.mark.asyncio class TestVectorStoreFromMethods: @pytest.fixture(scope="module") @@ -83,12 +110,12 @@ async def engine(self, db_project, db_region, db_instance, db_name): store_metadata=False, ) yield engine - await engine._aexecute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await engine._aexecute(f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") - await engine._engine.dispose() + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await engine.close() @pytest_asyncio.fixture - def engine_sync(self, db_project, db_region, db_instance, db_name): + async def engine_sync(self, db_project, db_region, db_instance, db_name): engine = PostgresEngine.from_instance( project_id=db_project, instance=db_instance, @@ -98,9 +125,8 @@ def engine_sync(self, db_project, db_region, db_instance, db_name): engine.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) yield engine - engine._execute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE_SYNC}") - - engine._engine.dispose() + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE_SYNC}") + await engine.close() async def test_afrom_texts(self, engine): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -112,9 +138,9 @@ async def test_afrom_texts(self, engine): metadatas=metadatas, ids=ids, ) - results = await engine._afetch(f"SELECT * FROM {DEFAULT_TABLE}") + results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") assert len(results) == 3 - await engine._aexecute(f"TRUNCATE TABLE {DEFAULT_TABLE}") + await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") async def test_from_texts(self, engine_sync): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -126,9 +152,9 @@ async def test_from_texts(self, engine_sync): metadatas=metadatas, ids=ids, ) - results = engine_sync._fetch(f"SELECT * FROM {DEFAULT_TABLE_SYNC}") + results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") assert len(results) == 3 - engine_sync._execute(f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") + await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") async def test_afrom_docs(self, engine): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -139,9 +165,9 @@ async def test_afrom_docs(self, engine): DEFAULT_TABLE, ids=ids, ) - results = await engine._afetch(f"SELECT * FROM {DEFAULT_TABLE}") + results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE}") assert len(results) == 3 - await engine._aexecute(f"TRUNCATE TABLE {DEFAULT_TABLE}") + await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE}") async def test_from_docs(self, engine_sync): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -152,9 +178,35 @@ async def test_from_docs(self, engine_sync): DEFAULT_TABLE_SYNC, ids=ids, ) - results = engine_sync._fetch(f"SELECT * FROM {DEFAULT_TABLE_SYNC}") + results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") + assert len(results) == 3 + await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") + + async def test_afrom_docs_cross_env(self, engine_sync): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + await PostgresVectorStore.afrom_documents( + docs, + embeddings_service, + engine_sync, + DEFAULT_TABLE_SYNC, + ids=ids, + ) + results = await afetch(engine_sync, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") + assert len(results) == 3 + await aexecute(engine_sync, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") + + async def test_from_docs_cross_env(self, engine, engine_sync): + ids = [str(uuid.uuid4()) for i in range(len(texts))] + PostgresVectorStore.from_documents( + docs, + embeddings_service, + engine, + DEFAULT_TABLE_SYNC, + ids=ids, + ) + results = await afetch(engine, f"SELECT * FROM {DEFAULT_TABLE_SYNC}") assert len(results) == 3 - engine_sync._execute(f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") + await aexecute(engine, f"TRUNCATE TABLE {DEFAULT_TABLE_SYNC}") async def test_afrom_texts_custom(self, engine): ids = [str(uuid.uuid4()) for i in range(len(texts))] @@ -169,7 +221,7 @@ async def test_afrom_texts_custom(self, engine): embedding_column="myembedding", metadata_columns=["page", "source"], ) - results = await engine._afetch(f"SELECT * FROM {CUSTOM_TABLE}") + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") assert len(results) == 3 assert results[0]["mycontent"] == "foo" assert results[0]["myembedding"] @@ -197,10 +249,10 @@ async def test_afrom_docs_custom(self, engine): metadata_columns=["page", "source"], ) - results = await engine._afetch(f"SELECT * FROM {CUSTOM_TABLE}") + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE}") assert len(results) == 3 assert results[0]["mycontent"] == "foo" assert results[0]["myembedding"] assert results[0]["page"] == "0" assert results[0]["source"] == "google.com" - await engine._aexecute(f"TRUNCATE TABLE {CUSTOM_TABLE}") + await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") diff --git a/tests/test_vectorstore_index.py b/tests/test_vectorstore_index.py new file mode 100644 index 00000000..7c240061 --- /dev/null +++ b/tests/test_vectorstore_index.py @@ -0,0 +1,223 @@ +# Copyright 2024 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + + +import os +import sys +import uuid + +import pytest +import pytest_asyncio +from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text + +from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore +from langchain_google_cloud_sql_pg.indexes import ( + DEFAULT_INDEX_NAME_SUFFIX, + DistanceStrategy, + HNSWIndex, + IVFFlatIndex, +) + +DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +DEFAULT_INDEX_NAME = DEFAULT_TABLE + DEFAULT_INDEX_NAME_SUFFIX +VECTOR_SIZE = 768 + +embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) + +texts = ["foo", "bar", "baz"] +ids = [str(uuid.uuid4()) for i in range(len(texts))] +metadatas = [{"page": str(i), "source": "google.com"} for i in range(len(texts))] +docs = [ + Document(page_content=texts[i], metadata=metadatas[i]) for i in range(len(texts)) +] + +embeddings = [embeddings_service.embed_query("foo") for i in range(len(texts))] + + +def get_env_var(key: str, desc: str) -> str: + v = os.environ.get(key) + if v is None: + raise ValueError(f"Must set env var {key} to: {desc}") + return v + + +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + +@pytest.mark.asyncio(scope="class") +class TestIndex: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "instance for cloud sql") + + @pytest_asyncio.fixture(scope="class") + async def engine(self, db_project, db_region, db_instance, db_name): + engine = PostgresEngine.from_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await engine.close() + + @pytest_asyncio.fixture(scope="class") + async def vs(self, engine): + engine.init_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + vs = PostgresVectorStore.create_sync( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + ) + + vs.add_texts(texts, ids=ids) + vs.drop_vector_index() + yield vs + + async def test_aapply_vector_index(self, vs): + index = HNSWIndex() + vs.apply_vector_index(index) + assert vs.is_valid_index(DEFAULT_INDEX_NAME) + + async def test_areindex(self, vs): + if not vs.is_valid_index(DEFAULT_INDEX_NAME): + index = HNSWIndex() + vs.apply_vector_index(index) + vs.reindex() + vs.reindex(DEFAULT_INDEX_NAME) + assert vs.is_valid_index(DEFAULT_INDEX_NAME) + + async def test_dropindex(self, vs): + vs.drop_vector_index() + result = vs.is_valid_index(DEFAULT_INDEX_NAME) + assert not result + + async def test_aapply_vector_index_ivfflat(self, vs): + index = IVFFlatIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + vs.apply_vector_index(index, concurrently=True) + assert vs.is_valid_index(DEFAULT_INDEX_NAME) + index = IVFFlatIndex( + name="secondindex", + distance_strategy=DistanceStrategy.INNER_PRODUCT, + ) + vs.apply_vector_index(index) + assert vs.is_valid_index("secondindex") + vs.drop_vector_index("secondindex") + + async def test_is_valid_index(self, vs): + is_valid = vs.is_valid_index("invalid_index") + assert is_valid == False + + +@pytest.mark.asyncio(scope="class") +class TestAsyncIndex: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "instance for cloud sql") + + @pytest_asyncio.fixture(scope="class") + async def engine(self, db_project, db_region, db_instance, db_name): + engine = await PostgresEngine.afrom_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await engine.close() + + @pytest_asyncio.fixture(scope="class") + async def vs(self, engine): + await engine.ainit_vectorstore_table(DEFAULT_TABLE, VECTOR_SIZE) + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + ) + + await vs.aadd_texts(texts, ids=ids) + await vs.adrop_vector_index() + yield vs + + async def test_aapply_vector_index(self, vs): + index = HNSWIndex() + await vs.aapply_vector_index(index) + assert await vs.ais_valid_index(DEFAULT_INDEX_NAME) + + async def test_areindex(self, vs): + if not await vs.ais_valid_index(DEFAULT_INDEX_NAME): + index = HNSWIndex() + await vs.aapply_vector_index(index) + await vs.areindex() + await vs.areindex(DEFAULT_INDEX_NAME) + assert await vs.ais_valid_index(DEFAULT_INDEX_NAME) + + async def test_dropindex(self, vs): + await vs.adrop_vector_index() + result = await vs.ais_valid_index(DEFAULT_INDEX_NAME) + assert not result + + async def test_aapply_vector_index_ivfflat(self, vs): + index = IVFFlatIndex(distance_strategy=DistanceStrategy.EUCLIDEAN) + await vs.aapply_vector_index(index, concurrently=True) + assert await vs.ais_valid_index(DEFAULT_INDEX_NAME) + index = IVFFlatIndex( + name="secondindex", + distance_strategy=DistanceStrategy.INNER_PRODUCT, + ) + await vs.aapply_vector_index(index) + assert await vs.ais_valid_index("secondindex") + await vs.adrop_vector_index("secondindex") + + async def test_is_valid_index(self, vs): + is_valid = await vs.ais_valid_index("invalid_index") + assert is_valid == False diff --git a/tests/test_cloudsql_vectorstore_search.py b/tests/test_vectorstore_search.py similarity index 79% rename from tests/test_cloudsql_vectorstore_search.py rename to tests/test_vectorstore_search.py index 65c6d8bc..f2c1cb17 100644 --- a/tests/test_cloudsql_vectorstore_search.py +++ b/tests/test_vectorstore_search.py @@ -19,12 +19,14 @@ import pytest_asyncio from langchain_core.documents import Document from langchain_core.embeddings import DeterministicFakeEmbedding +from sqlalchemy import text from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore from langchain_google_cloud_sql_pg.indexes import DistanceStrategy, HNSWQueryOptions DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -46,6 +48,18 @@ def get_env_var(key: str, desc: str) -> str: return v +async def aexecute( + engine: PostgresEngine, + query: str, +) -> None: + async def run(engine, query): + async with engine._pool.connect() as conn: + await conn.execute(text(query)) + await conn.commit() + + await engine._run_as_async(run(engine, query)) + + @pytest.mark.asyncio(scope="class") class TestVectorStoreSearch: @pytest.fixture(scope="module") @@ -73,6 +87,8 @@ async def engine(self, db_project, db_region, db_instance, db_name): database=db_name, ) yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") + await engine.close() @pytest_asyncio.fixture(scope="class") async def vs(self, engine): @@ -87,11 +103,9 @@ async def vs(self, engine): ids = [str(uuid.uuid4()) for i in range(len(texts))] await vs.aadd_documents(docs, ids=ids) yield vs - await engine._aexecute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") - await engine._engine.dispose() @pytest_asyncio.fixture(scope="class") - def engine_sync(self, db_project, db_region, db_instance, db_name): + async def engine_sync(self, db_project, db_region, db_instance, db_name): engine = PostgresEngine.from_instance( project_id=db_project, instance=db_instance, @@ -99,6 +113,8 @@ def engine_sync(self, db_project, db_region, db_instance, db_name): database=db_name, ) yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await engine.close() @pytest_asyncio.fixture(scope="class") async def vs_custom(self, engine_sync): @@ -126,8 +142,6 @@ async def vs_custom(self, engine_sync): ) vs_custom.add_documents(docs, ids=ids) yield vs_custom - engine_sync._aexecute(f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") - engine_sync._engine.dispose() async def test_asimilarity_search(self, vs): results = await vs.asimilarity_search("foo", k=1) @@ -213,6 +227,63 @@ async def test_amax_marginal_relevance_search_vector_score(self, vs): ) assert results[0][0] == Document(page_content="bar") + +class TestVectorStoreSearchSync: + @pytest.fixture(scope="module") + def db_project(self) -> str: + return get_env_var("PROJECT_ID", "project id for google cloud") + + @pytest.fixture(scope="module") + def db_region(self) -> str: + return get_env_var("REGION", "region for cloud sql instance") + + @pytest.fixture(scope="module") + def db_instance(self) -> str: + return get_env_var("INSTANCE_ID", "instance for cloud sql") + + @pytest.fixture(scope="module") + def db_name(self) -> str: + return get_env_var("DATABASE_ID", "instance for cloud sql") + + @pytest_asyncio.fixture(scope="class") + async def engine_sync(self, db_project, db_region, db_instance, db_name): + engine = PostgresEngine.from_instance( + project_id=db_project, + instance=db_instance, + region=db_region, + database=db_name, + ) + yield engine + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_SYNC}") + await engine.close() + + @pytest.fixture(scope="class") + def vs_custom(self, engine_sync): + engine_sync.init_vectorstore_table( + CUSTOM_TABLE_SYNC, + VECTOR_SIZE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[ + Column("page", "TEXT"), + Column("source", "TEXT"), + ], + store_metadata=False, + ) + + vs_custom = PostgresVectorStore.create_sync( + engine_sync, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE_SYNC, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + index_query_options=HNSWQueryOptions(ef_search=1), + ) + vs_custom.add_documents(docs, ids=ids) + yield vs_custom + def test_similarity_search(self, vs_custom): results = vs_custom.similarity_search("foo", k=1) assert len(results) == 1 From 9709a9aa686f79698cbf4cd0b8a43fab4a9eb90d Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Tue, 10 Sep 2024 15:14:33 -0700 Subject: [PATCH 05/12] chore: release 0.10.0 (#210) Release-As: 0.10.0 --- samples/langchain_on_vertexai/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/samples/langchain_on_vertexai/README.md b/samples/langchain_on_vertexai/README.md index 63974b05..3c6ea3dc 100644 --- a/samples/langchain_on_vertexai/README.md +++ b/samples/langchain_on_vertexai/README.md @@ -30,5 +30,3 @@ Build and deploy an Agent with RAG tool and Memory | [retriever_agent_with_histo 1. Use [`create_embeddings.py`](create_embeddings.py) to add data to your vector store. Learn more at [Deploying a RAG Application with Cloud SQL for Postgres to LangChain on Vertex AI](https://github.com/GoogleCloudPlatform/generative-ai/blob/main/gemini/reasoning-engine/tutorial_cloud_sql_pg_rag_agent.ipynb). - - From 04b48ef1f60cdb07bda70ac48d4598796c2b0f3f Mon Sep 17 00:00:00 2001 From: Averi Kitsch Date: Wed, 11 Sep 2024 14:55:22 -0700 Subject: [PATCH 06/12] Update DEVELOPER.md (#200) Co-authored-by: Wenxin Du <117315983+duwenxin99@users.noreply.github.com> --- DEVELOPER.md | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/DEVELOPER.md b/DEVELOPER.md index 9e9ef215..a2089553 100644 --- a/DEVELOPER.md +++ b/DEVELOPER.md @@ -18,7 +18,7 @@ Learn more by reading [How should I write my commits?](https://github.com/google ### Run tests locally -1. Set environment variables for `INSTANCE_ID`, `DATABASE_ID`, `REGION`, `DB_USER`, `DB_PASSWORD` +1. Set environment variables for `INSTANCE_ID`, `DATABASE_ID`, `REGION`, `DB_USER`, `DB_PASSWORD`, `IAM_ACCOUNT`. 1. Run pytest to automatically run all tests: @@ -26,6 +26,14 @@ Learn more by reading [How should I write my commits?](https://github.com/google pytest ``` +Notes: + +* Tests use both IAM and built-in authentication. + * Learn how to set up a built-in databases user at [Cloud SQL built-in database authentication](https://cloud.google.com/sql/docs/postgres/built-in-authentication). + * Local tests will run against your `gcloud` credentials. Use `gcloud` to login with your personal account or a service account. This account will be used to run IAM tests. Learn how to set up access to the database at [Manage users with IAM database authentication](https://cloud.google.com/sql/docs/postgres/add-manage-iam-users). The "IAM_ACCOUNT" environment variable is also used to test authentication to override the local account. A personal account or a service account can be used for this test. + * You may need to grant access to the public schema for your new database user: `GRANT ALL ON SCHEMA public TO myaccount@example.com;` + + ### CI Platform Setup Cloud Build is used to run tests against Google Cloud resources in test project: langchain-cloud-sql-testing. @@ -121,4 +129,4 @@ The kokoro docs pipeline runs when a new release is created. See `.kokoro/` for [triggers]: https://console.cloud.google.com/cloud-build/triggers?e=13802955&project=langchain-cloud-sql-testing [vectorstore]: https://github.com/googleapis/langchain-google-cloud-sql-pg-python/tree/main/docs/vector_store.ipynb [loader]: https://github.com/googleapis/langchain-google-cloud-sql-pg-python/tree/main/docs/document_loader.ipynb -[history]: https://github.com/googleapis/langchain-google-cloud-sql-pg-python/tree/main/docs/chat_message_history.ipynb \ No newline at end of file +[history]: https://github.com/googleapis/langchain-google-cloud-sql-pg-python/tree/main/docs/chat_message_history.ipynb From 42baac64c9c5edba9e2ed53c983155579805fd5b Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Thu, 12 Sep 2024 00:32:29 +0200 Subject: [PATCH 07/12] chore(deps): update python-nonmajor (#201) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR contains the following updates: | Package | Change | Age | Adoption | Passing | Confidence | |---|---|---|---|---|---| | [SQLAlchemy](https://www.sqlalchemy.org) ([changelog](https://docs.sqlalchemy.org/en/latest/changelog/)) | `==2.0.32` -> `==2.0.34` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/SQLAlchemy/2.0.34?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/SQLAlchemy/2.0.34?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/SQLAlchemy/2.0.32/2.0.34?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/SQLAlchemy/2.0.32/2.0.34?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [cloud-sql-python-connector](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector) | `==1.10.0` -> `==1.12.0` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/cloud-sql-python-connector/1.12.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/cloud-sql-python-connector/1.12.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/cloud-sql-python-connector/1.10.0/1.12.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/cloud-sql-python-connector/1.10.0/1.12.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [google-cloud-aiplatform](https://redirect.github.com/googleapis/python-aiplatform) | `==1.57.0` -> `==1.65.0` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/google-cloud-aiplatform/1.65.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/google-cloud-aiplatform/1.65.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/google-cloud-aiplatform/1.57.0/1.65.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/google-cloud-aiplatform/1.57.0/1.65.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [google-cloud-resource-manager](https://redirect.github.com/googleapis/google-cloud-python/tree/main/packages/google-cloud-resource-manager) ([source](https://redirect.github.com/googleapis/google-cloud-python)) | `==1.12.3` -> `==1.12.5` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/google-cloud-resource-manager/1.12.5?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/google-cloud-resource-manager/1.12.5?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/google-cloud-resource-manager/1.12.3/1.12.5?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/google-cloud-resource-manager/1.12.3/1.12.5?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [langchain](https://redirect.github.com/langchain-ai/langchain) ([changelog](https://redirect.github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain%3D%3D0%22&expanded=true)) | `==0.2.14` -> `==0.2.16` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/langchain/0.2.16?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/langchain/0.2.16?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/langchain/0.2.14/0.2.16?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/langchain/0.2.14/0.2.16?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [langchain-core](https://redirect.github.com/langchain-ai/langchain) ([changelog](https://redirect.github.com/langchain-ai/langchain/releases?q=tag%3A%22langchain-core%3D%3D0%22&expanded=true)) | `==0.2.35` -> `==0.2.38` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/langchain-core/0.2.38?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/langchain-core/0.2.38?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/langchain-core/0.2.35/0.2.38?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/langchain-core/0.2.35/0.2.38?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [langchain-google-cloud-sql-pg](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python) ([changelog](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/main/CHANGELOG.md)) | `==0.6.1` -> `==0.9.0` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/langchain-google-cloud-sql-pg/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/langchain-google-cloud-sql-pg/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/langchain-google-cloud-sql-pg/0.6.1/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/langchain-google-cloud-sql-pg/0.6.1/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [langchain-google-cloud-sql-pg](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python) ([changelog](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/main/CHANGELOG.md)) | `==0.7.0` -> `==0.9.0` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/langchain-google-cloud-sql-pg/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/langchain-google-cloud-sql-pg/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/langchain-google-cloud-sql-pg/0.7.0/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/langchain-google-cloud-sql-pg/0.7.0/0.9.0?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [langchain-google-vertexai](https://redirect.github.com/langchain-ai/langchain-google) | `==1.0.4` -> `==1.0.10` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/langchain-google-vertexai/1.0.10?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/langchain-google-vertexai/1.0.10?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/langchain-google-vertexai/1.0.4/1.0.10?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/langchain-google-vertexai/1.0.4/1.0.10?slim=true)](https://docs.renovatebot.com/merge-confidence/) | | [numpy](https://numpy.org) ([source](https://redirect.github.com/numpy/numpy), [changelog](https://numpy.org/doc/stable/release)) | `==1.24.4` -> `==1.26.4` | [![age](https://developer.mend.io/api/mc/badges/age/pypi/numpy/1.26.4?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![adoption](https://developer.mend.io/api/mc/badges/adoption/pypi/numpy/1.26.4?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![passing](https://developer.mend.io/api/mc/badges/compatibility/pypi/numpy/1.24.4/1.26.4?slim=true)](https://docs.renovatebot.com/merge-confidence/) | [![confidence](https://developer.mend.io/api/mc/badges/confidence/pypi/numpy/1.24.4/1.26.4?slim=true)](https://docs.renovatebot.com/merge-confidence/) | --- ### Release Notes
GoogleCloudPlatform/cloud-sql-python-connector (cloud-sql-python-connector) ### [`v1.12.0`](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/blob/HEAD/CHANGELOG.md#1120-2024-08-13) [Compare Source](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/compare/v1.11.0...v1.12.0) ##### Features - use non-blocking disk read/writes ([#​1142](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/issues/1142)) ([d2fd465](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/commit/d2fd46523fae041ec8b4e907e4d0eeb204f0aa8f)) ##### Dependencies - update python non-major dependencies ([#​1149](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/issues/1149)) ([2e27c05](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/commit/2e27c05acb2d579127b892a7ed5860a9404c9f53)) ### [`v1.11.0`](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/blob/HEAD/CHANGELOG.md#1110-2024-07-10) [Compare Source](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/compare/v1.10.0...v1.11.0) ##### Features - invalidate cache on bad connection info and IP lookup ([#​1118](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/issues/1118)) ([672dc4e](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/commit/672dc4e8b853e5b2ac1b44da889466b6693381da)) ##### Bug Fixes - let DNS resolve outside connector ([#​1120](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/issues/1120)) ([d321f79](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/commit/d321f79dffebb7c630ba4f4c14dd36aa1a532798)) - retry 50x errors with exponential backoff ([#​1125](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/issues/1125)) ([2da9128](https://redirect.github.com/GoogleCloudPlatform/cloud-sql-python-connector/commit/2da9128cb1e46e713fe79094f1903f79e5a4cb64))
googleapis/python-aiplatform (google-cloud-aiplatform) ### [`v1.65.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1650-2024-09-04) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.64.0...v1.65.0) ##### ⚠ BREAKING CHANGES - Tokenization - Deprecated `ComputeTokenResult.token_info_list` in favor of `ComputeTokenResult.tokens_info` ##### Features - Add support for system instruction and tools in tokenization. ([72fcc06](https://redirect.github.com/googleapis/python-aiplatform/commit/72fcc063ed4a086da0ad37ec2ac58860d4e79051)) - Add vector search alpha to rag retrieval for hybrid search ranking ([6624ebe](https://redirect.github.com/googleapis/python-aiplatform/commit/6624ebe22726942dd70781122352f47268ee2dee)) - Adding Weaviate Vector DB option for RAG corpuses to SDK ([9b28202](https://redirect.github.com/googleapis/python-aiplatform/commit/9b28202a1bb17f54e042301d6cdac3b6aa826797)) - GenAI - Added `system_instruction` and `tools` support to `GenerativeModel.count_tokens` ([50fca69](https://redirect.github.com/googleapis/python-aiplatform/commit/50fca693b2f3b1a0b61867dc136be5a468fb2b2f)) - GenAI - Added Llama3 support in GenAI batch prediction ([6166152](https://redirect.github.com/googleapis/python-aiplatform/commit/6166152844dc0078f7a5a02355ef3555cc428cfa)) - PrivateEndpoint.stream_raw_predict ([197f333](https://redirect.github.com/googleapis/python-aiplatform/commit/197f333be5a075d41f98b762cd933cd2e89cecae)) - Support reserved_ip_ranges for VPC network in Ray on Vertex cluster ([36a56b9](https://redirect.github.com/googleapis/python-aiplatform/commit/36a56b99f9e53d19d80c2bff3bf55c208988c518)) - Tokenization - Deprecated `ComputeTokenResult.token_info_list` in favor of `ComputeTokenResult.tokens_info` ([efbcb54](https://redirect.github.com/googleapis/python-aiplatform/commit/efbcb54e0d5df4d65a79e60afdbc5c328538aef6)) ##### Bug Fixes - Tensorboard - Fixed bug in tensorboard uploader where profile logs are not uploaded from nested run directories. ([37627de](https://redirect.github.com/googleapis/python-aiplatform/commit/37627de4ec12b8c51abf41524b9e7e3adf2dab54)) - Tokenizers - Fixed `Tokenizer.compute_tokens` ([c29fa5d](https://redirect.github.com/googleapis/python-aiplatform/commit/c29fa5d98fc1c1550c039e84ff2f5725818c2231)) ### [`v1.64.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1640-2024-08-27) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.63.0...v1.64.0) ##### Features - Endpoint - Add support for Prediction dedicated endpoint. predict/rawPredict/streamRawPredict can use dedicated DNS to access the dedicated endpoint. ([3d68777](https://redirect.github.com/googleapis/python-aiplatform/commit/3d687777d39b00280c22d2a14ddde3ba644febf7)) - GenAI - Added the model Distillation feature (private preview) ([a0d4ff2](https://redirect.github.com/googleapis/python-aiplatform/commit/a0d4ff20ceb1c48806d1711fdb2691dc34f9f1db)) - Grounding - Allow initialization of `grounding.VertexAISearch` with full resource name or data store ID, project ID, and location. ([f334321](https://redirect.github.com/googleapis/python-aiplatform/commit/f334321694bb3be1a421ee19a33fc973e5455da9)) - Evaluation - Make Rouge class available in base level init ([aed82a1](https://redirect.github.com/googleapis/python-aiplatform/commit/aed82a1bd5d8942ded4dd325a4eb1a5f73bc50c7)) - Feature Store - Read for online store w/private service connect ([7af80c6](https://redirect.github.com/googleapis/python-aiplatform/commit/7af80c624e026cfe8dda2d1644467a517f49b78f)) - Ray - Support autoscaling in Ray on Vertex ([961da42](https://redirect.github.com/googleapis/python-aiplatform/commit/961da429683db113a822300342484aaf1128cfc0)) ##### Bug Fixes - Fix error in tensorboard uploader thrown when time_series_id is None ([d59a052](https://redirect.github.com/googleapis/python-aiplatform/commit/d59a0522ddc2131ab39b052e742a6472f84e0a5a)) - Evaluation - Fix typo in prompt templates: ([c8fa7a8](https://redirect.github.com/googleapis/python-aiplatform/commit/c8fa7a8cf53165354fa89e38fffc0ef4a821e211)) ##### Documentation - **samples:** Adding code sample for vector search create streaming index ([71464e7](https://redirect.github.com/googleapis/python-aiplatform/commit/71464e7d2a57fa6770d2fcb7c5c0e669055c4cdb)) ### [`v1.63.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1630-2024-08-20) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.62.0...v1.63.0) ##### Features - A new field `satisfies_pzs` is added to message `.google.cloud.aiplatform.v1.BatchPredictionJob` ([#​4192](https://redirect.github.com/googleapis/python-aiplatform/issues/4192)) ([6919037](https://redirect.github.com/googleapis/python-aiplatform/commit/6919037e9513e922e9ffe197e68a99fb343c4fff)) - Add advanced PDF parsing option for RAG file import ([6e1dc06](https://redirect.github.com/googleapis/python-aiplatform/commit/6e1dc0658ffd875f4a3bbcab62976e15e997102e)) - Add multithreading for custom metric computation. ([2c93fc1](https://redirect.github.com/googleapis/python-aiplatform/commit/2c93fc17b4a76623209b7699a73d4e6c9c27cc81)) - Add progress bar for generating inference. ([b78714f](https://redirect.github.com/googleapis/python-aiplatform/commit/b78714f3cf8fc22f7caa193a7398efe3626c2c5f)) - Add progress bar to custom metrics. ([3974aec](https://redirect.github.com/googleapis/python-aiplatform/commit/3974aec92595870b6f33ecd016763f59d6630898)) - Add Prompt class support for configs and Prompt.generate_content wrapper ([7f1e031](https://redirect.github.com/googleapis/python-aiplatform/commit/7f1e0313842546b7e911ee3ef06d7193deb64a91)) - GenAI - Added seed parameter to the GenerationConfig class ([9f1e073](https://redirect.github.com/googleapis/python-aiplatform/commit/9f1e0739d837b3110b40b8806514ca3e49e2b1da)) - GenAI - Added the `Candidate.avg_logprobs` property ([de80695](https://redirect.github.com/googleapis/python-aiplatform/commit/de80695ad2359361a698cffbde2336417297ef35)) - GenAI - Released the `Prompt` feature to Public Preview ([64eeab8](https://redirect.github.com/googleapis/python-aiplatform/commit/64eeab8b3404e87c0cc19fb6862ff51ec8b95954)) - GenAI Evaluation: Add generic model-based `PointwiseMetric` and `PairwiseMetric` classes that allow customizing metric prompt templates. Add `PointwiseMetricPromptTemplate`, `PairwiseMetricPromptTemplate` classes to help formulate and customize metric prompt templates. Add `metric_column_mapping` parameter to `EvalTask` for metric prompt template input variable name mapping. ([fd38b49](https://redirect.github.com/googleapis/python-aiplatform/commit/fd38b49231bd1b35af57056b9e69a5427ddb114c)) - GenAI Evaluation: Open source model-based metric prompt template examples for Gemini 1.5 Pro autorater. Add `MetricPromptTemplateExamples` class to help retrieve model-based metric prompt templates. ([fd38b49](https://redirect.github.com/googleapis/python-aiplatform/commit/fd38b49231bd1b35af57056b9e69a5427ddb114c)) - GenAI Evaluation: Release GenAI Evaluation SDK GA features to `vertexai.preview` module. ([fd38b49](https://redirect.github.com/googleapis/python-aiplatform/commit/fd38b49231bd1b35af57056b9e69a5427ddb114c)) - Publish GenAI Evaluation SDK GA features to `vertexai.evaluation` module. Switch GenAI Evaluation Service client to v1 version. ([45e4251](https://redirect.github.com/googleapis/python-aiplatform/commit/45e42516fbc47db1c44a7669f2730a1590a992c2)) ##### Bug Fixes - Add support of display_name to create_cached_content in python SDK ([ecc2d54](https://redirect.github.com/googleapis/python-aiplatform/commit/ecc2d54a84c03f7d06e987fba5f5c67fba109ce0)) - Always upload the pickled object and dependencies tarball when creating ReasoningEngine ([34ef5a3](https://redirect.github.com/googleapis/python-aiplatform/commit/34ef5a35bd91aea53c89650c20962dae29b3a535)) - Remove grouding attribution ([f6ece65](https://redirect.github.com/googleapis/python-aiplatform/commit/f6ece65d8f2933ddcb4ec1a08784f8d2c365f2de)) ##### Documentation - Update Prompt docstring for batch prompting ([e96b6e6](https://redirect.github.com/googleapis/python-aiplatform/commit/e96b6e6f0574b96fc4b61f99c671ef6646fc4956)) ### [`v1.62.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1620-2024-08-13) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.61.0...v1.62.0) ##### Features - Add metadata to evaluation result. ([375095e](https://redirect.github.com/googleapis/python-aiplatform/commit/375095e72cc4f43611710372a1e36753a891a710)) - Add Prompt class for multimodal prompt templating ([1bdc235](https://redirect.github.com/googleapis/python-aiplatform/commit/1bdc235ea64f8d63ce9d60d88cb873ee341d3ff9)) - Add support for query method in Vertex AI Extension SDK ([0008735](https://redirect.github.com/googleapis/python-aiplatform/commit/0008735968606a716add88072cff76f2fc552d7b)) - Add support for reservation affinity in custom training jobs. ([802609b](https://redirect.github.com/googleapis/python-aiplatform/commit/802609b1f5e5d8d41a77dafb5b1a2dbf01f2bd30)) - Add support for strategy in custom training jobs. ([a076191](https://redirect.github.com/googleapis/python-aiplatform/commit/a076191b8726363e1f7c47ef8343eb86cebf9918)) - Adding spot, reservation_affinity to Vertex SDK ([3e785bd](https://redirect.github.com/googleapis/python-aiplatform/commit/3e785bd9c9d3d11197ef930f563ee96231a67d84)) - Support api keys in initializer and create_client ([7404f67](https://redirect.github.com/googleapis/python-aiplatform/commit/7404f679246e41e0009ec2d49f05d669eb357f71)) - Support creating optimized online store with private service connect ([659ba3f](https://redirect.github.com/googleapis/python-aiplatform/commit/659ba3f287f9aa78840d4b9b9ca216002d5f1e6a)) - Support disable Cloud logging in Ray on Vertex ([accaa97](https://redirect.github.com/googleapis/python-aiplatform/commit/accaa9750d98b7a37b08da3bd2058d9cdd03bd5c)) - Support PSC-Interface in Ray on Vertex ([accaa97](https://redirect.github.com/googleapis/python-aiplatform/commit/accaa9750d98b7a37b08da3bd2058d9cdd03bd5c)) ##### Bug Fixes - Added credentials, project, and location on PipelineJobSchedule init ([281c171](https://redirect.github.com/googleapis/python-aiplatform/commit/281c1710afc6cac49c02d926bee7a6c43b6ef851)) - Avoid breakage of langchain from orjson 3.10.7 ([c990f73](https://redirect.github.com/googleapis/python-aiplatform/commit/c990f73845f38e58ba2dddb372ad2f84d4a05479)) - Deprecate disable_attribution in GoogleSearchRetrieval. ([c68d559](https://redirect.github.com/googleapis/python-aiplatform/commit/c68d559b9d0fd7288b6775f57d05f474f5f7920a)) ##### Documentation - Update the docstring for compute_tokens method. ([849e8d4](https://redirect.github.com/googleapis/python-aiplatform/commit/849e8d409e4838cad0a020231b806b0c9ef587ce)) ### [`v1.61.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1610-2024-08-05) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.60.0...v1.61.0) ##### Features - Add a warning message for scheduled deprecation of Coherence metric class ([7f238fb](https://redirect.github.com/googleapis/python-aiplatform/commit/7f238fb3cebc44893b4e6959a77743cc4d96138e)) - Add deprecation messages for all model-based metric classes ([71c0fd3](https://redirect.github.com/googleapis/python-aiplatform/commit/71c0fd397139a95b6045f898e906ce11b2e7e8ce)) - Add support for task type (CODE_RETRIEVAL_QUERY) through get_embeddings. ([f2ce1e4](https://redirect.github.com/googleapis/python-aiplatform/commit/f2ce1e4caea9f344e39fc3232f697b1a6ea4f99a)) - Add system_instruction to LangchainAgent template. ([c71c3dd](https://redirect.github.com/googleapis/python-aiplatform/commit/c71c3ddbfeaa577dfce683b3299d94e77d1c4895)) - Adding Slack and Jira data connector for RAG to SDK ([d92e7c9](https://redirect.github.com/googleapis/python-aiplatform/commit/d92e7c91d280dd417d2c2a2cf5abc36592888593)) - Allow protobuf 5.x ([ce9cd5d](https://redirect.github.com/googleapis/python-aiplatform/commit/ce9cd5def14597822c1d071e438cf63b6d4ba3ca)) - LVM - Release `ImageGenerationModel` to GA ([718c199](https://redirect.github.com/googleapis/python-aiplatform/commit/718c1997778310b6898344b2e5a34513e7a82e5f)) - Support "update" for reasoning engine. ([b73ef3e](https://redirect.github.com/googleapis/python-aiplatform/commit/b73ef3eaa2d88dbc8071e3a4f0c7da934683fc2a)) - Update Rapid Evaluation Service QPS. Add a customizable evaluation service QPS parameter. ([9ee9289](https://redirect.github.com/googleapis/python-aiplatform/commit/9ee9289fbe5face719515e453d4f81648b44e7b1)) ##### Documentation - Change init sample to use vertexai ([829e0f6](https://redirect.github.com/googleapis/python-aiplatform/commit/829e0f6fd286cf2de2ac307a836305766473faef)) - Make small fixes to file import documentation ([f7d65c3](https://redirect.github.com/googleapis/python-aiplatform/commit/f7d65c32948c54bcf3a6927639f2173b556bb310)) ### [`v1.60.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1600-2024-07-24) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.59.0...v1.60.0) ##### Features - Add preflight validations to PipelineJob submit and run methods. ([c5a3535](https://redirect.github.com/googleapis/python-aiplatform/commit/c5a35354485a577dd5477449bc4bdcd7866a8df4)) - Add support for langchain v0.2+ package versions in default installation ([259b638](https://redirect.github.com/googleapis/python-aiplatform/commit/259b638300054e13b2dfe1d3f32d7126bbb18b15)) - GenAI - Added tokenization support via `GenerativeModel.compute_tokens` ([cfe0cc6](https://redirect.github.com/googleapis/python-aiplatform/commit/cfe0cc62cbf8dc12b4b021377ddd613d1072fe95)) - GenAI - ContextCaching - allow from_cached_content to take the cached_content resource name ([8f53902](https://redirect.github.com/googleapis/python-aiplatform/commit/8f53902b933f3abf0c9a222f45c2fa7ada727505)) - Make count_tokens generally-available at TextEmbeddingModel. ([efb8413](https://redirect.github.com/googleapis/python-aiplatform/commit/efb84134094ab87d6a2fac48a1f2f1b2199c1818)) ##### Bug Fixes - Avoid throw error when Part.text is empty in modality content checks ([bbd4a49](https://redirect.github.com/googleapis/python-aiplatform/commit/bbd4a49d398052ba2c20e09f8f052e6f766d8fca)) - Correct logit_bias type annotation to accept keys as strings ([2676d25](https://redirect.github.com/googleapis/python-aiplatform/commit/2676d25e62d1bda68b0fbef274d0e669a6670415)) - Create FV embedding dimensions sample - dimensions should be an int ([2aa221e](https://redirect.github.com/googleapis/python-aiplatform/commit/2aa221ec994fea63bd73e2cfe760a314b990e4b0)) - Fix the sync option for Model Monitor job creation ([22151e2](https://redirect.github.com/googleapis/python-aiplatform/commit/22151e29e752dd8f27188046f8c8866c004ca196)) - Include DeploymentResourcePool class in aiplatform top-level sdk module ([ecc4f09](https://redirect.github.com/googleapis/python-aiplatform/commit/ecc4f09054b3e314b51ebf622b7241a5ab4ff072)) - Overriding the current TracerProvider when enabling tracing ([1476c10](https://redirect.github.com/googleapis/python-aiplatform/commit/1476c10f2fd91c02cd98799564a33ede742bc6e0)) - Pass the project ID from vertexai.init to CloudTraceSpanExporter when enable_tracing=True for LangchainAgent ([3ec043e](https://redirect.github.com/googleapis/python-aiplatform/commit/3ec043eefb053739f767d5199b1941bbc3c49120)) ##### Documentation - GenAI - Update README.md for Vertex Generative AI SDK for Python to add subsections to the right nav. ([42af742](https://redirect.github.com/googleapis/python-aiplatform/commit/42af742d808abdca56b84b3381388a36c8454f1b)) ### [`v1.59.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1590-2024-07-09) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.58.0...v1.59.0) ##### Features - Add model and contents fields to ComputeTokensRequest v1 ([f6e7b9c](https://redirect.github.com/googleapis/python-aiplatform/commit/f6e7b9c0f1656edba0c69d02475c2a7337fefb99)) - Add model and contents fields to ComputeTokensRequest v1beta1 ([f6e7b9c](https://redirect.github.com/googleapis/python-aiplatform/commit/f6e7b9c0f1656edba0c69d02475c2a7337fefb99)) - Add role field to TokensInfo v1 ([f6e7b9c](https://redirect.github.com/googleapis/python-aiplatform/commit/f6e7b9c0f1656edba0c69d02475c2a7337fefb99)) - Add role field to TokensInfo v1beta1 ([f6e7b9c](https://redirect.github.com/googleapis/python-aiplatform/commit/f6e7b9c0f1656edba0c69d02475c2a7337fefb99)) - GenAI - Tuning - Released the Supervised Fine Tuning feature o GA ([ae47639](https://redirect.github.com/googleapis/python-aiplatform/commit/ae47639c1dc03a89d83c8de1609aaa25af9a1368)) ##### Bug Fixes - **deps:** Require proto-plus 1.22.3 ([4131e65](https://redirect.github.com/googleapis/python-aiplatform/commit/4131e6583799d16b5032fecd73d4539fb05f0cd1)) - Offline store - set application name + remove session param ([7395665](https://redirect.github.com/googleapis/python-aiplatform/commit/7395665bcd847a62b25392d98848a6fb130f5286)) ### [`v1.58.0`](https://redirect.github.com/googleapis/python-aiplatform/blob/HEAD/CHANGELOG.md#1580-2024-07-03) [Compare Source](https://redirect.github.com/googleapis/python-aiplatform/compare/v1.57.0...v1.58.0) ##### Features - Add deploy_metadata to PublisherModel.Deploy v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Add deploy_metadata to PublisherModel.Deploy v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - Add display tuning job button for Ipython environments when getting an existing job ([872b455](https://redirect.github.com/googleapis/python-aiplatform/commit/872b455bcdda59d73d7060aaaa20a0b0e86e8cbb)) - Add private_service_connect_config and service_attachment fields to DedicatedServingEndpoint v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Add satisfies_pzs and satisfies_pzi fields to Model v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Add satisfies_pzs and satisfies_pzi fields to Model v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - Add satisfies_pzs and satisfies_pzi fields to Tensorboard v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Add satisfies_pzs and satisfies_pzi fields to Tensorboard v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - Add UpdateDeploymentResourcePool method to DeploymentResourcePoolService v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Add UpdateDeploymentResourcePool method to DeploymentResourcePoolService v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - Add use_effective_order field to BleuSpec v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - GenAI - Evaluation - Add a progress bar for evaluation service requests ([bbffb0d](https://redirect.github.com/googleapis/python-aiplatform/commit/bbffb0d5bfe0509399c801d849311a6201caa633)) - GenAI - Evaluation - Implement rate limiter and refactor parallelization for online evaluation service requests ([bbffb0d](https://redirect.github.com/googleapis/python-aiplatform/commit/bbffb0d5bfe0509399c801d849311a6201caa633)) - GenAI - Evaluation - Return partial evaluation results with error logging ([bbffb0d](https://redirect.github.com/googleapis/python-aiplatform/commit/bbffb0d5bfe0509399c801d849311a6201caa633)) - Migrate DeploymentResourcePool and associated functionality to V1 namespace for GA launch of model co-hosting. ([1474d98](https://redirect.github.com/googleapis/python-aiplatform/commit/1474d988fa63cbbb3b200634719bc245cab6a448)) ##### Bug Fixes - `IndexConfig` - use TreeAhConfig as default `algorithm_config`. ([341d287](https://redirect.github.com/googleapis/python-aiplatform/commit/341d287719cabdaa1041cdefe9b65b77f1e4bc3b)) - LVM - Update `Video.load_from_file()` to support storage.googleapis.com links ([b63f960](https://redirect.github.com/googleapis/python-aiplatform/commit/b63f9600f743067ae97103dfd43f4392b9f6de56)) ##### Documentation - Update comments of AutoscalingSpec v1 ([71e41c8](https://redirect.github.com/googleapis/python-aiplatform/commit/71e41c8eeb0e081d67660161a31f6a228d7b0502)) - Update comments of AutoscalingSpec v1beta1 ([b5c3cdd](https://redirect.github.com/googleapis/python-aiplatform/commit/b5c3cdd737acd695301c9a564d8f91371288f9f1)) - Update import paths for Gemini README ([46b3042](https://redirect.github.com/googleapis/python-aiplatform/commit/46b30425e8c86588256bf75f857078caeb9d7dee))
googleapis/google-cloud-python (google-cloud-resource-manager) ### [`v1.12.5`](https://redirect.github.com/googleapis/google-cloud-python/releases/tag/google-cloud-resource-manager-v1.12.5): google-cloud-resource-manager: v1.12.5 [Compare Source](https://redirect.github.com/googleapis/google-cloud-python/compare/google-cloud-resource-manager-v1.12.4...google-cloud-resource-manager-v1.12.5) ##### Bug Fixes - Retry and timeout values do not propagate in requests during pagination ([52db52e](https://redirect.github.com/googleapis/google-cloud-python/commit/52db52ea05c6883b07956d323fdd1d3029806374)) ### [`v1.12.4`](https://redirect.github.com/googleapis/google-cloud-python/releases/tag/google-cloud-resource-manager-v1.12.4): google-cloud-resource-manager: v1.12.4 [Compare Source](https://redirect.github.com/googleapis/google-cloud-python/compare/google-cloud-resource-manager-v1.12.3...google-cloud-resource-manager-v1.12.4) ##### Bug Fixes - Allow Protobuf 5.x ([#​12869](https://redirect.github.com/googleapis/google-cloud-python/issues/12869)) ([e42edbc](https://redirect.github.com/googleapis/google-cloud-python/commit/e42edbcf7f4d8ed66b6645c96a01c55fb8cd7666))
googleapis/langchain-google-cloud-sql-pg-python (langchain-google-cloud-sql-pg) ### [`v0.9.0`](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/HEAD/CHANGELOG.md#090-2024-09-05) [Compare Source](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.8.0...v0.9.0) ##### Features - Add support for custom schema names ([#​191](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/191)) ([1e0566a](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/1e0566af98bf24c711315a791336ba212d240acd)) ### [`v0.8.0`](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/HEAD/CHANGELOG.md#080-2024-09-04) [Compare Source](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.7.0...v0.8.0) ##### Features - Add table name to default index name ([#​171](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/171)) ([8e61bc7](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/8e61bc779bc8f803e40e76aaeffdb93c35a5c90f)) - Remove langchain-community dependency ([#​172](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/172)) ([b4f40bb](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/b4f40bb389b40853e3deed37e1385a7866741231)) ##### Bug Fixes - Add caching for background loop/thread ([#​184](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/184)) ([1489f81](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/1489f818c1d62bfee5c5a3bab42d380556662e82)) - Fix QueryOptions not applied to similarity search bug ([#​185](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/185)) ([e5dca97](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/e5dca973d625c4df4c3e741a3ad8e95be0cd1472)) - Fixed extra char in requirements.txt ([#​196](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/196)) ([50dc32f](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/50dc32f8ae476c98e3ed38a153096551ce02d340)) ##### Documentation - Add index choosing guide ([#​178](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/178)) ([e96ffb6](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/e96ffb6dc99425e4dafb8ac13730eed253e74c4e)) - Added vector store initialization from documents ([#​174](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/174)) ([eb2eac3](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/eb2eac303f64e809e6f3fc9bc3307be163602a4e)) - Update README.md to fix 404 links to templates ([#​182](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/182)) ([f10ae6c](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/f10ae6c9a8645874a5ab64e846ec540aeddf977a)) ### [`v0.7.0`](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/HEAD/CHANGELOG.md#070-2024-07-23) [Compare Source](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.6.1...v0.7.0) ##### Features - Add similarity search score threshold select function ([#​157](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/157)) ([71789f0](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/71789f06a9702ee2e037b084a88c1258b7232a4b)) - Added example for document saver ([#​164](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/164)) ([13b909e](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/13b909e1fbc518728103ae6de0a1d8c462df8144)) - Auto-generate IDs upon adding embeddings ([#​158](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/158)) ([a364514](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/a3645147f3d7fe0958d0420f948cf6afb8eb215b)) - Support IAM account override ([#​160](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/160)) ([2de3cba](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/2de3cbae40d267a7b038a7b421999b5bb60c03d8)) ##### Bug Fixes - Add key to engine constructor ([c12ded9](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/c12ded92abcb6a44e374f7b00afc1e17588e0688)) - Rename inner product distance search function to inner_product ([#​168](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/168)) ([c5641c3](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/c5641c305e4c63d09f24c88dba679bcf1a4040b2)) ##### Documentation - Add docstring to all methods ([#​163](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/163)) ([61413f1](https://redirect.github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/61413f10d9cb074a1fc82a742000827285208750))
numpy/numpy (numpy) ### [`v1.26.4`](https://redirect.github.com/numpy/numpy/releases/tag/v1.26.4) [Compare Source](https://redirect.github.com/numpy/numpy/compare/v1.26.3...v1.26.4) ### NumPy 1.26.4 Release Notes NumPy 1.26.4 is a maintenance release that fixes bugs and regressions discovered after the 1.26.3 release. The Python versions supported by this release are 3.9-3.12. This is the last planned release in the 1.26.x series. #### Contributors A total of 13 people contributed to this release. People with a "+" by their names contributed a patch for the first time. - Charles Harris - Elliott Sales de Andrade - Lucas Colley + - Mark Ryan + - Matti Picus - Nathan Goldbaum - Ola x Nilsson + - Pieter Eendebak - Ralf Gommers - Sayed Adel - Sebastian Berg - Stefan van der Walt - Stefano Rivera #### Pull requests merged A total of 19 pull requests were merged for this release. - [#​25323](https://redirect.github.com/numpy/numpy/pull/25323): BUG: Restore missing asstr import - [#​25523](https://redirect.github.com/numpy/numpy/pull/25523): MAINT: prepare 1.26.x for further development - [#​25539](https://redirect.github.com/numpy/numpy/pull/25539): BUG: `numpy.array_api`: fix `linalg.cholesky` upper decomp... - [#​25584](https://redirect.github.com/numpy/numpy/pull/25584): CI: Bump azure pipeline timeout to 120 minutes - [#​25585](https://redirect.github.com/numpy/numpy/pull/25585): MAINT, BLD: Fix unused inline functions warnings on clang - [#​25599](https://redirect.github.com/numpy/numpy/pull/25599): BLD: include fix for MinGW platform detection - [#​25618](https://redirect.github.com/numpy/numpy/pull/25618): TST: Fix test_numeric on riscv64 - [#​25619](https://redirect.github.com/numpy/numpy/pull/25619): BLD: fix building for windows ARM64 - [#​25620](https://redirect.github.com/numpy/numpy/pull/25620): MAINT: add `newaxis` to `__all__` in `numpy.array_api` - [#​25630](https://redirect.github.com/numpy/numpy/pull/25630): BUG: Use large file fallocate on 32 bit linux platforms - [#​25643](https://redirect.github.com/numpy/numpy/pull/25643): TST: Fix test_warning_calls on Python 3.12 - [#​25645](https://redirect.github.com/numpy/numpy/pull/25645): TST: Bump pytz to 2023.3.post1 - [#​25658](https://redirect.github.com/numpy/numpy/pull/25658): BUG: Fix AVX512 build flags on Intel Classic Compiler - [#​25670](https://redirect.github.com/numpy/numpy/pull/25670): BLD: fix potential issue with escape sequences in `__config__.py` - [#​25718](https://redirect.github.com/numpy/numpy/pull/25718): CI: pin cygwin python to 3.9.16-1 and fix typing tests \[skip... - [#​25720](https://redirect.github.com/numpy/numpy/pull/25720): MAINT: Bump cibuildwheel to v2.16.4 - [#​25748](https://redirect.github.com/numpy/numpy/pull/25748): BLD: unvendor meson-python on 1.26.x and upgrade to meson-python... - [#​25755](https://redirect.github.com/numpy/numpy/pull/25755): MAINT: Include header defining backtrace - [#​25756](https://redirect.github.com/numpy/numpy/pull/25756): BUG: Fix np.quantile(\[Fraction(2,1)], 0.5) ([#​24711](https://redirect.github.com/numpy/numpy/issues/24711)) #### Checksums ##### MD5 90f33cdd8934cd07192d6ede114d8d4d numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl 63ac60767f6724490e587f6010bd6839 numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl ad4e82b225aaaf5898ea9798b50978d8 numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl d428e3da2df4fa359313348302cf003a numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 89937c3bb596193f8ca9eae2ff84181e numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl de4f9da0a4e6dfd4cec39c7ad5139803 numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl 2c1f73fd9b3acf4b9b0c23e985cdd38f numpy-1.26.4-cp310-cp310-win32.whl 920ad1f50e478b1a877fe7b7a46cc520 numpy-1.26.4-cp310-cp310-win_amd64.whl 719d1ff12db38903dcfd6749078fb11d numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl eb601e80194d2e1c00d8daedd8dc68c4 numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl 71a7ab11996fa370dc28e28731bd5c32 numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl eb0cdd03e1ee2eb45c57c7340c98cf48 numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 9d4ae1b0b27a625400f81ed1846a5667 numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl 1b6771350d2f496157430437a895ba4b numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl 1e4a18612ee4d0e54e0833574ebc6d25 numpy-1.26.4-cp311-cp311-win32.whl 5fd325dd8704023c1110835d7a1b095a numpy-1.26.4-cp311-cp311-win_amd64.whl d95ce582923d24dbddbc108aa5fd2128 numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl 6f16f3d70e0d95ce2b032167c546cc95 numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl 5369536d4c45fbe384147ff23185b48a numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 1ceb224096686831ad731e472b65e96a numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl cd8d3c00bbc89f9bc07e2df762f9e2ae numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl 5bd81ce840bb2e42befe01efb0402b79 numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl 2cc3b0757228078395da3efa3dc99f23 numpy-1.26.4-cp312-cp312-win32.whl 305155bd5ae879344c58968879584ed1 numpy-1.26.4-cp312-cp312-win_amd64.whl ec2310f67215743e9c5d16b6c9fb87b6 numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl 406aea6081c1affbebdb6ad56b5deaf4 numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl fee12f0a3cbac7bbf1a1c2d82d3b02a9 numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl baf4b7143c7b9ce170e62b33380fb573 numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 376ff29f90b7840ae19ecd59ad1ddf53 numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl 86785b3a7cd156c08c2ebc26f7816fb3 numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl ab8a9ab69f16b7005f238cda76bc0bac numpy-1.26.4-cp39-cp39-win32.whl fafa4453e820c7ff40907e5dc79d8199 numpy-1.26.4-cp39-cp39-win_amd64.whl 7f13e2f07bd3e4a439ade0e4d27905c6 numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl 928954b41c1cd0e856f1a31d41722661 numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 57bbd5c0b3848d804c416cbcab4a0ae8 numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl 19550cbe7bedd96a928da9d4ad69509d numpy-1.26.4.tar.gz ##### SHA256 9ff0f4f29c51e2803569d7a51c2304de5554655a60c5d776e35b4a41413830d0 numpy-1.26.4-cp310-cp310-macosx_10_9_x86_64.whl 2e4ee3380d6de9c9ec04745830fd9e2eccb3e6cf790d39d7b98ffd19b0dd754a numpy-1.26.4-cp310-cp310-macosx_11_0_arm64.whl d209d8969599b27ad20994c8e41936ee0964e6da07478d6c35016bc386b66ad4 numpy-1.26.4-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ffa75af20b44f8dba823498024771d5ac50620e6915abac414251bd971b4529f numpy-1.26.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 62b8e4b1e28009ef2846b4c7852046736bab361f7aeadeb6a5b89ebec3c7055a numpy-1.26.4-cp310-cp310-musllinux_1_1_aarch64.whl a4abb4f9001ad2858e7ac189089c42178fcce737e4169dc61321660f1a96c7d2 numpy-1.26.4-cp310-cp310-musllinux_1_1_x86_64.whl bfe25acf8b437eb2a8b2d49d443800a5f18508cd811fea3181723922a8a82b07 numpy-1.26.4-cp310-cp310-win32.whl b97fe8060236edf3662adfc2c633f56a08ae30560c56310562cb4f95500022d5 numpy-1.26.4-cp310-cp310-win_amd64.whl 4c66707fabe114439db9068ee468c26bbdf909cac0fb58686a42a24de1760c71 numpy-1.26.4-cp311-cp311-macosx_10_9_x86_64.whl edd8b5fe47dab091176d21bb6de568acdd906d1887a4584a15a9a96a1dca06ef numpy-1.26.4-cp311-cp311-macosx_11_0_arm64.whl 7ab55401287bfec946ced39700c053796e7cc0e3acbef09993a9ad2adba6ca6e numpy-1.26.4-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 666dbfb6ec68962c033a450943ded891bed2d54e6755e35e5835d63f4f6931d5 numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 96ff0b2ad353d8f990b63294c8986f1ec3cb19d749234014f4e7eb0112ceba5a numpy-1.26.4-cp311-cp311-musllinux_1_1_aarch64.whl 60dedbb91afcbfdc9bc0b1f3f402804070deed7392c23eb7a7f07fa857868e8a numpy-1.26.4-cp311-cp311-musllinux_1_1_x86_64.whl 1af303d6b2210eb850fcf03064d364652b7120803a0b872f5211f5234b399f20 numpy-1.26.4-cp311-cp311-win32.whl cd25bcecc4974d09257ffcd1f098ee778f7834c3ad767fe5db785be9a4aa9cb2 numpy-1.26.4-cp311-cp311-win_amd64.whl b3ce300f3644fb06443ee2222c2201dd3a89ea6040541412b8fa189341847218 numpy-1.26.4-cp312-cp312-macosx_10_9_x86_64.whl 03a8c78d01d9781b28a6989f6fa1bb2c4f2d51201cf99d3dd875df6fbd96b23b numpy-1.26.4-cp312-cp312-macosx_11_0_arm64.whl 9fad7dcb1aac3c7f0584a5a8133e3a43eeb2fe127f47e3632d43d677c66c102b numpy-1.26.4-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 675d61ffbfa78604709862923189bad94014bef562cc35cf61d3a07bba02a7ed numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl ab47dbe5cc8210f55aa58e4805fe224dac469cde56b9f731a4c098b91917159a numpy-1.26.4-cp312-cp312-musllinux_1_1_aarch64.whl 1dda2e7b4ec9dd512f84935c5f126c8bd8b9f2fc001e9f54af255e8c5f16b0e0 numpy-1.26.4-cp312-cp312-musllinux_1_1_x86_64.whl 50193e430acfc1346175fcbdaa28ffec49947a06918b7b92130744e81e640110 numpy-1.26.4-cp312-cp312-win32.whl 08beddf13648eb95f8d867350f6a018a4be2e5ad54c8d8caed89ebca558b2818 numpy-1.26.4-cp312-cp312-win_amd64.whl 7349ab0fa0c429c82442a27a9673fc802ffdb7c7775fad780226cb234965e53c numpy-1.26.4-cp39-cp39-macosx_10_9_x86_64.whl 52b8b60467cd7dd1e9ed082188b4e6bb35aa5cdd01777621a1658910745b90be numpy-1.26.4-cp39-cp39-macosx_11_0_arm64.whl d5241e0a80d808d70546c697135da2c613f30e28251ff8307eb72ba696945764 numpy-1.26.4-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl f870204a840a60da0b12273ef34f7051e98c3b5961b61b0c2c1be6dfd64fbcd3 numpy-1.26.4-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 679b0076f67ecc0138fd2ede3a8fd196dddc2ad3254069bcb9faf9a79b1cebcd numpy-1.26.4-cp39-cp39-musllinux_1_1_aarch64.whl 47711010ad8555514b434df65f7d7b076bb8261df1ca9bb78f53d3b2db02e95c numpy-1.26.4-cp39-cp39-musllinux_1_1_x86_64.whl a354325ee03388678242a4d7ebcd08b5c727033fcff3b2f536aea978e15ee9e6 numpy-1.26.4-cp39-cp39-win32.whl 3373d5d70a5fe74a2c1bb6d2cfd9609ecf686d47a2d7b1d37a8f3b6bf6003aea numpy-1.26.4-cp39-cp39-win_amd64.whl afedb719a9dcfc7eaf2287b839d8198e06dcd4cb5d276a3df279231138e83d30 numpy-1.26.4-pp39-pypy39_pp73-macosx_10_9_x86_64.whl 95a7476c59002f2f6c590b9b7b998306fba6a5aa646b1e22ddfeaf8f78c3a29c numpy-1.26.4-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 7e50d0a0cc3189f9cb0aeb3a6a6af18c16f59f004b866cd2be1c14b36134a4a0 numpy-1.26.4-pp39-pypy39_pp73-win_amd64.whl 2a02aba9ed12e4ac4eb3ea9421c420301a0c6460d9830d74a9df87efa4912010 numpy-1.26.4.tar.gz ### [`v1.26.3`](https://redirect.github.com/numpy/numpy/compare/v1.26.2...v1.26.3) [Compare Source](https://redirect.github.com/numpy/numpy/compare/v1.26.2...v1.26.3) ### [`v1.26.2`](https://redirect.github.com/numpy/numpy/releases/tag/v1.26.2): 1.26.2 release [Compare Source](https://redirect.github.com/numpy/numpy/compare/v1.26.1...v1.26.2) ### NumPy 1.26.2 Release Notes NumPy 1.26.2 is a maintenance release that fixes bugs and regressions discovered after the 1.26.1 release. The 1.26.release series is the last planned minor release series before NumPy 2.0. The Python versions supported by this release are 3.9-3.12. #### Contributors A total of 13 people contributed to this release. People with a "+" by their names contributed a patch for the first time. - [@​stefan6419846](https://redirect.github.com/stefan6419846) - [@​thalassemia](https://redirect.github.com/thalassemia) + - Andrew Nelson - Charles Bousseau + - Charles Harris - Marcel Bargull + - Mark Mentovai + - Matti Picus - Nathan Goldbaum - Ralf Gommers - Sayed Adel - Sebastian Berg - William Ayd + #### Pull requests merged A total of 25 pull requests were merged for this release. - [#​24814](https://redirect.github.com/numpy/numpy/pull/24814): MAINT: align test_dispatcher s390x targets with \_umath_tests_mtargets - [#​24929](https://redirect.github.com/numpy/numpy/pull/24929): MAINT: prepare 1.26.x for further development - [#​24955](https://redirect.github.com/numpy/numpy/pull/24955): ENH: Add Cython enumeration for NPY_FR_GENERIC - [#​24962](https://redirect.github.com/numpy/numpy/pull/24962): REL: Remove Python upper version from the release branch - [#​24971](https://redirect.github.com/numpy/numpy/pull/24971): BLD: Use the correct Python interpreter when running tempita.py - [#​24972](https://redirect.github.com/numpy/numpy/pull/24972): MAINT: Remove unhelpful error replacements from `import_array()` - [#​24977](https://redirect.github.com/numpy/numpy/pull/24977): BLD: use classic linker on macOS, the new one in XCode 15 has... - [#​25003](https://redirect.github.com/numpy/numpy/pull/25003): BLD: musllinux_aarch64 \[wheel build] - [#​25043](https://redirect.github.com/numpy/numpy/pull/25043): MAINT: Update mailmap - [#​25049](https://redirect.github.com/numpy/numpy/pull/25049): MAINT: Update meson build infrastructure. - [#​25071](https://redirect.github.com/numpy/numpy/pull/25071): MAINT: Split up .github/workflows to match main - [#​25083](https://redirect.github.com/numpy/numpy/pull/25083): BUG: Backport fix build on ppc64 when the baseline set to Power9... - [#​25093](https://redirect.github.com/numpy/numpy/pull/25093): BLD: Fix features.h detection for Meson builds \[1.26.x Backport] - [#​25095](https://redirect.github.com/numpy/numpy/pull/25095): BUG: Avoid intp conversion regression in Cython 3 (backport) - [#​25107](https://redirect.github.com/numpy/numpy/pull/25107): CI: remove obsolete jobs, and move macOS and conda Azure jobs... - [#​25108](https://redirect.github.com/numpy/numpy/pull/25108): CI: Add linux_qemu action and remove travis testing. - [#​25112](https://redirect.github.com/numpy/numpy/pull/25112): MAINT: Update .spin/cmds.py from main. - [#​25113](https://redirect.github.com/numpy/numpy/pull/25113): DOC: Visually divide main license and bundled licenses in wheels - [#​25115](https://redirect.github.com/numpy/numpy/pull/25115): MAINT: Add missing `noexcept` to shuffle helpers - [#​25116](https://redirect.github.com/numpy/numpy/pull/25116): DOC: Fix license identifier for OpenBLAS - [#​25117](https://redirect.github.com/numpy/numpy/pull/25117): BLD: improve detection of Netlib libblas/libcblas/liblapack - [#​25118](https://redirect.github.com/numpy/numpy/pull/25118): MAINT: Make bitfield integers unsigned - [#​25119](https://redirect.github.com/numpy/numpy/pull/25119): BUG: Make n a long int for np.random.multinomial - [#​25120](https://redirect.github.com/numpy/numpy/pull/25120): BLD: change default of the `allow-noblas` option to true. - [#​25121](https://redirect.github.com/numpy/numpy/pull/25121): BUG: ensure passing `np.dtype` to itself doesn't crash #### Checksums ##### MD5 1a5dc6b5b3bf11ad40a59eedb3b69fa1 numpy-1.26.2-cp310-cp310-macosx_10_9_x86_64.whl 4b741c6dfe4e6e22e34e9c5c788d4f04 numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl 2953687fb26e1dd8a2d1bb7109551fcd numpy-1.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl ea9127a3a03f27fd101c62425c661d8d numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 7a6be7c6c1cc3e1ff73f64052fe30677 numpy-1.26.2-cp310-cp310-musllinux_1_1_aarch64.whl 4f45d3f69f54fd1638609fde34c33a5c numpy-1.26.2-cp310-cp310-musllinux_1_1_x86_64.whl f22f5ea26c86eb126ff502fff75d6c21 numpy-1.26.2-cp310-cp310-win32.whl 49871452488e1a55d15ab54c6f3e546e numpy-1.26.2-cp310-cp310-win_amd64.whl 676740bf60fb1c8f5a6b31e00b9a4e9b numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl 7170545dcc2a38a1c2386a6081043b64 numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl feae1190c73d811e2e7ebcad4baf6edf numpy-1.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 03131896abade61b77e0f6e53abb988a numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl f160632f128a3fd46787aa02d8731fbb numpy-1.26.2-cp311-cp311-musllinux_1_1_aarch64.whl 014250db593d589b5533ef7127839c46 numpy-1.26.2-cp311-cp311-musllinux_1_1_x86_64.whl fb437346dac24d0cb23f5314db043c8b numpy-1.26.2-cp311-cp311-win32.whl 7359adc233874898ea768cd4aec28bb3 numpy-1.26.2-cp311-cp311-win_amd64.whl 207a678bea75227428e7fb84d4dc457a numpy-1.26.2-cp312-cp312-macosx_10_9_x86_64.whl 302ff6cc047a408cdf21981bd7b26056 numpy-1.26.2-cp312-cp312-macosx_11_0_arm64.whl 7526faaea58c76aed395c7128dd6e14d numpy-1.26.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 28d3b1943d3a8ad4bbb2ae9da0a77cb9 numpy-1.26.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl d91f5b2bb2c931e41ae7c80ec7509a31 numpy-1.26.2-cp312-cp312-musllinux_1_1_aarch64.whl b2504d4239419f012c08fa1eab12f940 numpy-1.26.2-cp312-cp312-musllinux_1_1_x86_64.whl 57944ba30adc07f33e83a9b45f5c625a numpy-1.26.2-cp312-cp312-win32.whl fe38cd95bbee405ce0cf51c8753a2676 numpy-1.26.2-cp312-cp312-win_amd64.whl 28e1bc3efaf89cf6f0a2b616c0e16401 numpy-1.26.2-cp39-cp39-macosx_10_9_x86_64.whl 9932ccff54855f12ee24f60528279bf1 numpy-1.26.2-cp39-cp39-macosx_11_0_arm64.whl b52c1e987074dad100ad234122a397b9 numpy-1.26.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 1d1bd7e0d2a89ce795a9566a38ed9bb5 numpy-1.26.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 01d2abfe8e9b35415efb791ac6c5865e numpy-1.26.2-cp39-cp39-musllinux_1_1_aarch64.whl 5a6d6ac287ebd93a221e59590329e202 numpy-1.26.2-cp39-cp39-musllinux_1_1_x86_64.whl 4e4e4d8cf661a8d2838ee700fabae87e numpy-1.26.2-cp39-cp39-win32.whl b8e52ecac110471502686abbdf774b78 numpy-1.26.2-cp39-cp39-win_amd64.whl aed2d2914be293f60fedda360b64abf8 numpy-1.26.2-pp39-pypy39_pp73-macosx_10_9_x86_64.whl 6bd88e0f33933445d0e18c1a850f60e0 numpy-1.26.2-pp39-pypy39_pp73-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 010aeb2a50af0af1f7ef56f76f8cf463 numpy-1.26.2-pp39-pypy39_pp73-win_amd64.whl 8f6446a32e47953a03f8fe8533e21e98 numpy-1.26.2.tar.gz ##### SHA256 3703fc9258a4a122d17043e57b35e5ef1c5a5837c3db8be396c82e04c1cf9b0f numpy-1.26.2-cp310-cp310-macosx_10_9_x86_64.whl cc392fdcbd21d4be6ae1bb4475a03ce3b025cd49a9be5345d76d7585aea69440 numpy-1.26.2-cp310-cp310-macosx_11_0_arm64.whl 36340109af8da8805d8851ef1d74761b3b88e81a9bd80b290bbfed61bd2b4f75 numpy-1.26.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl bcc008217145b3d77abd3e4d5ef586e3bdfba8fe17940769f8aa09b99e856c00 numpy-1.26.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 3ced40d4e9e18242f70dd02d739e44698df3dcb010d31f495ff00a31ef6014fe numpy-1.26.2-cp310-cp310-musllinux_1_1_aarch64.whl b272d4cecc32c9e19911891446b72e986157e6a1809b7b56518b4f3755267523 numpy-1.26.2-cp310-cp310-musllinux_1_1_x86_64.whl 22f8fc02fdbc829e7a8c578dd8d2e15a9074b630d4da29cda483337e300e3ee9 numpy-1.26.2-cp310-cp310-win32.whl 26c9d33f8e8b846d5a65dd068c14e04018d05533b348d9eaeef6c1bd787f9919 numpy-1.26.2-cp310-cp310-win_amd64.whl b96e7b9c624ef3ae2ae0e04fa9b460f6b9f17ad8b4bec6d7756510f1f6c0c841 numpy-1.26.2-cp311-cp311-macosx_10_9_x86_64.whl aa18428111fb9a591d7a9cc1b48150097ba6a7e8299fb56bdf574df650e7d1f1 numpy-1.26.2-cp311-cp311-macosx_11_0_arm64.whl 06fa1ed84aa60ea6ef9f91ba57b5ed963c3729534e6e54055fc151fad0423f0a numpy-1.26.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl 96ca5482c3dbdd051bcd1fce8034603d6ebfc125a7bd59f55b40d8f5d246832b numpy-1.26.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 854ab91a2906ef29dc3925a064fcd365c7b4da743f84b123002f6139bcb3f8a7 numpy-1.26.2-cp311-cp311-musllinux_1_1_aarch64.whl f43740ab089277d403aa07567be138fc2a89d4d9892d113b
--- ### Configuration 📅 **Schedule**: Branch creation - At any time (no schedule defined), Automerge - At any time (no schedule defined). 🚦 **Automerge**: Disabled by config. Please merge this manually once you are satisfied. ♻ **Rebasing**: Never, or you tick the rebase/retry checkbox. 👻 **Immortal**: This PR will be recreated if closed unmerged. Get [config help](https://redirect.github.com/renovatebot/renovate/discussions) if that's undesired. --- - [ ] If you want to rebase/retry this PR, check this box --- This PR was generated by [Mend Renovate](https://mend.io/renovate/). View the [repository job log](https://developer.mend.io/github/googleapis/langchain-google-cloud-sql-pg-python). --- requirements.txt | 6 +++--- samples/langchain_on_vertexai/requirements.txt | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/requirements.txt b/requirements.txt index c6a36c54..ec4b6dd2 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ -cloud-sql-python-connector[asyncpg]==1.10.0 -langchain-core==0.2.35 +cloud-sql-python-connector[asyncpg]==1.12.0 +langchain-core==0.2.38 numpy==1.24.4; python_version<='3.8' numpy==1.26.4; python_version>'3.8' pgvector==0.3.2 -SQLAlchemy[asyncio]==2.0.32 +SQLAlchemy[asyncio]==2.0.34 diff --git a/samples/langchain_on_vertexai/requirements.txt b/samples/langchain_on_vertexai/requirements.txt index 13db4cc8..5996cbf4 100644 --- a/samples/langchain_on_vertexai/requirements.txt +++ b/samples/langchain_on_vertexai/requirements.txt @@ -2,4 +2,4 @@ google-cloud-aiplatform[reasoningengine,langchain]==1.65.0 google-cloud-resource-manager==1.12.5 langchain-community==0.2.16 langchain-google-cloud-sql-pg==0.9.0 -langchain-google-vertexai==1.0.10 \ No newline at end of file +langchain-google-vertexai==1.0.10 From ce7d0cc55d78d12f7f1dc60e333a1f9b1127d8d0 Mon Sep 17 00:00:00 2001 From: Jack Wotherspoon Date: Thu, 12 Sep 2024 15:45:33 -0400 Subject: [PATCH 08/12] ci: use v2 Cloud SQL Proxy (#211) --- integration.cloudbuild.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/integration.cloudbuild.yaml b/integration.cloudbuild.yaml index 957248be..e685c3bb 100644 --- a/integration.cloudbuild.yaml +++ b/integration.cloudbuild.yaml @@ -29,8 +29,8 @@ steps: args: - -c - | - wget -O /workspace/cloud_sql_proxy https://storage.googleapis.com/cloudsql-proxy/v1.37.0/cloud_sql_proxy.linux.386 - chmod +x /workspace/cloud_sql_proxy + wget -O /workspace/cloud-sql-proxy https://storage.googleapis.com/cloud-sql-connectors/cloud-sql-proxy/v2.13.0/cloud-sql-proxy.linux.386 + chmod +x /workspace/cloud-sql-proxy - id: Run integration tests name: python:${_VERSION} @@ -45,7 +45,7 @@ steps: args: - "-c" - | - /workspace/cloud_sql_proxy -dir=/workspace -instances=${_INSTANCE_CONNECTION_NAME}=tcp:$_IP_ADDRESS:$_DATABASE_PORT & sleep 2; + /workspace/cloud-sql-proxy ${_INSTANCE_CONNECTION_NAME} --port $_DATABASE_PORT & sleep 2; python -m pytest --cov=langchain_google_cloud_sql_pg --cov-config=.coveragerc tests/ availableSecrets: From 7ef9335a45578273e9ffc0921f60a1c6cc3e89ed Mon Sep 17 00:00:00 2001 From: Anubhav Dhawan Date: Fri, 13 Sep 2024 12:22:07 +0530 Subject: [PATCH 09/12] docs: Update sample python notebooks to reflect the support for custom schema. (#204) Specifying custom database schema names is now supported by all library methods. This commit highlights the same in the python notebooks. --- docs/chat_message_history.ipynb | 27 +++++++++++++++++++-- docs/document_loader.ipynb | 42 +++++++++++++++++++++++++++++---- docs/vector_store.ipynb | 24 +++++++++++++++++++ 3 files changed, 87 insertions(+), 6 deletions(-) diff --git a/docs/chat_message_history.ipynb b/docs/chat_message_history.ipynb index e9493dee..02e6f04f 100644 --- a/docs/chat_message_history.ipynb +++ b/docs/chat_message_history.ipynb @@ -287,6 +287,24 @@ "engine.init_chat_history_table(table_name=TABLE_NAME)" ] }, + { + "cell_type": "markdown", + "id": "345b76b8", + "metadata": {}, + "source": [ + "#### Optional Tip: 💡\n", + "You can also specify a schema name by passing `schema_name` wherever you pass `table_name`. Eg:\n", + "\n", + "```python\n", + "SCHEMA_NAME=\"my_schema\"\n", + "\n", + "engine.init_chat_history_table(\n", + " table_name=TABLE_NAME,\n", + " schema_name=SCHEMA_NAME # Default: \"public\"\n", + ")\n", + "```" + ] + }, { "cell_type": "markdown", "id": "zSYQTYf3UfOi", @@ -300,7 +318,8 @@ "\n", "1. `engine` - An instance of a `PostgresEngine` engine.\n", "1. `session_id` - A unique identifier string that specifies an id for the session.\n", - "1. `table_name` : The name of the table within the Cloud SQL database to store the chat message history." + "1. `table_name` : The name of the table within the Cloud SQL database to store the chat message history.\n", + "1. `schema_name` : The name of the database schema containing the chat message history table." ] }, { @@ -315,7 +334,10 @@ "from langchain_google_cloud_sql_pg import PostgresChatMessageHistory\n", "\n", "history = PostgresChatMessageHistory.create_sync(\n", - " engine, session_id=\"test_session\", table_name=TABLE_NAME\n", + " engine,\n", + " session_id=\"test_session\",\n", + " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", ")\n", "history.add_user_message(\"hi!\")\n", "history.add_ai_message(\"whats up?\")" @@ -456,6 +478,7 @@ " engine,\n", " session_id=session_id,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " ),\n", " input_messages_key=\"question\",\n", " history_messages_key=\"history\",\n", diff --git a/docs/document_loader.ipynb b/docs/document_loader.ipynb index 84db4ae4..7be30a9b 100644 --- a/docs/document_loader.ipynb +++ b/docs/document_loader.ipynb @@ -257,6 +257,25 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Optional Tip: 💡\n", + "You can also specify a schema name by passing `schema_name` wherever you pass `table_name`. Eg:\n", + "\n", + "```python\n", + "SCHEMA_NAME=\"my_schema\"\n", + "\n", + "await engine.ainit_document_table(\n", + " table_name=TABLE_NAME,\n", + " schema_name=SCHEMA_NAME # Default: \"public\"\n", + " \n", + " ...\n", + ")\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": { @@ -277,7 +296,11 @@ "from langchain_google_cloud_sql_pg import PostgresLoader\n", "\n", "# Creating a basic PostgreSQL object\n", - "loader = await PostgresLoader.create(engine, table_name=TABLE_NAME)" + "loader = await PostgresLoader.create(\n", + " engine,\n", + " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", + ")" ] }, { @@ -304,7 +327,11 @@ "from langchain_google_cloud_sql_pg import PostgresLoader\n", "\n", "# Creating a basic PostgresLoader object\n", - "loader = await PostgresLoader.create(engine, table_name=TABLE_NAME)\n", + "loader = await PostgresLoader.create(\n", + " engine,\n", + " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", + ")\n", "\n", "docs = await loader.aload()\n", "print(docs)" @@ -328,6 +355,7 @@ "loader = await PostgresLoader.create(\n", " engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " content_columns=[\"product_name\"], # Optional\n", " metadata_columns=[\"id\"], # Optional\n", ")\n", @@ -356,6 +384,7 @@ "loader = await PostgresLoader.create(\n", " engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " content_columns=[\"product_name\", \"description\"],\n", " format=\"YAML\",\n", ")\n", @@ -383,6 +412,7 @@ "saver = await PostgresDocumentSaver.create(\n", " engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " content_column=\"product_name\",\n", " metadata_columns=[\"description\", \"content\"],\n", " metadata_json_column=\"metadata\",\n", @@ -427,7 +457,7 @@ "metadata": {}, "source": [ "### Load the documents with PostgresLoader\n", - "PostgresLoader can be used with `TABLE_NAME` to query and load the whole table." + "PostgresLoader can be used with `TABLE_NAME` (and optionally `SCHEMA_NAME`) to query and load the whole table." ] }, { @@ -436,7 +466,11 @@ "metadata": {}, "outputs": [], "source": [ - "loader = await PostgresLoader.create(engine, table_name=TABLE_NAME)\n", + "loader = await PostgresLoader.create(\n", + " engine,\n", + " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", + ")\n", "docs = await loader.aload()\n", "\n", "print(docs)" diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 60839763..a253b3a6 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -258,6 +258,25 @@ ")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Optional Tip: 💡\n", + "You can also specify a schema name by passing `schema_name` wherever you pass `table_name`. Eg:\n", + "\n", + "```python\n", + "SCHEMA_NAME=\"my_schema\"\n", + "\n", + "await engine.ainit_vectorstore_table(\n", + " table_name=TABLE_NAME,\n", + " schema_name=SCHEMA_NAME, # Default: \"public\"\n", + " \n", + " ...\n", + ")\n", + "```" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -322,6 +341,7 @@ "store = await PostgresVectorStore.create( # Use .create() to initialize an async vector store\n", " engine=engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " embedding_service=embedding,\n", ")" ] @@ -365,6 +385,7 @@ " ids=ids,\n", " engine=engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " embedding_service=embedding,\n", ")" ] @@ -515,9 +536,11 @@ "\n", "# Set table name\n", "TABLE_NAME = \"vectorstore_custom\"\n", + "# SCHEMA_NAME = \"my_schema\"\n", "\n", "await engine.ainit_vectorstore_table(\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " vector_size=768, # VertexAI model: textembedding-gecko@latest\n", " metadata_columns=[Column(\"len\", \"INTEGER\")],\n", ")\n", @@ -527,6 +550,7 @@ "custom_store = await PostgresVectorStore.create(\n", " engine=engine,\n", " table_name=TABLE_NAME,\n", + " # schema_name=SCHEMA_NAME,\n", " embedding_service=embedding,\n", " metadata_columns=[\"len\"],\n", " # Connect to a existing VectorStore by customizing the table schema:\n", From ffaa87fd864d1c3ffeb00a34370af9e986a37cf5 Mon Sep 17 00:00:00 2001 From: dishaprakash <57954147+dishaprakash@users.noreply.github.com> Date: Tue, 17 Sep 2024 05:14:49 +0000 Subject: [PATCH 10/12] feat: allow non-uuid data types for vectorstore primary key (#209) * feat: allow non-uuid data types for vectorstore primary key * Update src/langchain_google_cloud_sql_pg/engine.py * Update src/langchain_google_cloud_sql_pg/engine.py * Update src/langchain_google_cloud_sql_pg/engine.py --------- Co-authored-by: Averi Kitsch --- .../async_vectorstore.py | 53 +++++++++---- src/langchain_google_cloud_sql_pg/engine.py | 27 ++++--- .../vectorstore.py | 78 ++++++++++++++----- tests/test_async_vectorstore_from_methods.py | 40 ++++++++++ tests/test_engine.py | 52 +++++++++++++ tests/test_vectorstore_from_methods.py | 69 +++++++++++++++- 6 files changed, 272 insertions(+), 47 deletions(-) diff --git a/src/langchain_google_cloud_sql_pg/async_vectorstore.py b/src/langchain_google_cloud_sql_pg/async_vectorstore.py index f74a1020..fcf92dbf 100644 --- a/src/langchain_google_cloud_sql_pg/async_vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/async_vectorstore.py @@ -226,10 +226,14 @@ async def __aadd_embeddings( texts: Iterable[str], embeddings: List[List[float]], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Add embeddings to the table.""" + """Add embeddings to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ if not ids: ids = [str(uuid.uuid4()) for _ in texts] if not metadatas: @@ -276,10 +280,14 @@ async def aadd_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed texts and add to the table.""" + """Embed texts and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ embeddings = self.embedding_service.embed_documents(list(texts)) ids = await self.__aadd_embeddings( texts, embeddings, metadatas=metadatas, ids=ids, **kwargs @@ -289,10 +297,14 @@ async def aadd_texts( async def aadd_documents( self, documents: List[Document], - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed documents and add to the table""" + """Embed documents and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ texts = [doc.page_content for doc in documents] metadatas = [doc.metadata for doc in documents] ids = await self.aadd_texts(texts, metadatas=metadatas, ids=ids, **kwargs) @@ -300,10 +312,14 @@ async def aadd_documents( async def adelete( self, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> Optional[bool]: - """Delete records from the table.""" + """Delete records from the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ if not ids: return False @@ -323,7 +339,7 @@ async def afrom_texts( # type: ignore[override] table_name: str, schema_name: str = "public", metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -338,6 +354,7 @@ async def afrom_texts( # type: ignore[override] **kwargs: Any, ) -> AsyncPostgresVectorStore: """Create an AsyncPostgresVectorStore instance from texts. + Args: texts (List[str]): Texts to add to the vector store. embedding (Embeddings): Text embedding model to use. @@ -358,6 +375,9 @@ async def afrom_texts( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: AsyncPostgresVectorStore """ @@ -389,7 +409,7 @@ async def afrom_documents( # type: ignore[override] engine: PostgresEngine, table_name: str, schema_name: str = "public", - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -425,6 +445,9 @@ async def afrom_documents( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: AsyncPostgresVectorStore """ @@ -735,7 +758,7 @@ def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: raise NotImplementedError( @@ -745,7 +768,7 @@ def add_texts( def add_documents( self, documents: List[Document], - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: raise NotImplementedError( @@ -754,7 +777,7 @@ def add_documents( def delete( self, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> Optional[bool]: raise NotImplementedError( @@ -769,7 +792,7 @@ def from_texts( # type: ignore[override] engine: PostgresEngine, table_name: str, metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -789,7 +812,7 @@ def from_documents( # type: ignore[override] embedding: Embeddings, engine: PostgresEngine, table_name: str, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], diff --git a/src/langchain_google_cloud_sql_pg/engine.py b/src/langchain_google_cloud_sql_pg/engine.py index 3548f4af..33b7a83f 100644 --- a/src/langchain_google_cloud_sql_pg/engine.py +++ b/src/langchain_google_cloud_sql_pg/engine.py @@ -410,7 +410,7 @@ async def _ainit_vectorstore_table( embedding_column: str = "embedding", metadata_columns: List[Column] = [], metadata_json_column: str = "langchain_metadata", - id_column: str = "langchain_id", + id_column: Union[str, Column] = "langchain_id", overwrite_existing: bool = False, store_metadata: bool = True, ) -> None: @@ -430,14 +430,14 @@ async def _ainit_vectorstore_table( metadata. Default: []. Optional. metadata_json_column (str): The column to store extra metadata in JSON format. Default: "langchain_metadata". Optional. - id_column (str): Name of the column to store ids. - Default: "langchain_id". Optional, + id_column (Union[str, Column]) : Column to store ids. + Default: "langchain_id" column name with data type UUID. Optional. overwrite_existing (bool): Whether to drop existing table. Default: False. store_metadata (bool): Whether to store metadata in the table. Default: True. - Raises: :class:`DuplicateTableError `: if table already exists and overwrite flag is not set. + :class:`UndefinedObjectError `: if the data type of the id column is not a postgreSQL data type. """ async with self._pool.connect() as conn: await conn.execute(text("CREATE EXTENSION IF NOT EXISTS vector")) @@ -450,8 +450,11 @@ async def _ainit_vectorstore_table( ) await conn.commit() + id_data_type = "UUID" if isinstance(id_column, str) else id_column.data_type + id_column_name = id_column if isinstance(id_column, str) else id_column.name + query = f"""CREATE TABLE "{schema_name}"."{table_name}"( - "{id_column}" UUID PRIMARY KEY, + "{id_column_name}" {id_data_type} PRIMARY KEY, "{content_column}" TEXT NOT NULL, "{embedding_column}" vector({vector_size}) NOT NULL""" for column in metadata_columns: @@ -474,7 +477,7 @@ async def ainit_vectorstore_table( embedding_column: str = "embedding", metadata_columns: List[Column] = [], metadata_json_column: str = "langchain_metadata", - id_column: str = "langchain_id", + id_column: Union[str, Column] = "langchain_id", overwrite_existing: bool = False, store_metadata: bool = True, ) -> None: @@ -494,8 +497,8 @@ async def ainit_vectorstore_table( metadata. Default: []. Optional. metadata_json_column (str): The column to store extra metadata in JSON format. Default: "langchain_metadata". Optional. - id_column (str): Name of the column to store ids. - Default: "langchain_id". Optional, + id_column (Union[str, Column]) : Column to store ids. + Default: "langchain_id" column name with data type UUID. Optional. overwrite_existing (bool): Whether to drop existing table. Default: False. store_metadata (bool): Whether to store metadata in the table. Default: True. @@ -524,7 +527,7 @@ def init_vectorstore_table( embedding_column: str = "embedding", metadata_columns: List[Column] = [], metadata_json_column: str = "langchain_metadata", - id_column: str = "langchain_id", + id_column: Union[str, Column] = "langchain_id", overwrite_existing: bool = False, store_metadata: bool = True, ) -> None: @@ -544,11 +547,13 @@ def init_vectorstore_table( metadata. Default: []. Optional. metadata_json_column (str): The column to store extra metadata in JSON format. Default: "langchain_metadata". Optional. - id_column (str): Name of the column to store ids. - Default: "langchain_id". Optional, + id_column (Union[str, Column]) : Column to store ids. + Default: "langchain_id" column name with data type UUID. Optional. overwrite_existing (bool): Whether to drop existing table. Default: False. store_metadata (bool): Whether to store metadata in the table. Default: True. + Raises: + :class:`UndefinedObjectError `: if the `ids` data type does not match that of the `id_column`. """ self._run_as_sync( self._ainit_vectorstore_table( diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py index 39b79de6..109e5d9a 100644 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -187,10 +187,14 @@ async def aadd_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed texts and add to the table.""" + """Embed texts and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return await self._engine._run_as_async( self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) ) @@ -199,10 +203,14 @@ def add_texts( self, texts: Iterable[str], metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed texts and add to the table.""" + """Embed texts and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return self._engine._run_as_sync( self.__vs.aadd_texts(texts, metadatas, ids, **kwargs) ) @@ -210,10 +218,14 @@ def add_texts( async def aadd_documents( self, documents: List[Document], - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed documents and add to the table""" + """Embed documents and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return await self._engine._run_as_async( self.__vs.aadd_documents(documents, ids, **kwargs) ) @@ -221,28 +233,40 @@ async def aadd_documents( def add_documents( self, documents: List[Document], - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> List[str]: - """Embed documents and add to the table.""" + """Embed documents and add to the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return self._engine._run_as_sync( self.__vs.aadd_documents(documents, ids, **kwargs) ) async def adelete( self, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> Optional[bool]: - """Delete records from the table.""" + """Delete records from the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return await self._engine._run_as_async(self.__vs.adelete(ids, **kwargs)) def delete( self, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, **kwargs: Any, ) -> Optional[bool]: - """Delete records from the table.""" + """Delete records from the table. + + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + """ return self._engine._run_as_sync(self.__vs.adelete(ids, **kwargs)) @classmethod @@ -254,7 +278,7 @@ async def afrom_texts( # type: ignore[override] table_name: str, schema_name: str = "public", metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -268,6 +292,7 @@ async def afrom_texts( # type: ignore[override] index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from texts. + Args: texts (List[str]): Texts to add to the vector store. embedding (Embeddings): Text embedding model to use. @@ -275,7 +300,7 @@ async def afrom_texts( # type: ignore[override] table_name (str): Name of the existing table or the table to be created. schema_name (str, optional): Database schema name of the table. Defaults to "public". metadatas (Optional[List[dict]]): List of metadatas to add to table records. - ids: (Optional[List[str]]): List of IDs to add to table records. + ids: (Optional[List]): List of IDs to add to table records. content_column (str): Column that represent a Document’s page_content. Defaults to "content". embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". metadata_columns (List[str]): Column(s) that represent a document's metadata. @@ -288,6 +313,9 @@ async def afrom_texts( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: PostgresVectorStore """ @@ -319,7 +347,7 @@ async def afrom_documents( # type: ignore[override] engine: PostgresEngine, table_name: str, schema_name: str = "public", - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -341,7 +369,7 @@ async def afrom_documents( # type: ignore[override] table_name (str): Name of the existing table or the table to be created. schema_name (str, optional): Database schema name of the table. Defaults to "public". metadatas (Optional[List[dict]]): List of metadatas to add to table records. - ids: (Optional[List[str]]): List of IDs to add to table records. + ids: (Optional[List]): List of IDs to add to table records. content_column (str): Column that represent a Document’s page_content. Defaults to "content". embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". metadata_columns (List[str]): Column(s) that represent a document's metadata. @@ -354,6 +382,9 @@ async def afrom_documents( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: PostgresVectorStore """ @@ -386,7 +417,7 @@ def from_texts( # type: ignore[override] table_name: str, schema_name: str = "public", metadatas: Optional[List[dict]] = None, - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -400,6 +431,7 @@ def from_texts( # type: ignore[override] index_query_options: Optional[QueryOptions] = None, ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from texts. + Args: texts (List[str]): Texts to add to the vector store. embedding (Embeddings): Text embedding model to use. @@ -407,7 +439,7 @@ def from_texts( # type: ignore[override] table_name (str): Name of the existing table or the table to be created. schema_name (str, optional): Database schema name of the table. Defaults to "public". metadatas (Optional[List[dict]]): List of metadatas to add to table records. - ids: (Optional[List[str]]): List of IDs to add to table records. + ids: (Optional[List]): List of IDs to add to table records. content_column (str): Column that represent a Document’s page_content. Defaults to "content". embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". metadata_columns (List[str]): Column(s) that represent a document's metadata. @@ -420,6 +452,9 @@ def from_texts( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: PostgresVectorStore """ @@ -451,7 +486,7 @@ def from_documents( # type: ignore[override] engine: PostgresEngine, table_name: str, schema_name: str = "public", - ids: Optional[List[str]] = None, + ids: Optional[List] = None, content_column: str = "content", embedding_column: str = "embedding", metadata_columns: List[str] = [], @@ -473,7 +508,7 @@ def from_documents( # type: ignore[override] table_name (str): Name of the existing table or the table to be created. schema_name (str, optional): Database schema name of the table. Defaults to "public". metadatas (Optional[List[dict]]): List of metadatas to add to table records. - ids: (Optional[List[str]]): List of IDs to add to table records. + ids: (Optional[List]): List of IDs to add to table records. content_column (str): Column that represent a Document’s page_content. Defaults to "content". embedding_column (str): Column for embedding vectors. The embedding is generated from the document value. Defaults to "embedding". metadata_columns (List[str]): Column(s) that represent a document's metadata. @@ -486,6 +521,9 @@ def from_documents( # type: ignore[override] lambda_mult (float): Number between 0 and 1 that determines the degree of diversity among the results with 0 corresponding to maximum diversity and 1 to minimum diversity. Defaults to 0.5. index_query_options (QueryOptions): Index query option. + Raises: + :class:`InvalidTextRepresentationError `: if the `ids` data type does not match that of the `id_column`. + Returns: PostgresVectorStore """ diff --git a/tests/test_async_vectorstore_from_methods.py b/tests/test_async_vectorstore_from_methods.py index fd933508..59274f6a 100644 --- a/tests/test_async_vectorstore_from_methods.py +++ b/tests/test_async_vectorstore_from_methods.py @@ -29,6 +29,9 @@ DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE_WITH_INT_ID = "test_table_custom_with_int_it" + str(uuid.uuid4()).replace( + "-", "_" +) VECTOR_SIZE = 768 @@ -100,9 +103,19 @@ async def engine(self, db_project, db_region, db_instance, db_name): metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], store_metadata=False, ) + await engine._ainit_vectorstore_table( + CUSTOM_TABLE_WITH_INT_ID, + VECTOR_SIZE, + id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=False, + ) yield engine await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID}") await engine.close() async def test_afrom_texts(self, engine): @@ -180,3 +193,30 @@ async def test_afrom_docs_custom(self, engine): assert results[0]["page"] == "0" assert results[0]["source"] == "google.com" await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") + + async def test_afrom_docs_custom_with_int_id(self, engine): + ids = [i for i in range(len(texts))] + docs = [ + Document( + page_content=texts[i], + metadata={"page": str(i), "source": "google.com"}, + ) + for i in range(len(texts)) + ] + await AsyncPostgresVectorStore.afrom_documents( + docs, + embeddings_service, + engine, + CUSTOM_TABLE_WITH_INT_ID, + ids=ids, + id_column="integer_id", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + ) + + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID}") + assert len(results) == 3 + for row in results: + assert isinstance(row["integer_id"], int) + await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID}") diff --git a/tests/test_engine.py b/tests/test_engine.py index 70e18802..5e117b0e 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -31,8 +31,12 @@ DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +INT_ID_CUSTOM_TABLE = "test_table_custom_int_id" + str(uuid.uuid4()).replace("-", "_") DEFAULT_TABLE_SYNC = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE_SYNC = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +INT_ID_CUSTOM_TABLE_SYNC = "test_table_custom_int_id" + str(uuid.uuid4()).replace( + "-", "_" +) VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -110,6 +114,7 @@ async def engine(self, db_project, db_region, db_instance, db_name): yield engine await aexecute(engine, f'DROP TABLE "{CUSTOM_TABLE}"') await aexecute(engine, f'DROP TABLE "{DEFAULT_TABLE}"') + await aexecute(engine, f'DROP TABLE "{INT_ID_CUSTOM_TABLE}"') await engine.close() async def test_init_table(self, engine): @@ -143,6 +148,29 @@ async def test_init_table_custom(self, engine): for row in results: assert row in expected + async def test_init_table_with_int_id(self, engine): + await engine.ainit_vectorstore_table( + INT_ID_CUSTOM_TABLE, + VECTOR_SIZE, + id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), + content_column="my-content", + embedding_column="my_embedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=True, + ) + stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{INT_ID_CUSTOM_TABLE}';" + results = await afetch(engine, stmt) + expected = [ + {"column_name": "integer_id", "data_type": "integer"}, + {"column_name": "my_embedding", "data_type": "USER-DEFINED"}, + {"column_name": "langchain_metadata", "data_type": "json"}, + {"column_name": "my-content", "data_type": "text"}, + {"column_name": "page", "data_type": "text"}, + {"column_name": "source", "data_type": "text"}, + ] + for row in results: + assert row in expected + async def test_password( self, db_project, @@ -306,6 +334,7 @@ async def engine(self, db_project, db_region, db_instance, db_name): yield engine await aexecute(engine, f'DROP TABLE "{CUSTOM_TABLE_SYNC}"') await aexecute(engine, f'DROP TABLE "{DEFAULT_TABLE_SYNC}"') + await aexecute(engine, f'DROP TABLE "{INT_ID_CUSTOM_TABLE_SYNC}"') await engine.close() async def test_init_table(self, engine): @@ -339,6 +368,29 @@ async def test_init_table_custom(self, engine): for row in results: assert row in expected + async def test_init_table_with_int_id(self, engine): + engine.init_vectorstore_table( + INT_ID_CUSTOM_TABLE_SYNC, + VECTOR_SIZE, + id_column=Column(name="integer_id", data_type="INTEGER", nullable=False), + content_column="my-content", + embedding_column="my_embedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=True, + ) + stmt = f"SELECT column_name, data_type FROM information_schema.columns WHERE table_name = '{INT_ID_CUSTOM_TABLE_SYNC}';" + results = await afetch(engine, stmt) + expected = [ + {"column_name": "integer_id", "data_type": "integer"}, + {"column_name": "my_embedding", "data_type": "USER-DEFINED"}, + {"column_name": "langchain_metadata", "data_type": "json"}, + {"column_name": "my-content", "data_type": "text"}, + {"column_name": "page", "data_type": "text"}, + {"column_name": "source", "data_type": "text"}, + ] + for row in results: + assert row in expected + async def test_password( self, db_project, diff --git a/tests/test_vectorstore_from_methods.py b/tests/test_vectorstore_from_methods.py index 8b161f6f..fadf8fc1 100644 --- a/tests/test_vectorstore_from_methods.py +++ b/tests/test_vectorstore_from_methods.py @@ -29,6 +29,12 @@ DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") DEFAULT_TABLE_SYNC = "test_table_sync" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +CUSTOM_TABLE_WITH_INT_ID = "test_table_with_int_id" + str(uuid.uuid4()).replace( + "-", "_" +) +CUSTOM_TABLE_WITH_INT_ID_SYNC = "test_table_with_int_id" + str(uuid.uuid4()).replace( + "-", "_" +) VECTOR_SIZE = 768 @@ -109,9 +115,19 @@ async def engine(self, db_project, db_region, db_instance, db_name): metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], store_metadata=False, ) + await engine.ainit_vectorstore_table( + CUSTOM_TABLE_WITH_INT_ID, + VECTOR_SIZE, + id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=False, + ) yield engine await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID}") await engine.close() @pytest_asyncio.fixture @@ -123,9 +139,18 @@ async def engine_sync(self, db_project, db_region, db_instance, db_name): database=db_name, ) engine.init_vectorstore_table(DEFAULT_TABLE_SYNC, VECTOR_SIZE) - + engine.init_vectorstore_table( + CUSTOM_TABLE_WITH_INT_ID_SYNC, + VECTOR_SIZE, + id_column=Column(name="integer_id", data_type="INTEGER", nullable="False"), + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=[Column("page", "TEXT"), Column("source", "TEXT")], + store_metadata=False, + ) yield engine await aexecute(engine, f"DROP TABLE IF EXISTS {DEFAULT_TABLE_SYNC}") + await aexecute(engine, f"DROP TABLE IF EXISTS {CUSTOM_TABLE_WITH_INT_ID_SYNC}") await engine.close() async def test_afrom_texts(self, engine): @@ -256,3 +281,45 @@ async def test_afrom_docs_custom(self, engine): assert results[0]["page"] == "0" assert results[0]["source"] == "google.com" await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE}") + + async def test_afrom_texts_custom_with_int_id(self, engine): + ids = [i for i in range(len(texts))] + await PostgresVectorStore.afrom_texts( + texts, + embeddings_service, + engine, + CUSTOM_TABLE_WITH_INT_ID, + metadatas=metadatas, + ids=ids, + id_column="integer_id", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + ) + results = await afetch(engine, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID}") + assert len(results) == 3 + for row in results: + assert isinstance(row["integer_id"], int) + await aexecute(engine, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID}") + + async def test_from_texts_custom_with_int_id(self, engine_sync): + ids = [i for i in range(len(texts))] + PostgresVectorStore.from_texts( + texts, + embeddings_service, + engine_sync, + CUSTOM_TABLE_WITH_INT_ID_SYNC, + metadatas=metadatas, + ids=ids, + id_column="integer_id", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["page", "source"], + ) + results = await afetch( + engine_sync, f"SELECT * FROM {CUSTOM_TABLE_WITH_INT_ID_SYNC}" + ) + assert len(results) == 3 + for row in results: + assert isinstance(row["integer_id"], int) + await aexecute(engine_sync, f"TRUNCATE TABLE {CUSTOM_TABLE_WITH_INT_ID_SYNC}") From c801959de8f7cef6f21506ef7fb9cb8a2e9f2f1b Mon Sep 17 00:00:00 2001 From: Mend Renovate Date: Tue, 17 Sep 2024 21:48:47 +0200 Subject: [PATCH 11/12] chore(deps): update github actions (#213) --- .github/workflows/lint.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 3e5996ce..f3b29a86 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -31,10 +31,10 @@ jobs: steps: - name: Checkout Repository - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4 + uses: actions/checkout@692973e3d937129bcbf40652eb9f2f61becf3332 # v4 - name: Setup Python - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0 + uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0 with: python-version: "3.11" From e9a853450571631d33cdc1fce93bb1da0fa5625f Mon Sep 17 00:00:00 2001 From: "release-please[bot]" <55107282+release-please[bot]@users.noreply.github.com> Date: Tue, 17 Sep 2024 15:38:23 -0700 Subject: [PATCH 12/12] chore(main): release 0.10.0 (#203) * chore(main): release 0.10.0 * Update CHANGELOG.md * Update CHANGELOG.md --------- Co-authored-by: release-please[bot] <55107282+release-please[bot]@users.noreply.github.com> Co-authored-by: Averi Kitsch --- CHANGELOG.md | 28 ++++++++++++++++++++ src/langchain_google_cloud_sql_pg/version.py | 2 +- 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b62c7d1d..a5bc737a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,33 @@ # Changelog +## [0.10.0](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.9.0...v0.10.0) (2024-09-17) + + +### ⚠ BREAKING CHANGES + +* support async and sync versions of indexing methods +* remove _aexecute(), _execute(), _afetch(), and _fetch() methods + +### Features + +* Add from_engine_args method ([de16842](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/de168427f9884f33332086b68308e1225ee9e952)) +* Add support for sync from_engine ([de16842](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/de168427f9884f33332086b68308e1225ee9e952)) +* Allow non-uuid data types for vectorstore primary key ([#209](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/209)) ([ffaa87f](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/ffaa87fd864d1c3ffeb00a34370af9e986a37cf5)) +* Refactor to support both async and sync usage ([de16842](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/de168427f9884f33332086b68308e1225ee9e952)) + + +### Bug Fixes + +* Replacing cosine_similarity and maximal_marginal_relevance local methods with the ones in langchain core. ([#190](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/190)) ([7f27092](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/7f2709225a1a5a71b33522dafd354dc7159c358f)) +* Support async and sync versions of indexing methods ([de16842](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/de168427f9884f33332086b68308e1225ee9e952)) +* Updating the minimum langchain core version to 0.2.36 ([#205](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/205)) ([0651231](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/0651231b7d77e0451ae769f78fe6dce3e724dec4)) + + +### Documentation + +* Update sample python notebooks to reflect the support for custom schema. ([#204](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/204)) ([7ef9335](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/7ef9335a45578273e9ffc0921f60a1c6cc3e89ed)) + + ## [0.9.0](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.8.0...v0.9.0) (2024-09-05) diff --git a/src/langchain_google_cloud_sql_pg/version.py b/src/langchain_google_cloud_sql_pg/version.py index ba03825a..00f17d64 100644 --- a/src/langchain_google_cloud_sql_pg/version.py +++ b/src/langchain_google_cloud_sql_pg/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.9.0" +__version__ = "0.10.0"