diff --git a/.coveragerc b/.coveragerc index 964dbb39..2c634d52 100644 --- a/.coveragerc +++ b/.coveragerc @@ -5,4 +5,4 @@ omit = [report] show_missing = true -fail_under = 82 \ No newline at end of file +fail_under = 90 \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 9d548423..5539b0ee 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,27 @@ # Changelog +## [0.8.0](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.7.0...v0.8.0) (2024-09-04) + + +### Features + +* Add table name to default index name ([#171](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/171)) ([8e61bc7](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/8e61bc779bc8f803e40e76aaeffdb93c35a5c90f)) +* Remove langchain-community dependency ([#172](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/172)) ([b4f40bb](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/b4f40bb389b40853e3deed37e1385a7866741231)) + + +### Bug Fixes + +* Add caching for background loop/thread ([#184](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/184)) ([1489f81](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/1489f818c1d62bfee5c5a3bab42d380556662e82)) +* Fix QueryOptions not applied to similarity search bug ([#185](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/185)) ([e5dca97](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/e5dca973d625c4df4c3e741a3ad8e95be0cd1472)) +* Fixed extra char in requirements.txt ([#196](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/196)) ([50dc32f](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/50dc32f8ae476c98e3ed38a153096551ce02d340)) + + +### Documentation + +* Add index choosing guide ([#178](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/178)) ([e96ffb6](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/e96ffb6dc99425e4dafb8ac13730eed253e74c4e)) +* Added vector store initialization from documents ([#174](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/174)) ([eb2eac3](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/eb2eac303f64e809e6f3fc9bc3307be163602a4e)) +* Update README.md to fix 404 links to templates ([#182](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/issues/182)) ([f10ae6c](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/commit/f10ae6c9a8645874a5ab64e846ec540aeddf977a)) + ## [0.7.0](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/compare/v0.6.1...v0.7.0) (2024-07-23) diff --git a/DEVELOPER.md b/DEVELOPER.md index 4118a0da..9e9ef215 100644 --- a/DEVELOPER.md +++ b/DEVELOPER.md @@ -28,7 +28,7 @@ Learn more by reading [How should I write my commits?](https://github.com/google ### CI Platform Setup -Cloud Build is used to run tests against Google Cloud resources in test project: langchain-alloydb-testing. +Cloud Build is used to run tests against Google Cloud resources in test project: langchain-cloud-sql-testing. Each test has a corresponding Cloud Build trigger, see [all triggers][triggers]. These tests are registered as required tests in `.github/sync-repo-settings.yaml`. @@ -41,7 +41,7 @@ name: pg-integration-test-pr-py38 description: Run integration tests on PR for Python 3.8 filename: integration.cloudbuild.yaml github: - name: langchain-google-alloydb-pg-python + name: langchain-google-cloud-sql-pg-python owner: googleapis pullRequest: branch: .* diff --git a/docs/_static/index_choosing_decision_tree.png b/docs/_static/index_choosing_decision_tree.png new file mode 100644 index 00000000..a4ea368a Binary files /dev/null and b/docs/_static/index_choosing_decision_tree.png differ diff --git a/docs/how_to_choose_an_index_guide.md b/docs/how_to_choose_an_index_guide.md new file mode 100644 index 00000000..9d42bc33 --- /dev/null +++ b/docs/how_to_choose_an_index_guide.md @@ -0,0 +1,43 @@ +# How to Choose a Nearest-Neighbor Index Guide + +## Introduction + +When leveraging vector search in your application, selecting the right algorithm and accurately measuring recall are pivotal for enhancing search efficiency and result relevancy. This guide is crafted to assist developers in navigating through these crucial processes with ease, specifically for those who are developing with LangChain Python. To find out what is a good index for your vector database, the first decision you need to make is to choose from KNN or ANN. + +## KNN vs. ANN + +K-Nearest Neighbors (KNN) and Approximate Nearest Neighbors (ANN) are both nearest neighbor algorithms used for vector similarity search. KNN is a brute force algorithm that guarantees perfect recall at the cost of speed. ANN algorithms offer significantly faster speeds, but with less perfect recall. + +For example, you have an item description and you want to search for the top 3 most similar items in the database. KNN checks every single item in the database and finds the top 3. On the other hand, ANN uses an algorithm to guess where the top 3 items are, and search only those vectors within the calculated scope, sacrificing some accuracy for speed. While KNN guarantees you get the most similar items, the computational cost is too high for scalability. ANN, although faster and good for a quick search, might occasionally miss an item that would have been selected. + +![Index Choosing Decision Tree](_static/index_choosing_decision_tree.png) + +## Choosing between KNN and ANN + +In the context of vector search indexing, choosing between KNN and ANN is a trade-off between precision and efficiency. For applications with a small dataset (less than 10k) or requiring absolute precision, such as in legal or academic research where every possible relevant result must be identified, KNN is preferable despite its higher computational demands. In contrast, in scenarios where speed is crucial, resources are limited, and a high accuracy is still required–which is the case for most commercial usage–ANN is the better choice. + +If you decide to adopt ANN indexing for your application’s vector similarity search, an important question arises: which algorithm to choose for ANN’s similarity approximation? + +## Choosing the Right ANN Algorithm + +When selecting an indexing algorithm for your application’s ANN vector search, it's essential to consider your specific needs and the characteristics of your dataset. There are two major categories of ANN algorithms: + +- Graph-based algorithms are good at handling complex, high-dimensional data, offering faster search speeds by navigating through a network of interconnected data points. They are especially useful when the dataset is relatively larger, as they can efficiently traverse this network to find close matches. However, the memory usage and index building time could also grow significantly as the dataset grows compared to tree-based indexes. Example: +HNSW (through the pgvector extension) + +- Tree-based algorithms organize data in a structured, hierarchical manner, making them efficient for lower-dimensional datasets. They offer a structured and often more resource-efficient approach to partitioning space and finding neighbors, but their performance degrades when the embeddings have high dimensionality but low information density. Example: +IVFFlat (through the pgvector extension) + +Here is a comparison table between Graph-based and Tree-based Indexing algorithms: + +| Feature | Graph-based algorithm | Tree-based algorithm | +|---------|-----------------------|----------------------| +|Latency |Generally offers higher search efficiency, especially in high-dimensional spaces due to its ability to skip over irrelevant regions of the graph. Write latency is generally higher.| Efficiency depends on dataset distribution characteristics.| +| Accuracy | Can achieve high levels of accuracy by adjusting the graph's complexity (e.g., the number of edges per node), allowing for fine-tuning based on the dataset.| Accuracy is influenced by the tree's depth and branching factor. While very accurate in lower dimensions, accuracy decreases on embeddings with high dimensionality but low information density.| +| Examples | HNSW (through pgvector) | IVFFlat (through pgvector)| +| Index Creation Time| Slower | Faster| +| Memory used | More | Less | + +## Next Step + +By carefully evaluating your requirements for accuracy, computational resources, and scalability, you can select the optimal indexing approach for your vector search application. Once you've made the choice for your indexing algorithm, turn to [this guide](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/blob/main/samples/index_tuning_sample/README.md) as your next step for evaluating indexing performance and fine-tuning your indexes as needed. diff --git a/docs/index.rst b/docs/index.rst index 8982ab15..33ec4547 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -10,6 +10,14 @@ API Reference langchain_google_cloud_sql_pg/loader langchain_google_cloud_sql_pg/history + +How to Choose a Nearest-Neighbor Index Guide +-------------------------------------------- +.. toctree:: + :maxdepth: 2 + + how_to_choose_an_index_guide.md + Changelog --------- .. toctree:: diff --git a/docs/vector_store.ipynb b/docs/vector_store.ipynb index 8800fff5..60839763 100644 --- a/docs/vector_store.ipynb +++ b/docs/vector_store.ipynb @@ -1,548 +1,591 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Google Cloud SQL for PostgreSQL\n", - "\n", - "> [Cloud SQL](https://cloud.google.com/sql) is a fully managed relational database service that offers high performance, seamless integration, and impressive scalability. It offers PostgreSQL, PostgreSQL, and SQL Server database engines. Extend your database application to build AI-powered experiences leveraging Cloud SQL's Langchain integrations.\n", - "\n", - "This notebook goes over how to use `Cloud SQL for PostgreSQL` to store vector embeddings with the `PostgresVectorStore` class.\n", - "\n", - "Learn more about the package on [GitHub](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/).\n", - "\n", - "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googleapis/langchain-google-cloud-sql-pg-python/blob/main/docs/vector_store.ipynb)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Before you begin\n", - "\n", - "To run this notebook, you will need to do the following:\n", - "\n", - " * [Create a Google Cloud Project](https://developers.google.com/workspace/guides/create-project)\n", - " * [Enable the Cloud SQL Admin API.](https://console.cloud.google.com/flows/enableapi?apiid=sqladmin.googleapis.com)\n", - " * [Create a Cloud SQL instance.](https://cloud.google.com/sql/docs/postgres/connect-instance-auth-proxy#create-instance)\n", - " * [Create a Cloud SQL database.](https://cloud.google.com/sql/docs/postgres/create-manage-databases)\n", - " * [Add a User to the database.](https://cloud.google.com/sql/docs/postgres/create-manage-users)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IR54BmgvdHT_" - }, - "source": [ - "### 🦜🔗 Library Installation\n", - "Install the integration library, `langchain-google-cloud-sql-pg`, and the library for the embedding service, `langchain-google-vertexai`." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 1000 - }, - "id": "0ZITIDE160OD", - "outputId": "e184bc0d-6541-4e0a-82d2-1e216db00a2d" - }, - "outputs": [], - "source": [ - "%pip install --upgrade --quiet langchain-google-cloud-sql-pg langchain-google-vertexai" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "v40bB_GMcr9f" - }, - "source": [ - "**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "v6jBDnYnNM08", - "metadata": { - "id": "v6jBDnYnNM08" - }, - "outputs": [], - "source": [ - "# # Automatically restart kernel after installs so that your environment can access the new packages\n", - "# import IPython\n", - "\n", - "# app = IPython.Application.instance()\n", - "# app.kernel.do_shutdown(True)" - ] - }, - { - "cell_type": "markdown", - "id": "yygMe6rPWxHS", - "metadata": { - "id": "yygMe6rPWxHS" - }, - "source": [ - "### 🔐 Authentication\n", - "Authenticate to Google Cloud as the IAM user logged into this notebook in order to access your Google Cloud Project.\n", - "\n", - "* If you are using Colab to run this notebook, use the cell below and continue.\n", - "* If you are using Vertex AI Workbench, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "PTXN1_DSXj2b", - "metadata": { - "id": "PTXN1_DSXj2b" - }, - "outputs": [], - "source": [ - "from google.colab import auth\n", - "\n", - "auth.authenticate_user()" - ] - }, - { - "cell_type": "markdown", - "id": "NEvB9BoLEulY", - "metadata": { - "id": "NEvB9BoLEulY" - }, - "source": [ - "### ☁ Set Your Google Cloud Project\n", - "Set your Google Cloud project so that you can leverage Google Cloud resources within this notebook.\n", - "\n", - "If you don't know your project ID, try the following:\n", - "\n", - "* Run `gcloud config list`.\n", - "* Run `gcloud projects list`.\n", - "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "gfkS3yVRE4_W", - "metadata": { - "cellView": "form", - "id": "gfkS3yVRE4_W" - }, - "outputs": [], - "source": [ - "# @markdown Please fill in the value below with your Google Cloud project ID and then run the cell.\n", - "\n", - "PROJECT_ID = \"my-project-id\" # @param {type:\"string\"}\n", - "\n", - "# Set the project id\n", - "!gcloud config set project {PROJECT_ID}" - ] - }, - { - "cell_type": "markdown", - "id": "f8f2830ee9ca1e01", - "metadata": { - "id": "f8f2830ee9ca1e01" - }, - "source": [ - "## Basic Usage" - ] - }, - { - "cell_type": "markdown", - "id": "OMvzMWRrR6n7", - "metadata": { - "id": "OMvzMWRrR6n7" - }, - "source": [ - "### Set Cloud SQL database values\n", - "Find your database values, in the [Cloud SQL Instances page](https://console.cloud.google.com/sql?_ga=2.223735448.2062268965.1707700487-2088871159.1707257687)." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "irl7eMFnSPZr", - "metadata": { - "id": "irl7eMFnSPZr" - }, - "outputs": [], - "source": [ - "# @title Set Your Values Here { display-mode: \"form\" }\n", - "REGION = \"us-central1\" # @param {type: \"string\"}\n", - "INSTANCE = \"my-pg-instance\" # @param {type: \"string\"}\n", - "DATABASE = \"my-database\" # @param {type: \"string\"}\n", - "TABLE_NAME = \"vector_store\" # @param {type: \"string\"}" - ] - }, - { - "cell_type": "markdown", - "id": "QuQigs4UoFQ2", - "metadata": { - "id": "QuQigs4UoFQ2" - }, - "source": [ - "### PostgresEngine Connection Pool\n", - "\n", - "One of the requirements and arguments to establish Cloud SQL as a vector store is a `PostgresEngine` object. The `PostgresEngine` configures a connection pool to your Cloud SQL database, enabling successful connections from your application and following industry best practices.\n", - "\n", - "To create a `PostgresEngine` using `PostgresEngine.from_instance()` you need to provide only 4 things:\n", - "\n", - "1. `project_id` : Project ID of the Google Cloud Project where the Cloud SQL instance is located.\n", - "1. `region` : Region where the Cloud SQL instance is located.\n", - "1. `instance` : The name of the Cloud SQL instance.\n", - "1. `database` : The name of the database to connect to on the Cloud SQL instance.\n", - "\n", - "By default, [IAM database authentication](https://cloud.google.com/sql/docs/postgres/iam-authentication#iam-db-auth) will be used as the method of database authentication. This library uses the IAM principal belonging to the [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials) sourced from the envionment.\n", - "\n", - "For more informatin on IAM database authentication please see:\n", - "\n", - "* [Configure an instance for IAM database authentication](https://cloud.google.com/sql/docs/postgres/create-edit-iam-instances)\n", - "* [Manage users with IAM database authentication](https://cloud.google.com/sql/docs/postgres/add-manage-iam-users)\n", - "\n", - "Optionally, [built-in database authentication](https://cloud.google.com/sql/docs/postgres/built-in-authentication) using a username and password to access the Cloud SQL database can also be used. Just provide the optional `user` and `password` arguments to `PostgresEngine.from_instance()`:\n", - "\n", - "* `user` : Database user to use for built-in database authentication and login\n", - "* `password` : Database password to use for built-in database authentication and login.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\"**Note**: This tutorial demonstrates the async interface. All async methods have corresponding sync methods.\"" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_google_cloud_sql_pg import PostgresEngine\n", - "\n", - "engine = await PostgresEngine.afrom_instance(\n", - " project_id=PROJECT_ID, region=REGION, instance=INSTANCE, database=DATABASE\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "D9Xs2qhm6X56" - }, - "source": [ - "### Initialize a table\n", - "The `PostgresVectorStore` class requires a database table. The `PostgresEngine` engine has a helper method `init_vectorstore_table()` that can be used to create a table with the proper schema for you." - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "metadata": { - "id": "avlyHEMn6gzU" - }, - "outputs": [], - "source": [ - "from langchain_google_cloud_sql_pg import PostgresEngine\n", - "\n", - "await engine.ainit_vectorstore_table(\n", - " table_name=TABLE_NAME,\n", - " vector_size=768, # Vector size for VertexAI model(textembedding-gecko@latest)\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Create an embedding class instance\n", - "\n", - "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/).\n", - "You may need to enable Vertex AI API to use `VertexAIEmbeddings`. We recommend setting the embedding model's version for production, learn more about the [Text embeddings models](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings)." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "5utKIdq7KYi5", - "metadata": { - "id": "5utKIdq7KYi5" - }, - "outputs": [], - "source": [ - "# enable Vertex AI API\n", - "!gcloud services enable aiplatform.googleapis.com" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Vb2RJocV9_LQ", - "outputId": "37f5dc74-2512-47b2-c135-f34c10afdcf4" - }, - "outputs": [], - "source": [ - "from langchain_google_vertexai import VertexAIEmbeddings\n", - "\n", - "embedding = VertexAIEmbeddings(\n", - " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "e1tl0aNx7SWy" - }, - "source": [ - "### Initialize a default PostgresVectorStore" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "z-AZyzAQ7bsf" - }, - "outputs": [], - "source": [ - "from langchain_google_cloud_sql_pg import PostgresVectorStore\n", - "\n", - "store = await PostgresVectorStore.create( # Use .create() to initialize an async vector store\n", - " engine=engine,\n", - " table_name=TABLE_NAME,\n", - " embedding_service=embedding,\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Add texts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "\n", - "all_texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", - "metadatas = [{\"len\": len(t)} for t in all_texts]\n", - "ids = [str(uuid.uuid4()) for _ in all_texts]\n", - "\n", - "await store.aadd_texts(all_texts, metadatas=metadatas, ids=ids)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Delete texts" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "await store.adelete([ids[1]])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Search for documents" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query = \"I'd like a fruit.\"\n", - "docs = await store.asimilarity_search(query)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Search for documents by vector" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "query_vector = embedding.embed_query(query)\n", - "docs = await store.asimilarity_search_by_vector(query_vector, k=2)\n", - "print(docs)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Add a Index\n", - "Speed up vector search queries by applying a vector index. Learn more about [vector indexes](https://cloud.google.com/blog/products/databases/faster-similarity-search-performance-with-pgvector-indexes)." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_google_cloud_sql_pg.indexes import IVFFlatIndex\n", - "\n", - "index = IVFFlatIndex()\n", - "await store.aapply_vector_index(index)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Re-index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "await store.areindex() # Re-index using default index name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Remove an index" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "await store.aadrop_vector_index() # Delete index using default name" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Create a custom Vector Store\n", - "A Vector Store can take advantage of relational data to filter similarity searches.\n", - "\n", - "Create a table with custom metadata columns." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_google_cloud_sql_pg import Column\n", - "\n", - "# Set table name\n", - "TABLE_NAME = \"vectorstore_custom\"\n", - "\n", - "await engine.ainit_vectorstore_table(\n", - " table_name=TABLE_NAME,\n", - " vector_size=768, # VertexAI model: textembedding-gecko@latest\n", - " metadata_columns=[Column(\"len\", \"INTEGER\")],\n", - ")\n", - "\n", - "\n", - "# Initialize PostgresVectorStore\n", - "custom_store = await PostgresVectorStore.create(\n", - " engine=engine,\n", - " table_name=TABLE_NAME,\n", - " embedding_service=embedding,\n", - " metadata_columns=[\"len\"],\n", - " # Connect to a existing VectorStore by customizing the table schema:\n", - " # id_column=\"uuid\",\n", - " # content_column=\"documents\",\n", - " # embedding_column=\"vectors\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Search for documents with metadata filter" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import uuid\n", - "\n", - "# Add texts to the Vector Store\n", - "all_texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", - "metadatas = [{\"len\": len(t)} for t in all_texts]\n", - "ids = [str(uuid.uuid4()) for _ in all_texts]\n", - "await custom_store.aadd_texts(all_texts, metadatas=metadatas, ids=ids)\n", - "\n", - "# Use filter on search\n", - "docs = await custom_store.asimilarity_search_by_vector(query_vector, filter=\"len >= 6\")\n", - "\n", - "print(docs)" - ] - } - ], - "metadata": { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Google Cloud SQL for PostgreSQL\n", + "\n", + "> [Cloud SQL](https://cloud.google.com/sql) is a fully managed relational database service that offers high performance, seamless integration, and impressive scalability. It offers PostgreSQL, PostgreSQL, and SQL Server database engines. Extend your database application to build AI-powered experiences leveraging Cloud SQL's Langchain integrations.\n", + "\n", + "This notebook goes over how to use `Cloud SQL for PostgreSQL` to store vector embeddings with the `PostgresVectorStore` class.\n", + "\n", + "Learn more about the package on [GitHub](https://github.com/googleapis/langchain-google-cloud-sql-pg-python/).\n", + "\n", + "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/googleapis/langchain-google-cloud-sql-pg-python/blob/main/docs/vector_store.ipynb)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Before you begin\n", + "\n", + "To run this notebook, you will need to do the following:\n", + "\n", + " * [Create a Google Cloud Project](https://developers.google.com/workspace/guides/create-project)\n", + " * [Enable the Cloud SQL Admin API.](https://console.cloud.google.com/flows/enableapi?apiid=sqladmin.googleapis.com)\n", + " * [Create a Cloud SQL instance.](https://cloud.google.com/sql/docs/postgres/connect-instance-auth-proxy#create-instance)\n", + " * [Create a Cloud SQL database.](https://cloud.google.com/sql/docs/postgres/create-manage-databases)\n", + " * [Add a User to the database.](https://cloud.google.com/sql/docs/postgres/create-manage-users)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IR54BmgvdHT_" + }, + "source": [ + "### 🦜🔗 Library Installation\n", + "Install the integration library, `langchain-google-cloud-sql-pg`, and the library for the embedding service, `langchain-google-vertexai`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { "colab": { - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.5" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "base_uri": "https://localhost:8080/", + "height": 1000 + }, + "id": "0ZITIDE160OD", + "outputId": "e184bc0d-6541-4e0a-82d2-1e216db00a2d" + }, + "outputs": [], + "source": [ + "%pip install --upgrade --quiet langchain-google-cloud-sql-pg langchain-google-vertexai" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "v40bB_GMcr9f" + }, + "source": [ + "**Colab only:** Uncomment the following cell to restart the kernel or use the button to restart the kernel. For Vertex AI Workbench you can restart the terminal using the button on top." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "v6jBDnYnNM08", + "metadata": { + "id": "v6jBDnYnNM08" + }, + "outputs": [], + "source": [ + "# # Automatically restart kernel after installs so that your environment can access the new packages\n", + "# import IPython\n", + "\n", + "# app = IPython.Application.instance()\n", + "# app.kernel.do_shutdown(True)" + ] + }, + { + "cell_type": "markdown", + "id": "yygMe6rPWxHS", + "metadata": { + "id": "yygMe6rPWxHS" + }, + "source": [ + "### 🔐 Authentication\n", + "Authenticate to Google Cloud as the IAM user logged into this notebook in order to access your Google Cloud Project.\n", + "\n", + "* If you are using Colab to run this notebook, use the cell below and continue.\n", + "* If you are using Vertex AI Workbench, check out the setup instructions [here](https://github.com/GoogleCloudPlatform/generative-ai/tree/main/setup-env)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "PTXN1_DSXj2b", + "metadata": { + "id": "PTXN1_DSXj2b" + }, + "outputs": [], + "source": [ + "from google.colab import auth\n", + "\n", + "auth.authenticate_user()" + ] + }, + { + "cell_type": "markdown", + "id": "NEvB9BoLEulY", + "metadata": { + "id": "NEvB9BoLEulY" + }, + "source": [ + "### ☁ Set Your Google Cloud Project\n", + "Set your Google Cloud project so that you can leverage Google Cloud resources within this notebook.\n", + "\n", + "If you don't know your project ID, try the following:\n", + "\n", + "* Run `gcloud config list`.\n", + "* Run `gcloud projects list`.\n", + "* See the support page: [Locate the project ID](https://support.google.com/googleapi/answer/7014113)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "gfkS3yVRE4_W", + "metadata": { + "cellView": "form", + "id": "gfkS3yVRE4_W" + }, + "outputs": [], + "source": [ + "# @markdown Please fill in the value below with your Google Cloud project ID and then run the cell.\n", + "\n", + "PROJECT_ID = \"my-project-id\" # @param {type:\"string\"}\n", + "\n", + "# Set the project id\n", + "!gcloud config set project {PROJECT_ID}" + ] + }, + { + "cell_type": "markdown", + "id": "f8f2830ee9ca1e01", + "metadata": { + "id": "f8f2830ee9ca1e01" + }, + "source": [ + "## Basic Usage" + ] + }, + { + "cell_type": "markdown", + "id": "OMvzMWRrR6n7", + "metadata": { + "id": "OMvzMWRrR6n7" + }, + "source": [ + "### Set Cloud SQL database values\n", + "Find your database values, in the [Cloud SQL Instances page](https://console.cloud.google.com/sql?_ga=2.223735448.2062268965.1707700487-2088871159.1707257687)." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "irl7eMFnSPZr", + "metadata": { + "id": "irl7eMFnSPZr" + }, + "outputs": [], + "source": [ + "# @title Set Your Values Here { display-mode: \"form\" }\n", + "REGION = \"us-central1\" # @param {type: \"string\"}\n", + "INSTANCE = \"my-pg-instance\" # @param {type: \"string\"}\n", + "DATABASE = \"my-database\" # @param {type: \"string\"}\n", + "TABLE_NAME = \"vector_store\" # @param {type: \"string\"}" + ] + }, + { + "cell_type": "markdown", + "id": "QuQigs4UoFQ2", + "metadata": { + "id": "QuQigs4UoFQ2" + }, + "source": [ + "### PostgresEngine Connection Pool\n", + "\n", + "One of the requirements and arguments to establish Cloud SQL as a vector store is a `PostgresEngine` object. The `PostgresEngine` configures a connection pool to your Cloud SQL database, enabling successful connections from your application and following industry best practices.\n", + "\n", + "To create a `PostgresEngine` using `PostgresEngine.from_instance()` you need to provide only 4 things:\n", + "\n", + "1. `project_id` : Project ID of the Google Cloud Project where the Cloud SQL instance is located.\n", + "1. `region` : Region where the Cloud SQL instance is located.\n", + "1. `instance` : The name of the Cloud SQL instance.\n", + "1. `database` : The name of the database to connect to on the Cloud SQL instance.\n", + "\n", + "By default, [IAM database authentication](https://cloud.google.com/sql/docs/postgres/iam-authentication#iam-db-auth) will be used as the method of database authentication. This library uses the IAM principal belonging to the [Application Default Credentials (ADC)](https://cloud.google.com/docs/authentication/application-default-credentials) sourced from the envionment.\n", + "\n", + "For more informatin on IAM database authentication please see:\n", + "\n", + "* [Configure an instance for IAM database authentication](https://cloud.google.com/sql/docs/postgres/create-edit-iam-instances)\n", + "* [Manage users with IAM database authentication](https://cloud.google.com/sql/docs/postgres/add-manage-iam-users)\n", + "\n", + "Optionally, [built-in database authentication](https://cloud.google.com/sql/docs/postgres/built-in-authentication) using a username and password to access the Cloud SQL database can also be used. Just provide the optional `user` and `password` arguments to `PostgresEngine.from_instance()`:\n", + "\n", + "* `user` : Database user to use for built-in database authentication and login\n", + "* `password` : Database password to use for built-in database authentication and login.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\"**Note**: This tutorial demonstrates the async interface. All async methods have corresponding sync methods.\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_google_cloud_sql_pg import PostgresEngine\n", + "\n", + "engine = await PostgresEngine.afrom_instance(\n", + " project_id=PROJECT_ID, region=REGION, instance=INSTANCE, database=DATABASE\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "D9Xs2qhm6X56" + }, + "source": [ + "### Initialize a table\n", + "The `PostgresVectorStore` class requires a database table. The `PostgresEngine` engine has a helper method `init_vectorstore_table()` that can be used to create a table with the proper schema for you." + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "id": "avlyHEMn6gzU" + }, + "outputs": [], + "source": [ + "from langchain_google_cloud_sql_pg import PostgresEngine\n", + "\n", + "await engine.ainit_vectorstore_table(\n", + " table_name=TABLE_NAME,\n", + " vector_size=768, # Vector size for VertexAI model(textembedding-gecko@latest)\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create an embedding class instance\n", + "\n", + "You can use any [LangChain embeddings model](https://python.langchain.com/docs/integrations/text_embedding/).\n", + "You may need to enable Vertex AI API to use `VertexAIEmbeddings`. We recommend setting the embedding model's version for production, learn more about the [Text embeddings models](https://cloud.google.com/vertex-ai/docs/generative-ai/model-reference/text-embeddings)." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "5utKIdq7KYi5", + "metadata": { + "id": "5utKIdq7KYi5" + }, + "outputs": [], + "source": [ + "# enable Vertex AI API\n", + "!gcloud services enable aiplatform.googleapis.com" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Vb2RJocV9_LQ", + "outputId": "37f5dc74-2512-47b2-c135-f34c10afdcf4" + }, + "outputs": [], + "source": [ + "from langchain_google_vertexai import VertexAIEmbeddings\n", + "\n", + "embedding = VertexAIEmbeddings(\n", + " model_name=\"textembedding-gecko@latest\", project=PROJECT_ID\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "e1tl0aNx7SWy" + }, + "source": [ + "### Initialize a default PostgresVectorStore" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "z-AZyzAQ7bsf" + }, + "outputs": [], + "source": [ + "from langchain_google_cloud_sql_pg import PostgresVectorStore\n", + "\n", + "store = await PostgresVectorStore.create( # Use .create() to initialize an async vector store\n", + " engine=engine,\n", + " table_name=TABLE_NAME,\n", + " embedding_service=embedding,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Initialize Vector Store with documents\n", + "\n", + "This is a great way to get started quickly. However, the default method is recommended for most applications to avoid accidentally adding duplicate documents." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_core.documents import Document\n", + "import uuid\n", + "\n", + "docs = [\n", + " Document(\n", + " page_content=\"Red Apple\",\n", + " metadata={\"description\": \"red\", \"content\": \"1\", \"category\": \"fruit\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Banana Cavendish\",\n", + " metadata={\"description\": \"yellow\", \"content\": \"2\", \"category\": \"fruit\"},\n", + " ),\n", + " Document(\n", + " page_content=\"Orange Navel\",\n", + " metadata={\"description\": \"orange\", \"content\": \"3\", \"category\": \"fruit\"},\n", + " ),\n", + "]\n", + "ids = [str(uuid.uuid4()) for i in range(len(docs))]\n", + "\n", + "store_with_documents = await PostgresVectorStore.afrom_documents(\n", + " documents=docs,\n", + " ids=ids,\n", + " engine=engine,\n", + " table_name=TABLE_NAME,\n", + " embedding_service=embedding,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "\n", + "all_texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", + "metadatas = [{\"len\": len(t)} for t in all_texts]\n", + "ids = [str(uuid.uuid4()) for _ in all_texts]\n", + "\n", + "await store.aadd_texts(all_texts, metadatas=metadatas, ids=ids)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Delete texts" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await store.adelete([ids[1]])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search for documents" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query = \"I'd like a fruit.\"\n", + "docs = await store.asimilarity_search(query)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search for documents by vector" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "query_vector = embedding.embed_query(query)\n", + "docs = await store.asimilarity_search_by_vector(query_vector, k=2)\n", + "print(docs)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Add a Index\n", + "Speed up vector search queries by applying a vector index. Learn more about [vector indexes](https://cloud.google.com/blog/products/databases/faster-similarity-search-performance-with-pgvector-indexes)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_google_cloud_sql_pg.indexes import IVFFlatIndex\n", + "\n", + "index = IVFFlatIndex()\n", + "await store.aapply_vector_index(index)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Re-index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await store.areindex() # Re-index using default index name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Remove an index" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await store.aadrop_vector_index() # Delete index using default name" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Create a custom Vector Store\n", + "A Vector Store can take advantage of relational data to filter similarity searches.\n", + "\n", + "Create a table with custom metadata columns." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_google_cloud_sql_pg import Column\n", + "\n", + "# Set table name\n", + "TABLE_NAME = \"vectorstore_custom\"\n", + "\n", + "await engine.ainit_vectorstore_table(\n", + " table_name=TABLE_NAME,\n", + " vector_size=768, # VertexAI model: textembedding-gecko@latest\n", + " metadata_columns=[Column(\"len\", \"INTEGER\")],\n", + ")\n", + "\n", + "\n", + "# Initialize PostgresVectorStore\n", + "custom_store = await PostgresVectorStore.create(\n", + " engine=engine,\n", + " table_name=TABLE_NAME,\n", + " embedding_service=embedding,\n", + " metadata_columns=[\"len\"],\n", + " # Connect to a existing VectorStore by customizing the table schema:\n", + " # id_column=\"uuid\",\n", + " # content_column=\"documents\",\n", + " # embedding_column=\"vectors\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Search for documents with metadata filter" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import uuid\n", + "\n", + "# Add texts to the Vector Store\n", + "all_texts = [\"Apples and oranges\", \"Cars and airplanes\", \"Pineapple\", \"Train\", \"Banana\"]\n", + "metadatas = [{\"len\": len(t)} for t in all_texts]\n", + "ids = [str(uuid.uuid4()) for _ in all_texts]\n", + "await custom_store.aadd_texts(all_texts, metadatas=metadatas, ids=ids)\n", + "\n", + "# Use filter on search\n", + "docs = await custom_store.asimilarity_search_by_vector(query_vector, filter=\"len >= 6\")\n", + "\n", + "print(docs)" + ] + } + ], + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.5" + } + }, + "nbformat": 4, + "nbformat_minor": 0 } diff --git a/pyproject.toml b/pyproject.toml index 8facf43d..b6627534 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,6 @@ authors = [ dependencies = [ "cloud-sql-python-connector[asyncpg] >= 1.10.0, <2.0.0", "langchain-core>=0.1.1, <1.0.0 ", - "langchain-community>=0.0.18, <0.3.0", "numpy>=1.24.4, <2.0.0", "pgvector>=0.2.5, <1.0.0", "SQLAlchemy[asyncio]>=2.0.25, <3.0.0" @@ -41,11 +40,11 @@ Changelog = "https://github.com/googleapis/langchain-google-cloud-sql-pg-python/ [project.optional-dependencies] test = [ - "black[jupyter]==24.4.2", + "black[jupyter]==24.8.0", "isort==5.13.2", - "mypy==1.10.1", - "pytest-asyncio==0.23.7", - "pytest==8.2.2", + "mypy==1.11.2", + "pytest-asyncio==0.24.0", + "pytest==8.3.2", "pytest-cov==5.0.0" ] @@ -62,6 +61,8 @@ profile = "black" [tool.mypy] python_version = 3.8 warn_unused_configs = true +disallow_incomplete_defs = true + exclude = [ 'docs/*', 'noxfile.py' diff --git a/requirements.txt b/requirements.txt index ba25b966..c6a36c54 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ cloud-sql-python-connector[asyncpg]==1.10.0 -langchain-core==0.2.12 -langchain-community==0.2.6 -numpy===1.24.4; python_version<='3.8' +langchain-core==0.2.35 +numpy==1.24.4; python_version<='3.8' numpy==1.26.4; python_version>'3.8' -pgvector==0.3.0 -SQLAlchemy[asyncio]==2.0.31 +pgvector==0.3.2 +SQLAlchemy[asyncio]==2.0.32 diff --git a/samples/index_tuning_sample/requirements.txt b/samples/index_tuning_sample/requirements.txt index 48cf49be..0a7a2867 100644 --- a/samples/index_tuning_sample/requirements.txt +++ b/samples/index_tuning_sample/requirements.txt @@ -1,3 +1,3 @@ -langchain-google-cloud-sql-pg==0.4.1 -langchain==0.1.8 -langchain-google-vertexai==1.0.6 \ No newline at end of file +langchain-google-cloud-sql-pg==0.7.0 +langchain==0.2.14 +langchain-google-vertexai==1.0.10 \ No newline at end of file diff --git a/samples/langchain_on_vertexai/README.md b/samples/langchain_on_vertexai/README.md index 27bad116..63974b05 100644 --- a/samples/langchain_on_vertexai/README.md +++ b/samples/langchain_on_vertexai/README.md @@ -9,9 +9,9 @@ Use the following templates to deploy Retrieval Augmented Generation (RAG) appli Description | Sample ----------- | ------ -Deploy a pre-built `LangchainAgent` with custom RAG tool | [prebuilt_langchain_agent.py](prebuilt_langchain_agent.py) -Build and deploy a question-answering RAG application | [retriever_chain.py](retriever_chain.p) -Build and deploy an Agent with RAG tool and Memory | [retriever_agent_with_history.py](retriever_agent_with_history.py) +Deploy a pre-built `LangchainAgent` with custom RAG tool | [prebuilt_langchain_agent_template.py](prebuilt_langchain_agent_template.py) +Build and deploy a question-answering RAG application | [retriever_chain_template.py](retriever_chain_template.p) +Build and deploy an Agent with RAG tool and Memory | [retriever_agent_with_history_template.py](retriever_agent_with_history_template.py) ## Before you begin diff --git a/samples/requirements.txt b/samples/requirements.txt index b90131e5..462950ce 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,4 +1,4 @@ google-cloud-aiplatform[reasoningengine,langchain] langchain-google-vertexai -langchain +langchain-community google-cloud-resource-manager \ No newline at end of file diff --git a/src/langchain_google_cloud_sql_pg/chat_message_history.py b/src/langchain_google_cloud_sql_pg/chat_message_history.py index 76dae6c5..4ce9f5f0 100644 --- a/src/langchain_google_cloud_sql_pg/chat_message_history.py +++ b/src/langchain_google_cloud_sql_pg/chat_message_history.py @@ -44,7 +44,7 @@ class PostgresChatMessageHistory(BaseChatMessageHistory): def __init__( self, - key, + key: object, engine: PostgresEngine, session_id: str, table_name: str, @@ -77,7 +77,7 @@ async def create( engine: PostgresEngine, session_id: str, table_name: str, - ): + ) -> PostgresChatMessageHistory: """Create a new PostgresChatMessageHistory instance. Args: @@ -117,7 +117,7 @@ def create_sync( engine: PostgresEngine, session_id: str, table_name: str, - ): + ) -> PostgresChatMessageHistory: """Create a new PostgresChatMessageHistory instance. Args: diff --git a/src/langchain_google_cloud_sql_pg/engine.py b/src/langchain_google_cloud_sql_pg/engine.py index 70eebbb8..da284af9 100644 --- a/src/langchain_google_cloud_sql_pg/engine.py +++ b/src/langchain_google_cloud_sql_pg/engine.py @@ -17,13 +17,23 @@ import asyncio from dataclasses import dataclass from threading import Thread -from typing import TYPE_CHECKING, Awaitable, Dict, List, Optional, TypeVar, Union +from typing import ( + TYPE_CHECKING, + Awaitable, + Dict, + List, + Optional, + Sequence, + TypeVar, + Union, +) import aiohttp import google.auth # type: ignore import google.auth.transport.requests # type: ignore from google.cloud.sql.connector import Connector, IPTypes, RefreshStrategy from sqlalchemy import MetaData, Table, text +from sqlalchemy.engine.row import RowMapping from sqlalchemy.exc import InvalidRequestError from sqlalchemy.ext.asyncio import AsyncEngine, create_async_engine @@ -97,6 +107,8 @@ class PostgresEngine: """A class for managing connections to a Cloud SQL for Postgres database.""" _connector: Optional[Connector] = None + _default_loop: Optional[asyncio.AbstractEventLoop] = None + _default_thread: Optional[Thread] = None __create_key = object() def __init__( @@ -156,9 +168,12 @@ def from_instance( """ # Running a loop in a background thread allows us to support # async methods from non-async environments - loop = asyncio.new_event_loop() - thread = Thread(target=loop.run_forever, daemon=True) - thread.start() + if cls._default_loop is None: + cls._default_loop = asyncio.new_event_loop() + cls._default_thread = Thread( + target=cls._default_loop.run_forever, daemon=True + ) + cls._default_thread.start() coro = cls._create( project_id, region, @@ -167,12 +182,12 @@ def from_instance( ip_type, user, password, - loop=loop, - thread=thread, + loop=cls._default_loop, + thread=cls._default_thread, quota_project=quota_project, iam_account_email=iam_account_email, ) - return asyncio.run_coroutine_threadsafe(coro, loop).result() + return asyncio.run_coroutine_threadsafe(coro, cls._default_loop).result() @classmethod async def _create( @@ -218,7 +233,7 @@ async def _create( ) if cls._connector is None: cls._connector = Connector( - loop=asyncio.get_event_loop(), + loop=loop, user_agent=USER_AGENT, quota_project=quota_project, refresh_strategy=RefreshStrategy.LAZY, @@ -305,19 +320,21 @@ def from_engine(cls, engine: AsyncEngine) -> PostgresEngine: """Create an PostgresEngine instance from an AsyncEngine.""" return cls(cls.__create_key, engine, None, None) - async def _aexecute(self, query: str, params: Optional[dict] = None): + async def _aexecute(self, query: str, params: Optional[dict] = None) -> None: """Execute a SQL query.""" async with self._engine.connect() as conn: await conn.execute(text(query), params) await conn.commit() - async def _aexecute_outside_tx(self, query: str): - """Execute a SQL query.""" + async def _aexecute_outside_tx(self, query: str) -> None: + """Execute a SQL query in a new transaction.""" async with self._engine.connect() as conn: await conn.execute(text("COMMIT")) await conn.execute(text(query)) - async def _afetch(self, query: str, params: Optional[dict] = None): + async def _afetch( + self, query: str, params: Optional[dict] = None + ) -> Sequence[RowMapping]: """Fetch results from a SQL query.""" async with self._engine.connect() as conn: result = await conn.execute(text(query), params) @@ -326,11 +343,23 @@ async def _afetch(self, query: str, params: Optional[dict] = None): return result_fetch - def _execute(self, query: str, params: Optional[dict] = None): + async def _afetch_with_query_options( + self, query: str, query_options: str + ) -> Sequence[RowMapping]: + """Set temporary database flags and fetch results from a SQL query.""" + async with self._engine.connect() as conn: + await conn.execute(text(query_options)) + result = await conn.execute(text(query)) + result_map = result.mappings() + result_fetch = result_map.fetchall() + + return result_fetch + + def _execute(self, query: str, params: Optional[dict] = None) -> None: """Execute a SQL query.""" return self._run_as_sync(self._aexecute(query, params)) - def _fetch(self, query: str, params: Optional[dict] = None): + def _fetch(self, query: str, params: Optional[dict] = None) -> Sequence[RowMapping]: """Fetch results from a SQL query.""" return self._run_as_sync(self._afetch(query, params)) @@ -439,7 +468,7 @@ def init_vectorstore_table( ) ) - async def ainit_chat_history_table(self, table_name) -> None: + async def ainit_chat_history_table(self, table_name: str) -> None: """Create a Cloud SQL table to store chat history. Args: @@ -456,7 +485,7 @@ async def ainit_chat_history_table(self, table_name) -> None: );""" await self._aexecute(create_table_query) - def init_chat_history_table(self, table_name) -> None: + def init_chat_history_table(self, table_name: str) -> None: """Create a Cloud SQL table to store chat history. Args: diff --git a/src/langchain_google_cloud_sql_pg/indexes.py b/src/langchain_google_cloud_sql_pg/indexes.py index b4dfafc2..b5616a8c 100644 --- a/src/langchain_google_cloud_sql_pg/indexes.py +++ b/src/langchain_google_cloud_sql_pg/indexes.py @@ -34,12 +34,12 @@ class DistanceStrategy(StrategyMixin, enum.Enum): DEFAULT_DISTANCE_STRATEGY = DistanceStrategy.COSINE_DISTANCE -DEFAULT_INDEX_NAME = "langchainvectorindex" +DEFAULT_INDEX_NAME_SUFFIX: str = "langchainvectorindex" @dataclass class BaseIndex(ABC): - name: str = DEFAULT_INDEX_NAME + name: Optional[str] = None index_type: str = "base" distance_strategy: DistanceStrategy = field( default_factory=lambda: DistanceStrategy.COSINE_DISTANCE diff --git a/src/langchain_google_cloud_sql_pg/loader.py b/src/langchain_google_cloud_sql_pg/loader.py index 93261a77..92dd7941 100644 --- a/src/langchain_google_cloud_sql_pg/loader.py +++ b/src/langchain_google_cloud_sql_pg/loader.py @@ -27,7 +27,7 @@ ) import sqlalchemy -from langchain_community.document_loaders.base import BaseLoader +from langchain_core.document_loaders.base import BaseLoader from langchain_core.documents import Document from .engine import PostgresEngine @@ -36,24 +36,24 @@ DEFAULT_METADATA_COL = "langchain_metadata" -def text_formatter(row, content_columns) -> str: +def text_formatter(row: dict, content_columns: List[str]) -> str: """txt document formatter.""" return " ".join(str(row[column]) for column in content_columns if column in row) -def csv_formatter(row, content_columns) -> str: +def csv_formatter(row: dict, content_columns: List[str]) -> str: """CSV document formatter.""" return ", ".join(str(row[column]) for column in content_columns if column in row) -def yaml_formatter(row, content_columns) -> str: +def yaml_formatter(row: dict, content_columns: List[str]) -> str: """YAML document formatter.""" return "\n".join( f"{column}: {str(row[column])}" for column in content_columns if column in row ) -def json_formatter(row, content_columns) -> str: +def json_formatter(row: dict, content_columns: List[str]) -> str: """JSON document formatter.""" dictionary = {} for column in content_columns: @@ -116,7 +116,7 @@ class PostgresLoader(BaseLoader): def __init__( self, - key, + key: object, engine: PostgresEngine, query: str, content_columns: List[str], @@ -162,7 +162,7 @@ async def create( metadata_json_column: Optional[str] = None, format: Optional[str] = None, formatter: Optional[Callable] = None, - ): + ) -> PostgresLoader: """Create a new PostgresLoader instance. Args: @@ -255,7 +255,7 @@ def create_sync( metadata_json_column: Optional[str] = None, format: Optional[str] = None, formatter: Optional[Callable] = None, - ): + ) -> PostgresLoader: """Create a new PostgresLoader instance. Args: @@ -340,7 +340,7 @@ class PostgresDocumentSaver: def __init__( self, - key, + key: object, engine: PostgresEngine, table_name: str, content_column: str, @@ -378,7 +378,7 @@ async def create( content_column: str = DEFAULT_CONTENT_COL, metadata_columns: List[str] = [], metadata_json_column: Optional[str] = DEFAULT_METADATA_COL, - ): + ) -> PostgresDocumentSaver: """Create an PostgresDocumentSaver instance. Args: @@ -435,7 +435,7 @@ def create_sync( content_column: str = DEFAULT_CONTENT_COL, metadata_columns: List[str] = [], metadata_json_column: str = DEFAULT_METADATA_COL, - ): + ) -> PostgresDocumentSaver: """Create an PostgresDocumentSaver instance. Args: @@ -555,29 +555,3 @@ def delete(self, docs: List[Document]) -> None: docs (List[langchain_core.documents.Document]): a list of documents to be deleted. """ self.engine._run_as_sync(self.adelete(docs)) - - async def _aload_table_schema(self) -> sqlalchemy.Table: - """ - Load table schema from existing table in PgSQL database. - - Returns: - (sqlalchemy.Table): The loaded table. - """ - metadata = sqlalchemy.MetaData() - async with self.engine._engine.connect() as conn: - await conn.run_sync(metadata.reflect, only=[self.table_name]) - - table = sqlalchemy.Table(self.table_name, metadata) - # Extract the schema information - schema = [] - for column in table.columns: - schema.append( - { - "name": column.name, - "type": column.type.python_type, - "max_length": getattr(column.type, "length", None), - "nullable": not column.nullable, - } - ) - - return metadata.tables[self.table_name] diff --git a/src/langchain_google_cloud_sql_pg/vectorstore.py b/src/langchain_google_cloud_sql_pg/vectorstore.py index d634cec7..a4a0b53a 100644 --- a/src/langchain_google_cloud_sql_pg/vectorstore.py +++ b/src/langchain_google_cloud_sql_pg/vectorstore.py @@ -17,17 +17,18 @@ import json import uuid -from typing import Any, Callable, Iterable, List, Optional, Tuple, Type, Union +from typing import Any, Callable, Iterable, List, Optional, Sequence, Tuple, Type, Union import numpy as np from langchain_core.documents import Document from langchain_core.embeddings import Embeddings from langchain_core.vectorstores import VectorStore +from sqlalchemy.engine.row import RowMapping from .engine import PostgresEngine from .indexes import ( DEFAULT_DISTANCE_STRATEGY, - DEFAULT_INDEX_NAME, + DEFAULT_INDEX_NAME_SUFFIX, BaseIndex, DistanceStrategy, ExactNearestNeighbor, @@ -42,7 +43,7 @@ class PostgresVectorStore(VectorStore): def __init__( self, - key, + key: object, engine: PostgresEngine, embedding_service: Embeddings, table_name: str, @@ -114,7 +115,7 @@ async def create( fetch_k: int = 20, lambda_mult: float = 0.5, index_query_options: Optional[QueryOptions] = None, - ): + ) -> PostgresVectorStore: """Create a new PostgresVectorStore instance. Args: @@ -182,7 +183,7 @@ async def create( del all_columns[id_column] del all_columns[content_column] del all_columns[embedding_column] - metadata_columns = [k for k, _ in all_columns.keys()] + metadata_columns = [k for k in all_columns.keys()] return cls( cls.__create_key, @@ -218,7 +219,7 @@ def create_sync( fetch_k: int = 20, lambda_mult: float = 0.5, index_query_options: Optional[QueryOptions] = None, - ): + ) -> PostgresVectorStore: """Create a new PostgresVectorStore instance. Args: @@ -496,7 +497,7 @@ def from_texts( # type: ignore[override] id_column: str = "langchain_id", metadata_json_column: str = "langchain_metadata", **kwargs: Any, - ): + ) -> PostgresVectorStore: """Create an PostgresVectorStore instance from texts. Args: texts (List[str]): Texts to add to the vector store. @@ -589,7 +590,7 @@ async def __query_collection( k: Optional[int] = None, filter: Optional[str] = None, **kwargs: Any, - ) -> List[Any]: + ) -> Sequence[RowMapping]: """Perform similarity search query on the vector store table.""" k = k if k else self.k operator = self.distance_strategy.operator @@ -598,10 +599,12 @@ async def __query_collection( filter = f"WHERE {filter}" if filter else "" stmt = f"SELECT *, {search_function}({self.embedding_column}, '{embedding}') as distance FROM \"{self.table_name}\" {filter} ORDER BY {self.embedding_column} {operator} '{embedding}' LIMIT {k};" if self.index_query_options: - await self.engine._aexecute( - f"SET LOCAL {self.index_query_options.to_string()};" + query_options_stmt = f"SET LOCAL {self.index_query_options.to_string()};" + results = await self.engine._afetch_with_query_options( + stmt, query_options_stmt ) - results = await self.engine._afetch(stmt) + else: + results = await self.engine._afetch(stmt) return results def similarity_search( @@ -901,31 +904,37 @@ async def aapply_vector_index( filter = f"WHERE ({index.partial_indexes})" if index.partial_indexes else "" params = "WITH " + index.index_options() function = index.distance_strategy.index_function - name = name or index.name + if name is None: + if index.name == None: + index.name = self.table_name + DEFAULT_INDEX_NAME_SUFFIX + name = index.name stmt = f'CREATE INDEX {"CONCURRENTLY" if concurrently else ""} {name} ON "{self.table_name}" USING {index.index_type} ({self.embedding_column} {function}) {params} {filter};' if concurrently: await self.engine._aexecute_outside_tx(stmt) else: await self.engine._aexecute(stmt) - async def areindex(self, index_name: str = DEFAULT_INDEX_NAME) -> None: + async def areindex(self, index_name: Optional[str] = None) -> None: """Re-index the vector store table.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX query = f"REINDEX INDEX {index_name};" await self.engine._aexecute(query) async def adrop_vector_index( self, - index_name: str = DEFAULT_INDEX_NAME, + index_name: Optional[str] = None, ) -> None: """Drop the vector index.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX query = f"DROP INDEX IF EXISTS {index_name};" await self.engine._aexecute(query) async def is_valid_index( self, - index_name: str = DEFAULT_INDEX_NAME, + index_name: Optional[str] = None, ) -> bool: """Check if index exists in the table.""" + index_name = index_name or self.table_name + DEFAULT_INDEX_NAME_SUFFIX query = f""" SELECT tablename, indexname FROM pg_indexes diff --git a/src/langchain_google_cloud_sql_pg/version.py b/src/langchain_google_cloud_sql_pg/version.py index 85217a3e..74efebbe 100644 --- a/src/langchain_google_cloud_sql_pg/version.py +++ b/src/langchain_google_cloud_sql_pg/version.py @@ -13,4 +13,4 @@ # See the License for the specific language governing permissions and # limitations under the License. # -__version__ = "0.7.0" +__version__ = "0.8.0" diff --git a/tests/test_cloudsql_vectorstore.py b/tests/test_cloudsql_vectorstore.py index bda136f8..081853a8 100644 --- a/tests/test_cloudsql_vectorstore.py +++ b/tests/test_cloudsql_vectorstore.py @@ -17,8 +17,8 @@ import pytest import pytest_asyncio -from langchain_community.embeddings import DeterministicFakeEmbedding from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore @@ -133,6 +133,19 @@ async def vs_custom(self, engine): yield vs await engine._aexecute(f'DROP TABLE IF EXISTS "{CUSTOM_TABLE}"') + async def test_init_with_constructor(self, engine): + with pytest.raises(Exception): + PostgresVectorStore( + engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="noname", + embedding_column="myembedding", + metadata_columns=["page", "source"], + metadata_json_column="mymeta", + ) + async def test_post_init(self, engine): with pytest.raises(ValueError): await PostgresVectorStore.create( @@ -265,4 +278,60 @@ async def test_add_texts(self, engine_sync, vs_sync): results = engine_sync._fetch(f'SELECT * FROM "{DEFAULT_TABLE_SYNC}"') assert len(results) == 6 + async def test_ignore_metadata_columns(self, vs_custom): + column_to_ignore = "source" + vs = await PostgresVectorStore.create( + vs_custom.engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + ignore_metadata_columns=[column_to_ignore], + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_json_column="mymeta", + ) + assert column_to_ignore not in vs.metadata_columns + + async def test_create_vectorstore_with_invalid_parameters(self, vs_custom): + with pytest.raises(ValueError): + await PostgresVectorStore.create( + vs_custom.engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="myembedding", + metadata_columns=["random_column"], # invalid metadata column + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + vs_custom.engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="langchain_id", # invalid content column type + embedding_column="myembedding", + metadata_columns=["random_column"], + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + vs_custom.engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="random_column", # invalid embedding column + metadata_columns=["random_column"], + ) + with pytest.raises(ValueError): + await PostgresVectorStore.create( + vs_custom.engine, + embedding_service=embeddings_service, + table_name=CUSTOM_TABLE, + id_column="myid", + content_column="mycontent", + embedding_column="langchain_id", # invalid embedding column data type + metadata_columns=["random_column"], + ) + # Need tests for store metadata=False diff --git a/tests/test_cloudsql_vectorstore_from_methods.py b/tests/test_cloudsql_vectorstore_from_methods.py index e0439c97..e7e143cb 100644 --- a/tests/test_cloudsql_vectorstore_from_methods.py +++ b/tests/test_cloudsql_vectorstore_from_methods.py @@ -17,8 +17,8 @@ import pytest import pytest_asyncio -from langchain_community.embeddings import DeterministicFakeEmbedding from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore diff --git a/tests/test_cloudsql_vectorstore_index.py b/tests/test_cloudsql_vectorstore_index.py index bb70b6d7..10baf13a 100644 --- a/tests/test_cloudsql_vectorstore_index.py +++ b/tests/test_cloudsql_vectorstore_index.py @@ -19,12 +19,12 @@ import pytest import pytest_asyncio -from langchain_community.embeddings import DeterministicFakeEmbedding from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding from langchain_google_cloud_sql_pg import PostgresEngine, PostgresVectorStore from langchain_google_cloud_sql_pg.indexes import ( - DEFAULT_INDEX_NAME, + DEFAULT_INDEX_NAME_SUFFIX, DistanceStrategy, HNSWIndex, IVFFlatIndex, @@ -32,6 +32,7 @@ DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") +DEFAULT_INDEX_NAME = DEFAULT_TABLE + DEFAULT_INDEX_NAME_SUFFIX VECTOR_SIZE = 768 embeddings_service = DeterministicFakeEmbedding(size=VECTOR_SIZE) @@ -54,10 +55,6 @@ def get_env_var(key: str, desc: str) -> str: @pytest.mark.asyncio(scope="class") -@pytest.mark.skipif( - sys.version_info != (3, 11), - reason="To prevent index clashes only run on python3.11 or higher", -) class TestIndex: @pytest.fixture(scope="module") def db_project(self) -> str: @@ -100,11 +97,13 @@ async def vs(self, engine): await engine._aexecute(f"DROP TABLE IF EXISTS {DEFAULT_TABLE}") await engine._engine.dispose() + @pytest.mark.run(order=1) async def test_aapply_vector_index(self, vs): index = HNSWIndex() await vs.aapply_vector_index(index) assert await vs.is_valid_index(DEFAULT_INDEX_NAME) + @pytest.mark.run(order=2) async def test_areindex(self, vs): if not await vs.is_valid_index(DEFAULT_INDEX_NAME): index = HNSWIndex() @@ -113,6 +112,7 @@ async def test_areindex(self, vs): await vs.areindex(DEFAULT_INDEX_NAME) assert await vs.is_valid_index(DEFAULT_INDEX_NAME) + @pytest.mark.run(order=3) async def test_dropindex(self, vs): await vs.adrop_vector_index() result = await vs.is_valid_index(DEFAULT_INDEX_NAME) @@ -129,3 +129,7 @@ async def test_aapply_vector_index_ivfflat(self, vs): await vs.aapply_vector_index(index) assert await vs.is_valid_index("secondindex") await vs.adrop_vector_index("secondindex") + + async def test_is_valid_index(self, vs): + is_valid = await vs.is_valid_index("invalid_index") + assert is_valid == False diff --git a/tests/test_cloudsql_vectorstore_search.py b/tests/test_cloudsql_vectorstore_search.py index 536a3f10..65c6d8bc 100644 --- a/tests/test_cloudsql_vectorstore_search.py +++ b/tests/test_cloudsql_vectorstore_search.py @@ -17,11 +17,11 @@ import pytest import pytest_asyncio -from langchain_community.embeddings import DeterministicFakeEmbedding from langchain_core.documents import Document +from langchain_core.embeddings import DeterministicFakeEmbedding from langchain_google_cloud_sql_pg import Column, PostgresEngine, PostgresVectorStore -from langchain_google_cloud_sql_pg.indexes import HNSWQueryOptions, IVFFlatQueryOptions +from langchain_google_cloud_sql_pg.indexes import DistanceStrategy, HNSWQueryOptions DEFAULT_TABLE = "test_table" + str(uuid.uuid4()).replace("-", "_") CUSTOM_TABLE = "test_table_custom" + str(uuid.uuid4()).replace("-", "_") @@ -151,7 +151,7 @@ async def test_asimilarity_search_by_vector(self, vs): assert results[0][0] == Document(page_content="foo") assert results[0][1] == 0 - async def test_similarity_search_with_relevance_scores_threshold(self, vs): + async def test_similarity_search_with_relevance_scores_threshold_cosine(self, vs): score_threshold = {"score_threshold": 0} results = await vs.asimilarity_search_with_relevance_scores( "foo", **score_threshold @@ -171,6 +171,23 @@ async def test_similarity_search_with_relevance_scores_threshold(self, vs): assert len(results) == 1 assert results[0][0] == Document(page_content="foo") + async def test_similarity_search_with_relevance_scores_threshold_euclidean( + self, engine + ): + vs = await PostgresVectorStore.create( + engine, + embedding_service=embeddings_service, + table_name=DEFAULT_TABLE, + distance_strategy=DistanceStrategy.EUCLIDEAN, + ) + + score_threshold = {"score_threshold": 0.9} + results = await vs.asimilarity_search_with_relevance_scores( + "foo", **score_threshold + ) + assert len(results) == 1 + assert results[0][0] == Document(page_content="foo") + async def test_amax_marginal_relevance_search(self, vs): results = await vs.amax_marginal_relevance_search("bar") assert results[0] == Document(page_content="bar") diff --git a/tests/test_postgresql_chatmessagehistory.py b/tests/test_postgresql_chatmessagehistory.py index ab61c6d2..ea0b85ee 100644 --- a/tests/test_postgresql_chatmessagehistory.py +++ b/tests/test_postgresql_chatmessagehistory.py @@ -79,14 +79,14 @@ def test_chat_message_history(memory_engine: PostgresEngine) -> None: assert len(history.messages) == 0 -def test_chat_table(memory_engine: Any): +def test_chat_table(memory_engine: Any) -> None: with pytest.raises(ValueError): PostgresChatMessageHistory.create_sync( engine=memory_engine, session_id="test", table_name="doesnotexist" ) -def test_chat_schema(memory_engine: Any): +def test_chat_schema(memory_engine: Any) -> None: doc_table_name = "test_table" + str(uuid.uuid4()) memory_engine.init_document_table(table_name=doc_table_name) with pytest.raises(IndexError): diff --git a/tests/test_postgresql_engine.py b/tests/test_postgresql_engine.py index a5790f44..9ccd43a0 100644 --- a/tests/test_postgresql_engine.py +++ b/tests/test_postgresql_engine.py @@ -19,7 +19,7 @@ import pytest import pytest_asyncio from google.cloud.sql.connector import Connector, IPTypes -from langchain_community.embeddings import DeterministicFakeEmbedding +from langchain_core.embeddings import DeterministicFakeEmbedding from sqlalchemy import VARCHAR from sqlalchemy.ext.asyncio import create_async_engine diff --git a/tests/test_postgresql_loader.py b/tests/test_postgresql_loader.py index d295c027..8f4f5e35 100644 --- a/tests/test_postgresql_loader.py +++ b/tests/test_postgresql_loader.py @@ -69,6 +69,29 @@ async def _cleanup_table(self, engine): query = f'DROP TABLE IF EXISTS "{table_name}"' await engine._aexecute(query) + async def test_create_loader_with_invalid_parameters(self, engine): + with pytest.raises(ValueError): + await PostgresLoader.create( + engine=engine, + ) + with pytest.raises(ValueError): + + def fake_formatter(): + return None + + await PostgresLoader.create( + engine=engine, + table_name=table_name, + format="text", + formatter=fake_formatter, + ) + with pytest.raises(ValueError): + await PostgresLoader.create( + engine=engine, + table_name=table_name, + format="fake_format", + ) + async def test_load_from_query_default(self, engine): try: await self._cleanup_table(engine) @@ -216,6 +239,30 @@ async def test_load_from_query_customized_content_default_metadata(self, engine) ) ] + loader = await PostgresLoader.create( + engine=engine, + query=f'SELECT * FROM "{table_name}";', + content_columns=[ + "variety", + "quantity_in_stock", + "price_per_unit", + ], + format="JSON", + ) + + documents = await self._collect_async_items(loader.alazy_load()) + + assert documents == [ + Document( + page_content='{"variety": "Granny Smith", "quantity_in_stock": 150, "price_per_unit": 1}', + metadata={ + "fruit_id": 1, + "fruit_name": "Apple", + "organic": 1, + }, + ) + ] + finally: await self._cleanup_table(engine)