From ec8c596573ac9c9ec19dd6d9a71e590a339b71b3 Mon Sep 17 00:00:00 2001 From: Yassin Nouh <70436855+YassinNouh21@users.noreply.github.com> Date: Wed, 23 Apr 2025 22:21:00 +0200 Subject: [PATCH 1/2] feat: Add Milvus tutorial with Feast integration Signed-off-by: Yassin Nouh <70436855+YassinNouh21@users.noreply.github.com> --- .../online_store/milvus_tutorial/README.md | 66 ++++++ .../milvus_tutorial/docker-compose.yml | 31 +++ .../milvus_tutorial/feature_store.yaml | 16 ++ .../milvus_tutorial/milvus_example.py | 191 ++++++++++++++++++ 4 files changed, 304 insertions(+) create mode 100644 examples/online_store/milvus_tutorial/README.md create mode 100644 examples/online_store/milvus_tutorial/docker-compose.yml create mode 100644 examples/online_store/milvus_tutorial/feature_store.yaml create mode 100644 examples/online_store/milvus_tutorial/milvus_example.py diff --git a/examples/online_store/milvus_tutorial/README.md b/examples/online_store/milvus_tutorial/README.md new file mode 100644 index 00000000000..961d497d56e --- /dev/null +++ b/examples/online_store/milvus_tutorial/README.md @@ -0,0 +1,66 @@ +# Milvus Tutorial with Feast + +This tutorial demonstrates how to use Milvus as a vector database backend for Feast. You'll learn how to set up Milvus, create embeddings, store them in Feast, and perform similarity searches. + +## Prerequisites + +- Python 3.8+ +- Docker (for running Milvus) +- Feast installed (`pip install 'feast[milvus]'`) + +## Setup + +1. Start Milvus containers with Docker Compose: + +```bash +docker compose up -d +``` + +This will start three containers: +- `milvus-standalone`: The Milvus server +- `milvus-etcd`: For metadata storage +- `milvus-minio`: For object storage + +2. Wait until all containers are healthy (this may take a minute or two): + +```bash +docker ps +``` + +## Project Structure + +``` +milvus_tutorial/ +├── README.md +├── feature_store.yaml # Feast configuration +├── docker-compose.yml # Docker Compose configuration for Milvus +├── data/ # Data directory +│ └── sample_data.parquet # Sample data with embeddings (generated by the script) +└── milvus_example.py # Example script +``` + +## Tutorial Steps + +1. Configure Feast with Milvus +2. Generate sample data with embeddings +3. Define feature views +4. Register and apply feature definitions +5. Perform vector similarity search + +Run the complete example: + +```bash +python milvus_example.py +``` + +## How It Works + +This tutorial demonstrates: + +- Setting up Milvus as a vector database +- Configuring Feast to use Milvus as the online store +- Generating embeddings for text data +- Storing embeddings in Feast feature views +- Performing vector similarity searches using Feast's retrieval API + +Milvus is a powerful vector database designed for efficient similarity searches, making it an excellent choice for applications like semantic search and recommendation systems. diff --git a/examples/online_store/milvus_tutorial/docker-compose.yml b/examples/online_store/milvus_tutorial/docker-compose.yml new file mode 100644 index 00000000000..c450a44f256 --- /dev/null +++ b/examples/online_store/milvus_tutorial/docker-compose.yml @@ -0,0 +1,31 @@ +version: "3.9" + +services: + etcd: + image: quay.io/coreos/etcd:v3.5.18 + command: > + etcd -advertise-client-urls=http://etcd:2379 + -listen-client-urls http://0.0.0.0:2379 + volumes: ["./volumes/etcd:/etcd"] + healthcheck: + test: ["CMD", "etcdctl", "endpoint", "health"] + interval: 30s + + minio: + image: minio/minio:RELEASE.2023-03-20T20-16-18Z + environment: + MINIO_ACCESS_KEY: minioadmin + MINIO_SECRET_KEY: minioadmin + command: server /data --console-address ":9001" + volumes: ["./volumes/minio:/data"] + ports: ["9000:9000", "9001:9001"] + + milvus: + image: milvusdb/milvus:v2.5.10 + command: ["milvus", "run", "standalone"] + environment: + ETCD_ENDPOINTS: etcd:2379 + MINIO_ADDRESS: minio:9000 + depends_on: [etcd, minio] + volumes: ["./volumes/milvus:/var/lib/milvus"] + ports: ["19530:19530", "9091:9091"] diff --git a/examples/online_store/milvus_tutorial/feature_store.yaml b/examples/online_store/milvus_tutorial/feature_store.yaml new file mode 100644 index 00000000000..6cde51529ad --- /dev/null +++ b/examples/online_store/milvus_tutorial/feature_store.yaml @@ -0,0 +1,16 @@ +project: milvus_tutorial +provider: local +registry: data/registry.db +online_store: + type: milvus + host: localhost + port: 19530 + vector_enabled: true + embedding_dim: 384 + index_type: "FLAT" + metric_type: "L2" + +offline_store: + type: file + +entity_key_serialization_version: 3 \ No newline at end of file diff --git a/examples/online_store/milvus_tutorial/milvus_example.py b/examples/online_store/milvus_tutorial/milvus_example.py new file mode 100644 index 00000000000..dc5cb646c60 --- /dev/null +++ b/examples/online_store/milvus_tutorial/milvus_example.py @@ -0,0 +1,191 @@ +# Milvus Tutorial with Feast +# +# This example demonstrates how to use Milvus +# as a vector database backend for Feast. + +import os +import subprocess +from datetime import datetime, timedelta + +import pandas as pd + +# For generating embeddings +try: + from sentence_transformers import SentenceTransformer +except ImportError: + print("Installing sentence_transformers...") + subprocess.check_call(["pip", "install", "sentence-transformers"]) + from sentence_transformers import SentenceTransformer + +from feast import FeatureStore, Entity, FeatureView, Field, FileSource +from feast.data_format import ParquetFormat +from feast.types import Float32, Array, String +from feast.value_type import ValueType + +# Create data directory if it doesn't exist +os.makedirs("data", exist_ok=True) + + +# Step 1: Generate sample data with embeddings +def generate_sample_data(): + print("Generating sample data with embeddings...") + + # Sample product data + products = [ + {"id": 1, "name": "Smartphone", + "description": "A high-end smartphone with advanced camera features and long battery life."}, + {"id": 2, "name": "Laptop", + "description": "Powerful laptop with fast processor and high-resolution display for professional use."}, + {"id": 3, "name": "Headphones", + "description": "Wireless noise-cancelling headphones with premium sound quality."}, + {"id": 4, "name": "Smartwatch", + "description": "Fitness tracking smartwatch with heart rate monitoring and sleep analysis."}, + {"id": 5, "name": "Tablet", + "description": "Lightweight tablet with vibrant display perfect for reading and browsing."}, + {"id": 6, "name": "Camera", + "description": "Professional digital camera with high-resolution sensor and interchangeable lenses."}, + {"id": 7, "name": "Speaker", + "description": "Bluetooth speaker with rich bass and long battery life for outdoor use."}, + {"id": 8, "name": "Gaming Console", + "description": "Next-generation gaming console with 4K graphics and fast loading times."}, + {"id": 9, "name": "E-reader", + "description": "E-ink display reader with backlight for comfortable reading in any lighting condition."}, + {"id": 10, "name": "Smart TV", + "description": "4K smart television with built-in streaming apps and voice control."} + ] + + # Create DataFrame + df = pd.DataFrame(products) + + # Generate embeddings using sentence-transformers + model = SentenceTransformer('all-MiniLM-L6-v2') # Small, fast model with 384-dim embeddings + embeddings = model.encode(df['description'].tolist()) + + # Add embeddings and timestamp to DataFrame + df['embedding'] = embeddings.tolist() + df['event_timestamp'] = datetime.now() - timedelta(days=1) + df['created_timestamp'] = datetime.now() - timedelta(days=1) + + # Save to parquet file + parquet_path = "data/sample_data.parquet" + df.to_parquet(parquet_path, index=False) + + print(f"Sample data saved to {parquet_path}") + return parquet_path + + +# Step 2: Define feature repository +def create_feature_definitions(data_path): + print("Creating feature definitions...") + + product = Entity( + name="product_id", + description="Product ID", + join_keys=["id"], + value_type=ValueType.INT64, + ) + + source = FileSource( + file_format=ParquetFormat(), + path=data_path, + timestamp_field="event_timestamp", + created_timestamp_column="created_timestamp", + ) + + # Define feature view with vector embeddings + product_embeddings = FeatureView( + name="product_embeddings", + entities=[product], + ttl=timedelta(days=30), + schema=[ + Field( + name="embedding", + dtype=Array(Float32), + vector_index=True, # Mark as vector field + ), + Field(name="name", dtype=String), + Field(name="description", dtype=String), + ], + source=source, + online=True, + ) + + return product, product_embeddings + + +def setup_feature_store(product, product_embeddings): + print("Setting up feature store...") + + store = FeatureStore(repo_path=".") + + store.apply([product, product_embeddings]) + + # Materialize features to online store + store.materialize( + start_date=datetime.now() - timedelta(days=2), + end_date=datetime.now(), + ) + + print("Feature store setup complete") + return store + + +# Step 4: Perform vector similarity search +def perform_similarity_search(store, query_text: str, top_k: int = 3): + print(f"\nPerforming similarity search for: '{query_text}'") + + # Generate embedding for query text + model = SentenceTransformer('all-MiniLM-L6-v2') + query_embedding = model.encode(query_text).tolist() + + # Perform similarity search using vector embeddings with version 2 API + try: + results = store.retrieve_online_documents_v2( + features=["product_embeddings:embedding", "product_embeddings:name", "product_embeddings:description"], + query=query_embedding, + top_k=top_k, + distance_metric="L2" + ).to_df() + + # Print results + print(f"\nTop {top_k} similar products:") + for i, row in results.iterrows(): + print(f"\n{i + 1}. Name: {row['product_embeddings__name']}") + print(f" Description: {row['product_embeddings__description']}") + print(f" Distance: {row['distance']}") + + return results + except Exception as e: + print(f"Error performing search: {e}") + return None + + +# Main function to run the example +def main(): + print("=== Milvus Tutorial with Feast ===") + + # Check if Milvus is running + print("\nEnsure Milvus is running:") + print("docker compose up -d") + + input("\nPress Enter to continue once Milvus is ready...") + + # Generate sample data + data_path = generate_sample_data() + + # Create feature definitions + product, product_embeddings = create_feature_definitions(data_path) + + # Setup feature store + store = setup_feature_store(product, product_embeddings) + + # Perform similarity searches + perform_similarity_search(store, "wireless audio device with good sound", top_k=3) + perform_similarity_search(store, "portable computing device for work", top_k=3) + + print("\n=== Tutorial Complete ===") + print("You've successfully set up Milvus with Feast and performed vector similarity searches!") + + +if __name__ == "__main__": + main() From efda2e49323e390e35dccca88bc48ef0debdf46e Mon Sep 17 00:00:00 2001 From: Yassin Nouh <70436855+YassinNouh21@users.noreply.github.com> Date: Wed, 23 Apr 2025 22:51:29 +0200 Subject: [PATCH 2/2] Update examples/online_store/milvus_tutorial/README.md Co-authored-by: Francisco Arceo Signed-off-by: Yassin Nouh <70436855+YassinNouh21@users.noreply.github.com> --- examples/online_store/milvus_tutorial/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/online_store/milvus_tutorial/README.md b/examples/online_store/milvus_tutorial/README.md index 961d497d56e..cbe2c6178cc 100644 --- a/examples/online_store/milvus_tutorial/README.md +++ b/examples/online_store/milvus_tutorial/README.md @@ -4,7 +4,7 @@ This tutorial demonstrates how to use Milvus as a vector database backend for Fe ## Prerequisites -- Python 3.8+ +- Python 3.10+ - Docker (for running Milvus) - Feast installed (`pip install 'feast[milvus]'`)