Skip to content
Merged
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
test: Add keyword and hybrid search tests for Milvus online store
Signed-off-by: Yassin Nouh <70436855+YassinNouh21@users.noreply.github.com>
Signed-off-by: yassinnouh21 <yassinnouh21@gmail.com>
  • Loading branch information
YassinNouh21 committed Mar 31, 2025
commit f4420b074e0b1365cdb09d1c739904628f3f0789
173 changes: 173 additions & 0 deletions sdk/python/tests/unit/online_store/test_online_retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -1484,3 +1484,176 @@ def test_milvus_native_from_feast_data() -> None:

# Clean up the collection
client.drop_collection(collection_name=COLLECTION_NAME)


def test_milvus_keyword_search() -> None:
"""
Test retrieving documents from the Milvus online store using keyword search.
"""
random.seed(42)
n = 10 # number of samples
vector_length = 10
runner = CliRunner()
with runner.local_repo(
example_repo_py=get_example_repo("example_rag_feature_repo.py"),
offline_store="file",
online_store="milvus",
apply=False,
teardown=False,
) as store:
from datetime import timedelta

from feast import Entity, FeatureView, Field, FileSource
from feast.types import Array, Float32, Int64, String, UnixTimestamp

rag_documents_source = FileSource(
path="data/embedded_documents.parquet",
timestamp_field="event_timestamp",
created_timestamp_column="created_timestamp",
)

item = Entity(
name="item_id",
join_keys=["item_id"],
value_type=ValueType.INT64,
)
author = Entity(
name="author_id",
join_keys=["author_id"],
value_type=ValueType.STRING,
)

document_embeddings = FeatureView(
name="text_documents",
entities=[item, author],
schema=[
Field(
name="vector",
dtype=Array(Float32),
vector_index=True,
vector_search_metric="COSINE",
),
Field(name="item_id", dtype=Int64),
Field(name="author_id", dtype=String),
Field(name="content", dtype=String),
Field(name="title", dtype=String),
Field(name="created_timestamp", dtype=UnixTimestamp),
Field(name="event_timestamp", dtype=UnixTimestamp),
],
source=rag_documents_source,
ttl=timedelta(hours=24),
)

store.apply([rag_documents_source, item, document_embeddings])

# Write some data with specific text content for keyword search
document_embeddings_fv = store.get_feature_view(name="text_documents")
provider = store._get_provider()

contents = [
"Feast is an open source feature store for machine learning",
"Feature stores solve the problem of coordinating features for training and serving",
"Milvus is a vector database that can be used with Feast",
"Keyword search uses BM25 algorithm for relevance ranking",
"Vector search uses embeddings for semantic similarity",
"Python is a popular programming language for machine learning",
"Feast supports multiple storage backends for online and offline use cases",
"Online stores are used for low-latency feature serving",
"Offline stores are used for batch feature retrieval during training",
"Feast enables data scientists to define, manage, and share features",
]

titles = [
"Introduction to Feast",
"Feature Store Benefits",
"Using Milvus with Feast",
"Keyword Search Fundamentals",
"Vector Search Overview",
"Python for ML",
"Feast Storage Options",
"Online Serving with Feast",
"Offline Training Support",
"Feast for Data Scientists",
]

item_keys = [
EntityKeyProto(
join_keys=["item_id", "author_id"],
entity_values=[
ValueProto(int64_val=i),
ValueProto(string_val=f"author_{i}"),
],
)
for i in range(n)
]
data = []
for i, item_key in enumerate(item_keys):
data.append(
(
item_key,
{
"vector": ValueProto(
float_list_val=FloatListProto(
val=np.random.random(vector_length)
)
),
"content": ValueProto(string_val=contents[i]),
"title": ValueProto(string_val=titles[i]),
},
_utc_now(),
_utc_now(),
)
)

provider.online_write_batch(
config=store.config,
table=document_embeddings_fv,
data=data,
progress=None,
)

# Test keyword search for "Milvus"
result_milvus = store.retrieve_online_documents_v2(
features=[
"text_documents:content",
"text_documents:title",
],
query_string="Milvus",
top_k=3,
).to_dict()

# Verify that documents containing "Milvus" are returned
assert len(result_milvus["content"]) > 0
assert any("Milvus" in content for content in result_milvus["content"])

# Test keyword search for "machine learning"
result_ml = store.retrieve_online_documents_v2(
features=[
"text_documents:content",
"text_documents:title",
],
query_string="machine learning",
top_k=3,
).to_dict()

# Verify that documents containing "machine learning" are returned
assert len(result_ml["content"]) > 0
assert any("machine learning" in content.lower() for content in result_ml["content"])

# Test hybrid search (vector + keyword)
query_embedding = np.random.random(vector_length).tolist()
result_hybrid = store.retrieve_online_documents_v2(
features=[
"text_documents:content",
"text_documents:title",
"text_documents:vector",
],
query=query_embedding,
query_string="Feast",
top_k=3,
).to_dict()

# Verify hybrid search results
assert len(result_hybrid["content"]) > 0
assert any("Feast" in content for content in result_hybrid["content"])
assert len(result_hybrid["vector"]) > 0