Skip to content

Commit 2623df8

Browse files
updated to implement full text search
Signed-off-by: Francisco Javier Arceo <farceo@redhat.com>
1 parent 61f5050 commit 2623df8

File tree

9 files changed

+231
-52
lines changed

9 files changed

+231
-52
lines changed

sdk/python/feast/feature_server.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,7 @@ class GetOnlineFeaturesRequest(BaseModel):
7575
features: Optional[List[str]] = None
7676
full_feature_names: bool = False
7777
query_embedding: Optional[List[float]] = None
78+
query_string: Optional[str] = None
7879

7980

8081
def _get_features(request: GetOnlineFeaturesRequest, store: "feast.FeatureStore"):
@@ -195,6 +196,7 @@ async def retrieve_online_documents(
195196
entity_rows=request.entities,
196197
full_feature_names=request.full_feature_names,
197198
query=request.query_embedding,
199+
query_string=request.query_string,
198200
)
199201

200202
response = await run_in_threadpool(

sdk/python/feast/feature_store.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1867,6 +1867,7 @@ def retrieve_online_documents_v2(
18671867
top_k: int,
18681868
features: List[str],
18691869
distance_metric: Optional[str] = "L2",
1870+
query_string: Optional[str] = None,
18701871
) -> OnlineResponse:
18711872
"""
18721873
Retrieves the top k closest document features. Note, embeddings are a subset of features.
@@ -1878,6 +1879,7 @@ def retrieve_online_documents_v2(
18781879
query: The query to retrieve the closest document features for.
18791880
top_k: The number of closest document features to retrieve.
18801881
distance_metric: The distance metric to use for retrieval.
1882+
query_string: The query string to retrieve the closest document features using keyword search (bm25).
18811883
"""
18821884
if isinstance(query, str):
18831885
raise ValueError(
@@ -1919,6 +1921,7 @@ def retrieve_online_documents_v2(
19191921
query,
19201922
top_k,
19211923
distance_metric,
1924+
query_string,
19221925
)
19231926

19241927
def _retrieve_from_online_store(
@@ -1988,6 +1991,7 @@ def _retrieve_from_online_store_v2(
19881991
query: List[float],
19891992
top_k: int,
19901993
distance_metric: Optional[str],
1994+
query_string: Optional[str],
19911995
) -> OnlineResponse:
19921996
"""
19931997
Search and return document features from the online document store.
@@ -2003,6 +2007,7 @@ def _retrieve_from_online_store_v2(
20032007
query=query,
20042008
top_k=top_k,
20052009
distance_metric=distance_metric,
2010+
query_string=query_string,
20062011
)
20072012

20082013
entity_key_dict: Dict[str, List[ValueProto]] = {}

sdk/python/feast/infra/online_stores/milvus_online_store/milvus.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -463,6 +463,7 @@ def retrieve_online_documents_v2(
463463
embedding: List[float],
464464
top_k: int,
465465
distance_metric: Optional[str] = None,
466+
query_string: Optional[str] = None,
466467
) -> List[
467468
Tuple[
468469
Optional[datetime],

sdk/python/feast/infra/online_stores/online_store.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -439,6 +439,7 @@ def retrieve_online_documents_v2(
439439
embedding: List[float],
440440
top_k: int,
441441
distance_metric: Optional[str] = None,
442+
query_string: Optional[str] = None,
442443
) -> List[
443444
Tuple[
444445
Optional[datetime],
@@ -456,6 +457,7 @@ def retrieve_online_documents_v2(
456457
requested_features: The list of features whose embeddings should be used for retrieval.
457458
embedding: The embeddings to use for retrieval.
458459
top_k: The number of documents to retrieve.
460+
query_string: The query string to search for using keyword search (bm25) (optional)
459461
460462
Returns:
461463
object: A list of top k closest documents to the specified embedding. Each item in the list is a tuple

sdk/python/feast/infra/online_stores/sqlite.py

Lines changed: 141 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@
4040
from feast.protos.feast.types.Value_pb2 import Value as ValueProto
4141
from feast.repo_config import FeastConfigBaseModel, RepoConfig
4242
from feast.type_map import feast_value_type_to_python_type
43-
from feast.types import FEAST_VECTOR_TYPES
43+
from feast.types import FEAST_VECTOR_TYPES, PrimitiveFeastType
4444
from feast.utils import (
4545
_build_retrieve_online_document_record,
4646
_serialize_vector_to_float_list,
@@ -442,6 +442,7 @@ def retrieve_online_documents_v2(
442442
query: List[float],
443443
top_k: int,
444444
distance_metric: Optional[str] = None,
445+
query_string: Optional[str] = None,
445446
) -> List[
446447
Tuple[
447448
Optional[datetime],
@@ -458,72 +459,135 @@ def retrieve_online_documents_v2(
458459
query: Query embedding to search for
459460
top_k: Number of items to return
460461
distance_metric: Distance metric to use (optional)
462+
query_string: The query string to search for using keyword search (bm25) (optional)
461463
Returns:
462464
List of tuples containing the event timestamp, entity key, and feature values
463465
"""
464466
online_store = config.online_store
465467
if not isinstance(online_store, SqliteOnlineStoreConfig):
466468
raise ValueError("online_store must be SqliteOnlineStoreConfig")
467-
if not online_store.vector_enabled:
468-
raise ValueError("Vector search is not enabled in the online store config")
469+
if not online_store.vector_enabled and not online_store.text_search_enabled:
470+
raise ValueError(
471+
"You must enable either vector search or text search in the online store config"
472+
)
469473

470474
conn = self._get_conn(config)
471475
cur = conn.cursor()
472476

473-
if not online_store.vector_len:
477+
if online_store.vector_enabled and not online_store.vector_len:
474478
raise ValueError("vector_len is not configured in the online store config")
475479

476-
query_embedding_bin = serialize_f32(query, online_store.vector_len) # type: ignore
477480
table_name = _table_id(config.project, table)
478481
vector_field = _get_vector_field(table)
479482

480-
cur.execute(
481-
f"""
482-
CREATE VIRTUAL TABLE IF NOT EXISTS vec_table using vec0(
483-
vector_value float[{online_store.vector_len}]
484-
);
485-
"""
486-
)
483+
if online_store.vector_enabled:
484+
query_embedding_bin = serialize_f32(query, online_store.vector_len) # type: ignore
485+
cur.execute(
486+
f"""
487+
CREATE VIRTUAL TABLE IF NOT EXISTS vec_table using vec0(
488+
vector_value float[{online_store.vector_len}]
489+
);
490+
"""
491+
)
492+
cur.execute(
493+
f"""
494+
INSERT INTO vec_table (rowid, vector_value)
495+
select rowid, vector_value from {table_name}
496+
where feature_name = "{vector_field}"
497+
"""
498+
)
499+
elif online_store.text_search_enabled:
500+
string_field_list = [
501+
f.name for f in table.features if f.dtype == PrimitiveFeastType.STRING
502+
]
503+
string_fields = ", ".join(string_field_list)
504+
# TODO: swap this for a value configurable in each Field()
505+
BM25_DEFAULT_WEIGHTS = ", ".join(
506+
[
507+
str(1.0)
508+
for f in table.features
509+
if f.dtype == PrimitiveFeastType.STRING
510+
]
511+
)
512+
cur.execute(
513+
f"""
514+
CREATE VIRTUAL TABLE IF NOT EXISTS search_table using fts5(
515+
entity_key, fv_rowid, {string_fields}, tokenize="porter unicode61"
516+
);
517+
"""
518+
)
519+
insert_query = _generate_bm25_search_insert_query(
520+
table_name, string_field_list
521+
)
522+
cur.execute(insert_query)
487523

488-
cur.execute(
489-
f"""
490-
INSERT INTO vec_table (rowid, vector_value)
491-
select rowid, vector_value from {table_name}
492-
where feature_name = "{vector_field}"
493-
"""
494-
)
524+
else:
525+
raise ValueError(
526+
"Neither vector search nor text search are enabled in the online store config"
527+
)
495528

496-
cur.execute(
497-
f"""
529+
if online_store.vector_enabled:
530+
cur.execute(
531+
f"""
532+
select
533+
fv2.entity_key,
534+
fv2.feature_name,
535+
fv2.value,
536+
fv.vector_value,
537+
f.distance,
538+
fv.event_ts,
539+
fv.created_ts
540+
from (
541+
select
542+
rowid,
543+
vector_value,
544+
distance
545+
from vec_table
546+
where vector_value match ?
547+
order by distance
548+
limit ?
549+
) f
550+
left join {table_name} fv
551+
on f.rowid = fv.rowid
552+
left join {table_name} fv2
553+
on fv.entity_key = fv2.entity_key
554+
where fv2.feature_name != "{vector_field}"
555+
""",
556+
(
557+
query_embedding_bin,
558+
top_k,
559+
),
560+
)
561+
elif online_store.text_search_enabled:
562+
cur.execute(
563+
f"""
498564
select
499-
fv2.entity_key,
500-
fv2.feature_name,
501-
fv2.value,
565+
fv.entity_key,
566+
fv.feature_name,
567+
fv.value,
502568
fv.vector_value,
503569
f.distance,
504570
fv.event_ts,
505571
fv.created_ts
506-
from (
507-
select
508-
rowid,
509-
vector_value,
510-
distance
511-
from vec_table
512-
where vector_value match ?
513-
order by distance
514-
limit ?
515-
) f
516-
left join {table_name} fv
517-
on f.rowid = fv.rowid
518-
left join {table_name} fv2
519-
on fv.entity_key = fv2.entity_key
520-
where fv2.feature_name != "{vector_field}"
521-
""",
522-
(
523-
query_embedding_bin,
524-
top_k,
525-
),
526-
)
572+
from {table_name} fv
573+
inner join (
574+
select
575+
fv_rowid,
576+
entity_key,
577+
{string_fields},
578+
bm25(search_table, {BM25_DEFAULT_WEIGHTS}) as distance
579+
from search_table
580+
where search_table match ? order by distance limit ?
581+
) f
582+
on f.entity_key = fv.entity_key
583+
""",
584+
(query_string, top_k),
585+
)
586+
587+
else:
588+
raise ValueError(
589+
"Neither vector search nor text search are enabled in the online store config"
590+
)
527591

528592
rows = cur.fetchall()
529593
results: List[
@@ -557,9 +621,10 @@ def retrieve_online_documents_v2(
557621
feature_val.ParseFromString(value_bin)
558622
entity_dict[entity_key]["entity_key_proto"] = entity_key_proto
559623
entity_dict[entity_key][feature_name] = feature_val
560-
entity_dict[entity_key][vector_field] = _serialize_vector_to_float_list(
561-
vector_value
562-
)
624+
if online_store.vector_enabled:
625+
entity_dict[entity_key][vector_field] = _serialize_vector_to_float_list(
626+
vector_value
627+
)
563628
entity_dict[entity_key]["distance"] = ValueProto(float_val=distance)
564629
entity_dict[entity_key]["event_ts"] = event_ts
565630
entity_dict[entity_key]["created_ts"] = created_ts
@@ -706,3 +771,31 @@ def _get_vector_field(table: FeatureView) -> str:
706771
)
707772
vector_field: str = vector_fields[0].name
708773
return vector_field
774+
775+
776+
def _generate_bm25_search_insert_query(
777+
table_name: str, string_field_list: List[str]
778+
) -> str:
779+
"""
780+
Generates an SQL insertion query for the given table and string fields.
781+
782+
Args:
783+
table_name (str): The name of the table to select data from.
784+
string_field_list (List[str]): The list of string fields to be used in the insertion.
785+
786+
Returns:
787+
str: The generated SQL insertion query.
788+
"""
789+
_string_fields = ", ".join(string_field_list)
790+
query = f"INSERT INTO search_table (entity_key, fv_rowid, {_string_fields})\nSELECT\n\tDISTINCT fv0.entity_key,\n\tfv0.rowid as fv_rowid"
791+
from_query = f"\nFROM (select rowid, * from {table_name} where feature_name = '{string_field_list[0]}') fv0"
792+
793+
for i, string_field in enumerate(string_field_list):
794+
query += f"\n\t,fv{i}.value as {string_field}"
795+
if i > 0:
796+
from_query += (
797+
f"\nLEFT JOIN (select rowid, * from {table_name} where feature_name = '{string_field}') fv{i}"
798+
+ f"\n\tON fv0.entity_key = fv{i}.entity_key"
799+
)
800+
801+
return query + from_query

sdk/python/feast/infra/passthrough_provider.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,7 @@ def retrieve_online_documents_v2(
321321
query: List[float],
322322
top_k: int,
323323
distance_metric: Optional[str] = None,
324+
query_string: Optional[str] = None,
324325
) -> List:
325326
result = []
326327
if self.online_store:
@@ -331,6 +332,7 @@ def retrieve_online_documents_v2(
331332
query,
332333
top_k,
333334
distance_metric,
335+
query_string,
334336
)
335337
return result
336338

sdk/python/feast/infra/provider.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -459,6 +459,7 @@ def retrieve_online_documents_v2(
459459
query: List[float],
460460
top_k: int,
461461
distance_metric: Optional[str] = None,
462+
query_string: Optional[str] = None,
462463
) -> List[
463464
Tuple[
464465
Optional[datetime],
@@ -476,6 +477,7 @@ def retrieve_online_documents_v2(
476477
requested_features: the requested document feature names.
477478
query: The query embedding to search for.
478479
top_k: The number of documents to return.
480+
query_string: The query string to search for using keyword search (bm25) (optional)
479481
480482
Returns:
481483
A list of dictionaries, where each dictionary contains the datetime, entitykey, and a dictionary

sdk/python/tests/foo_provider.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,7 @@ def retrieve_online_documents_v2(
172172
query: List[float],
173173
top_k: int,
174174
distance_metric: Optional[str] = None,
175+
query_string: Optional[str] = None,
175176
) -> List[
176177
Tuple[
177178
Optional[datetime],

0 commit comments

Comments
 (0)