Skip to content

Commit f7b7d37

Browse files
committed
add possibility to pushdown clickhouse deduplication to offline store
Signed-off-by: lukas.valatka <lukas.valatka@cast.ai>
1 parent a6fcf5e commit f7b7d37

File tree

2 files changed

+31
-0
lines changed

2 files changed

+31
-0
lines changed

sdk/python/feast/infra/offline_stores/contrib/clickhouse_offline_store/clickhouse.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import contextlib
2+
import logging
23
import re
34
from dataclasses import asdict
45
from datetime import datetime
@@ -32,6 +33,8 @@
3233
from feast.infra.utils.clickhouse.connection_utils import get_client
3334
from feast.saved_dataset import SavedDatasetStorage
3435

36+
logger = logging.getLogger(__name__)
37+
3538

3639
class ClickhouseOfflineStoreConfig(ClickhouseConfig):
3740
type: Literal["clickhouse"] = "clickhouse"
@@ -205,6 +208,29 @@ def pull_all_from_table_or_query(
205208
assert isinstance(config.offline_store, ClickhouseOfflineStoreConfig)
206209
assert isinstance(data_source, ClickhouseSource)
207210

211+
# https://github.com/feast-dev/feast/issues/5707
212+
# Not an ideal solution, but least invasion into existing codebase
213+
if config.offline_store.deduplicate_pushdown:
214+
logger.warning(
215+
"""
216+
deduplicate_pushdown optimization is set to True in ClickhouseOfflineStoreConfig.
217+
Calling pull_latest_from_table_or_query instead of pull_all_from_table_or_query.
218+
This results in multiple times more efficient materialization jobs for large datasets.
219+
However, it comes with a caveat - can't compute historical features (just latest values available) with Feast compute engines
220+
Use with caution and ensure your use case aligns with this behavior.
221+
"""
222+
)
223+
return ClickhouseOfflineStore.pull_latest_from_table_or_query(
224+
config=config,
225+
data_source=data_source,
226+
join_key_columns=join_key_columns,
227+
feature_name_columns=feature_name_columns,
228+
timestamp_field=timestamp_field,
229+
created_timestamp_column=created_timestamp_column,
230+
start_date=start_date,
231+
end_date=end_date,
232+
)
233+
208234
from_expression = data_source.get_table_query_string()
209235

210236
timestamp_fields = [timestamp_field]

sdk/python/feast/infra/utils/clickhouse/clickhouse_config.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,4 +11,9 @@ class ClickhouseConfig(FeastConfigBaseModel):
1111
password: StrictStr
1212
use_temporary_tables_for_entity_df: bool = True
1313

14+
# https://github.com/feast-dev/feast/issues/5707
15+
# We observed that for large materialization jobs, it's multiple times more efficient
16+
# pushdown deduplication to ClickHouse side rather than doing it in Feast (no matter compute engine used)
17+
deduplicate_pushdown: bool = False
18+
1419
model_config = ConfigDict(frozen=True)

0 commit comments

Comments
 (0)