Skip to content

Commit 3abfba5

Browse files
committed
test: add integration test for non-entity retrieval
Signed-off-by: yassinnouh21 <yassinnouh21@gmail.com>
1 parent 7a4b365 commit 3abfba5

File tree

1 file changed

+107
-0
lines changed

1 file changed

+107
-0
lines changed

sdk/python/tests/integration/offline_store/test_universal_historical_retrieval.py

Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -728,3 +728,110 @@ def test_historical_features_field_mapping(
728728
actual_df,
729729
sort_by=["driver_id"],
730730
)
731+
732+
733+
@pytest.mark.integration
734+
@pytest.mark.universal_offline_stores(only=["file"])
735+
def test_historical_features_non_entity_retrieval(environment):
736+
"""Test get_historical_features with entity_df=None using start_date/end_date.
737+
738+
This exercises the non-entity retrieval path where a synthetic entity_df is
739+
generated internally. Regression test for the bug where start_date was used
740+
instead of end_date for min_event_timestamp in the synthetic entity_df.
741+
"""
742+
store = environment.feature_store
743+
744+
now = datetime.now().replace(microsecond=0, second=0, minute=0)
745+
two_days_ago = now - timedelta(days=2)
746+
one_day_ago = now - timedelta(days=1)
747+
748+
driver_stats_df = pd.DataFrame(
749+
data=[
750+
{
751+
"driver_id": 1001,
752+
"avg_daily_trips": 10,
753+
"event_timestamp": two_days_ago,
754+
"created": two_days_ago,
755+
},
756+
{
757+
"driver_id": 1001,
758+
"avg_daily_trips": 20,
759+
"event_timestamp": one_day_ago,
760+
"created": one_day_ago,
761+
},
762+
{
763+
"driver_id": 1001,
764+
"avg_daily_trips": 30,
765+
"event_timestamp": now,
766+
"created": now,
767+
},
768+
{
769+
"driver_id": 1002,
770+
"avg_daily_trips": 100,
771+
"event_timestamp": two_days_ago,
772+
"created": two_days_ago,
773+
},
774+
{
775+
"driver_id": 1002,
776+
"avg_daily_trips": 200,
777+
"event_timestamp": one_day_ago,
778+
"created": one_day_ago,
779+
},
780+
{
781+
"driver_id": 1002,
782+
"avg_daily_trips": 300,
783+
"event_timestamp": now,
784+
"created": now,
785+
},
786+
]
787+
)
788+
789+
start_date = now - timedelta(days=3)
790+
end_date = now + timedelta(hours=1)
791+
792+
driver_stats_data_source = environment.data_source_creator.create_data_source(
793+
df=driver_stats_df,
794+
destination_name=f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
795+
timestamp_field="event_timestamp",
796+
created_timestamp_column="created",
797+
)
798+
799+
driver_entity = Entity(name="driver", join_keys=["driver_id"])
800+
driver_fv = FeatureView(
801+
name="driver_stats",
802+
entities=[driver_entity],
803+
schema=[Field(name="avg_daily_trips", dtype=Int32)],
804+
source=driver_stats_data_source,
805+
)
806+
807+
store.apply([driver_entity, driver_fv])
808+
809+
offline_job = store.get_historical_features(
810+
entity_df=None,
811+
features=["driver_stats:avg_daily_trips"],
812+
full_feature_names=False,
813+
start_date=start_date,
814+
end_date=end_date,
815+
)
816+
817+
actual_df = offline_job.to_df()
818+
819+
assert not actual_df.empty, "Result should not be empty"
820+
assert "avg_daily_trips" in actual_df.columns
821+
822+
actual_driver_ids = set(actual_df["driver_id"].tolist())
823+
assert 1001 in actual_driver_ids, "driver 1001 should be in results"
824+
assert 1002 in actual_driver_ids, "driver 1002 should be in results"
825+
826+
# Verify timestamps fall within the requested range
827+
for ts in actual_df["event_timestamp"]:
828+
ts_val = pd.Timestamp(ts)
829+
assert ts_val >= pd.Timestamp(start_date), f"Timestamp {ts_val} before start_date"
830+
assert ts_val <= pd.Timestamp(end_date), f"Timestamp {ts_val} after end_date"
831+
832+
# The latest features must be present -- this is the critical regression check.
833+
# With the old bug (using start_date instead of end_date), the synthetic entity_df
834+
# had wrong max_event_timestamp causing the latest rows to be missed.
835+
actual_trips = set(actual_df["avg_daily_trips"].tolist())
836+
assert 30 in actual_trips, "Latest trip value 30 for driver 1001 should be present"
837+
assert 300 in actual_trips, "Latest trip value 300 for driver 1002 should be present"

0 commit comments

Comments
 (0)