Skip to content
Merged
Prev Previous commit
Next Next commit
test: add integration test for non-entity retrieval
Signed-off-by: yassinnouh21 <yassinnouh21@gmail.com>
  • Loading branch information
YassinNouh21 committed Mar 16, 2026
commit 271825e3225dd4825e1285988ae421febcba3122
Original file line number Diff line number Diff line change
Expand Up @@ -728,3 +728,115 @@ def test_historical_features_field_mapping(
actual_df,
sort_by=["driver_id"],
)


@pytest.mark.integration
@pytest.mark.universal_offline_stores(only=["file"])
def test_historical_features_non_entity_retrieval(environment):
"""Test get_historical_features with entity_df=None using start_date/end_date.

This exercises the non-entity retrieval path where a synthetic entity_df is
generated internally. Regression test for the bug where start_date was used
instead of end_date for min_event_timestamp in the synthetic entity_df.
"""
store = environment.feature_store

now = datetime.now().replace(microsecond=0, second=0, minute=0)
two_days_ago = now - timedelta(days=2)
one_day_ago = now - timedelta(days=1)

driver_stats_df = pd.DataFrame(
data=[
{
"driver_id": 1001,
"avg_daily_trips": 10,
"event_timestamp": two_days_ago,
"created": two_days_ago,
},
{
"driver_id": 1001,
"avg_daily_trips": 20,
"event_timestamp": one_day_ago,
"created": one_day_ago,
},
{
"driver_id": 1001,
"avg_daily_trips": 30,
"event_timestamp": now,
"created": now,
},
{
"driver_id": 1002,
"avg_daily_trips": 100,
"event_timestamp": two_days_ago,
"created": two_days_ago,
},
{
"driver_id": 1002,
"avg_daily_trips": 200,
"event_timestamp": one_day_ago,
"created": one_day_ago,
},
{
"driver_id": 1002,
"avg_daily_trips": 300,
"event_timestamp": now,
"created": now,
},
]
)

start_date = now - timedelta(days=3)
end_date = now + timedelta(hours=1)

driver_stats_data_source = environment.data_source_creator.create_data_source(
df=driver_stats_df,
destination_name=f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
timestamp_field="event_timestamp",
created_timestamp_column="created",
)

driver_entity = Entity(name="driver", join_keys=["driver_id"])
driver_fv = FeatureView(
name="driver_stats",
entities=[driver_entity],
schema=[Field(name="avg_daily_trips", dtype=Int32)],
source=driver_stats_data_source,
)

store.apply([driver_entity, driver_fv])

offline_job = store.get_historical_features(
entity_df=None,
features=["driver_stats:avg_daily_trips"],
full_feature_names=False,
start_date=start_date,
end_date=end_date,
)

actual_df = offline_job.to_df()

assert not actual_df.empty, "Result should not be empty"
assert "avg_daily_trips" in actual_df.columns

actual_driver_ids = set(actual_df["driver_id"].tolist())
assert 1001 in actual_driver_ids, "driver 1001 should be in results"
assert 1002 in actual_driver_ids, "driver 1002 should be in results"

# Verify timestamps fall within the requested range.
# Strip tz info to avoid tz-naive vs tz-aware comparison issues.
ts_start = pd.Timestamp(start_date).tz_localize(None)
ts_end = pd.Timestamp(end_date).tz_localize(None)
for ts in actual_df["event_timestamp"]:
ts_val = pd.Timestamp(ts).tz_localize(None)
assert ts_val >= ts_start, f"Timestamp {ts_val} before start_date"
assert ts_val <= ts_end, f"Timestamp {ts_val} after end_date"

# The latest features must be present -- this is the critical regression check.
# With the old bug (using start_date instead of end_date), the synthetic entity_df
# had wrong max_event_timestamp causing the latest rows to be missed.
actual_trips = set(actual_df["avg_daily_trips"].tolist())
assert 30 in actual_trips, "Latest trip value 30 for driver 1001 should be present"
assert 300 in actual_trips, (
"Latest trip value 300 for driver 1002 should be present"
)
Loading