test: add integration test for non-entity retrieval

Signed-off-by: yassinnouh21 <yassinnouh21@gmail.com>
feast-dev · franciscojavierarceo · Mar 17, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 16, 2026
commit 271825e3225dd4825e1285988ae421febcba3122
@@ -728,3 +728,115 @@ def test_historical_features_field_mapping(
         actual_df,
         sort_by=["driver_id"],
     )
+
+
+@pytest.mark.integration
+@pytest.mark.universal_offline_stores(only=["file"])
+def test_historical_features_non_entity_retrieval(environment):
+    """Test get_historical_features with entity_df=None using start_date/end_date.
+
+    This exercises the non-entity retrieval path where a synthetic entity_df is
+    generated internally. Regression test for the bug where start_date was used
+    instead of end_date for min_event_timestamp in the synthetic entity_df.
+    """
+    store = environment.feature_store
+
+    now = datetime.now().replace(microsecond=0, second=0, minute=0)
+    two_days_ago = now - timedelta(days=2)
+    one_day_ago = now - timedelta(days=1)
+
+    driver_stats_df = pd.DataFrame(
+        data=[
+            {
+                "driver_id": 1001,
+                "avg_daily_trips": 10,
+                "event_timestamp": two_days_ago,
+                "created": two_days_ago,
+            },
+            {
+                "driver_id": 1001,
+                "avg_daily_trips": 20,
+                "event_timestamp": one_day_ago,
+                "created": one_day_ago,
+            },
+            {
+                "driver_id": 1001,
+                "avg_daily_trips": 30,
+                "event_timestamp": now,
+                "created": now,
+            },
+            {
+                "driver_id": 1002,
+                "avg_daily_trips": 100,
+                "event_timestamp": two_days_ago,
+                "created": two_days_ago,
+            },
+            {
+                "driver_id": 1002,
+                "avg_daily_trips": 200,
+                "event_timestamp": one_day_ago,
+                "created": one_day_ago,
+            },
+            {
+                "driver_id": 1002,
+                "avg_daily_trips": 300,
+                "event_timestamp": now,
+                "created": now,
+            },
+        ]
+    )
+
+    start_date = now - timedelta(days=3)
+    end_date = now + timedelta(hours=1)
+
+    driver_stats_data_source = environment.data_source_creator.create_data_source(
+        df=driver_stats_df,
+        destination_name=f"test_driver_stats_{int(time.time_ns())}_{random.randint(1000, 9999)}",
+        timestamp_field="event_timestamp",
+        created_timestamp_column="created",
+    )
+
+    driver_entity = Entity(name="driver", join_keys=["driver_id"])
+    driver_fv = FeatureView(
+        name="driver_stats",
+        entities=[driver_entity],
+        schema=[Field(name="avg_daily_trips", dtype=Int32)],
+        source=driver_stats_data_source,
+    )
+
+    store.apply([driver_entity, driver_fv])
+
+    offline_job = store.get_historical_features(
+        entity_df=None,
+        features=["driver_stats:avg_daily_trips"],
+        full_feature_names=False,
+        start_date=start_date,
+        end_date=end_date,
+    )
+
+    actual_df = offline_job.to_df()
+
+    assert not actual_df.empty, "Result should not be empty"
+    assert "avg_daily_trips" in actual_df.columns
+
+    actual_driver_ids = set(actual_df["driver_id"].tolist())
+    assert 1001 in actual_driver_ids, "driver 1001 should be in results"
+    assert 1002 in actual_driver_ids, "driver 1002 should be in results"
+
+    # Verify timestamps fall within the requested range.
+    # Strip tz info to avoid tz-naive vs tz-aware comparison issues.
+    ts_start = pd.Timestamp(start_date).tz_localize(None)
+    ts_end = pd.Timestamp(end_date).tz_localize(None)
+    for ts in actual_df["event_timestamp"]:
+        ts_val = pd.Timestamp(ts).tz_localize(None)
+        assert ts_val >= ts_start, f"Timestamp {ts_val} before start_date"
+        assert ts_val <= ts_end, f"Timestamp {ts_val} after end_date"
+
+    # The latest features must be present -- this is the critical regression check.
+    # With the old bug (using start_date instead of end_date), the synthetic entity_df
+    # had wrong max_event_timestamp causing the latest rows to be missed.
+    actual_trips = set(actual_df["avg_daily_trips"].tolist())
+    assert 30 in actual_trips, "Latest trip value 30 for driver 1001 should be present"
+    assert 300 in actual_trips, (
+        "Latest trip value 300 for driver 1002 should be present"
+    )