@@ -728,3 +728,110 @@ def test_historical_features_field_mapping(
728728 actual_df ,
729729 sort_by = ["driver_id" ],
730730 )
731+
732+
733+ @pytest .mark .integration
734+ @pytest .mark .universal_offline_stores (only = ["file" ])
735+ def test_historical_features_non_entity_retrieval (environment ):
736+ """Test get_historical_features with entity_df=None using start_date/end_date.
737+
738+ This exercises the non-entity retrieval path where a synthetic entity_df is
739+ generated internally. Regression test for the bug where start_date was used
740+ instead of end_date for min_event_timestamp in the synthetic entity_df.
741+ """
742+ store = environment .feature_store
743+
744+ now = datetime .now ().replace (microsecond = 0 , second = 0 , minute = 0 )
745+ two_days_ago = now - timedelta (days = 2 )
746+ one_day_ago = now - timedelta (days = 1 )
747+
748+ driver_stats_df = pd .DataFrame (
749+ data = [
750+ {
751+ "driver_id" : 1001 ,
752+ "avg_daily_trips" : 10 ,
753+ "event_timestamp" : two_days_ago ,
754+ "created" : two_days_ago ,
755+ },
756+ {
757+ "driver_id" : 1001 ,
758+ "avg_daily_trips" : 20 ,
759+ "event_timestamp" : one_day_ago ,
760+ "created" : one_day_ago ,
761+ },
762+ {
763+ "driver_id" : 1001 ,
764+ "avg_daily_trips" : 30 ,
765+ "event_timestamp" : now ,
766+ "created" : now ,
767+ },
768+ {
769+ "driver_id" : 1002 ,
770+ "avg_daily_trips" : 100 ,
771+ "event_timestamp" : two_days_ago ,
772+ "created" : two_days_ago ,
773+ },
774+ {
775+ "driver_id" : 1002 ,
776+ "avg_daily_trips" : 200 ,
777+ "event_timestamp" : one_day_ago ,
778+ "created" : one_day_ago ,
779+ },
780+ {
781+ "driver_id" : 1002 ,
782+ "avg_daily_trips" : 300 ,
783+ "event_timestamp" : now ,
784+ "created" : now ,
785+ },
786+ ]
787+ )
788+
789+ start_date = now - timedelta (days = 3 )
790+ end_date = now + timedelta (hours = 1 )
791+
792+ driver_stats_data_source = environment .data_source_creator .create_data_source (
793+ df = driver_stats_df ,
794+ destination_name = f"test_driver_stats_{ int (time .time_ns ())} _{ random .randint (1000 , 9999 )} " ,
795+ timestamp_field = "event_timestamp" ,
796+ created_timestamp_column = "created" ,
797+ )
798+
799+ driver_entity = Entity (name = "driver" , join_keys = ["driver_id" ])
800+ driver_fv = FeatureView (
801+ name = "driver_stats" ,
802+ entities = [driver_entity ],
803+ schema = [Field (name = "avg_daily_trips" , dtype = Int32 )],
804+ source = driver_stats_data_source ,
805+ )
806+
807+ store .apply ([driver_entity , driver_fv ])
808+
809+ offline_job = store .get_historical_features (
810+ entity_df = None ,
811+ features = ["driver_stats:avg_daily_trips" ],
812+ full_feature_names = False ,
813+ start_date = start_date ,
814+ end_date = end_date ,
815+ )
816+
817+ actual_df = offline_job .to_df ()
818+
819+ assert not actual_df .empty , "Result should not be empty"
820+ assert "avg_daily_trips" in actual_df .columns
821+
822+ actual_driver_ids = set (actual_df ["driver_id" ].tolist ())
823+ assert 1001 in actual_driver_ids , "driver 1001 should be in results"
824+ assert 1002 in actual_driver_ids , "driver 1002 should be in results"
825+
826+ # Verify timestamps fall within the requested range
827+ for ts in actual_df ["event_timestamp" ]:
828+ ts_val = pd .Timestamp (ts )
829+ assert ts_val >= pd .Timestamp (start_date ), f"Timestamp { ts_val } before start_date"
830+ assert ts_val <= pd .Timestamp (end_date ), f"Timestamp { ts_val } after end_date"
831+
832+ # The latest features must be present -- this is the critical regression check.
833+ # With the old bug (using start_date instead of end_date), the synthetic entity_df
834+ # had wrong max_event_timestamp causing the latest rows to be missed.
835+ actual_trips = set (actual_df ["avg_daily_trips" ].tolist ())
836+ assert 30 in actual_trips , "Latest trip value 30 for driver 1001 should be present"
837+ assert 300 in actual_trips , "Latest trip value 300 for driver 1002 should be present"
0 commit comments