feast-dev
diff --git a/‎sdk/python/feast/feature_store.py‎
Lines changed: 127 additions & 31 deletions b/‎sdk/python/feast/feature_store.py‎
Lines changed: 127 additions & 31 deletions
diff --git a/‎sdk/python/feast/mlflow_integration/__init__.py‎
Lines changed: 2 additions & 6 deletions b/‎sdk/python/feast/mlflow_integration/__init__.py‎
Lines changed: 2 additions & 6 deletions
diff --git a/‎sdk/python/feast/mlflow_integration/config.py‎
Lines changed: 6 additions & 19 deletions b/‎sdk/python/feast/mlflow_integration/config.py‎
Lines changed: 6 additions & 19 deletions
diff --git a/‎sdk/python/feast/mlflow_integration/entity_df_builder.py‎
Lines changed: 19 additions & 29 deletions b/‎sdk/python/feast/mlflow_integration/entity_df_builder.py‎
Lines changed: 19 additions & 29 deletions
@@ -14,6 +14,7 @@
 import asyncio
 import copy
 import itertools
+import logging
 import os
 import time
 import warnings
@@ -109,18 +110,25 @@
 _mlflow_log_fn_loaded = False
 
 
+_logger = logging.getLogger(__name__)
+
+
 def _get_mlflow_log_fn():
     """Lazy-import mlflow logger only when MLflow integration is configured."""
     global _mlflow_log_fn, _mlflow_log_fn_loaded
     if not _mlflow_log_fn_loaded:
-        _mlflow_log_fn_loaded = True
         try:
             from feast.mlflow_integration.logger import (
                 log_feature_retrieval_to_mlflow,
             )
 
             _mlflow_log_fn = log_feature_retrieval_to_mlflow
-        except Exception:
+            _mlflow_log_fn_loaded = True
+        except ImportError:
+            _mlflow_log_fn_loaded = True
+            _mlflow_log_fn = None
+        except Exception as e:
+            _logger.warning("MLflow auto-log import failed (will retry): %s", e)
             _mlflow_log_fn = None
     return _mlflow_log_fn
 
@@ -213,19 +221,20 @@ def __init__(
         # Initialize feature service cache for performance optimization
         self._feature_service_cache = {}
 
+        # Cache for _resolve_feature_service_name lookups
+        self._fs_name_cache: Dict[frozenset, Optional[str]] = {}
+
         # Configure MLflow tracking URI globally from config
         self._init_mlflow_tracking()
 
     def _init_mlflow_tracking(self):
         """Configure MLflow globally from feature_store.yaml.
 
-        Sets the tracking URI and experiment name so the user never needs
-        to call mlflow.set_tracking_uri() or mlflow.set_experiment() in
-        their scripts.  The experiment is named after the Feast project.
+        Sets the tracking URI and experiment name.
+        The experiment is named after the Feast project.
 
         When no tracking_uri is specified, defaults to http://127.0.0.1:5000
-        (a local MLflow tracking server). This ensures that train.py,
-        predict.py, feast ui, and the MLflow UI all share the same backend.
+        (a local MLflow tracking server).
         """
         try:
             mlflow_cfg = self.config.mlflow
@@ -242,24 +251,92 @@ def _init_mlflow_tracking(self):
         except Exception as e:
             warnings.warn(f"Failed to configure MLflow tracking: {e}")
 
-    def _resolve_feature_service_name(
-        self, feature_refs: List[str]
-    ) -> Optional[str]:
-        """Try to find a feature service that covers the given feature refs."""
+    def _resolve_feature_service_name(self, feature_refs: List[str]) -> Optional[str]:
+        """Find the best-matching feature service for the given feature refs.
+
+        Resolution: exact match wins immediately; otherwise the smallest
+        superset (fewest extra features) is returned.  Results are cached
+        per FeatureStore instance for O(1) repeated lookups.
+        """
         try:
-            ref_set = set(feature_refs)
+            ref_key = frozenset(feature_refs)
+            if ref_key in self._fs_name_cache:
+                return self._fs_name_cache[ref_key]
+
+            best_match = None
+            best_extra = float("inf")
+
             for fs in self.registry.list_feature_services(
                 self.project, allow_cache=True
             ):
-                fs_refs = set()
-                for proj in fs.feature_view_projections:
-                    for feat in proj.features:
-                        fs_refs.add(f"{proj.name}:{feat.name}")
-                if ref_set == fs_refs or ref_set.issubset(fs_refs):
+                fs_refs = frozenset(
+                    f"{p.name}:{f.name}"
+                    for p in fs.feature_view_projections
+                    for f in p.features
+                )
+                if ref_key == fs_refs:
+                    self._fs_name_cache[ref_key] = fs.name
                     return fs.name
-        except Exception:
-            pass
-        return None
+                if ref_key.issubset(fs_refs):
+                    extra = len(fs_refs) - len(ref_key)
+                    if extra < best_extra:
+                        best_match = fs.name
+                        best_extra = extra
+
+            self._fs_name_cache[ref_key] = best_match
+            return best_match
+        except Exception as e:
+            _logger.debug("Failed to resolve feature service name: %s", e)
+            return None
+
+    def _auto_log_entity_df_info(self, entity_df, start_date=None, end_date=None):
+        """Log entity_df info to MLflow for reproducibility.
+
+        Handles three entity_df types:
+        - pd.DataFrame: saves metadata + full parquet artifact (if under 100k rows)
+        - str (SQL query): logs the query as a param
+        - None (range-based): logs start_date/end_date
+        """
+        try:
+            import mlflow
+
+            if mlflow.active_run() is None:
+                return
+            tracking_uri = self.config.mlflow.tracking_uri or "http://127.0.0.1:5000"
+            client = mlflow.MlflowClient(tracking_uri=tracking_uri)
+            run_id = mlflow.active_run().info.run_id
+
+            if isinstance(entity_df, str):
+                query = entity_df if len(entity_df) <= 490 else entity_df[:487] + "..."
+                client.log_param(run_id, "feast.entity_df_query", query)
+                client.set_tag(run_id, "feast.entity_df_type", "sql")
+
+            elif isinstance(entity_df, pd.DataFrame):
+                client.set_tag(run_id, "feast.entity_df_type", "dataframe")
+                client.log_param(run_id, "feast.entity_df_rows", str(len(entity_df)))
+                cols = ",".join(entity_df.columns)
+                if len(cols) > 490:
+                    cols = cols[:487] + "..."
+                client.log_param(run_id, "feast.entity_df_columns", cols)
+
+                max_rows = 100_000
+                if len(entity_df) <= max_rows:
+                    import tempfile
+
+                    with tempfile.TemporaryDirectory() as tmp_dir:
+                        path = os.path.join(tmp_dir, "entity_df.parquet")
+                        entity_df.to_parquet(path, index=False)
+                        mlflow.log_artifact(path)
+
+            elif entity_df is None and (start_date or end_date):
+                client.set_tag(run_id, "feast.entity_df_type", "range")
+                if start_date:
+                    client.log_param(run_id, "feast.start_date", str(start_date))
+                if end_date:
+                    client.log_param(run_id, "feast.end_date", str(end_date))
+
+        except Exception as e:
+            _logger.debug("Failed to log entity_df info to MLflow: %s", e)
 
     def _init_openlineage_emitter(self) -> Optional[Any]:
         """Initialize OpenLineage emitter if configured and enabled."""
@@ -1572,11 +1649,18 @@ def get_historical_features(
             _log_fn = _get_mlflow_log_fn()
             if _log_fn is not None:
                 _duration = time.monotonic() - _retrieval_start
-                _entity_count = (
-                    len(entity_df) if isinstance(entity_df, pd.DataFrame) else 0
-                )
+                if isinstance(entity_df, pd.DataFrame):
+                    _entity_count = len(entity_df)
+                elif isinstance(entity_df, str):
+                    _entity_count = -1
+                else:
+                    _entity_count = 0
                 _fs = features if isinstance(features, FeatureService) else None
-                _fs_name = features.name if isinstance(features, FeatureService) else self._resolve_feature_service_name(_feature_refs)
+                _fs_name = (
+                    features.name
+                    if isinstance(features, FeatureService)
+                    else self._resolve_feature_service_name(_feature_refs)
+                )
                 _log_fn(
                     feature_refs=_feature_refs,
                     entity_count=_entity_count,
@@ -1588,6 +1672,11 @@ def get_historical_features(
                     tracking_uri=self.config.mlflow.tracking_uri,
                 )
 
+                if self.config.mlflow.auto_log_entity_df:
+                    self._auto_log_entity_df_info(
+                        entity_df, start_date=start_date, end_date=end_date
+                    )
+
         return job
 
     def create_saved_dataset(
@@ -2739,13 +2828,21 @@ def get_online_features(
                 _feature_refs = utils._get_features(
                     self.registry, self.project, features, allow_cache=True
                 )
-                _entity_count = (
-                    len(entity_rows)
-                    if isinstance(entity_rows, list)
-                    else 0
-                )
+                if isinstance(entity_rows, list):
+                    _entity_count = len(entity_rows)
+                elif isinstance(entity_rows, Mapping):
+                    try:
+                        _entity_count = len(next(iter(entity_rows.values())))
+                    except Exception:
+                        _entity_count = 0
+                else:
+                    _entity_count = 0
                 _fs = features if isinstance(features, FeatureService) else None
-                _fs_name = features.name if isinstance(features, FeatureService) else self._resolve_feature_service_name(_feature_refs)
+                _fs_name = (
+                    features.name
+                    if isinstance(features, FeatureService)
+                    else self._resolve_feature_service_name(_feature_refs)
+                )
                 _log_fn(
                     feature_refs=_feature_refs,
                     entity_count=_entity_count,
@@ -2756,7 +2853,6 @@ def get_online_features(
                     project=self.project,
                     tracking_uri=self.config.mlflow.tracking_uri,
                 )
-
         return response
 
     async def get_online_features_async(
 
@@ -1,9 +1,8 @@
 """
 MLflow integration for Feast Feature Store.
 
-This module provides seamless integration between Feast and MLflow for
-automatic experiment tracking of feature retrieval operations. When enabled
-in feature_store.yaml, feature metadata is logged automatically to MLflow
+This module provides seamless integration between Feast and MLflow. When enabled
+in feature_store.yaml, feature metadata is logged to MLflow
 during get_historical_features and get_online_features calls.
 
 Usage:
@@ -17,9 +16,6 @@
             tracking_uri: http://localhost:5000
             auto_log: true
 
-    Then use Feast normally - feature retrieval metadata is logged automatically
-    to any active MLflow run.
-
     For advanced use cases, the module also provides:
     - resolve_feature_service_from_model_uri: Map an MLflow model to its Feast
       feature service.
 
@@ -6,32 +6,19 @@
 
 
 class MlflowConfig(FeastBaseModel):
-    """Configuration for MLflow integration.
-
-    This enables automatic logging of feature retrieval metadata to MLflow
-    during get_historical_features and get_online_features calls.
-
-    Example configuration in feature_store.yaml:
-        mlflow:
-            enabled: true
-            tracking_uri: http://localhost:5000
-            auto_log: true
-    """
-
     enabled: StrictBool = False
     """ bool: Whether MLflow integration is enabled. Defaults to False. """
 
     tracking_uri: Optional[StrictStr] = None
-    """ str: MLflow tracking URI. If not set, uses MLflow's default
-        (MLFLOW_TRACKING_URI env var or local ./mlruns). """
+    """ str: MLflow tracking URI. If not set, defaults to
+        http://127.0.0.1:5000 (local MLflow tracking server).
+        Set explicitly for remote/shared MLflow deployments. """
 
     auto_log: StrictBool = True
     """ bool: Automatically log feature retrieval metadata to the active
         MLflow run when get_historical_features or get_online_features is
         called. Defaults to True. """
 
-    auto_log_dataset: StrictBool = False
-    """ bool: When True, the training DataFrame produced by
-        get_historical_features().to_df() is logged as an MLflow dataset
-        input on the active run. Defaults to False because the DataFrame
-        can be large. """
+    auto_log_entity_df: StrictBool = False
+    """ bool: When True, the input entity_df (or SQL query) is recorded in
+        the MLflow run. Defaults to False. """
@@ -18,18 +18,22 @@ def get_entity_df_from_mlflow_run(
     run_id: str,
     tracking_uri: Optional[str] = None,
     timestamp_column: str = "event_timestamp",
+    max_rows: Optional[int] = None,
 ) -> pd.DataFrame:
-    """Build an entity DataFrame from an MLflow run's artifacts or params.
+    """Build an entity DataFrame from an MLflow run's artifacts.
 
     Convention: the run should have an artifact named ``entity_df.parquet``
-    (or ``entity_df.csv``).  Alternatively, a run param
-    ``feast.entity_df_path`` pointing to a local/remote file path.
+    (or ``entity_df.csv``), saved automatically when
+    ``auto_log_entity_df: true`` is set in ``feature_store.yaml``.
 
     Args:
         run_id: The MLflow run ID.
         tracking_uri: Optional MLflow tracking URI.
         timestamp_column: Expected name of the timestamp column in the
             entity DataFrame.
+        max_rows: Optional limit on number of rows to load.  When set,
+            only the first ``max_rows`` rows are returned (useful for
+            large artifacts to avoid OOM).
 
     Returns:
         A ``pd.DataFrame`` suitable for passing to
@@ -47,50 +51,33 @@ def get_entity_df_from_mlflow_run(
             "mlflow is not installed. Install with: pip install feast[mlflow]"
         )
 
-    if tracking_uri:
-        mlflow.set_tracking_uri(tracking_uri)
-
-    client = mlflow.MlflowClient()
+    client = mlflow.MlflowClient(tracking_uri=tracking_uri)
 
     try:
-        run = client.get_run(run_id)
+        client.get_run(run_id)
     except MlflowException as e:
         raise FeastMlflowEntityDfError(f"Run '{run_id}' not found: {e}")
 
     # Strategy 1: artifact entity_df.parquet
     df = _try_artifact(client, run_id, "entity_df.parquet", "parquet")
     if df is not None:
+        if max_rows is not None:
+            df = df.head(max_rows)
         _validate_timestamp_col(df, timestamp_column)
         return df
 
     # Strategy 2: artifact entity_df.csv
     df = _try_artifact(client, run_id, "entity_df.csv", "csv")
     if df is not None:
+        if max_rows is not None:
+            df = df.head(max_rows)
         _validate_timestamp_col(df, timestamp_column)
         return df
 
-    # Strategy 3: run param feast.entity_df_path
-    params = run.data.params
-    path = params.get("feast.entity_df_path")
-    if path:
-        try:
-            if path.endswith(".parquet"):
-                df = pd.read_parquet(path)
-            else:
-                df = pd.read_csv(path)
-            _validate_timestamp_col(df, timestamp_column)
-            return df
-        except FeastMlflowEntityDfError:
-            raise
-        except Exception as e:
-            raise FeastMlflowEntityDfError(
-                f"Could not load entity df from param path '{path}': {e}"
-            )
-
     raise FeastMlflowEntityDfError(
         f"No entity data found for run '{run_id}'. "
-        f"Expected artifact 'entity_df.parquet' or 'entity_df.csv', "
-        f"or param 'feast.entity_df_path'."
+        f"Expected artifact 'entity_df.parquet' or 'entity_df.csv'. "
+        f"Ensure auto_log_entity_df is enabled in feature_store.yaml."
     )
 
 
@@ -101,7 +88,10 @@ def _try_artifact(client, run_id: str, artifact_name: str, fmt: str):
         if fmt == "parquet":
             return pd.read_parquet(local_path)
         return pd.read_csv(local_path)
-    except Exception:
+    except Exception as e:
+        _logger.debug(
+            "Artifact '%s' not found for run '%s': %s", artifact_name, run_id, e
+        )
         return None