docs: Added blog post on horizontal scaling

ntkathole · ntkathole · commit cb2c1c61201f · 2026-02-25T15:37:56.000+05:30
Signed-off-by: ntkathole &lt;nikhilkathole2683@gmail.com&gt;
diff --git a/infra/feast-operator/internal/controller/services/scaling_test.go b/infra/feast-operator/internal/controller/services/scaling_test.go
@@ -192,6 +192,36 @@ var _ = Describe("Horizontal Scaling", func() {
 			}
 			Expect(isFilePersistence(featureStore)).To(BeFalse())
 		})
+
+		It("should return true when no registry is configured (implicit file-based default)", func() {
+			featureStore.Status.Applied.Services.Registry = nil
+			Expect(isFilePersistence(featureStore)).To(BeTrue())
+		})
+
+		It("should return true when local registry has no persistence configured", func() {
+			featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
+				Local: &feastdevv1.LocalRegistryConfig{},
+			}
+			Expect(isFilePersistence(featureStore)).To(BeTrue())
+		})
+
+		It("should return true when local registry has empty persistence", func() {
+			featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
+				Local: &feastdevv1.LocalRegistryConfig{
+					Persistence: &feastdevv1.RegistryPersistence{},
+				},
+			}
+			Expect(isFilePersistence(featureStore)).To(BeTrue())
+		})
+
+		It("should return false when remote registry is configured", func() {
+			featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
+				Remote: &feastdevv1.RemoteRegistryConfig{
+					Hostname: ptr("registry.example.com"),
+				},
+			}
+			Expect(isFilePersistence(featureStore)).To(BeFalse())
+		})
 	})
 
 	Describe("validateScaling", func() {
@@ -229,6 +259,28 @@ var _ = Describe("Horizontal Scaling", func() {
 			}
 			Expect(feast.validateScaling()).To(Succeed())
 		})
+
+		It("should reject scaling when no registry is configured (implicit file-based default)", func() {
+			featureStore.Status.Applied.Services.Scaling = &feastdevv1.ScalingConfig{
+				Replicas: ptr(int32(3)),
+			}
+			featureStore.Status.Applied.Services.Registry = nil
+			err := feast.validateScaling()
+			Expect(err).To(HaveOccurred())
+			Expect(err.Error()).To(ContainSubstring("DB-backed persistence"))
+		})
+
+		It("should succeed with scaling and remote registry", func() {
+			featureStore.Status.Applied.Services.Scaling = &feastdevv1.ScalingConfig{
+				Replicas: ptr(int32(3)),
+			}
+			featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
+				Remote: &feastdevv1.RemoteRegistryConfig{
+					Hostname: ptr("registry.example.com"),
+				},
+			}
+			Expect(feast.validateScaling()).To(Succeed())
+		})
 	})
 
 	Describe("validateScaling rejects file-based persistence with scaling", func() {
diff --git a/infra/feast-operator/internal/controller/services/util.go b/infra/feast-operator/internal/controller/services/util.go
@@ -525,17 +525,29 @@ func isFilePersistence(featureStore *feastdevv1.FeatureStore) bool {
 		return true
 	}
 
-	if IsLocalRegistry(featureStore) &&
-		services.Registry.Local.Persistence != nil &&
-		services.Registry.Local.Persistence.FilePersistence != nil {
-		// S3/GS-backed registry file persistence is safe for multi-replica
-		if services.Registry.Local.Persistence.FilePersistence.Path != "" {
-			path := services.Registry.Local.Persistence.FilePersistence.Path
-			if strings.HasPrefix(path, "s3://") || strings.HasPrefix(path, "gs://") {
-				return false
+	// When no registry is configured, the deployment defaults to a file-based
+	// registry (registry.db). Only a remote registry or an explicit local
+	// registry with DB persistence is safe for multi-replica.
+	if services.Registry == nil {
+		return true
+	}
+	if isRemoteRegistry(featureStore) {
+		return false
+	}
+	if IsLocalRegistry(featureStore) {
+		if services.Registry.Local.Persistence == nil ||
+			services.Registry.Local.Persistence.DBPersistence == nil {
+			// S3/GS-backed registry file persistence is safe for multi-replica
+			if services.Registry.Local.Persistence != nil &&
+				services.Registry.Local.Persistence.FilePersistence != nil &&
+				services.Registry.Local.Persistence.FilePersistence.Path != "" {
+				path := services.Registry.Local.Persistence.FilePersistence.Path
+				if strings.HasPrefix(path, "s3://") || strings.HasPrefix(path, "gs://") {
+					return false
+				}
 			}
+			return true
 		}
-		return true
 	}
 
 	return false
diff --git a/infra/website/docs/blog/scaling-feast-feature-server.md b/infra/website/docs/blog/scaling-feast-feature-server.md
@@ -0,0 +1,253 @@
+---
+title: Scaling the Feast Feature Server on Kubernetes
+description: The Feast Operator now supports horizontal scaling with static replicas, HPA autoscaling, and external autoscalers like KEDA — enabling production-grade, high-availability feature serving.
+date: 2026-02-21
+authors: ["Nikhil Kathole"]
+---
+
+# Scaling the Feast Feature Server on Kubernetes
+
+As ML systems move from experimentation to production, the feature server often becomes a critical bottleneck. A single-replica deployment might handle development traffic, but production workloads — real-time inference, batch scoring, multiple consuming services — demand the ability to scale horizontally.
+
+We're excited to announce that the Feast Operator now supports **horizontal scaling** for the FeatureStore deployment, giving teams the tools to run Feast at production scale on Kubernetes.
+
+# The Problem: Single-Replica Limitations
+
+By default, the Feast Operator deploys a single-replica Deployment. This works well for getting started, but presents challenges as traffic grows:
+
+- **Single point of failure** — one pod crash means downtime for all feature consumers
+- **Throughput ceiling** — a single pod can only handle so many concurrent requests
+- **No elasticity** — traffic spikes (model retraining, batch inference) can overwhelm the server
+- **Rolling updates cause downtime** — the default `Recreate` strategy tears down the old pod before starting a new one
+
+Teams have been manually patching Deployments or creating external HPAs, but this bypasses the operator's reconciliation loop and can lead to configuration drift.
+
+# The Solution: Native Scaling Support
+
+The Feast Operator now supports three scaling modes through a new `scaling` field in the FeatureStore CR:
+
+## 1. Static Replicas
+
+The simplest approach — set a fixed number of replicas:
+
+```yaml
+apiVersion: feast.dev/v1
+kind: FeatureStore
+metadata:
+  name: production-feast
+spec:
+  feastProject: my_project
+  services:
+    scaling:
+      replicas: 3
+    onlineStore:
+      persistence:
+        store:
+          type: postgres
+          secretRef:
+            name: feast-data-stores
+    registry:
+      local:
+        persistence:
+          store:
+            type: sql
+            secretRef:
+              name: feast-data-stores
+```
+
+This gives you high availability and load distribution with a predictable resource footprint. The operator automatically switches the Deployment strategy to `RollingUpdate`, ensuring zero-downtime deployments.
+
+## 2. HPA Autoscaling
+
+For workloads with variable traffic patterns, the operator can create and manage a `HorizontalPodAutoscaler` directly:
+
+```yaml
+apiVersion: feast.dev/v1
+kind: FeatureStore
+metadata:
+  name: autoscaled-feast
+spec:
+  feastProject: my_project
+  services:
+    scaling:
+      autoscaling:
+        minReplicas: 2
+        maxReplicas: 10
+        metrics:
+        - type: Resource
+          resource:
+            name: cpu
+            target:
+              type: Utilization
+              averageUtilization: 70
+    onlineStore:
+      persistence:
+        store:
+          type: postgres
+          secretRef:
+            name: feast-data-stores
+      server:
+        resources:
+          requests:
+            cpu: 200m
+            memory: 256Mi
+          limits:
+            cpu: "1"
+            memory: 1Gi
+    registry:
+      local:
+        persistence:
+          store:
+            type: sql
+            secretRef:
+              name: feast-data-stores
+```
+
+The operator creates the HPA as an owned resource — it's automatically cleaned up if you remove the autoscaling configuration or delete the FeatureStore CR. If no custom metrics are specified, the operator defaults to **80% CPU utilization**.
+
+## 3. External Autoscalers (KEDA, Custom HPAs)
+
+For teams using [KEDA](https://keda.sh) or other external autoscalers, the operator is designed to stay out of the way. When no `scaling` field is set, the operator preserves whatever replica count an external controller sets on the Deployment.
+
+To use KEDA, configure the FeatureStore with an explicit `RollingUpdate` strategy and DB-backed persistence, then create a KEDA `ScaledObject` targeting the Feast Deployment:
+
+```yaml
+apiVersion: feast.dev/v1
+kind: FeatureStore
+metadata:
+  name: keda-feast
+spec:
+  feastProject: my_project
+  services:
+    deploymentStrategy:
+      type: RollingUpdate
+    onlineStore:
+      persistence:
+        store:
+          type: postgres
+          secretRef:
+            name: feast-data-stores
+    registry:
+      local:
+        persistence:
+          store:
+            type: sql
+            secretRef:
+              name: feast-data-stores
+---
+apiVersion: keda.sh/v1alpha1
+kind: ScaledObject
+metadata:
+  name: feast-scaledobject
+spec:
+  scaleTargetRef:
+    name: feast-keda-feast    # matches the Feast deployment name
+  minReplicaCount: 2
+  maxReplicaCount: 10
+  triggers:
+  - type: prometheus
+    metadata:
+      serverAddress: http://prometheus.monitoring.svc:9090
+      metricName: http_requests_total
+      query: sum(rate(http_requests_total{service="feast"}[2m]))
+      threshold: "100"
+```
+
+This gives you the full power of KEDA's 50+ event-driven triggers while the operator manages the rest of the Feast deployment lifecycle.
+
+# Safety First: Persistence Validation
+
+Not all persistence backends are safe for multi-replica deployments. File-based stores like SQLite, DuckDB, and local `registry.db` use single-writer file locks that don't work across pods.
+
+The operator enforces this at reconciliation time — if you configure scaling with file-based persistence, you'll get a clear error:
+
+```
+horizontal scaling (replicas > 1 or autoscaling) requires DB-backed persistence
+for all enabled services. File-based persistence (SQLite, DuckDB, registry.db)
+is incompatible with multiple replicas
+```
+
+This validation applies to all enabled services (online store, offline store, and registry). Object-store-backed registry paths (`s3://` and `gs://`) are treated as safe since they support concurrent readers.
+
+| Persistence Type | Compatible with Scaling? |
+|---|---|
+| PostgreSQL / MySQL | Yes |
+| Redis | Yes |
+| Cassandra | Yes |
+| SQL-based Registry | Yes |
+| S3/GCS Registry | Yes |
+| SQLite | No |
+| DuckDB | No |
+| Local `registry.db` | No |
+
+# How It Works Under the Hood
+
+The implementation adds three key behaviors to the operator's reconciliation loop:
+
+**1. Replica management** — When static replicas are configured, the operator sets them on the Deployment. When HPA is configured, the operator leaves the `replicas` field unset so the HPA controller can manage it. When neither is configured, existing replicas are preserved (supporting external autoscalers).
+
+**2. Deployment strategy** — The operator automatically switches from `Recreate` (the default for single-replica) to `RollingUpdate` when scaling is enabled. This prevents the "kill-all-pods-then-start-new-ones" behavior that would cause downtime during scaling events. Users can always override this with an explicit `deploymentStrategy` in the CR.
+
+**3. HPA lifecycle** — The operator creates, updates, and deletes the HPA as an owned resource tied to the FeatureStore CR. Removing the `autoscaling` configuration automatically cleans up the HPA.
+
+The scaling status is reported back on the FeatureStore status:
+
+```yaml
+status:
+  scalingStatus:
+    currentReplicas: 3
+    desiredReplicas: 3
+```
+
+# What About TLS, CronJobs, and Services?
+
+Scaling is designed to work seamlessly with existing operator features:
+
+- **TLS** — Each pod mounts the same TLS secret. OpenShift service-serving certificates work automatically since they're bound to the Service, not individual pods.
+- **Kubernetes Services** — The Service's label selector already matches all pods in the Deployment, so load balancing across replicas works out of the box.
+- **CronJobs** — The `feast apply` and `feast materialize-incremental` CronJobs use `kubectl exec` into a single pod. Since DB-backed persistence is required for scaling, all pods share the same state — it doesn't matter which pod the CronJob runs against.
+
+# Getting Started
+
+**1. Ensure DB-backed persistence** for all enabled services (online store, offline store, registry).
+
+**2. Add a `scaling` field** to your FeatureStore CR:
+
+```yaml
+services:
+  scaling:
+    replicas: 3          # static replicas
+    # -- OR --
+    # autoscaling:        # HPA
+    #   minReplicas: 2
+    #   maxReplicas: 10
+```
+
+**3. Apply** the updated CR:
+
+```bash
+kubectl apply -f my-featurestore.yaml
+```
+
+**4. Verify** the scaling:
+
+```bash
+# Check pods
+kubectl get pods -l app.kubernetes.io/managed-by=feast
+
+# Check HPA (if using autoscaling)
+kubectl get hpa
+
+# Check FeatureStore status
+kubectl get feast -o yaml
+```
+
+# Learn More
+
+- [Scaling Feast documentation](https://docs.feast.dev/how-to-guides/scaling-feast)
+- [Feast on Kubernetes guide](https://docs.feast.dev/how-to-guides/feast-on-kubernetes)
+- [FeatureStore CRD API reference](https://github.com/feast-dev/feast/blob/master/infra/feast-operator/docs/api/markdown/ref.md)
+- [Sample CRs for static scaling and HPA](https://github.com/feast-dev/feast/tree/master/infra/feast-operator/config/samples)
+- Join the [Feast Slack](https://slack.feast.dev) to share feedback and ask questions
+
+We're excited to see teams scale their feature serving infrastructure with confidence. Try it out and let us know how it works for your use case!