Skip to content

Commit cb2c1c6

Browse files
committed
docs: Added blog post on horizontal scaling
Signed-off-by: ntkathole <nikhilkathole2683@gmail.com>
1 parent 26069d0 commit cb2c1c6

File tree

3 files changed

+326
-9
lines changed

3 files changed

+326
-9
lines changed

infra/feast-operator/internal/controller/services/scaling_test.go

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,6 +192,36 @@ var _ = Describe("Horizontal Scaling", func() {
192192
}
193193
Expect(isFilePersistence(featureStore)).To(BeFalse())
194194
})
195+
196+
It("should return true when no registry is configured (implicit file-based default)", func() {
197+
featureStore.Status.Applied.Services.Registry = nil
198+
Expect(isFilePersistence(featureStore)).To(BeTrue())
199+
})
200+
201+
It("should return true when local registry has no persistence configured", func() {
202+
featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
203+
Local: &feastdevv1.LocalRegistryConfig{},
204+
}
205+
Expect(isFilePersistence(featureStore)).To(BeTrue())
206+
})
207+
208+
It("should return true when local registry has empty persistence", func() {
209+
featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
210+
Local: &feastdevv1.LocalRegistryConfig{
211+
Persistence: &feastdevv1.RegistryPersistence{},
212+
},
213+
}
214+
Expect(isFilePersistence(featureStore)).To(BeTrue())
215+
})
216+
217+
It("should return false when remote registry is configured", func() {
218+
featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
219+
Remote: &feastdevv1.RemoteRegistryConfig{
220+
Hostname: ptr("registry.example.com"),
221+
},
222+
}
223+
Expect(isFilePersistence(featureStore)).To(BeFalse())
224+
})
195225
})
196226

197227
Describe("validateScaling", func() {
@@ -229,6 +259,28 @@ var _ = Describe("Horizontal Scaling", func() {
229259
}
230260
Expect(feast.validateScaling()).To(Succeed())
231261
})
262+
263+
It("should reject scaling when no registry is configured (implicit file-based default)", func() {
264+
featureStore.Status.Applied.Services.Scaling = &feastdevv1.ScalingConfig{
265+
Replicas: ptr(int32(3)),
266+
}
267+
featureStore.Status.Applied.Services.Registry = nil
268+
err := feast.validateScaling()
269+
Expect(err).To(HaveOccurred())
270+
Expect(err.Error()).To(ContainSubstring("DB-backed persistence"))
271+
})
272+
273+
It("should succeed with scaling and remote registry", func() {
274+
featureStore.Status.Applied.Services.Scaling = &feastdevv1.ScalingConfig{
275+
Replicas: ptr(int32(3)),
276+
}
277+
featureStore.Status.Applied.Services.Registry = &feastdevv1.Registry{
278+
Remote: &feastdevv1.RemoteRegistryConfig{
279+
Hostname: ptr("registry.example.com"),
280+
},
281+
}
282+
Expect(feast.validateScaling()).To(Succeed())
283+
})
232284
})
233285

234286
Describe("validateScaling rejects file-based persistence with scaling", func() {

infra/feast-operator/internal/controller/services/util.go

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -525,17 +525,29 @@ func isFilePersistence(featureStore *feastdevv1.FeatureStore) bool {
525525
return true
526526
}
527527

528-
if IsLocalRegistry(featureStore) &&
529-
services.Registry.Local.Persistence != nil &&
530-
services.Registry.Local.Persistence.FilePersistence != nil {
531-
// S3/GS-backed registry file persistence is safe for multi-replica
532-
if services.Registry.Local.Persistence.FilePersistence.Path != "" {
533-
path := services.Registry.Local.Persistence.FilePersistence.Path
534-
if strings.HasPrefix(path, "s3://") || strings.HasPrefix(path, "gs://") {
535-
return false
528+
// When no registry is configured, the deployment defaults to a file-based
529+
// registry (registry.db). Only a remote registry or an explicit local
530+
// registry with DB persistence is safe for multi-replica.
531+
if services.Registry == nil {
532+
return true
533+
}
534+
if isRemoteRegistry(featureStore) {
535+
return false
536+
}
537+
if IsLocalRegistry(featureStore) {
538+
if services.Registry.Local.Persistence == nil ||
539+
services.Registry.Local.Persistence.DBPersistence == nil {
540+
// S3/GS-backed registry file persistence is safe for multi-replica
541+
if services.Registry.Local.Persistence != nil &&
542+
services.Registry.Local.Persistence.FilePersistence != nil &&
543+
services.Registry.Local.Persistence.FilePersistence.Path != "" {
544+
path := services.Registry.Local.Persistence.FilePersistence.Path
545+
if strings.HasPrefix(path, "s3://") || strings.HasPrefix(path, "gs://") {
546+
return false
547+
}
536548
}
549+
return true
537550
}
538-
return true
539551
}
540552

541553
return false
Lines changed: 253 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,253 @@
1+
---
2+
title: Scaling the Feast Feature Server on Kubernetes
3+
description: The Feast Operator now supports horizontal scaling with static replicas, HPA autoscaling, and external autoscalers like KEDA — enabling production-grade, high-availability feature serving.
4+
date: 2026-02-21
5+
authors: ["Nikhil Kathole"]
6+
---
7+
8+
# Scaling the Feast Feature Server on Kubernetes
9+
10+
As ML systems move from experimentation to production, the feature server often becomes a critical bottleneck. A single-replica deployment might handle development traffic, but production workloads — real-time inference, batch scoring, multiple consuming services — demand the ability to scale horizontally.
11+
12+
We're excited to announce that the Feast Operator now supports **horizontal scaling** for the FeatureStore deployment, giving teams the tools to run Feast at production scale on Kubernetes.
13+
14+
# The Problem: Single-Replica Limitations
15+
16+
By default, the Feast Operator deploys a single-replica Deployment. This works well for getting started, but presents challenges as traffic grows:
17+
18+
- **Single point of failure** — one pod crash means downtime for all feature consumers
19+
- **Throughput ceiling** — a single pod can only handle so many concurrent requests
20+
- **No elasticity** — traffic spikes (model retraining, batch inference) can overwhelm the server
21+
- **Rolling updates cause downtime** — the default `Recreate` strategy tears down the old pod before starting a new one
22+
23+
Teams have been manually patching Deployments or creating external HPAs, but this bypasses the operator's reconciliation loop and can lead to configuration drift.
24+
25+
# The Solution: Native Scaling Support
26+
27+
The Feast Operator now supports three scaling modes through a new `scaling` field in the FeatureStore CR:
28+
29+
## 1. Static Replicas
30+
31+
The simplest approach — set a fixed number of replicas:
32+
33+
```yaml
34+
apiVersion: feast.dev/v1
35+
kind: FeatureStore
36+
metadata:
37+
name: production-feast
38+
spec:
39+
feastProject: my_project
40+
services:
41+
scaling:
42+
replicas: 3
43+
onlineStore:
44+
persistence:
45+
store:
46+
type: postgres
47+
secretRef:
48+
name: feast-data-stores
49+
registry:
50+
local:
51+
persistence:
52+
store:
53+
type: sql
54+
secretRef:
55+
name: feast-data-stores
56+
```
57+
58+
This gives you high availability and load distribution with a predictable resource footprint. The operator automatically switches the Deployment strategy to `RollingUpdate`, ensuring zero-downtime deployments.
59+
60+
## 2. HPA Autoscaling
61+
62+
For workloads with variable traffic patterns, the operator can create and manage a `HorizontalPodAutoscaler` directly:
63+
64+
```yaml
65+
apiVersion: feast.dev/v1
66+
kind: FeatureStore
67+
metadata:
68+
name: autoscaled-feast
69+
spec:
70+
feastProject: my_project
71+
services:
72+
scaling:
73+
autoscaling:
74+
minReplicas: 2
75+
maxReplicas: 10
76+
metrics:
77+
- type: Resource
78+
resource:
79+
name: cpu
80+
target:
81+
type: Utilization
82+
averageUtilization: 70
83+
onlineStore:
84+
persistence:
85+
store:
86+
type: postgres
87+
secretRef:
88+
name: feast-data-stores
89+
server:
90+
resources:
91+
requests:
92+
cpu: 200m
93+
memory: 256Mi
94+
limits:
95+
cpu: "1"
96+
memory: 1Gi
97+
registry:
98+
local:
99+
persistence:
100+
store:
101+
type: sql
102+
secretRef:
103+
name: feast-data-stores
104+
```
105+
106+
The operator creates the HPA as an owned resource — it's automatically cleaned up if you remove the autoscaling configuration or delete the FeatureStore CR. If no custom metrics are specified, the operator defaults to **80% CPU utilization**.
107+
108+
## 3. External Autoscalers (KEDA, Custom HPAs)
109+
110+
For teams using [KEDA](https://keda.sh) or other external autoscalers, the operator is designed to stay out of the way. When no `scaling` field is set, the operator preserves whatever replica count an external controller sets on the Deployment.
111+
112+
To use KEDA, configure the FeatureStore with an explicit `RollingUpdate` strategy and DB-backed persistence, then create a KEDA `ScaledObject` targeting the Feast Deployment:
113+
114+
```yaml
115+
apiVersion: feast.dev/v1
116+
kind: FeatureStore
117+
metadata:
118+
name: keda-feast
119+
spec:
120+
feastProject: my_project
121+
services:
122+
deploymentStrategy:
123+
type: RollingUpdate
124+
onlineStore:
125+
persistence:
126+
store:
127+
type: postgres
128+
secretRef:
129+
name: feast-data-stores
130+
registry:
131+
local:
132+
persistence:
133+
store:
134+
type: sql
135+
secretRef:
136+
name: feast-data-stores
137+
---
138+
apiVersion: keda.sh/v1alpha1
139+
kind: ScaledObject
140+
metadata:
141+
name: feast-scaledobject
142+
spec:
143+
scaleTargetRef:
144+
name: feast-keda-feast # matches the Feast deployment name
145+
minReplicaCount: 2
146+
maxReplicaCount: 10
147+
triggers:
148+
- type: prometheus
149+
metadata:
150+
serverAddress: http://prometheus.monitoring.svc:9090
151+
metricName: http_requests_total
152+
query: sum(rate(http_requests_total{service="feast"}[2m]))
153+
threshold: "100"
154+
```
155+
156+
This gives you the full power of KEDA's 50+ event-driven triggers while the operator manages the rest of the Feast deployment lifecycle.
157+
158+
# Safety First: Persistence Validation
159+
160+
Not all persistence backends are safe for multi-replica deployments. File-based stores like SQLite, DuckDB, and local `registry.db` use single-writer file locks that don't work across pods.
161+
162+
The operator enforces this at reconciliation time — if you configure scaling with file-based persistence, you'll get a clear error:
163+
164+
```
165+
horizontal scaling (replicas > 1 or autoscaling) requires DB-backed persistence
166+
for all enabled services. File-based persistence (SQLite, DuckDB, registry.db)
167+
is incompatible with multiple replicas
168+
```
169+
170+
This validation applies to all enabled services (online store, offline store, and registry). Object-store-backed registry paths (`s3://` and `gs://`) are treated as safe since they support concurrent readers.
171+
172+
| Persistence Type | Compatible with Scaling? |
173+
|---|---|
174+
| PostgreSQL / MySQL | Yes |
175+
| Redis | Yes |
176+
| Cassandra | Yes |
177+
| SQL-based Registry | Yes |
178+
| S3/GCS Registry | Yes |
179+
| SQLite | No |
180+
| DuckDB | No |
181+
| Local `registry.db` | No |
182+
183+
# How It Works Under the Hood
184+
185+
The implementation adds three key behaviors to the operator's reconciliation loop:
186+
187+
**1. Replica management** — When static replicas are configured, the operator sets them on the Deployment. When HPA is configured, the operator leaves the `replicas` field unset so the HPA controller can manage it. When neither is configured, existing replicas are preserved (supporting external autoscalers).
188+
189+
**2. Deployment strategy** — The operator automatically switches from `Recreate` (the default for single-replica) to `RollingUpdate` when scaling is enabled. This prevents the "kill-all-pods-then-start-new-ones" behavior that would cause downtime during scaling events. Users can always override this with an explicit `deploymentStrategy` in the CR.
190+
191+
**3. HPA lifecycle** — The operator creates, updates, and deletes the HPA as an owned resource tied to the FeatureStore CR. Removing the `autoscaling` configuration automatically cleans up the HPA.
192+
193+
The scaling status is reported back on the FeatureStore status:
194+
195+
```yaml
196+
status:
197+
scalingStatus:
198+
currentReplicas: 3
199+
desiredReplicas: 3
200+
```
201+
202+
# What About TLS, CronJobs, and Services?
203+
204+
Scaling is designed to work seamlessly with existing operator features:
205+
206+
- **TLS** — Each pod mounts the same TLS secret. OpenShift service-serving certificates work automatically since they're bound to the Service, not individual pods.
207+
- **Kubernetes Services** — The Service's label selector already matches all pods in the Deployment, so load balancing across replicas works out of the box.
208+
- **CronJobs** — The `feast apply` and `feast materialize-incremental` CronJobs use `kubectl exec` into a single pod. Since DB-backed persistence is required for scaling, all pods share the same state — it doesn't matter which pod the CronJob runs against.
209+
210+
# Getting Started
211+
212+
**1. Ensure DB-backed persistence** for all enabled services (online store, offline store, registry).
213+
214+
**2. Add a `scaling` field** to your FeatureStore CR:
215+
216+
```yaml
217+
services:
218+
scaling:
219+
replicas: 3 # static replicas
220+
# -- OR --
221+
# autoscaling: # HPA
222+
# minReplicas: 2
223+
# maxReplicas: 10
224+
```
225+
226+
**3. Apply** the updated CR:
227+
228+
```bash
229+
kubectl apply -f my-featurestore.yaml
230+
```
231+
232+
**4. Verify** the scaling:
233+
234+
```bash
235+
# Check pods
236+
kubectl get pods -l app.kubernetes.io/managed-by=feast
237+
238+
# Check HPA (if using autoscaling)
239+
kubectl get hpa
240+
241+
# Check FeatureStore status
242+
kubectl get feast -o yaml
243+
```
244+
245+
# Learn More
246+
247+
- [Scaling Feast documentation](https://docs.feast.dev/how-to-guides/scaling-feast)
248+
- [Feast on Kubernetes guide](https://docs.feast.dev/how-to-guides/feast-on-kubernetes)
249+
- [FeatureStore CRD API reference](https://github.com/feast-dev/feast/blob/master/infra/feast-operator/docs/api/markdown/ref.md)
250+
- [Sample CRs for static scaling and HPA](https://github.com/feast-dev/feast/tree/master/infra/feast-operator/config/samples)
251+
- Join the [Feast Slack](https://slack.feast.dev) to share feedback and ask questions
252+
253+
We're excited to see teams scale their feature serving infrastructure with confidence. Try it out and let us know how it works for your use case!

0 commit comments

Comments
 (0)