Skip to content

Commit f376ea2

Browse files
committed
feat: Horizontal scaling support to the Feast operator
Signed-off-by: ntkathole <nikhilkathole2683@gmail.com>
1 parent 406ebcb commit f376ea2

File tree

16 files changed

+5457
-1760
lines changed

16 files changed

+5457
-1760
lines changed

.secrets.baseline

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,10 @@
9090
{
9191
"path": "detect_secrets.filters.allowlist.is_line_allowlisted"
9292
},
93+
{
94+
"path": "detect_secrets.filters.common.is_baseline_file",
95+
"filename": ".secrets.baseline"
96+
},
9397
{
9498
"path": "detect_secrets.filters.common.is_ignored_due_to_verification_policies",
9599
"min_level": 2
@@ -930,7 +934,7 @@
930934
"filename": "infra/feast-operator/api/v1/featurestore_types.go",
931935
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
932936
"is_verified": false,
933-
"line_number": 657
937+
"line_number": 692
934938
}
935939
],
936940
"infra/feast-operator/api/v1/zz_generated.deepcopy.go": [
@@ -939,21 +943,21 @@
939943
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
940944
"hashed_secret": "f914fc9324de1bec1ad13dec94a8ea2ddb41fc87",
941945
"is_verified": false,
942-
"line_number": 615
946+
"line_number": 658
943947
},
944948
{
945949
"type": "Secret Keyword",
946950
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
947951
"hashed_secret": "44e17306b837162269a410204daaa5ecee4ec22c",
948952
"is_verified": false,
949-
"line_number": 1123
953+
"line_number": 1206
950954
},
951955
{
952956
"type": "Secret Keyword",
953957
"filename": "infra/feast-operator/api/v1/zz_generated.deepcopy.go",
954958
"hashed_secret": "c2028031c154bbe86fd69bef740855c74b927dcf",
955959
"is_verified": false,
956-
"line_number": 1128
960+
"line_number": 1211
957961
}
958962
],
959963
"infra/feast-operator/api/v1alpha1/featurestore_types.go": [
@@ -1152,7 +1156,7 @@
11521156
"filename": "infra/feast-operator/internal/controller/services/services.go",
11531157
"hashed_secret": "36dc326eb15c7bdd8d91a6b87905bcea20b637d1",
11541158
"is_verified": false,
1155-
"line_number": 164
1159+
"line_number": 178
11561160
}
11571161
],
11581162
"infra/feast-operator/internal/controller/services/tls_test.go": [
@@ -1535,5 +1539,5 @@
15351539
}
15361540
]
15371541
},
1538-
"generated_at": "2026-02-19T06:53:49Z"
1542+
"generated_at": "2026-02-21T16:33:24Z"
15391543
}

docs/how-to-guides/feast-on-kubernetes.md

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,9 @@ spec:
6565
> _More advanced FeatureStore CR examples can be found in the feast-operator [samples directory](../../infra/feast-operator/config/samples)._
6666
6767
{% hint style="success" %}
68-
Important note: Scaling a Feature Store Deployment should only be done if the configured data store(s) will support it.
68+
**Scaling:** The Feast Operator supports horizontal scaling via static replicas, HPA autoscaling, or external autoscalers like [KEDA](https://keda.sh). Scaling requires DB-backed persistence for all enabled services.
6969
70-
Please check the how-to guide for some specific recommendations on [how to scale Feast](./scaling-feast.md).
70+
See the [Horizontal Scaling with the Feast Operator](./scaling-feast.md#horizontal-scaling-with-the-feast-operator) guide for configuration details, or check the general recommendations on [how to scale Feast](./scaling-feast.md).
7171
{% endhint %}
72+
73+
> _Sample scaling CRs are available at [`v1_featurestore_scaling_static.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_static.yaml) and [`v1_featurestore_scaling_hpa.yaml`](../../infra/feast-operator/config/samples/v1_featurestore_scaling_hpa.yaml)._

docs/how-to-guides/scaling-feast.md

Lines changed: 154 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,4 +23,157 @@ However, this process does not scale for large data sets, since it's executed on
2323
Feast supports pluggable [Compute Engines](../getting-started/components/compute-engine.md), that allow the materialization process to be scaled up.
2424
Aside from the local process, Feast supports a [Lambda-based materialization engine](https://rtd.feast.dev/en/master/#alpha-lambda-based-engine), and a [Bytewax-based materialization engine](https://rtd.feast.dev/en/master/#bytewax-engine).
2525

26-
Users may also be able to build an engine to scale up materialization using existing infrastructure in their organizations.
26+
Users may also be able to build an engine to scale up materialization using existing infrastructure in their organizations.
27+
28+
### Horizontal Scaling with the Feast Operator
29+
30+
When running Feast on Kubernetes with the [Feast Operator](./feast-on-kubernetes.md), you can horizontally scale the FeatureStore deployment by adding a `scaling` field to the `services` section of the FeatureStore CR.
31+
32+
**Prerequisites:** Horizontal scaling requires **DB-backed persistence** for all enabled services (online store, offline store, and registry). File-based persistence (SQLite, DuckDB, `registry.db`) is incompatible with multiple replicas because these backends do not support concurrent access from multiple pods.
33+
34+
#### Static Replicas
35+
36+
Set a fixed number of replicas:
37+
38+
```yaml
39+
apiVersion: feast.dev/v1
40+
kind: FeatureStore
41+
metadata:
42+
name: sample-scaling
43+
spec:
44+
feastProject: my_project
45+
services:
46+
scaling:
47+
replicas: 3
48+
onlineStore:
49+
persistence:
50+
store:
51+
type: postgres
52+
secretRef:
53+
name: feast-data-stores
54+
registry:
55+
local:
56+
persistence:
57+
store:
58+
type: sql
59+
secretRef:
60+
name: feast-data-stores
61+
```
62+
63+
#### Autoscaling with HPA
64+
65+
Configure a HorizontalPodAutoscaler to dynamically scale based on metrics:
66+
67+
```yaml
68+
apiVersion: feast.dev/v1
69+
kind: FeatureStore
70+
metadata:
71+
name: sample-autoscaling
72+
spec:
73+
feastProject: my_project
74+
services:
75+
scaling:
76+
autoscaling:
77+
minReplicas: 2
78+
maxReplicas: 10
79+
metrics:
80+
- type: Resource
81+
resource:
82+
name: cpu
83+
target:
84+
type: Utilization
85+
averageUtilization: 70
86+
onlineStore:
87+
persistence:
88+
store:
89+
type: postgres
90+
secretRef:
91+
name: feast-data-stores
92+
server:
93+
resources:
94+
requests:
95+
cpu: 200m
96+
memory: 256Mi
97+
registry:
98+
local:
99+
persistence:
100+
store:
101+
type: sql
102+
secretRef:
103+
name: feast-data-stores
104+
```
105+
106+
{% hint style="info" %}
107+
When autoscaling is configured, the operator automatically sets the deployment strategy to `RollingUpdate` (instead of the default `Recreate`) to ensure zero-downtime scaling. You can override this by explicitly setting `deploymentStrategy` in the CR.
108+
{% endhint %}
109+
110+
#### Validation Rules
111+
112+
The operator enforces the following rules:
113+
- `replicas` and `autoscaling` are **mutually exclusive** -- you cannot set both.
114+
- Scaling with `replicas > 1` or any `autoscaling` config is **rejected** if any enabled service uses file-based persistence.
115+
- S3 (`s3://`) and GCS (`gs://`) backed registry file persistence is allowed with scaling, since these object stores support concurrent readers.
116+
117+
#### Using KEDA (Kubernetes Event-Driven Autoscaling)
118+
119+
[KEDA](https://keda.sh) is also supported as an external autoscaler. Rather than using the built-in `autoscaling` field, you can create a KEDA `ScaledObject` that targets the Feast deployment directly.
120+
121+
When using KEDA, do **not** set the `scaling.autoscaling` field -- KEDA manages its own HPA. The operator will preserve the replica count set by KEDA since it does not override externally managed replicas.
122+
123+
There are a few things you must configure manually when using KEDA:
124+
125+
1. **Set the deployment strategy to `RollingUpdate`** -- The operator defaults to `Recreate` when no `scaling` config is present, which causes downtime on scale events. Override it explicitly:
126+
127+
```yaml
128+
apiVersion: feast.dev/v1
129+
kind: FeatureStore
130+
metadata:
131+
name: sample-keda
132+
spec:
133+
feastProject: my_project
134+
services:
135+
deploymentStrategy:
136+
type: RollingUpdate
137+
onlineStore:
138+
persistence:
139+
store:
140+
type: postgres
141+
secretRef:
142+
name: feast-data-stores
143+
registry:
144+
local:
145+
persistence:
146+
store:
147+
type: sql
148+
secretRef:
149+
name: feast-data-stores
150+
```
151+
152+
2. **Ensure DB-backed persistence** -- The operator's persistence validation only applies when the built-in `scaling` field is used. With KEDA, you are responsible for ensuring all enabled services use DB-backed persistence (not SQLite, DuckDB, or local `registry.db`).
153+
154+
3. **Create a KEDA `ScaledObject`** targeting the Feast deployment:
155+
156+
```yaml
157+
apiVersion: keda.sh/v1alpha1
158+
kind: ScaledObject
159+
metadata:
160+
name: feast-scaledobject
161+
spec:
162+
scaleTargetRef:
163+
name: feast-sample-keda # must match the Feast deployment name
164+
minReplicaCount: 2
165+
maxReplicaCount: 10
166+
triggers:
167+
- type: prometheus
168+
metadata:
169+
serverAddress: http://prometheus.monitoring.svc:9090
170+
metricName: http_requests_total
171+
query: sum(rate(http_requests_total{service="feast"}[2m]))
172+
threshold: "100"
173+
```
174+
175+
{% hint style="warning" %}
176+
KEDA-created HPAs are not owned by the Feast operator. The operator will not interfere with them, but it also will not clean them up if the FeatureStore CR is deleted. You must manage the KEDA `ScaledObject` lifecycle independently.
177+
{% endhint %}
178+
179+
For the full API reference, see the [FeatureStore CRD reference](../../infra/feast-operator/docs/api/markdown/ref.md).

infra/feast-operator/api/v1/featurestore_types.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@ package v1
1818

1919
import (
2020
appsv1 "k8s.io/api/apps/v1"
21+
autoscalingv2 "k8s.io/api/autoscaling/v2"
2122
batchv1 "k8s.io/api/batch/v1"
2223
corev1 "k8s.io/api/core/v1"
2324
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
@@ -301,6 +302,40 @@ type FeatureStoreServices struct {
301302
DisableInitContainers bool `json:"disableInitContainers,omitempty"`
302303
// Volumes specifies the volumes to mount in the FeatureStore deployment. A corresponding `VolumeMount` should be added to whichever feast service(s) require access to said volume(s).
303304
Volumes []corev1.Volume `json:"volumes,omitempty"`
305+
// Scaling configures horizontal scaling for the FeatureStore deployment.
306+
// Requires DB-based persistence for all enabled services when replicas > 1 or autoscaling is configured.
307+
Scaling *ScalingConfig `json:"scaling,omitempty"`
308+
}
309+
310+
// ScalingConfig configures horizontal scaling for the FeatureStore deployment.
311+
// +kubebuilder:validation:XValidation:rule="!has(self.replicas) || !has(self.autoscaling)",message="replicas and autoscaling are mutually exclusive."
312+
type ScalingConfig struct {
313+
// Replicas is the static number of pod replicas. Mutually exclusive with autoscaling.
314+
// +kubebuilder:validation:Minimum=1
315+
// +optional
316+
Replicas *int32 `json:"replicas,omitempty"`
317+
// Autoscaling configures a HorizontalPodAutoscaler for the FeatureStore deployment.
318+
// Mutually exclusive with replicas.
319+
// +optional
320+
Autoscaling *AutoscalingConfig `json:"autoscaling,omitempty"`
321+
}
322+
323+
// AutoscalingConfig defines HPA settings for the FeatureStore deployment.
324+
type AutoscalingConfig struct {
325+
// MinReplicas is the lower limit for the number of replicas. Defaults to 1.
326+
// +kubebuilder:validation:Minimum=1
327+
// +optional
328+
MinReplicas *int32 `json:"minReplicas,omitempty"`
329+
// MaxReplicas is the upper limit for the number of replicas. Required.
330+
// +kubebuilder:validation:Minimum=1
331+
MaxReplicas int32 `json:"maxReplicas"`
332+
// Metrics contains the specifications for which to use to calculate the desired replica count.
333+
// If not set, defaults to 80% CPU utilization.
334+
// +optional
335+
Metrics []autoscalingv2.MetricSpec `json:"metrics,omitempty"`
336+
// Behavior configures the scaling behavior of the target.
337+
// +optional
338+
Behavior *autoscalingv2.HorizontalPodAutoscalerBehavior `json:"behavior,omitempty"`
304339
}
305340

306341
// OfflineStore configures the offline store service
@@ -690,6 +725,16 @@ type FeatureStoreStatus struct {
690725
FeastVersion string `json:"feastVersion,omitempty"`
691726
Phase string `json:"phase,omitempty"`
692727
ServiceHostnames ServiceHostnames `json:"serviceHostnames,omitempty"`
728+
// ScalingStatus reports the current scaling state of the FeatureStore deployment.
729+
ScalingStatus *ScalingStatus `json:"scalingStatus,omitempty"`
730+
}
731+
732+
// ScalingStatus reports the observed scaling state.
733+
type ScalingStatus struct {
734+
// CurrentReplicas is the current number of pod replicas.
735+
CurrentReplicas int32 `json:"currentReplicas,omitempty"`
736+
// DesiredReplicas is the desired number of pod replicas.
737+
DesiredReplicas int32 `json:"desiredReplicas,omitempty"`
693738
}
694739

695740
// ServiceHostnames defines the service hostnames in the format of <domain>:<port>, e.g. example.svc.cluster.local:80

0 commit comments

Comments
 (0)