From efa1561fea71238ec7d05d536c833254d2e0a0a8 Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Fri, 22 May 2026 12:59:01 +0530 Subject: [PATCH 1/3] HA config dynamic flag changes and some improvements --- .../com/cloud/ha/HighAvailabilityManager.java | 10 ++-------- .../java/org/apache/cloudstack/ha/HAManager.java | 16 ++++++++-------- 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java index 53bfcce27038..c91182f79d32 100644 --- a/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java +++ b/engine/components-api/src/main/java/com/cloud/ha/HighAvailabilityManager.java @@ -39,7 +39,7 @@ public interface HighAvailabilityManager extends Manager { "Force High-Availability to happen even if the VM says no.", true, Cluster); ConfigKey HAWorkers = new ConfigKey<>("Advanced", Integer.class, "ha.workers", "5", - "The number of High-Availability worker threads.", true, Cluster); + "The number of High-Availability worker threads.", false, Cluster); ConfigKey InvestigateRetryInterval = new ConfigKey<>("Advanced", Integer.class, "investigate.retry.interval", "60", "The time (in seconds) between VM pings when the agent is disconnected.", true, Cluster); @@ -59,12 +59,6 @@ public interface HighAvailabilityManager extends Manager { + "that were successful and is now ready to be purged from the database (table: op_ha_work).", true, Cluster); - ConfigKey MaxRetries = new ConfigKey<>("Advanced", Integer.class, "max.retries", - "5", "The number of times to try a restart for the different Work-Types: " - + "Migrating - VMs off of a host, Destroy - a VM, Stop - a VM for storage pool migration purposes," - + " CheckStop - checks if a VM has been stopped, ForceStop - force a VM to stop even if the " - + "states don't allow it, Destroy - a VM and HA - restart a VM.", true, Cluster); - ConfigKey TimeToSleep = new ConfigKey<>("Advanced", Long.class, "time.to.sleep", "60", "The time in seconds to sleep before checking the database (table: op_ha_work) " + "for new working types (Migration, Stop, CheckStop, ForceStop, Destroy and HA), if no work items are found.", @@ -76,7 +70,7 @@ public interface HighAvailabilityManager extends Manager { true, Cluster); ConfigKey KvmHAFenceHostIfHeartbeatFailsOnStorage = new ConfigKey<>("Advanced", Boolean.class, "kvm.ha.fence.on.storage.heartbeat.failure", "false", - "Proceed fencing the host even the heartbeat failed for only one storage pool", false, ConfigKey.Scope.Zone); + "Proceed fencing the host even the heartbeat failed for only one storage pool", true, ConfigKey.Scope.Zone); enum WorkType { Migration, // Migrating VMs off of a host. diff --git a/server/src/main/java/org/apache/cloudstack/ha/HAManager.java b/server/src/main/java/org/apache/cloudstack/ha/HAManager.java index 068230c6673d..1aa20c66df06 100644 --- a/server/src/main/java/org/apache/cloudstack/ha/HAManager.java +++ b/server/src/main/java/org/apache/cloudstack/ha/HAManager.java @@ -28,43 +28,43 @@ public interface HAManager extends HAConfigManager { ConfigKey MaxConcurrentHealthCheckOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.concurrent.health.check.operations", "50", - "The number of concurrent health check operations per management server. This setting determines the size of the thread pool consuming the HEALTH CHECK queue.", true); + "The number of concurrent health check operations per management server. This setting determines the size of the thread pool consuming the HEALTH CHECK queue.", false); ConfigKey MaxPendingHealthCheckOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.pending.health.check.operations", "5000", - "The number of pending health check operations per management server. This setting determines the size of the HEALTH CHECK queue.", true); + "The number of pending health check operations per management server. This setting determines the size of the HEALTH CHECK queue.", false); ConfigKey MaxConcurrentActivityCheckOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.concurrent.activity.check.operations", "25", "The number of concurrent activity check operations per management server. This setting determines the size of the thread pool consuming the ACTIVITY CHECK queue.", - true); + false); ConfigKey MaxPendingActivityCheckOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.pending.activity.check.operations", "2500", - "The number of pending activity check operations per management server. This setting determines the size of the size of the ACTIVITY CHECK queue.", true); + "The number of pending activity check operations per management server. This setting determines the size of the ACTIVITY CHECK queue.", false); ConfigKey MaxConcurrentRecoveryOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.concurrent.recovery.operations", "25", - "The number of concurrent recovery operations per management server.", true); + "The number of concurrent recovery operations per management server. This setting determines the size of the thread pool consuming the RECOVERY queue.", false); ConfigKey MaxPendingRecoveryOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.pending.recovery.operations", "2500", - "The number of pending recovery operations per management server. This setting determines the size of the size of the RECOVERY queue.", true); + "The number of pending recovery operations per management server. This setting determines the size of the RECOVERY queue.", false); ConfigKey MaxConcurrentFenceOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.concurrent.fence.operations", "25", - "The number of concurrent fence operations per management server.", true); + "The number of concurrent fence operations per management server. This setting determines the size of the thread pool consuming the FENCE queue.", false); ConfigKey MaxPendingFenceOperations = new ConfigKey<>("Advanced", Integer.class, "ha.max.pending.fence.operations", "2500", - "The number of pending fence operations per management server. This setting determines the size of the size of the FENCE queue.", true); + "The number of pending fence operations per management server. This setting determines the size of the FENCE queue.", false); boolean transitionHAState(final HAConfig.Event event, final HAConfig haConfig); From 4d00921f3fe7d5c64902e9fe9ca906c04600d85d Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Fri, 22 May 2026 13:33:39 +0530 Subject: [PATCH 2/3] config name update in javadoc --- .../main/java/com/cloud/ha/HighAvailabilityManagerImpl.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java index 755de00dec26..a139a3d8f5ff 100644 --- a/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java +++ b/server/src/main/java/com/cloud/ha/HighAvailabilityManagerImpl.java @@ -112,8 +112,8 @@ * the VM. * * @config {@table || Param Name | Description | Values | Default || || workers | number of worker threads to spin off to do the - * processing | int | 1 || || time.to.sleep | Time to sleep if no work items are found | seconds | 60 || || max.retries - * | number of times to retry start | int | 5 || || time.between.failure | Time elapsed between failures before we + * processing | int | 1 || || time.to.sleep | Time to sleep if no work items are found | seconds | 60 || || vm.ha.migration.max.retries + * | number of times to retry start and other HA work items | int | 5 || || time.between.failure | Time elapsed between failures before we * consider it as another retry | seconds | 3600 || || time.between.cleanup | Time to wait before the cleanup thread * runs | seconds | 86400 || || force.ha | Force HA to happen even if the VM says no | boolean | false || || * ha.retry.wait | time to wait before retrying the work item | seconds | 120 || || stop.retry.wait | time to wait From 4e7a0ecffa3b9b21afb546e9b8ab7e8731d92cdd Mon Sep 17 00:00:00 2001 From: Suresh Kumar Anaparti Date: Fri, 22 May 2026 13:36:13 +0530 Subject: [PATCH 3/3] Check config 'kvm.ha.fence.on.storage.heartbeat.failure' value in zone scope before global --- .../org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java | 4 ++-- server/src/main/java/com/cloud/ha/KVMFencer.java | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java index af7441c4fd29..f59cbd065927 100644 --- a/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java +++ b/plugins/hypervisors/kvm/src/main/java/org/apache/cloudstack/kvm/ha/KVMHostActivityChecker.java @@ -123,7 +123,7 @@ public Status getHostAgentStatus(Host host) { private Status checkHostStatusWithSameHost(Host host) { Status hostStatus; - boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value(); + boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.valueIn(host.getDataCenterId()); final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown); try { logger.debug("Checking {} status...", host.toString()); @@ -150,7 +150,7 @@ private Status checkHostStatusWithSameHost(Host host) { private Status checkHostStatusWithNeighbourHosts(Host host) { Status hostStatusFromNeighbour = Status.Unknown; - boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value(); + boolean reportFailureIfOneStorageIsDown = HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.valueIn(host.getDataCenterId()); final CheckOnHostCommand cmd = new CheckOnHostCommand(host, reportFailureIfOneStorageIsDown); List neighbors = resourceManager.listHostsInClusterByStatus(host.getClusterId(), Status.Up); for (HostVO neighbor : neighbors) { diff --git a/server/src/main/java/com/cloud/ha/KVMFencer.java b/server/src/main/java/com/cloud/ha/KVMFencer.java index 4a6606b09cc3..fbabb2b939ab 100644 --- a/server/src/main/java/com/cloud/ha/KVMFencer.java +++ b/server/src/main/java/com/cloud/ha/KVMFencer.java @@ -80,7 +80,7 @@ public Boolean fenceOff(VirtualMachine vm, Host host) { List hosts = _resourceMgr.listAllHostsInCluster(host.getClusterId()); FenceCommand fence = new FenceCommand(vm, host); - fence.setReportCheckFailureIfOneStorageIsDown(HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.value()); + fence.setReportCheckFailureIfOneStorageIsDown(HighAvailabilityManager.KvmHAFenceHostIfHeartbeatFailsOnStorage.valueIn(host.getDataCenterId())); int i = 0; for (HostVO h : hosts) {