Skip to content

Commit 6837333

Browse files
author
Anthony Xu
committed
check_heartbeat and pingtest execute through ssh, not XAPI, because XAPI may hang when master host is downi
1 parent 1b34959 commit 6837333

5 files changed

Lines changed: 78 additions & 48 deletions

File tree

plugins/hypervisors/xen/src/com/cloud/hypervisor/xen/resource/CitrixResourceBase.java

Lines changed: 33 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -406,17 +406,28 @@ protected boolean pingdomr(Connection conn, String host, String port) {
406406

407407
}
408408

409-
protected boolean pingXenServer() {
409+
protected boolean pingXAPI() {
410410
Connection conn = getConnection();
411+
try {
412+
Host host = Host.getByUuid(conn, _host.uuid);
413+
if( !host.getEnabled(conn) ) {
414+
s_logger.debug("Host " + _host.ip + " is not enabled!");
415+
return false;
416+
}
417+
} catch (Exception e) {
418+
s_logger.debug("cannot get host enabled status, host " + _host.ip + " due to " + e.toString(), e);
419+
return false;
420+
}
411421
try {
412422
callHostPlugin(conn, "echo", "main");
413-
return true;
414423
} catch (Exception e) {
415424
s_logger.debug("cannot ping host " + _host.ip + " due to " + e.toString(), e);
425+
return false;
416426
}
417-
return false;
427+
return true;
418428
}
419429

430+
420431
protected String logX(XenAPIObject obj, String msg) {
421432
return new StringBuilder("Host ").append(_host.ip).append(" ").append(obj.toWireString()).append(": ").append(msg).toString();
422433
}
@@ -2006,12 +2017,24 @@ protected Answer execute(ModifySshKeysCommand cmd) {
20062017
}
20072018

20082019
private boolean doPingTest(Connection conn, final String computingHostIp) {
2009-
String args = "-h " + computingHostIp;
2010-
String result = callHostPlugin(conn, "vmops", "pingtest", "args", args);
2011-
if (result == null || result.isEmpty()) {
2020+
com.trilead.ssh2.Connection sshConnection = new com.trilead.ssh2.Connection(_host.ip, 22);
2021+
try {
2022+
sshConnection.connect(null, 60000, 60000);
2023+
if (!sshConnection.authenticateWithPassword(_username, _password.peek())) {
2024+
throw new CloudRuntimeException("Unable to authenticate");
2025+
}
2026+
2027+
String cmd = "ping -c 2 " + computingHostIp;
2028+
if (!SSHCmdHelper.sshExecuteCmd(sshConnection, cmd)) {
2029+
throw new CloudRuntimeException("Cannot ping host " + computingHostIp + " from host " + _host.ip);
2030+
}
2031+
return true;
2032+
} catch (Exception e) {
2033+
s_logger.warn("Catch exception " + e.toString(), e);
20122034
return false;
2035+
} finally {
2036+
sshConnection.close();
20132037
}
2014-
return true;
20152038
}
20162039

20172040
protected CheckOnHostAnswer execute(CheckOnHostCommand cmd) {
@@ -2238,7 +2261,7 @@ protected Storage.StorageResourceType getStorageResourceType() {
22382261
}
22392262

22402263
protected CheckHealthAnswer execute(CheckHealthCommand cmd) {
2241-
boolean result = pingXenServer();
2264+
boolean result = pingXAPI();
22422265
return new CheckHealthAnswer(cmd, result);
22432266
}
22442267

@@ -4341,9 +4364,9 @@ protected StartupStorageCommand initializeLocalSR(Connection conn) {
43414364
@Override
43424365
public PingCommand getCurrentStatus(long id) {
43434366
try {
4344-
if (!pingXenServer()) {
4367+
if (!pingXAPI()) {
43454368
Thread.sleep(1000);
4346-
if (!pingXenServer()) {
4369+
if (!pingXAPI()) {
43474370
s_logger.warn(" can not ping xenserver " + _host.uuid);
43484371
return null;
43494372
}

plugins/hypervisors/xen/src/com/cloud/hypervisor/xen/resource/XenServer56FP1Resource.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,12 +73,10 @@ protected List<File> getPatchFiles() {
7373
protected FenceAnswer execute(FenceCommand cmd) {
7474
Connection conn = getConnection();
7575
try {
76-
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHostGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
77-
if (!result.contains("> DEAD <")) {
76+
if (check_heartbeat(cmd.getHostGuid())) {
7877
s_logger.debug("Heart beat is still going so unable to fence");
7978
return new FenceAnswer(cmd, false, "Heartbeat is still going on unable to fence");
8079
}
81-
8280
Set<VM> vms = VM.getByNameLabel(conn, cmd.getVmName());
8381
for (VM vm : vms) {
8482
Set<VDI> vdis = new HashSet<VDI>();

plugins/hypervisors/xen/src/com/cloud/hypervisor/xen/resource/XenServer56Resource.java

Lines changed: 41 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,7 @@
2828
import com.cloud.resource.ServerResource;
2929
import com.cloud.utils.exception.CloudRuntimeException;
3030
import com.cloud.utils.script.Script;
31+
import com.cloud.utils.ssh.SSHCmdHelper;
3132
import com.xensource.xenapi.Connection;
3233
import com.xensource.xenapi.Host;
3334
import com.xensource.xenapi.Network;
@@ -208,15 +209,37 @@ protected NetworkUsageAnswer execute(NetworkUsageCommand cmd) {
208209
}
209210
}
210211

212+
protected Boolean check_heartbeat(String hostuuid) {
213+
com.trilead.ssh2.Connection sshConnection = new com.trilead.ssh2.Connection(_host.ip, 22);
214+
try {
215+
sshConnection.connect(null, 60000, 60000);
216+
if (!sshConnection.authenticateWithPassword(_username, _password.peek())) {
217+
throw new CloudRuntimeException("Unable to authenticate");
218+
}
219+
220+
String shcmd = "/opt/cloud/bin/check_heartbeat.sh " + hostuuid + " "
221+
+ Integer.toString(_heartbeatInterval * 2);
222+
if (!SSHCmdHelper.sshExecuteCmd(sshConnection, shcmd)) {
223+
s_logger.debug("Heart beat is gone so dead.");
224+
return false;
225+
}
226+
s_logger.debug("Heart beat is still going");
227+
return true;
228+
} catch (Exception e) {
229+
s_logger.debug("health check failed due to catch exception " + e.toString());
230+
return null;
231+
} finally {
232+
sshConnection.close();
233+
}
234+
}
235+
211236
protected FenceAnswer execute(FenceCommand cmd) {
212237
Connection conn = getConnection();
213238
try {
214-
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHostGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
215-
if (!result.contains("> DEAD <")) {
239+
if (check_heartbeat(cmd.getHostGuid())) {
216240
s_logger.debug("Heart beat is still going so unable to fence");
217241
return new FenceAnswer(cmd, false, "Heartbeat is still going on unable to fence");
218242
}
219-
220243
Set<VM> vms = VM.getByNameLabel(conn, cmd.getVmName());
221244
for (VM vm : vms) {
222245
synchronized (_cluster.intern()) {
@@ -236,6 +259,7 @@ protected FenceAnswer execute(FenceCommand cmd) {
236259
}
237260
}
238261

262+
239263
@Override
240264
protected boolean transferManagementNetwork(Connection conn, Host host, PIF src, PIF.Record spr, PIF dest) throws XmlRpcException, XenAPIException {
241265
dest.reconfigureIp(conn, spr.ipConfigurationMode, spr.IP, spr.netmask, spr.gateway, spr.DNS);
@@ -269,33 +293,29 @@ protected boolean transferManagementNetwork(Connection conn, Host host, PIF src,
269293

270294
@Override
271295
public StartupCommand[] initialize() {
272-
pingXenServer();
296+
pingXAPI();
273297
StartupCommand[] cmds = super.initialize();
274298
return cmds;
275299
}
276300

301+
277302
@Override
278303
protected CheckOnHostAnswer execute(CheckOnHostCommand cmd) {
279-
try {
280-
Connection conn = getConnection();
281-
String result = callHostPluginPremium(conn, "check_heartbeat", "host", cmd.getHost().getGuid(), "interval", Integer.toString(_heartbeatInterval * 2));
282-
if (result == null) {
283-
return new CheckOnHostAnswer(cmd, "Unable to call plugin");
284-
}
285-
if (result.contains("> DEAD <")) {
286-
s_logger.debug("Heart beat is gone so dead.");
287-
return new CheckOnHostAnswer(cmd, false, "Heart Beat is done");
288-
} else if (result.contains("> ALIVE <")) {
289-
s_logger.debug("Heart beat is still going");
290-
return new CheckOnHostAnswer(cmd, true, "Heartbeat is still going");
291-
}
292-
return new CheckOnHostAnswer(cmd, null, "Unable to determine");
293-
} catch (Exception e) {
294-
s_logger.warn("Unable to fence", e);
295-
return new CheckOnHostAnswer(cmd, e.getMessage());
304+
Boolean alive = check_heartbeat(cmd.getHost().getGuid());
305+
String msg = "";
306+
if (alive == null) {
307+
msg = " cannot determine ";
308+
} else if ( alive == true) {
309+
msg = "Heart beat is still going";
310+
} else {
311+
msg = "Heart beat is gone so dead.";
296312
}
313+
s_logger.debug(msg);
314+
return new CheckOnHostAnswer(cmd, alive, msg);
315+
297316
}
298317

318+
299319
public XenServer56Resource() {
300320
super();
301321
}

scripts/vm/hypervisor/xenserver/check_heartbeat.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,3 +72,4 @@ do
7272
done
7373

7474
echo "=====> DEAD <======"
75+
exit 1

scripts/vm/hypervisor/xenserver/vmopspremium

Lines changed: 2 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -123,18 +123,7 @@ def setup_heartbeat_file(session, args):
123123
txt = ''
124124
return txt
125125

126-
@echo
127-
def check_heartbeat(session, args):
128-
host = args['host']
129-
interval = args['interval']
130-
try:
131-
cmd = ["bash", "/opt/cloud/bin/check_heartbeat.sh", host, interval]
132-
txt = util.pread2(cmd)
133-
except:
134-
txt=''
135-
return txt
136-
137-
126+
138127
@echo
139128
def heartbeat(session, args):
140129
host = args['host']
@@ -156,5 +145,4 @@ def asmonitor(session, args):
156145
return 'fail'
157146

158147
if __name__ == "__main__":
159-
XenAPIPlugin.dispatch({"forceShutdownVM":forceShutdownVM, "upgrade_snapshot":upgrade_snapshot, "create_privatetemplate_from_snapshot":create_privatetemplate_from_snapshot, "copy_vhd_to_secondarystorage":copy_vhd_to_secondarystorage, "copy_vhd_from_secondarystorage":copy_vhd_from_secondarystorage, "setup_heartbeat_sr":setup_heartbeat_sr, "setup_heartbeat_file":setup_heartbeat_file, "check_heartbeat":check_heartbeat, "heartbeat": heartbeat, "asmonitor": asmonitor})
160-
148+
XenAPIPlugin.dispatch({"forceShutdownVM":forceShutdownVM, "upgrade_snapshot":upgrade_snapshot, "create_privatetemplate_from_snapshot":create_privatetemplate_from_snapshot, "copy_vhd_to_secondarystorage":copy_vhd_to_secondarystorage, "copy_vhd_from_secondarystorage":copy_vhd_from_secondarystorage, "setup_heartbeat_sr":setup_heartbeat_sr, "setup_heartbeat_file":setup_heartbeat_file, "heartbeat": heartbeat, "asmonitor": asmonitor})

0 commit comments

Comments
 (0)