Skip to content

Commit 0be4a68

Browse files
author
Jayapal
committed
CLOUDSTACK-5164 Unmonit for 30 minutes for a failed process
1 parent ab2c38c commit 0be4a68

2 files changed

Lines changed: 161 additions & 40 deletions

File tree

systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ crontab -l | grep -v monitorServices.py | crontab -
6464
create_config $config
6565

6666
#add cron job
67-
(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */1 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
67+
(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */3 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
6868

6969

7070
unlock_exit 0 $lock $locked

systemvm/patches/debian/config/root/monitorServices.py

Lines changed: 160 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,14 +19,13 @@
1919

2020

2121

22-
__author__ = 'jayapalreddy'
2322

2423
from ConfigParser import SafeConfigParser
2524
from subprocess import *
2625
from os import path
2726
import time
27+
import os
2828

29-
monitor_log='/var/log/monitor.log'
3029
class StatusCodes:
3130
SUCCESS = 0
3231
FAILED = 1
@@ -35,42 +34,58 @@ class StatusCodes:
3534
STOPPED = 4
3635
STARTING = 5
3736

38-
class log:
37+
class Log:
3938
INFO = 'INFO'
4039
ALERT = 'ALERT'
4140
CRIT = 'CRIT'
4241
NOTIF = 'NOTIF'
4342

44-
43+
class Config:
44+
MONIT_AFTER_MINS = 30
45+
SLEEP_SEC = 1
46+
RETRY_ITERATIONS = 10
47+
RETRY_FOR_RESTART = 5
48+
MONITOR_LOG = '/var/log/monitor.log'
49+
UNMONIT_PS_FILE = '/etc/unmonit_psList.txt'
4550

4651

4752
def getConfig( config_file_path = "/etc/monitor.conf" ):
53+
"""
54+
Reads the process configuration from the config file.
55+
Config file contains the processes to be monitored.
56+
57+
"""
4858
process_dict = {}
4959
parser = SafeConfigParser()
5060
parser.read( config_file_path )
5161

52-
#print 'Read values:\n'
5362

5463
for section in parser.sections():
55-
# print section
5664
process_dict[section] = {}
5765

5866
for name, value in parser.items(section):
5967
process_dict[section][name] = value
60-
# print ' %s = %r' % (name, value)
68+
# printd (" %s = %r" % (name, value))
6169

6270
return process_dict
6371

6472
def printd (msg):
73+
"""
74+
prints the debug messages
75+
"""
6576

77+
#for debug
78+
#print msg
6679
return 0
6780

68-
f= open(monitor_log,'r+')
81+
f= open(Config.MONITOR_LOG,'r+')
6982
f.seek(0, 2)
7083
f.write(str(msg)+"\n")
7184
f.close()
7285

7386
def raisealert(severity, msg, process_name=None):
87+
""" Writes the alert message"""
88+
7489
#timeStr=str(time.ctime())
7590
if process_name is not None:
7691
log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
@@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None):
8297

8398

8499
def isPidMatchPidFile(pidfile, pids):
100+
""" Compares the running process pid with the pid in pid file.
101+
If a process with multiple pids then it matches with pid file
102+
"""
85103

86104
if pids is None or isinstance(pids,list) != True or len(pids) == 0:
87-
print "Invalid Arguments"
105+
printd ("Invalid Arguments")
88106
return StatusCodes.FAILED
89107
if not path.isfile(pidfile):
90108
#It seems there is no pid file for this service
@@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids):
100118

101119

102120
inp = fd.read()
121+
122+
if not inp:
123+
fd.close()
124+
return StatusCodes.FAILED
125+
103126
printd("file content "+str(inp))
104127
printd(pids)
105128
tocheck_pid = inp.strip()
106129
for item in pids:
107130
if str(tocheck_pid) == item.strip():
108131
printd("pid file matched")
132+
fd.close()
109133
return StatusCodes.SUCCESS
110134

111135
fd.close()
@@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids):
114138

115139

116140
def checkProcessStatus( process ):
141+
"""
142+
Check the process running status, if not running tries to restart
143+
"""
117144
process_name = process.get('processname')
118145
service_name = process.get('servicename')
119146
pidfile = process.get('pidfile')
120147
#temp_out = None
121148
restartFailed=False
122-
pidFileMatched=1
149+
pidFileMatched=False
150+
pids=''
123151
cmd=''
124152
if process_name is None:
125-
print "\n Invalid Process Name"
153+
printd ("\n Invalid Process Name")
126154
return StatusCodes.INVALID_INP
127155
else:
128-
msg="checking the process " + process_name
129-
printd(msg)
156+
printd("checking the process " + process_name)
130157
cmd = 'pidof ' + process_name
131158
printd(cmd)
132159
#cmd = 'service ' + process_name + ' status'
@@ -136,49 +163,48 @@ def checkProcessStatus( process ):
136163

137164
#check there is only one pid or not
138165
if exitStatus == 0:
166+
pids = temp_out.split(' ')
139167
msg="pids: " +temp_out;
140168
printd(msg)
141-
pids = temp_out.split(' ')
142169

143170
#there is more than one process so match the pid file
144-
#if not matched set pidFileMatched=0
171+
#if not matched set pidFileMatched=False
145172
printd("Checking pid file")
146173
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
147-
pidFileMatched = 1;
174+
pidFileMatched = True;
148175
else:
149-
pidFileMatched = 0;
176+
pidFileMatched = False;
150177

151-
printd(pidFileMatched)
152-
if exitStatus == 0 and pidFileMatched == 1:
178+
if exitStatus == 0 and pidFileMatched == True:
153179
printd("The process is running ....")
154180
return StatusCodes.RUNNING
155181
else:
156182
printd('exit status:'+str(exitStatus))
157183
msg="The process " + process_name +" is not running trying recover "
158184
printd(msg)
159185
#Retry the process state for few seconds
160-
for i in range(1,10):
186+
for i in range(1, Config.RETRY_ITERATIONS):
161187
pout = Popen(cmd, shell=True, stdout=PIPE)
162188
exitStatus = pout.wait()
163189
temp_out = pout.communicate()[0]
164190

165-
if i < 5: # this is just for trying few more times
191+
if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
166192
if exitStatus == 0:
167193
pids = temp_out.split(' ')
168194

169195
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
170-
pidFileMatched = 1;
196+
pidFileMatched = True;
171197
printd("pid file is matched ...")
172-
raisealert(log.ALERT, "The process detected as running", process_name)
198+
raisealert(Log.ALERT, "The process detected as running", process_name)
173199
break
174200
else:
175201
printd("pid file is not matched ...")
176-
pidFileMatched = 0;
202+
pidFileMatched = False;
203+
time.sleep(Config.SLEEP_SEC)
177204
continue
178-
time.sleep(1)
179205
else:
180206
msg="The process " +process_name+" is not running trying recover "
181-
raisealert(log.INFO,process_name,msg)
207+
raisealert(Log.INFO,process_name,msg)
182208

183209
if service_name == 'apache2':
184210
# Killing apache2 process with this the main service will not start
@@ -189,7 +215,7 @@ def checkProcessStatus( process ):
189215

190216
cmd = 'service ' + service_name + ' restart'
191217

192-
time.sleep(1)
218+
time.sleep(Config.SLEEP_SEC)
193219
#return_val= check_call(cmd , shell=True)
194220

195221
cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
@@ -198,37 +224,135 @@ def checkProcessStatus( process ):
198224
if return_val == 0:
199225
printd("The process" + process_name +" recovered successfully ")
200226
msg="The process " +process_name+" is recovered successfully "
201-
raisealert(log.INFO,msg,process_name)
227+
raisealert(Log.INFO,msg,process_name)
202228

203229
break;
204230
else:
205231
#retry restarting the process for few tries
206232
printd("process restart failing trying again ....")
207233
restartFailed=True
208-
time.sleep(1)
234+
time.sleep(Config.SLEEP_SEC)
209235
continue
210236
#for end here
211237

212238
if restartFailed == True:
213239
msg="The process %s recover failed "%process_name
214-
raisealert(log.ALERT,process_name,msg)
240+
raisealert(Log.ALERT,process_name,msg)
215241

216242
printd("Restart failed after number of retries")
217243
return StatusCodes.STOPPED
218244

219245
return StatusCodes.RUNNING
220246

221-
def raiseAlert( process_name ):
222-
print "process name %s is raised "%process_name
223247

224248
def monitProcess( processes_info ):
249+
"""
250+
Monitors the processes which got from the config file
251+
"""
225252
if len( processes_info ) == 0:
226-
print "Invalid Input"
253+
printd("Invalid Input")
227254
return StatusCodes.INVALID_INP
255+
256+
dict_unmonit={}
257+
umonit_update={}
258+
259+
if not path.isfile(Config.UNMONIT_PS_FILE):
260+
printd('Unmonit File not exist')
261+
else:
262+
#load the dictionary with unmonit process list
263+
dict_unmonit = loadPsFromUnMonitFile()
264+
265+
#time for noting process down time
266+
csec = repr(time.time()).split('.')[0]
267+
268+
unMonitPs=False
269+
228270
for process,properties in processes_info.items():
271+
#skip the process it its time stamp less than Config.MONIT_AFTER_MINS
272+
printd ("checking the process %s \n" %process)
273+
274+
if not is_emtpy(dict_unmonit):
275+
if dict_unmonit.has_key(process):
276+
ts = dict_unmonit[process]
277+
printd("Time difference=%s" %str(int(csec) - int(ts)))
278+
tmin = (int(csec) - int(ts) )/60
279+
280+
if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS:
281+
raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS))
282+
printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin)))
283+
unMonitPs=True
284+
continue
285+
229286
if checkProcessStatus( properties) != StatusCodes.RUNNING:
230-
print "\n Process %s is not Running"%process
287+
printd( "\n Process %s is not Running"%process)
288+
#add this process into unmonit list
289+
printd ("updating the process for unmonit %s\n" %process)
290+
umonit_update[process]=csec
291+
292+
293+
#if dict is not empty write to file else delete it
294+
if not is_emtpy(umonit_update):
295+
writePsListToUnmonitFile(umonit_update)
296+
else:
297+
if is_emtpy(umonit_update) and unMonitPs == False:
298+
#delete file it is there
299+
if path.isfile(Config.UNMONIT_PS_FILE):
300+
printd("Removing the file %s" %Config.UNMONIT_PS_FILE)
301+
os.remove(Config.UNMONIT_PS_FILE)
302+
303+
304+
305+
def loadPsFromUnMonitFile():
231306

307+
dict_unmonit = {}
308+
309+
try:
310+
fd = open(Config.UNMONIT_PS_FILE)
311+
except:
312+
printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE))
313+
return StatusCodes.FAILED
314+
315+
ps = fd.read()
316+
317+
if not ps:
318+
printd("File %s content is empty " %Config.UNMONIT_PS_FILE)
319+
return StatusCodes.FAILED
320+
321+
printd(ps)
322+
plist = ps.split(',')
323+
plist.remove('')
324+
for i in plist:
325+
dict_unmonit[i.split(':')[0]] = i.split(':')[1]
326+
327+
fd.close();
328+
329+
return dict_unmonit;
330+
331+
332+
def writePsListToUnmonitFile(umonit_update):
333+
printd("Write updated unmonit list to file")
334+
line=''
335+
for i in umonit_update:
336+
line+=str(i)+":"+str(umonit_update[i])+','
337+
printd(line)
338+
try:
339+
fd=open(Config.UNMONIT_PS_FILE,'w')
340+
except:
341+
printd("Failed to open file %s " %Config.UNMONIT_PS_FILE)
342+
return StatusCodes.FAILED
343+
344+
fd.write(line);
345+
fd.close()
346+
347+
348+
def is_emtpy(struct):
349+
"""
350+
Checks wether the given struct is empty or not
351+
"""
352+
if struct:
353+
return False
354+
else:
355+
return True
232356

233357
def main():
234358
'''
@@ -238,14 +362,11 @@ def main():
238362
printd("monitoring started")
239363
temp_dict = getConfig()
240364

241-
'''
242-
Step2: Get Previous Run Log
243-
'''
244365

245366
'''
246-
Step3: Monitor and Raise Alert
367+
Step2: Monitor and Raise Alert
247368
'''
248-
#raisealert(log.INFO, 'Monit started')
369+
#raisealert(Log.INFO, 'Monit started')
249370
monitProcess( temp_dict )
250371

251372

0 commit comments

Comments
 (0)