1919
2020
2121
22- __author__ = 'jayapalreddy'
2322
2423from ConfigParser import SafeConfigParser
2524from subprocess import *
2625from os import path
2726import time
27+ import os
2828
29- monitor_log = '/var/log/monitor.log'
3029class StatusCodes :
3130 SUCCESS = 0
3231 FAILED = 1
@@ -35,42 +34,58 @@ class StatusCodes:
3534 STOPPED = 4
3635 STARTING = 5
3736
38- class log :
37+ class Log :
3938 INFO = 'INFO'
4039 ALERT = 'ALERT'
4140 CRIT = 'CRIT'
4241 NOTIF = 'NOTIF'
4342
44-
43+ class Config :
44+ MONIT_AFTER_MINS = 30
45+ SLEEP_SEC = 1
46+ RETRY_ITERATIONS = 10
47+ RETRY_FOR_RESTART = 5
48+ MONITOR_LOG = '/var/log/monitor.log'
49+ UNMONIT_PS_FILE = '/etc/unmonit_psList.txt'
4550
4651
4752def getConfig ( config_file_path = "/etc/monitor.conf" ):
53+ """
54+ Reads the process configuration from the config file.
55+ Config file contains the processes to be monitored.
56+
57+ """
4858 process_dict = {}
4959 parser = SafeConfigParser ()
5060 parser .read ( config_file_path )
5161
52- #print 'Read values:\n'
5362
5463 for section in parser .sections ():
55- # print section
5664 process_dict [section ] = {}
5765
5866 for name , value in parser .items (section ):
5967 process_dict [section ][name ] = value
60- # print ' %s = %r' % (name, value)
68+ # printd (" %s = %r" % (name, value) )
6169
6270 return process_dict
6371
6472def printd (msg ):
73+ """
74+ prints the debug messages
75+ """
6576
77+ #for debug
78+ #print msg
6679 return 0
6780
68- f = open (monitor_log ,'r+' )
81+ f = open (Config . MONITOR_LOG ,'r+' )
6982 f .seek (0 , 2 )
7083 f .write (str (msg )+ "\n " )
7184 f .close ()
7285
7386def raisealert (severity , msg , process_name = None ):
87+ """ Writes the alert message"""
88+
7489 #timeStr=str(time.ctime())
7590 if process_name is not None :
7691 log = '[' + severity + ']' + " " + '[' + process_name + ']' + " " + msg + "\n "
@@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None):
8297
8398
8499def isPidMatchPidFile (pidfile , pids ):
100+ """ Compares the running process pid with the pid in pid file.
101+ If a process with multiple pids then it matches with pid file
102+ """
85103
86104 if pids is None or isinstance (pids ,list ) != True or len (pids ) == 0 :
87- print "Invalid Arguments"
105+ printd ( "Invalid Arguments" )
88106 return StatusCodes .FAILED
89107 if not path .isfile (pidfile ):
90108 #It seems there is no pid file for this service
@@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids):
100118
101119
102120 inp = fd .read ()
121+
122+ if not inp :
123+ fd .close ()
124+ return StatusCodes .FAILED
125+
103126 printd ("file content " + str (inp ))
104127 printd (pids )
105128 tocheck_pid = inp .strip ()
106129 for item in pids :
107130 if str (tocheck_pid ) == item .strip ():
108131 printd ("pid file matched" )
132+ fd .close ()
109133 return StatusCodes .SUCCESS
110134
111135 fd .close ()
@@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids):
114138
115139
116140def checkProcessStatus ( process ):
141+ """
142+ Check the process running status, if not running tries to restart
143+ """
117144 process_name = process .get ('processname' )
118145 service_name = process .get ('servicename' )
119146 pidfile = process .get ('pidfile' )
120147 #temp_out = None
121148 restartFailed = False
122- pidFileMatched = 1
149+ pidFileMatched = False
150+ pids = ''
123151 cmd = ''
124152 if process_name is None :
125- print "\n Invalid Process Name"
153+ printd ( "\n Invalid Process Name" )
126154 return StatusCodes .INVALID_INP
127155 else :
128- msg = "checking the process " + process_name
129- printd (msg )
156+ printd ("checking the process " + process_name )
130157 cmd = 'pidof ' + process_name
131158 printd (cmd )
132159 #cmd = 'service ' + process_name + ' status'
@@ -136,49 +163,48 @@ def checkProcessStatus( process ):
136163
137164 #check there is only one pid or not
138165 if exitStatus == 0 :
166+ pids = temp_out .split (' ' )
139167 msg = "pids: " + temp_out ;
140168 printd (msg )
141- pids = temp_out .split (' ' )
142169
143170 #there is more than one process so match the pid file
144- #if not matched set pidFileMatched=0
171+ #if not matched set pidFileMatched=False
145172 printd ("Checking pid file" )
146173 if isPidMatchPidFile (pidfile , pids ) == StatusCodes .SUCCESS :
147- pidFileMatched = 1 ;
174+ pidFileMatched = True ;
148175 else :
149- pidFileMatched = 0 ;
176+ pidFileMatched = False ;
150177
151- printd (pidFileMatched )
152- if exitStatus == 0 and pidFileMatched == 1 :
178+ if exitStatus == 0 and pidFileMatched == True :
153179 printd ("The process is running ...." )
154180 return StatusCodes .RUNNING
155181 else :
156182 printd ('exit status:' + str (exitStatus ))
157183 msg = "The process " + process_name + " is not running trying recover "
158184 printd (msg )
159185 #Retry the process state for few seconds
160- for i in range (1 ,10 ):
186+ for i in range (1 , Config . RETRY_ITERATIONS ):
161187 pout = Popen (cmd , shell = True , stdout = PIPE )
162188 exitStatus = pout .wait ()
163189 temp_out = pout .communicate ()[0 ]
164190
165- if i < 5 : # this is just for trying few more times
191+ if i < Config . RETRY_FOR_RESTART : # this is just for trying few more times
166192 if exitStatus == 0 :
167193 pids = temp_out .split (' ' )
168194
169195 if isPidMatchPidFile (pidfile , pids ) == StatusCodes .SUCCESS :
170- pidFileMatched = 1 ;
196+ pidFileMatched = True ;
171197 printd ("pid file is matched ..." )
172- raisealert (log .ALERT , "The process detected as running" , process_name )
198+ raisealert (Log .ALERT , "The process detected as running" , process_name )
173199 break
174200 else :
175201 printd ("pid file is not matched ..." )
176- pidFileMatched = 0 ;
202+ pidFileMatched = False ;
203+ time .sleep (Config .SLEEP_SEC )
177204 continue
178- time .sleep (1 )
179205 else :
180206 msg = "The process " + process_name + " is not running trying recover "
181- raisealert (log .INFO ,process_name ,msg )
207+ raisealert (Log .INFO ,process_name ,msg )
182208
183209 if service_name == 'apache2' :
184210 # Killing apache2 process with this the main service will not start
@@ -189,7 +215,7 @@ def checkProcessStatus( process ):
189215
190216 cmd = 'service ' + service_name + ' restart'
191217
192- time .sleep (1 )
218+ time .sleep (Config . SLEEP_SEC )
193219 #return_val= check_call(cmd , shell=True)
194220
195221 cout = Popen (cmd , shell = True , stdout = PIPE , stderr = STDOUT )
@@ -198,37 +224,135 @@ def checkProcessStatus( process ):
198224 if return_val == 0 :
199225 printd ("The process" + process_name + " recovered successfully " )
200226 msg = "The process " + process_name + " is recovered successfully "
201- raisealert (log .INFO ,msg ,process_name )
227+ raisealert (Log .INFO ,msg ,process_name )
202228
203229 break ;
204230 else :
205231 #retry restarting the process for few tries
206232 printd ("process restart failing trying again ...." )
207233 restartFailed = True
208- time .sleep (1 )
234+ time .sleep (Config . SLEEP_SEC )
209235 continue
210236 #for end here
211237
212238 if restartFailed == True :
213239 msg = "The process %s recover failed " % process_name
214- raisealert (log .ALERT ,process_name ,msg )
240+ raisealert (Log .ALERT ,process_name ,msg )
215241
216242 printd ("Restart failed after number of retries" )
217243 return StatusCodes .STOPPED
218244
219245 return StatusCodes .RUNNING
220246
221- def raiseAlert ( process_name ):
222- print "process name %s is raised " % process_name
223247
224248def monitProcess ( processes_info ):
249+ """
250+ Monitors the processes which got from the config file
251+ """
225252 if len ( processes_info ) == 0 :
226- print "Invalid Input"
253+ printd ( "Invalid Input" )
227254 return StatusCodes .INVALID_INP
255+
256+ dict_unmonit = {}
257+ umonit_update = {}
258+
259+ if not path .isfile (Config .UNMONIT_PS_FILE ):
260+ printd ('Unmonit File not exist' )
261+ else :
262+ #load the dictionary with unmonit process list
263+ dict_unmonit = loadPsFromUnMonitFile ()
264+
265+ #time for noting process down time
266+ csec = repr (time .time ()).split ('.' )[0 ]
267+
268+ unMonitPs = False
269+
228270 for process ,properties in processes_info .items ():
271+ #skip the process it its time stamp less than Config.MONIT_AFTER_MINS
272+ printd ("checking the process %s \n " % process )
273+
274+ if not is_emtpy (dict_unmonit ):
275+ if dict_unmonit .has_key (process ):
276+ ts = dict_unmonit [process ]
277+ printd ("Time difference=%s" % str (int (csec ) - int (ts )))
278+ tmin = (int (csec ) - int (ts ) )/ 60
279+
280+ if ( int (csec ) - int (ts ) )/ 60 < Config .MONIT_AFTER_MINS :
281+ raisealert (Log .ALERT , "The %s get monitor after %s minutes " % (process , Config .MONIT_AFTER_MINS ))
282+ printd ('process will be monitored after %s min' % (str (int (Config .MONIT_AFTER_MINS ) - tmin )))
283+ unMonitPs = True
284+ continue
285+
229286 if checkProcessStatus ( properties ) != StatusCodes .RUNNING :
230- print "\n Process %s is not Running" % process
287+ printd ( "\n Process %s is not Running" % process )
288+ #add this process into unmonit list
289+ printd ("updating the process for unmonit %s\n " % process )
290+ umonit_update [process ]= csec
291+
292+
293+ #if dict is not empty write to file else delete it
294+ if not is_emtpy (umonit_update ):
295+ writePsListToUnmonitFile (umonit_update )
296+ else :
297+ if is_emtpy (umonit_update ) and unMonitPs == False :
298+ #delete file it is there
299+ if path .isfile (Config .UNMONIT_PS_FILE ):
300+ printd ("Removing the file %s" % Config .UNMONIT_PS_FILE )
301+ os .remove (Config .UNMONIT_PS_FILE )
302+
303+
304+
305+ def loadPsFromUnMonitFile ():
231306
307+ dict_unmonit = {}
308+
309+ try :
310+ fd = open (Config .UNMONIT_PS_FILE )
311+ except :
312+ printd ("Failed to open file %s " % (Config .UNMONIT_PS_FILE ))
313+ return StatusCodes .FAILED
314+
315+ ps = fd .read ()
316+
317+ if not ps :
318+ printd ("File %s content is empty " % Config .UNMONIT_PS_FILE )
319+ return StatusCodes .FAILED
320+
321+ printd (ps )
322+ plist = ps .split (',' )
323+ plist .remove ('' )
324+ for i in plist :
325+ dict_unmonit [i .split (':' )[0 ]] = i .split (':' )[1 ]
326+
327+ fd .close ();
328+
329+ return dict_unmonit ;
330+
331+
332+ def writePsListToUnmonitFile (umonit_update ):
333+ printd ("Write updated unmonit list to file" )
334+ line = ''
335+ for i in umonit_update :
336+ line += str (i )+ ":" + str (umonit_update [i ])+ ','
337+ printd (line )
338+ try :
339+ fd = open (Config .UNMONIT_PS_FILE ,'w' )
340+ except :
341+ printd ("Failed to open file %s " % Config .UNMONIT_PS_FILE )
342+ return StatusCodes .FAILED
343+
344+ fd .write (line );
345+ fd .close ()
346+
347+
348+ def is_emtpy (struct ):
349+ """
350+ Checks wether the given struct is empty or not
351+ """
352+ if struct :
353+ return False
354+ else :
355+ return True
232356
233357def main ():
234358 '''
@@ -238,14 +362,11 @@ def main():
238362 printd ("monitoring started" )
239363 temp_dict = getConfig ()
240364
241- '''
242- Step2: Get Previous Run Log
243- '''
244365
245366 '''
246- Step3 : Monitor and Raise Alert
367+ Step2 : Monitor and Raise Alert
247368 '''
248- #raisealert(log .INFO, 'Monit started')
369+ #raisealert(Log .INFO, 'Monit started')
249370 monitProcess ( temp_dict )
250371
251372
0 commit comments