44
55from kubernetes import client
66
7+ DEFAULT_EXEC_TIMEOUT = 300
78
8- def run_command (cmd , cwd = None , check = True ):
9+
10+ def run_command (cmd , cwd = None , check = True , timeout = None ):
911 print (f"Running command: { ' ' .join (cmd )} " )
10- result = subprocess .run (cmd , cwd = cwd , capture_output = True , text = True , check = True )
12+ result = subprocess .run (
13+ cmd , cwd = cwd , capture_output = True , text = True , check = True , timeout = timeout
14+ )
1115 if check and result .returncode != 0 :
1216 print (result .stdout )
1317 print (result .stderr )
@@ -30,14 +34,23 @@ def delete_namespace(api_instance, namespace_name):
3034 api_instance .delete_namespace (namespace_name )
3135
3236
33- def run_kubectl_command (args ):
37+ def run_kubectl_command (args , timeout = None ):
3438 try :
3539 result = subprocess .run (
36- ["kubectl" ] + args , capture_output = True , text = True , check = True
40+ ["kubectl" ] + args ,
41+ capture_output = True ,
42+ text = True ,
43+ check = True ,
44+ timeout = timeout ,
3745 )
3846 return result .stdout .strip ()
47+ except subprocess .TimeoutExpired :
48+ print (f"TIMEOUT executing 'kubectl { ' ' .join (args )} ' after { timeout } s" )
49+ return None
3950 except subprocess .CalledProcessError as e :
4051 print (f"Error executing 'kubectl { ' ' .join (args )} ': { e } " )
52+ if e .stderr :
53+ print (f" stderr: { e .stderr .strip ()} " )
4154 return None
4255
4356
@@ -236,8 +249,116 @@ def applyFeastProject(namespace, feast_project):
236249 return apply_output
237250
238251
239- def execPodCommand (namespace , podName , command_args ):
252+ def execPodCommand (namespace , podName , command_args , timeout = DEFAULT_EXEC_TIMEOUT ):
240253 apply_args = ["exec" , podName , "-n" , namespace , "--" ] + command_args
241- apply_output = run_kubectl_command (apply_args )
254+ apply_output = run_kubectl_command (apply_args , timeout = timeout )
242255 print ("Output of args apply:\n " , apply_output )
243256 return apply_output
257+
258+
259+ def _kubectl_print (args , timeout = 30 ):
260+ """Run a kubectl command and print its output (for diagnostics)."""
261+ output = run_kubectl_command (args , timeout = timeout )
262+ if output :
263+ print (output )
264+ return output
265+
266+
267+ def dump_kubernetes_diagnostics (namespace ):
268+ """Dump diagnostic info for debugging infrastructure failures."""
269+ separator = "=" * 60
270+ print (f"\n { separator } " )
271+ print (f" KUBERNETES DIAGNOSTICS FOR NAMESPACE: { namespace } " )
272+ print (f"{ separator } \n " )
273+
274+ print ("--- Pod status ---" )
275+ _kubectl_print (["get" , "pods" , "-n" , namespace , "-o" , "wide" ])
276+
277+ print ("\n --- Pod descriptions (non-Running) ---" )
278+ pods_output = run_kubectl_command (
279+ [
280+ "get" ,
281+ "pods" ,
282+ "-n" ,
283+ namespace ,
284+ "--no-headers" ,
285+ "-o" ,
286+ "custom-columns=NAME:.metadata.name,STATUS:.status.phase" ,
287+ ],
288+ timeout = 30 ,
289+ )
290+ if pods_output :
291+ for line in pods_output .splitlines ():
292+ parts = line .split ()
293+ if len (parts ) == 2 and parts [1 ] != "Running" :
294+ print (f"\n Describing non-running pod: { parts [0 ]} " )
295+ _kubectl_print (["describe" , "pod" , parts [0 ], "-n" , namespace ])
296+
297+ print ("\n --- Services and endpoints ---" )
298+ _kubectl_print (["get" , "svc" , "-n" , namespace ])
299+ _kubectl_print (["get" , "endpoints" , "-n" , namespace ])
300+
301+ print ("\n --- Ingress ---" )
302+ _kubectl_print (["get" , "ingress" , "-n" , namespace , "-o" , "wide" ])
303+ _kubectl_print (["describe" , "ingress" , "-n" , namespace ])
304+
305+ print ("\n --- FeatureStore CRs ---" )
306+ _kubectl_print (["get" , "feast" , "-n" , namespace , "-o" , "wide" ])
307+
308+ print ("\n --- Warning/Error events ---" )
309+ _kubectl_print (
310+ [
311+ "get" ,
312+ "events" ,
313+ "-n" ,
314+ namespace ,
315+ "--sort-by=.lastTimestamp" ,
316+ "--field-selector=type!=Normal" ,
317+ ]
318+ )
319+
320+ print ("\n --- Pod logs (last 50 lines each) ---" )
321+ pods_names = run_kubectl_command (
322+ [
323+ "get" ,
324+ "pods" ,
325+ "-n" ,
326+ namespace ,
327+ "--no-headers" ,
328+ "-o" ,
329+ "custom-columns=:metadata.name" ,
330+ ],
331+ timeout = 30 ,
332+ )
333+ if pods_names :
334+ for pod_name in pods_names .splitlines ():
335+ pod_name = pod_name .strip ()
336+ if not pod_name :
337+ continue
338+ print (f"\n --- Logs for pod: { pod_name } ---" )
339+ _kubectl_print (
340+ [
341+ "logs" ,
342+ pod_name ,
343+ "-n" ,
344+ namespace ,
345+ "--tail=50" ,
346+ "--all-containers" ,
347+ ]
348+ )
349+
350+ print ("\n --- Ingress controller logs (last 30 lines) ---" )
351+ _kubectl_print (
352+ [
353+ "logs" ,
354+ "-n" ,
355+ "ingress-nginx" ,
356+ "-l" ,
357+ "app.kubernetes.io/component=controller" ,
358+ "--tail=30" ,
359+ ]
360+ )
361+
362+ print (f"\n { separator } " )
363+ print (" END DIAGNOSTICS" )
364+ print (f"{ separator } \n " )
0 commit comments