55 For more info, refer to the Event Driven TechSupport & CoreDump Mgmt HLD
66"""
77import os
8- import time
98import argparse
109import syslog
11- import re
1210from swsscommon .swsscommon import SonicV2Connector
1311from utilities_common .auto_techsupport_helper import *
1412
15- # Explicity Pass this to the subprocess invoking techsupport
16- ENV_VAR = os .environ
17- PATH_PREV = ENV_VAR ["PATH" ] if "PATH" in ENV_VAR else ""
18- ENV_VAR ["PATH" ] = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:" + PATH_PREV
19-
2013
2114def handle_coredump_cleanup (dump_name , db ):
2215 _ , num_bytes = get_stats (os .path .join (CORE_DUMP_DIR , CORE_DUMP_PTRN ))
@@ -49,8 +42,6 @@ def __init__(self, core_name, container_name, db):
4942 self .core_name = core_name
5043 self .container = container_name
5144 self .db = db
52- self .proc_mp = {}
53- self .core_ts_map = {}
5445
5546 def handle_core_dump_creation_event (self ):
5647 if self .db .get (CFG_DB , AUTO_TS , CFG_STATE ) != "enabled" :
@@ -66,112 +57,8 @@ def handle_core_dump_creation_event(self):
6657 syslog .syslog (syslog .LOG_NOTICE , msg .format (self .container , self .core_name ))
6758 return
6859
69- global_cooloff = self .db .get (CFG_DB , AUTO_TS , COOLOFF )
70- container_cooloff = self .db .get (CFG_DB , FEATURE_KEY , COOLOFF )
71-
72- try :
73- global_cooloff = float (global_cooloff )
74- except ValueError :
75- global_cooloff = 0.0
76-
77- try :
78- container_cooloff = float (container_cooloff )
79- except ValueError :
80- container_cooloff = 0.0
81-
82- cooloff_passed = self .verify_rate_limit_intervals (global_cooloff , container_cooloff )
83- if cooloff_passed :
84- since_cfg = self .get_since_arg ()
85- new_file = self .invoke_ts_cmd (since_cfg )
86- if new_file :
87- self .write_to_state_db (int (time .time ()), new_file )
88-
89- def write_to_state_db (self , timestamp , ts_dump ):
90- name = strip_ts_ext (ts_dump )
91- key = TS_MAP + "|" + name
92- self .db .set (STATE_DB , key , CORE_DUMP , self .core_name )
93- self .db .set (STATE_DB , key , TIMESTAMP , str (timestamp ))
94- self .db .set (STATE_DB , key , CONTAINER , self .container )
95-
96- def get_since_arg (self ):
97- since_cfg = self .db .get (CFG_DB , AUTO_TS , CFG_SINCE )
98- if not since_cfg :
99- return SINCE_DEFAULT
100- rc , _ , stderr = subprocess_exec (["date" , "--date={}" .format (since_cfg )], env = ENV_VAR )
101- if rc == 0 :
102- return since_cfg
103- return SINCE_DEFAULT
104-
105- def parse_ts_dump_name (self , ts_stdout ):
106- """ Figure out the ts_dump name from the techsupport stdout """
107- matches = re .findall (TS_PTRN , ts_stdout )
108- if matches :
109- return matches [- 1 ]
110- syslog .syslog (syslog .LOG_ERR , "stdout of the 'show techsupport' cmd doesn't have the dump name" )
111- return ""
112-
113- def invoke_ts_cmd (self , since_cfg , num_retry = 0 ):
114- cmd_opts = ["show" , "techsupport" , "--silent" , "--since" , since_cfg ]
115- cmd = " " .join (cmd_opts )
116- rc , stdout , stderr = subprocess_exec (cmd_opts , env = ENV_VAR )
117- new_dump = ""
118- if rc == EXT_LOCKFAIL :
119- syslog .syslog (syslog .LOG_NOTICE , "Another instance of techsupport running, aborting this. stderr: {}" .format (stderr ))
120- elif rc == EXT_RETRY :
121- if num_retry <= MAX_RETRY_LIMIT :
122- return self .invoke_ts_cmd (since_cfg , num_retry + 1 )
123- else :
124- syslog .syslog (syslog .LOG_ERR , "MAX_RETRY_LIMIT for show techsupport invocation exceeded, stderr: {}" .format (stderr ))
125- elif rc != EXT_SUCCESS :
126- syslog .syslog (syslog .LOG_ERR , "show techsupport failed with exit code {}, stderr: {}" .format (rc , stderr ))
127- else : # EXT_SUCCESS
128- new_dump = self .parse_ts_dump_name (stdout ) # Parse the dump name
129- if not new_dump :
130- syslog .syslog (syslog .LOG_ERR , "{} was run, but no techsupport dump is found" .format (cmd ))
131- else :
132- syslog .syslog (syslog .LOG_INFO , "{} is successful, {} is created" .format (cmd , new_dump ))
133- return new_dump
134-
135- def verify_rate_limit_intervals (self , global_cooloff , container_cooloff ):
136- """Verify both the global and per-proc rate_limit_intervals have passed"""
137- curr_ts_list = get_ts_dumps (True )
138- if global_cooloff and curr_ts_list :
139- last_ts_dump_creation = os .path .getmtime (curr_ts_list [- 1 ])
140- if time .time () - last_ts_dump_creation < global_cooloff :
141- msg = "Global rate_limit_interval period has not passed. Techsupport Invocation is skipped. Core: {}"
142- syslog .syslog (syslog .LOG_INFO , msg .format (self .core_name ))
143- return False
144-
145- self .parse_ts_map ()
146- if container_cooloff and self .container in self .core_ts_map :
147- last_creation_time = self .core_ts_map [self .container ][0 ][0 ]
148- if time .time () - last_creation_time < container_cooloff :
149- msg = "Per Container rate_limit_interval for {} has not passed. Techsupport Invocation is skipped. Core: {}"
150- syslog .syslog (syslog .LOG_INFO , msg .format (self .container , self .core_name ))
151- return False
152- return True
153-
154- def parse_ts_map (self ):
155- """Create proc_name, ts_dump & creation_time map"""
156- ts_keys = self .db .keys (STATE_DB , TS_MAP + "*" )
157- if not ts_keys :
158- return
159- for ts_key in ts_keys :
160- data = self .db .get_all (STATE_DB , ts_key )
161- if not data :
162- continue
163- container_name = data .get (CONTAINER , "" )
164- creation_time = data .get (TIMESTAMP , "" )
165- try :
166- creation_time = int (creation_time )
167- except Exception :
168- continue # if the creation time is invalid, skip the entry
169- ts_dump = ts_key .split ("|" )[- 1 ]
170- if container_name and container_name not in self .core_ts_map :
171- self .core_ts_map [container_name ] = []
172- self .core_ts_map [container_name ].append ((int (creation_time ), ts_dump ))
173- for container_name in self .core_ts_map :
174- self .core_ts_map [container_name ].sort ()
60+ invoke_ts_command_rate_limited (self .db , EVENT_TYPE_CORE , {CORE_DUMP : self .core_name }, self .container )
61+
17562
17663def main ():
17764 parser = argparse .ArgumentParser (description = 'Auto Techsupport Invocation and CoreDump Mgmt Script' )
0 commit comments