Skip to content

Commit 7aeb49f

Browse files
yozhao101yxieca
andauthored
[Monit] Sanity check for Monit services (#2501)
Description of PR Summary: This PR aims to add Monit service into sanity check. Specifically before running each pytest script, sanity check will determine whether the Monit is running and at the same time determine whether the services which were monitored by Monit are running correctly or not. Fixes # (issue) Type of change [ x] Testbed and Framework(new/improvement) Approach What is the motivation for this PR? This PR aims to add Monit service into sanity check. Specifically before running each pytest script, sanity check will determine whether the Monit is running and at the same time determine whether the services which were monitored by Monit are running correctly or not. How did you do it? I added a new function check_monit(...) in checks.py to achieve the purpose. In this function, it will first call a common function get_monit_services_status(...) in devices.py to get the metadata (service name, service status and service type) of services which are monitored by Monit. Then it will determine whether Monit is running or not and whether each service which was monitored by Monit is running or not. How did you verify/test it? I tested this change against the lab device str-dx010-acs-1. Any platform specific information? N/A Supported testbed topology if it's a new test case? N/A Signed-off-by: Yong Zhao <[email protected]> Co-authored-by: Ying Xie <[email protected]>
1 parent 099204e commit 7aeb49f

File tree

3 files changed

+108
-2
lines changed

3 files changed

+108
-2
lines changed

tests/common/devices.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -403,6 +403,34 @@ def critical_services_fully_started(self):
403403
logging.debug("Status of critical services: %s" % str(result))
404404
return all(result.values())
405405

406+
def get_monit_services_status(self):
407+
"""
408+
@summary: Get metadata (service name, service status and service type) of services
409+
which were monitored by Monit.
410+
@return: A dictionary in which key is the service name and values are service status
411+
and service type.
412+
"""
413+
monit_services_status = {}
414+
415+
services_status_result = self.shell("sudo monit status", module_ignore_errors=True)
416+
417+
exit_code = services_status_result["rc"]
418+
if exit_code != 0:
419+
return monit_services_status
420+
421+
for index, service_info in enumerate(services_status_result["stdout_lines"]):
422+
if "status" in service_info and "monitoring status" not in service_info:
423+
service_type_name = services_status_result["stdout_lines"][index - 1]
424+
service_type = service_type_name.split("'")[0].strip()
425+
service_name = service_type_name.split("'")[1].strip()
426+
service_status = service_info[service_info.find("status") + len("status"):].strip()
427+
428+
monit_services_status[service_name] = {}
429+
monit_services_status[service_name]["service_status"] = service_status
430+
monit_services_status[service_name]["service_type"] = service_type
431+
432+
return monit_services_status
433+
406434
def get_critical_group_and_process_lists(self, container_name):
407435
"""
408436
@summary: Get critical group and process lists by parsing the

tests/common/plugins/sanity_check/checks.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
logger = logging.getLogger(__name__)
99
SYSTEM_STABILIZE_MAX_TIME = 300
10+
MONIT_STABILIZE_MAX_TIME = 420
1011
OMEM_THRESHOLD_BYTES=10485760 # 10MB
1112

1213
def check_services(dut):
@@ -171,6 +172,80 @@ def check_dbmemory(dut):
171172
logger.info("Done checking database memory")
172173
return check_result
173174

175+
def check_monit_services_status(check_result, monit_services_status):
176+
"""
177+
@summary: Check whether each type of service which was monitored by Monit was in correct status or not.
178+
If a service was in "Not monitored" status, sanity check will skip it since this service
179+
was temporarily set to not be monitored by Monit.
180+
@return: A dictionary contains the testing result (failed or not failed) and the status of each service.
181+
"""
182+
check_result["services_status"] = {}
183+
for service_name, service_info in monit_services_status.items():
184+
check_result["services_status"].update({service_name: service_info["service_status"]})
185+
if service_info["service_status"] == "Not monitored":
186+
continue
187+
if ((service_info["service_type"] == "Filesystem" and service_info["service_status"] != "Accessible")
188+
or (service_info["service_type"] == "Process" and service_info["service_status"] != "Running")
189+
or (service_info["service_type"] == "Program" and service_info["service_status"] != "Status ok")):
190+
check_result["failed"] = True
191+
192+
return check_result
193+
194+
def check_monit(dut):
195+
"""
196+
@summary: Check whether the Monit is running and whether the services which were monitored by Monit are
197+
in the correct status or not.
198+
@return: A dictionary contains the testing result (failed or not failed) and the status of each service.
199+
"""
200+
logger.info("Checking status of each Monit service...")
201+
networking_uptime = dut.get_networking_uptime().seconds
202+
timeout = max((MONIT_STABILIZE_MAX_TIME - networking_uptime), 0)
203+
interval = 20
204+
logger.info("networking_uptime = {} seconds, timeout = {} seconds, interval = {} seconds" \
205+
.format(networking_uptime, timeout, interval))
206+
207+
check_result = {"failed": False, "check_item": "monit"}
208+
209+
if timeout == 0:
210+
monit_services_status = dut.get_monit_services_status()
211+
if not monit_services_status:
212+
logger.info("Monit was not running.")
213+
check_result["failed"] = True
214+
check_result["failed_reason"] = "Monit was not running"
215+
logger.info("Checking status of each Monit service was done!")
216+
return check_result
217+
218+
check_result = check_monit_services_status(check_result, monit_services_status)
219+
else:
220+
start = time.time()
221+
elapsed = 0
222+
is_monit_running = False
223+
while elapsed < timeout:
224+
check_result["failed"] = False
225+
monit_services_status = dut.get_monit_services_status()
226+
if not monit_services_status:
227+
wait(interval, msg="Monit was not started and wait {} seconds to retry. Remaining time: {}." \
228+
.format(interval, timeout - elapsed))
229+
elapsed = time.time() - start
230+
continue
231+
232+
is_monit_running = True
233+
check_result = check_monit_services_status(check_result, monit_services_status)
234+
if check_result["failed"]:
235+
wait(interval, msg="Services were not monitored and wait {} seconds to retry. Remaining time: {}. Services status: {}" \
236+
.format(interval, timeout - elapsed, str(check_result["services_status"])))
237+
elapsed = time.time() - start
238+
else:
239+
break
240+
241+
if not is_monit_running:
242+
logger.info("Monit was not running.")
243+
check_result["failed"] = True
244+
check_result["failed_reason"] = "Monit was not running"
245+
246+
logger.info("Checking status of each Monit service was done!")
247+
return check_result
248+
174249
def check_processes(dut):
175250
logger.info("Checking process status on %s..." % dut.hostname)
176251

@@ -193,6 +268,7 @@ def check_processes(dut):
193268
start = time.time()
194269
elapsed = 0
195270
while elapsed < timeout:
271+
check_result["failed"] = False
196272
processes_status = dut.all_critical_process_status()
197273
check_result["processes_status"] = processes_status
198274
check_result["services_status"] = {}
@@ -228,6 +304,8 @@ def do_checks(duthosts, check_items):
228304
elif item == "bgp":
229305
if dut in duthosts.frontend_nodes:
230306
results[dut.hostname].append(check_bgp_status(dut))
307+
elif item == "monit":
308+
results[dut.hostname].append(check_monit(dut))
231309

232310
return results
233311

tests/common/plugins/sanity_check/constants.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -21,5 +21,5 @@
2121
"adaptive": {"cmd": None, "reboot": False, "adaptive": True, 'recover_wait': 30},
2222
} # All supported recover methods
2323

24-
SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp"] # Supported checks
25-
DEFAULT_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp"] # Default checks
24+
SUPPORTED_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp", "monit"] # Supported checks
25+
DEFAULT_CHECK_ITEMS = ["services", "interfaces", "dbmemory", "processes", "bgp", "monit"] # Default checks

0 commit comments

Comments
 (0)