watchdog.py 20.3 KB
Newer Older
1
import os
L
Lucas Meneghel Rodrigues 已提交
2 3
import re
import time
4
import logging
Y
yama 已提交
5
import random
6
import platform
7

8 9
from avocado.utils import process
from virttest import error_context
10 11
from virttest import utils_misc
from virttest import env_process
12
from virttest import utils_test
13 14 15
from virttest import data_dir

from aexpect.exceptions import ShellTimeoutError
16 17


18
@error_context.context_aware
19
def run(test, params, env):
20 21
    """
    Configure watchdog, crash the guest and check if watchdog_action occurs.
22

23 24 25
    Test Step:
        1. see every function step
    Params:
L
Lucas Meneghel Rodrigues 已提交
26 27 28
        :param test: QEMU test object.
        :param params: Dictionary with test parameters.
        :param env: Dictionary with the test environment.
29 30
    """

31 32
    timeout = int(params.get("login_timeout", '360'))
    relogin_timeout = int(params.get("relogin_timeout", '240'))
33
    vm_arch_name = params["vm_arch_name"]
34

L
Lucas Meneghel Rodrigues 已提交
35
    watchdog_device_type = params.get("watchdog_device_type", "i6300esb")
36 37 38
    watchdog_action = params.get("watchdog_action", "reset")
    trigger_cmd = params.get("trigger_cmd", "echo c > /dev/watchdog")

L
Lucas Meneghel Rodrigues 已提交
39
    # internal function
40
    def _watchdog_device_check(test, session, watchdog_device):
41
        """
42
        Check the watchdog device have been found and init successfully. if not
43
        will raise error.
44
        """
45
        # when using ib700 or diag288, need modprobe it's driver manually.
46 47
        if watchdog_device == "ib700":
            session.cmd("modprobe ib700wdt")
48 49
        if watchdog_device == "diag288":
            session.cmd("modprobe diag288_wdt")
50

L
Lucas Meneghel Rodrigues 已提交
51
        # when wDT is 6300esb need check pci info
52
        if watchdog_device == "i6300esb":
53 54
            error_context.context("checking pci info to ensure have WDT"
                                  " device", logging.info)
55
            session.cmd("echo 1 > /sys/bus/pci/rescan")
56 57 58 59
            o = session.cmd_output("lspci")
            if o:
                wdt_pci_info = re.findall(".*6300ESB Watchdog Timer", o)
                if not wdt_pci_info:
60
                    test.fail("Can not find watchdog pci")
61 62
            logging.info("Found watchdog pci device : %s" % wdt_pci_info)

L
Lucas Meneghel Rodrigues 已提交
63
        # checking watchdog init info using dmesg
64
        error_context.context("Checking watchdog load info", logging.info)
65
        dmesg_info = params.get("dmesg_info", "(i6300ESB|ib700wdt).*init")
66 67 68
        module_check_cmd = params.get("module_check_cmd",
                                      "dmesg | grep -i '%s' " % dmesg_info)
        (s, o) = session.cmd_status_output(module_check_cmd)
69
        if s != 0:
70
            error_msg = "Wactchdog device '%s' load/initialization failed "
71
            test.error(error_msg % watchdog_device)
72 73 74
        logging.info("Watchdog device '%s' add and init successfully"
                     % watchdog_device)
        logging.debug("Init info : '%s'" % o)
75

76 77 78 79 80 81 82
    def _trigger_watchdog(session, trigger_cmd=None):
        """
        Trigger watchdog action
        Params:
            @session: guest connect session.
            @trigger_cmd: cmd trigger the watchdog
        """
L
Lucas Meneghel Rodrigues 已提交
83
        if trigger_cmd is not None:
84 85
            error_context.context(("Trigger Watchdog action using:'%s'." %
                                   trigger_cmd), logging.info)
86
            session.sendline(trigger_cmd)
87

88
    def _action_check(test, session, watchdog_action):
89
        """
L
Lucas Meneghel Rodrigues 已提交
90 91
        Check whether or not the watchdog action occurred. if the action was
        not occurred will raise error.
92
        """
L
Lucas Meneghel Rodrigues 已提交
93 94
        # when watchdog action is pause, shutdown, reset, poweroff
        # the vm session will lost responsive
95 96 97 98 99 100 101 102

        def check_guest_reboot(pattern):
            start_time = time.time()
            while (time.time() - start_time) < vm.REBOOT_TIMEOUT:
                if pattern in vm.serial_console.get_output().strip(o_before):
                    return True
            return False

103
        response_timeout = int(params.get("response_timeout", '240'))
104 105
        error_context.context("Check whether or not watchdog action '%s' took"
                              " effect" % watchdog_action, logging.info)
106 107 108 109 110 111 112 113 114 115 116 117 118 119 120
        if watchdog_action == "inject-nmi":
            if (vm_arch_name in ("x86_64", "i686")):
                if not utils_misc.wait_for(lambda: "NMI received" in session.cmd_output("dmesg"),
                                           response_timeout, 0, 1):
                    test.fail("Guest didn't receive dmesg with 'NMI received',"
                              "after action '%s'." % watchdog_action)
                msg = session.cmd_output("dmesg").splitlines()[-8:]
                logging.info("Guest received dmesg info: %s" % msg)
            elif (vm_arch_name in ("ppc64", "ppc64le")):
                rebooted = check_guest_reboot(params["guest_reboot_pattern"])
                if not rebooted:
                    test.fail("Guest isn't rebooted after watchdog action '%s'"
                              % watchdog_action)
                logging.info("Try to login the guest after reboot")
                session = vm.wait_for_login(timeout=timeout)
121
        if not utils_misc.wait_for(lambda: not session.is_responsive(),
122
                                   response_timeout, 0, 1):
123
            if (watchdog_action in ("none", "debug", "inject-nmi")):
124 125
                logging.info("OK, the guest session is responsive still")
            else:
126
                txt = "It seems action '%s' took no" % watchdog_action
F
Feng Yang 已提交
127
                txt += " effect, guest is still responsive."
128
                test.fail(txt)
129

130 131
        # when action is poweroff or shutdown(without no-shutdown option),
        # the vm will dead, and qemu exit.
L
Lucas Meneghel Rodrigues 已提交
132
        # The others the vm monitor still responsive, can report the vm status.
L
Lukáš Doktor 已提交
133 134
        if (watchdog_action == "poweroff" or (watchdog_action == "shutdown" and
                                              params.get("disable_shutdown") != "yes")):
135 136
            if not utils_misc.wait_for(lambda: vm.is_dead(),
                                       response_timeout, 0, 1):
137
                txt = "It seems '%s' action took no effect, " % watchdog_action
F
Feng Yang 已提交
138
                txt += "guest is still alive!"
139
                test.fail(txt)
140 141 142 143 144 145 146 147
        else:
            if watchdog_action == "pause":
                f_param = "paused"
            elif watchdog_action == "shutdown":
                f_param = "shutdown"
            else:
                f_param = "running"

L
Lucas Meneghel Rodrigues 已提交
148 149 150
            if not utils_misc.wait_for(
                lambda: vm.monitor.verify_status(f_param),
                    response_timeout, 0, 1):
151
                logging.debug("Monitor status is:%s" % vm.monitor.get_status())
152
                txt = "It seems action '%s' took no effect" % watchdog_action
F
Feng Yang 已提交
153
                txt += " , Wrong monitor status!"
154
                test.fail(txt)
155

L
Lucas Meneghel Rodrigues 已提交
156
        # when the action is reset, need can relogin the guest.
157 158 159
        if watchdog_action == "reset":
            logging.info("Try to login the guest after reboot")
            vm.wait_for_login(timeout=relogin_timeout)
L
Lucas Meneghel Rodrigues 已提交
160 161
        logging.info("Watchdog action '%s' come into effect." %
                     watchdog_action)
162 163 164 165 166 167 168 169

    def check_watchdog_support():
        """
        check the host qemu-kvm support watchdog device
        Test Step:
        1. Send qemu command 'qemu -watchdog ?'
        2. Check the watchdog type that the host support.
        """
170
        qemu_binary = utils_misc.get_qemu_binary(params)
171

172 173 174 175 176 177
        if "aarch" in platform.machine():
            machine_type = params.get("machine_type").split(':', 1)[1]
            watchdog_arg = " -M %s -watchdog '?'" % machine_type
        else:
            watchdog_arg = " -watchdog '?'"
        watchdog_type_check = params.get("watchdog_type_check", watchdog_arg)
L
Lucas Meneghel Rodrigues 已提交
178
        qemu_cmd = qemu_binary + watchdog_type_check
179

L
Lucas Meneghel Rodrigues 已提交
180
        # check the host support watchdog types.
181 182 183
        error_context.context("Checking whether or not the host support"
                              " WDT '%s'" % watchdog_device_type, logging.info)
        watchdog_device = process.system_output("%s 2>&1" % qemu_cmd,
184
                                                shell=True).decode()
185 186 187
        if watchdog_device:
            if re.findall(watchdog_device_type, watchdog_device, re.I):
                logging.info("The host support '%s' type watchdog device" %
L
Lucas Meneghel Rodrigues 已提交
188
                             watchdog_device_type)
189 190
            else:
                logging.info("The host support watchdog device type is: '%s'"
L
Lucas Meneghel Rodrigues 已提交
191
                             % watchdog_device)
192
                test.cancel("watdog %s isn't supported" % watchdog_device_type)
193
        else:
194
            test.cancel("No watchdog device supported by the host!")
195 196 197 198 199 200 201 202

    def guest_boot_with_watchdog():
        """
        check the guest can boot with watchdog device
        Test Step:
        1. Boot guest with watchdog device
        2. Check watchdog device have been initialized successfully in guest
        """
203
        _watchdog_device_check(test, session, watchdog_device_type)
204 205 206 207 208 209 210 211 212 213 214

    def watchdog_action_test():
        """
        Watchdog action test
        Test Step:
        1. Boot guest with watchdog device
        2. Check watchdog device have been initialized successfully in guest
        3.Trigger wathchdog action through open /dev/watchdog
        4.Ensure watchdog_action take effect.
        """

215
        _watchdog_device_check(test, session, watchdog_device_type)
216
        _trigger_watchdog(session, trigger_cmd)
217
        _action_check(test, session, watchdog_action)
218

219 220 221 222 223 224 225 226 227 228 229 230 231 232
    def magic_close_support():
        """
        Magic close the watchdog action.
        Test Step:
        1. Boot guest with watchdog device
        2. Check watchdog device have been initialized successfully in guest
        3. Inside guest, trigger watchdog action"
        4. Inside guest, before heartbeat expires, close this action"
        5. Wait heartbeat timeout check the watchdog action deactive.
        """

        response_timeout = int(params.get("response_timeout", '240'))
        magic_cmd = params.get("magic_close_cmd", "echo V > /dev/watchdog")

233
        _watchdog_device_check(test, session, watchdog_device_type)
234 235
        _trigger_watchdog(session, trigger_cmd)

L
Lucas Meneghel Rodrigues 已提交
236
        # magic close
237
        error_context.context("Magic close is start", logging.info)
238 239
        _trigger_watchdog(session, magic_cmd)

L
Lucas Meneghel Rodrigues 已提交
240 241
        if utils_misc.wait_for(lambda: not session.is_responsive(),
                               response_timeout, 0, 1):
242 243
            error_msg = "Watchdog action took effect, magic close FAILED"
            test.fail(error_msg)
F
Feng Yang 已提交
244
        logging.info("Magic close took effect.")
245 246 247 248 249 250 251

    def migration_when_wdt_timeout():
        """
        Migration when WDT timeout
        Test Step:
        1. Boot guest with watchdog device
        2. Check watchdog device have been initialized successfully in guest
252
        3. Start VM with watchdog device, action reset|pause
253 254 255 256 257 258 259 260 261
        4. Inside RHEL guest, trigger watchdog
        5. Before WDT timeout, do vm migration
        6. After migration, check the watchdog action take effect
        """

        mig_timeout = float(params.get("mig_timeout", "3600"))
        mig_protocol = params.get("migration_protocol", "tcp")
        mig_cancel_delay = int(params.get("mig_cancel") == "yes") * 2

262
        _watchdog_device_check(test, session, watchdog_device_type)
263
        _trigger_watchdog(session, trigger_cmd)
264

265 266
        error_context.context("Do migration(protocol:%s),Watchdog have"
                              " been triggered." % mig_protocol, logging.info)
267
        args = (mig_timeout, mig_protocol, mig_cancel_delay)
268
        migrate_thread = utils_misc.InterruptedThread(vm.migrate, args)
269
        migrate_thread.start()
270
        _action_check(test, session, watchdog_action)
271
        migrate_thread.join(timeout=mig_timeout)
272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288

    def hotplug_unplug_watchdog_device():
        """
        Hotplug/unplug watchdog device
        Test Step:
        1. Start VM with "-watchdog-action pause" CLI option
        2. Add WDT via monitor
        3. Trigger watchdog action in guest
        4. Remove WDT device through monitor cmd "device_del"
        5. Resume and relogin the guest, check the device have been removed.
        """

        session = vm.wait_for_login(timeout=timeout)
        o = session.cmd_output("lspci")
        if o:
            wdt_pci_info = re.findall(".*6300ESB Watchdog Timer", o)
            if wdt_pci_info:
289
                test.fail("Can find watchdog pci")
290 291

        plug_watchdog_device = params.get("plug_watchdog_device", "i6300esb")
292
        machine_type = params.get("machine_type")
293 294
        watchdog_device_add = ("device_add driver=%s, id=%s"
                               % (plug_watchdog_device, "watchdog"))
295 296
        if machine_type == "q35":
            watchdog_device_add += ",bus=pcie-pci-bridge-0,addr=0x1f"
297 298
        watchdog_device_del = ("device_del id=%s" % "watchdog")

299 300
        error_context.context(("Hotplug watchdog device '%s'" %
                               plug_watchdog_device), logging.info)
301 302
        vm.monitor.send_args_cmd(watchdog_device_add)

L
Lucas Meneghel Rodrigues 已提交
303
        # wait watchdog device init
304
        time.sleep(5)
305
        _watchdog_device_check(test, session, plug_watchdog_device)
306
        _trigger_watchdog(session, trigger_cmd)
307
        _action_check(test, session, watchdog_action)
308

309
        error_context.context("Hot unplug watchdog device", logging.info)
310 311
        vm.monitor.send_args_cmd(watchdog_device_del)

312 313
        error_context.context("Resume the guest, check the WDT have"
                              " been removed", logging.info)
314 315 316 317 318 319
        vm.resume()
        session = vm.wait_for_login(timeout=timeout)
        o = session.cmd_output("lspci")
        if o:
            wdt_pci_info = re.findall(".*6300ESB Watchdog Timer", o)
            if wdt_pci_info:
320
                test.fail("Oops, find watchdog pci, unplug failed")
321 322
            logging.info("The WDT remove successfully")

323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
    def stop_cont_test():
        """
        Check if the emulated watchdog devices work properly with the stop/
        continue operation
        """

        response_timeout = int(params.get("response_timeout", '240'))
        _watchdog_device_check(test, session, watchdog_device_type)
        vm.monitor.clear_event("WATCHDOG")
        _trigger_watchdog(session, trigger_cmd)
        vm.pause()
        if utils_misc.wait_for(lambda: vm.monitor.get_event("WATCHDOG"),
                               timeout=response_timeout):
            test.fail("Watchdog action '%s' still took effect after pausing "
                      "VM." % watchdog_action)
        logging.info("Watchdog action '%s' didn't take effect after pausing "
                     "VM, it is expected." % watchdog_action)
        vm.resume()
        if not utils_misc.wait_for(lambda: vm.monitor.get_event("WATCHDOG"),
                                   timeout=response_timeout):
            test.fail("Watchodg action '%s' didn't take effect after resuming "
                      "VM." % watchdog_action)
        _action_check(test, session, watchdog_action)

347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382
    def watchdog_test_suit():
        """
        Run watchdog-test-framework to verify the function of emulated watchdog
        devices.
        Test steps of the framework are as follows:
        1) Set up the watchdog with a 30 second timeout.
        2) Ping the watchdog for 60 seconds.  During this time the guest should
        run normally.
        3) Stop pinging the watchdog and just count up.  If the virtual watchdog
        device is set correctly, then the watchdog action (eg. pause) should
        happen around the 30 second mark.
        """

        _watchdog_device_check(test, session, watchdog_device_type)
        watchdog_test_lib = params["watchdog_test_lib"]
        src_path = os.path.join(data_dir.get_deps_dir(), watchdog_test_lib)
        test_dir = os.path.basename(watchdog_test_lib)
        session.cmd_output("rm -rf /home/%s" % test_dir)
        vm.copy_files_to(src_path, "/home")
        session.cmd_output("cd /home/%s && make" % test_dir)
        try:
            session.cmd_output("./watchdog-test --yes &", timeout=130)
        except ShellTimeoutError:
            # To judge if watchdog action happens after 30s
            o = session.get_output().splitlines()[-1]
            if 27 <= int(o.rstrip("...")) <= 32:
                _action_check(test, session, watchdog_action)
            else:
                test.fail("Watchdog action doesn't happen after 30s.")
        else:
            test.fail("Watchdog test suit doesn't run successfully.")
        finally:
            vm.resume()
            session.cmd_output("pkill watchdog-test")
            session.cmd_output("rm -rf /home/%s" % test_dir)

Y
yama 已提交
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442
    def heartbeat_test():
        """
        Heartbeat test for i6300esb
        Test steps:
        1.Start VM with "-watchdog-action pause" CLI option
        2.Set heartbeat value and reload the i6300esb module
        3.Trigger wathchdog action through open /dev/watchdog
        4.Ensure watchdog_action takes effect after $heartbeat.
        """
        del_module_cmd = params["del_module_cmd"]
        reload_module_cmd = params["reload_module_cmd"]
        _watchdog_device_check(test, session, watchdog_device_type)
        error_context.context("set heartbeat value and reload the i6300esb "
                              "module", logging.info)
        session.cmd(del_module_cmd)
        heartbeat = params["heartbeat"]
        if heartbeat == "random_value":
            heartbeat = random.randint(1, 20)
        else:
            heartbeat = eval(heartbeat)
        dmesg_cmd = params["dmesg_cmd"]
        session.cmd(dmesg_cmd)
        session.cmd_output(reload_module_cmd % heartbeat)
        if heartbeat < -2147483648 or heartbeat > 2147483647:
            o = session.cmd_output("dmesg | grep -i 'i6300esb.*invalid'")
            if o:
                logging.info("Heartbeat value %s is out of range, it is "
                             "expected." % heartbeat)
            else:
                test.fail("No invalid heartbeat info in dmesg.")
        elif -2147483648 <= heartbeat < 1 or 2046 < heartbeat <= 2147483647:
            o = session.cmd_output("dmesg | grep -i 'heartbeat=30'")
            if not o:
                test.fail("Heartbeat value isn't default 30 sec in dmesg, it "
                          "should be.")
            heartbeat = 30
        elif 1 <= heartbeat <= 2046:
            o = session.cmd_output("dmesg | grep -i 'heartbeat=%s'" % heartbeat)
            if not o:
                test.fail("Heartbeat value isn't %s sec in dmesg" % heartbeat)
        if heartbeat <= 2147483647 and heartbeat > -2147483648:
            _watchdog_device_check(test, session, watchdog_device_type)
            _trigger_watchdog(session, trigger_cmd)
            error_context.context("Watchdog will fire after %s s" % heartbeat,
                                  logging.info)
            start_time = time.time()
            end_time = start_time + float(heartbeat) + 2
            while not vm.monitor.verify_status("paused"):
                if time.time() > end_time:
                    test.fail("Monitor status is:%s, watchdog action '%s' didn't take"
                              "effect" % (vm.monitor.get_status(), watchdog_action))
                time.sleep(1)
            guest_pause_time = time.time() - start_time
            if abs(guest_pause_time - float(heartbeat)) <= 2:
                logging.info("Watchdog action '%s' took effect after '%s's." %
                             (watchdog_action, guest_pause_time))
            else:
                test.fail("Watchdog action '%s' took effect after '%s's, it is earlier"
                          " than expected." % (watchdog_action, guest_pause_time))

L
Lucas Meneghel Rodrigues 已提交
443
    # main procedure
444
    test_type = params.get("test_type")
445 446
    check_watchdog_support()

447 448 449 450 451
    error_context.context("'%s' test starting ... " % test_type, logging.info)
    error_context.context("Boot VM with WDT(Device:'%s', Action:'%s'),"
                          " and try to login" %
                          (watchdog_device_type, watchdog_action),
                          logging.info)
452 453 454 455
    params["start_vm"] = "yes"
    env_process.preprocess_vm(test, params, env, params.get("main_vm"))
    vm = env.get_vm(params["main_vm"])
    session = vm.wait_for_login(timeout=timeout)
456

457 458 459
    if (watchdog_action == "inject-nmi" and vm_arch_name in ("ppc64", "ppc64le")):
        o_before = vm.serial_console.get_output()

460
    if params.get("setup_runlevel") == "yes":
461
        error_context.context("Setup the runlevel for guest", logging.info)
462 463
        utils_test.qemu.setup_runlevel(params, session)

464 465 466 467
    if (test_type in locals()):
        test_running = locals()[test_type]
        test_running()
    else:
468 469
        test.error("Oops test %s doesn't exist, have a check please."
                   % test_type)