From 76d6efa58e3b24ccea70c080af6818ffdcf8ba5f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Luk=C3=A1=C5=A1=20Doktor?= <ldoktor@redhat.com>
Date: Tue, 25 Apr 2017 16:04:03 +0200
Subject: [PATCH] runner: Increase test postprocess timeout
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Currently there is a hardcoded test-postprocess timeout consisting of
two cycle-timeouts which is something between 1 and 2 seconds. This is
not sufficient on heavily loaded machines so this patch increases the
timeouts to:

 1s: when test was interrupted (ctrl+c/timeout)
10s: when the process died but the status was not yet delivered
60s: when test reported status but the process did not finish

taking into account the current available test/job timeout.

As those deadlines are significantly longer, users could have noticed a
frozen throbber, which is why I propagate the results_dispatcher and
handle the throbber also during the postprocess, which makes this patch
a bit more complicated than one would expect. I used "progress = False"
to distinguish between running test and postprocessing it.

Signed-off-by: Lukáš Doktor <ldoktor@redhat.com>
---
 avocado/core/runner.py             | 48 ++++++++++++++++++++----------
 selftests/functional/test_basic.py | 10 +++++--
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/avocado/core/runner.py b/avocado/core/runner.py
index 97028dcc..c9910366 100644
--- a/avocado/core/runner.py
+++ b/avocado/core/runner.py
@@ -39,6 +39,13 @@ from ..utils import stacktrace
 TEST_LOG = logging.getLogger("avocado.test")
 APP_LOG = logging.getLogger("avocado.app")
 
+#: when test was interrupted (ctrl+c/timeout)
+TIMEOUT_TEST_INTERRUPTED = 1
+#: when the process died but the status was not yet delivered
+TIMEOUT_PROCESS_DIED = 10
+#: when test reported status but the process did not finish
+TIMEOUT_PROCESS_ALIVE = 60
+
 
 def add_runner_failure(test_state, new_status, message):
     """
@@ -194,32 +201,38 @@ class TestStatus(object):
                                       " see overall job.log for details.")
         return test_state
 
-    def finish(self, proc, started, timeout, step):
+    def finish(self, proc, started, step, deadline, result_dispatcher):
         """
         Wait for the test process to finish and report status or error status
         if unable to obtain the status till deadline.
 
         :param proc: The test's process
         :param started: Time when the test started
-        :param timeout: Timeout for waiting on status
         :param first: Delay before first check
         :param step: Step between checks for the status
+        :param deadline: Test execution deadline
+        :param result_dispatcher: Result dispatcher (for test_progress
+               notifications)
         """
         # Wait for either process termination or test status
-        wait.wait_for(lambda: not proc.is_alive() or self.status, timeout, 0,
+        wait.wait_for(lambda: not proc.is_alive() or self.status, 1, 0,
                       step)
         if self.status:     # status exists, wait for process to finish
-            if not wait.wait_for(lambda: not proc.is_alive(), timeout, 0,
+            deadline = min(deadline, time.time() + TIMEOUT_PROCESS_ALIVE)
+            while time.time() < deadline:
+                result_dispatcher.map_method('test_progress', False)
+                if wait.wait_for(lambda: not proc.is_alive(), 1, 0,
                                  step):
-                err = "Test reported status but did not finish"
-            else:   # Test finished and reported status, pass
-                return self._add_status_failures(self.status)
+                    return self._add_status_failures(self.status)
+            err = "Test reported status but did not finish"
         else:   # proc finished, wait for late status delivery
-            if not wait.wait_for(lambda: self.status, timeout, 0, step):
-                err = "Test died without reporting the status."
-            else:
-                # Status delivered after the test process finished, pass
-                return self._add_status_failures(self.status)
+            deadline = min(deadline, time.time() + TIMEOUT_PROCESS_DIED)
+            while time.time() < deadline:
+                result_dispatcher.map_method('test_progress', False)
+                if wait.wait_for(lambda: self.status, 1, 0, step):
+                    # Status delivered after the test process finished, pass
+                    return self._add_status_failures(self.status)
+            err = "Test died without reporting the status."
         # At this point there were failures, fill the new test status
         TEST_LOG.debug("Original status: %s", str(self.status))
         test_state = self.early_status
@@ -432,9 +445,14 @@ class TestRunner(object):
                         stage_2_msg_displayed = True
                     os.kill(proc.pid, signal.SIGKILL)
 
-        # Get/update the test status
-        test_state = test_status.finish(proc, time_started, cycle_timeout,
-                                        step)
+        # Get/update the test status (decrease timeout on abort)
+        if abort_reason:
+            finish_deadline = TIMEOUT_TEST_INTERRUPTED
+        else:
+            finish_deadline = deadline
+        test_state = test_status.finish(proc, time_started, step,
+                                        finish_deadline,
+                                        result_dispatcher)
 
         # Try to log the timeout reason to test's results and update test_state
         if abort_reason:
diff --git a/selftests/functional/test_basic.py b/selftests/functional/test_basic.py
index 10c1cb1f..3e693db9 100644
--- a/selftests/functional/test_basic.py
+++ b/selftests/functional/test_basic.py
@@ -87,7 +87,7 @@ import time
 class MyTest(Test):
     def test(self):
          self.runner_queue.put({"running": False})
-         time.sleep(60)
+         time.sleep(70)
 '''
 
 
@@ -257,7 +257,7 @@ class RunnerOperationTest(unittest.TestCase):
                                     REPORTS_STATUS_AND_HANG,
                                     "hanged_test_with_status") as tst:
             res = process.run("%s run --sysinfo=off --job-results-dir %s %s "
-                              "--json -" % (AVOCADO, self.tmpdir, tst),
+                              "--json - --job-timeout 1" % (AVOCADO, self.tmpdir, tst),
                               ignore_status=True)
             self.assertEqual(res.exit_status, exit_codes.AVOCADO_TESTS_FAIL)
             results = json.loads(res.stdout)
@@ -266,7 +266,11 @@ class RunnerOperationTest(unittest.TestCase):
                                                "ERROR", res))
             self.assertIn("Test reported status but did not finish",
                           results["tests"][0]["fail_reason"])
-            self.assertLess(res.duration, 40, "Test execution took too long, "
+            # Currently it should finish up to 1s after the job-timeout
+            # but the prep and postprocess could take a bit longer on
+            # some environments, so let's just check it does not take
+            # > 60s, which is the deadline for force-finishing the test.
+            self.assertLess(res.duration, 55, "Test execution took too long, "
                             "which is likely because the hanged test was not "
                             "interrupted. Results:\n%s" % res)
 
-- 
GitLab