Merge pull request #5782 from tstromberg/test-timeout

Improve parallel start scheduling and autoset parallelism

Merge pull request #5782 from tstromberg/test-timeout
Improve parallel start scheduling and autoset parallelism
14adf489 · Thomas Strömberg · GitHub · ba8aeaae · 850ee4bf · 14adf489
14 changed file
--- a/hack/jenkins/common.sh
+++ b/hack/jenkins/common.sh
@@ -23,8 +23,6 @@
 # EXTRA_START_ARGS: additional flags to pass into minikube start
 # EXTRA_ARGS: additional flags to pass into minikube
 # JOB_NAME: the name of the logfile and check name to update on github
-# PARALLEL_COUNT: number of tests to run in parallel
 readonly TEST_ROOT="${HOME}/minikube-integration"
 readonly TEST_HOME="${TEST_ROOT}/${OS_ARCH}-${VM_DRIVER}-${MINIKUBE_LOCATION}-$$-${COMMIT}"
@@ -243,7 +241,7 @@ if [ "$(uname)" != "Darwin" ]; then
  docker build -t gcr.io/k8s-minikube/gvisor-addon:2 -f testdata/gvisor-addon-Dockerfile ./testdata
 fi
-readonly LOAD=$(uptime | egrep -o "load average.*: [0-9]" | cut -d" " -f3)
+readonly LOAD=$(uptime | egrep -o "load average.*: [0-9]+" | cut -d" " -f3)
 if [[ "${LOAD}" -gt 2 ]]; then
  echo ""
  echo "********************** LOAD WARNING ********************************"
@@ -255,21 +253,18 @@ if [[ "${LOAD}" -gt 2 ]]; then
    top -b -n1 | head -n 15
  fi
  echo "********************** LOAD WARNING ********************************"
-  echo ""
  echo "Sleeping 30s to see if load goes down ...."
  sleep 30
  uptime
 fi
 echo ""
 echo ">> Starting ${E2E_BIN} at $(date)"
 set -x
 ${SUDO_PREFIX}${E2E_BIN} \
  -minikube-start-args="--vm-driver=${VM_DRIVER} ${EXTRA_START_ARGS}" \
  -expected-default-driver="${EXPECTED_DEFAULT_DRIVER}" \
-  -test.timeout=60m \
+  -test.timeout=70m \
-  -test.parallel=${PARALLEL_COUNT} \
  ${EXTRA_TEST_ARGS} \
  -binary="${MINIKUBE_BIN}" && result=$? || result=$?
 set +x

--- a/hack/jenkins/linux_integration_tests_kvm.sh
+++ b/hack/jenkins/linux_integration_tests_kvm.sh
@@ -28,7 +28,6 @@ set -e
 OS_ARCH="linux-amd64"
 VM_DRIVER="kvm2"
 JOB_NAME="KVM_Linux"
-PARALLEL_COUNT=4
 EXPECTED_DEFAULT_DRIVER="kvm2"
 # We pick kvm as our gvisor testbed because it is fast & reliable

--- a/hack/jenkins/linux_integration_tests_none.sh
+++ b/hack/jenkins/linux_integration_tests_none.sh
@@ -30,7 +30,6 @@ OS_ARCH="linux-amd64"
 VM_DRIVER="none"
 JOB_NAME="none_Linux"
 EXTRA_ARGS="--bootstrapper=kubeadm"
-PARALLEL_COUNT=1
 EXPECTED_DEFAULT_DRIVER="kvm2"
 SUDO_PREFIX="sudo -E "

--- a/hack/jenkins/linux_integration_tests_virtualbox.sh
+++ b/hack/jenkins/linux_integration_tests_virtualbox.sh
@@ -28,7 +28,6 @@ set -e
 OS_ARCH="linux-amd64"
 VM_DRIVER="virtualbox"
 JOB_NAME="VirtualBox_Linux"
-PARALLEL_COUNT=4
 EXPECTED_DEFAULT_DRIVER="kvm2"
 # Download files and set permissions

--- a/hack/jenkins/osx_integration_tests_hyperkit.sh
+++ b/hack/jenkins/osx_integration_tests_hyperkit.sh
@@ -31,7 +31,6 @@ VM_DRIVER="hyperkit"
 JOB_NAME="HyperKit_macOS"
 EXTRA_ARGS="--bootstrapper=kubeadm"
 EXTRA_START_ARGS=""
-PARALLEL_COUNT=3
 EXPECTED_DEFAULT_DRIVER="hyperkit"

--- a/hack/jenkins/osx_integration_tests_virtualbox.sh
+++ b/hack/jenkins/osx_integration_tests_virtualbox.sh
@@ -29,7 +29,6 @@ OS_ARCH="darwin-amd64"
 VM_DRIVER="virtualbox"
 JOB_NAME="VirtualBox_macOS"
 EXTRA_ARGS="--bootstrapper=kubeadm"
-PARALLEL_COUNT=3
 # hyperkit behaves better, so it has higher precedence.
 # Assumes that hyperkit is also installed on the VirtualBox CI host.
 EXPECTED_DEFAULT_DRIVER="hyperkit"

--- a/test/integration/addons_test.go
+++ b/test/integration/addons_test.go
@@ -36,8 +36,8 @@ import (
 // TestAddons tests addons that require no special environment -- in parallel
 func TestAddons(t *testing.T) {
-	MaybeSlowParallel(t)
+	MaybeParallel(t)
+	WaitForStartSlot(t)
 	profile := UniqueProfileName("addons")
 	ctx, cancel := context.WithTimeout(context.Background(), 40*time.Minute)
 	defer CleanupWithLogs(t, profile, cancel)

--- a/test/integration/docker_test.go
+++ b/test/integration/docker_test.go
@@ -30,7 +30,8 @@ func TestDockerFlags(t *testing.T) {
 	if NoneDriver() {
 		t.Skip("skipping: none driver does not support ssh or bundle docker")
 	}
-	MaybeSlowParallel(t)
+	MaybeParallel(t)
+	WaitForStartSlot(t)
 	profile := UniqueProfileName("docker-flags")
 	ctx, cancel := context.WithTimeout(context.Background(), 20*time.Minute)

--- a/test/integration/guest_env_test.go
+++ b/test/integration/guest_env_test.go
@@ -27,7 +27,9 @@ import (
 )
 func TestGuestEnvironment(t *testing.T) {
-	MaybeSlowParallel(t)
+	MaybeParallel(t)
+	WaitForStartSlot(t)
 	profile := UniqueProfileName("guest")
 	ctx, cancel := context.WithTimeout(context.Background(), 15*time.Minute)
 	defer CleanupWithLogs(t, profile, cancel)

--- a/test/integration/gvisor_addon_test.go
+++ b/test/integration/gvisor_addon_test.go
@@ -34,8 +34,8 @@ func TestGvisorAddon(t *testing.T) {
 		t.Skip("skipping test because --gvisor=false")
 	}
-	MaybeSlowParallel(t)
+	MaybeParallel(t)
+	WaitForStartSlot(t)
 	profile := UniqueProfileName("gvisor")
 	ctx, cancel := context.WithTimeout(context.Background(), 60*time.Minute)
 	defer func() {

--- a/test/integration/helpers.go
+++ b/test/integration/helpers.go
@@ -42,8 +42,10 @@ import (
 )
 var (
-	antiRaceCounter = 0
+	// startTimes is a list of startup times, to guarantee --start-offset
-	antiRaceMutex   = &sync.Mutex{}
+	startTimes = []time.Time{}
+	// startTimesMutex is a lock to update startTimes without a race condition
+	startTimesMutex = &sync.Mutex{}
 )
 // RunResult stores the result of an cmd.Run call
@@ -330,25 +332,32 @@ func MaybeParallel(t *testing.T) {
 	t.Parallel()
 }
-// MaybeSlowParallel is a terrible workaround for tests which start clusters in a race-filled world
+// WaitForStartSlot enforces --start-offset to avoid startup race conditions
-// TODO: Try removing this hack once certificates are deployed per-profile
+func WaitForStartSlot(t *testing.T) {
-func MaybeSlowParallel(t *testing.T) {
+	// Not parallel
-	// NoneDriver shouldn't parallelize "minikube start"
 	if NoneDriver() {
 		return
 	}
-	antiRaceMutex.Lock()
+	wakeup := time.Now()
-	antiRaceCounter++
+	startTimesMutex.Lock()
-	antiRaceMutex.Unlock()
+	if len(startTimes) > 0 {
+		nextStart := startTimes[len(startTimes)-1].Add(*startOffset)
+		// Ignore nextStart if it is in the past - to guarantee offset for next caller
+		if time.Now().Before(nextStart) {
+			wakeup = nextStart
+		}
+	}
+	startTimes = append(startTimes, wakeup)
+	startTimesMutex.Unlock()
-	if antiRaceCounter > 0 {
+	if time.Now().Before(wakeup) {
-		// Slow enough to offset start, but not slow to be a major source of delay
+		d := time.Until(wakeup)
-		penalty := time.Duration(5*antiRaceCounter) * time.Second
+		t.Logf("Waiting for start slot at %s (sleeping %s)  ...", wakeup, d)
-		t.Logf("MaybeSlowParallel: Sleeping %s to avoid start race ...", penalty)
+		time.Sleep(d)
-		time.Sleep(penalty)
+	} else {
+		t.Logf("No need to wait for start slot, it is already %s", time.Now())
 	}
-	t.Parallel()
 }
 // killProcessFamily kills a pid and all of its children

--- a/test/integration/main.go
+++ b/test/integration/main.go
@@ -33,6 +33,7 @@ var defaultDriver = flag.String("expected-default-driver", "", "Expected default
 var forceProfile = flag.String("profile", "", "force tests to run against a particular profile")
 var cleanup = flag.Bool("cleanup", true, "cleanup failed test run")
 var enableGvisor = flag.Bool("gvisor", false, "run gvisor integration test (slow)")
+var startOffset = flag.Duration("start-offset", 30*time.Second, "how much time to offset between cluster starts")
 var postMortemLogs = flag.Bool("postmortem-logs", true, "show logs after a failed test run")
 // Paths to files - normally set for CI

--- a/test/integration/start_stop_delete_test.go
+++ b/test/integration/start_stop_delete_test.go
@@ -75,7 +75,8 @@ func TestStartStop(t *testing.T) {
 		for _, tc := range tests {
 			tc := tc
 			t.Run(tc.name, func(t *testing.T) {
-				MaybeSlowParallel(t)
+				MaybeParallel(t)
+				WaitForStartSlot(t)
 				if !strings.Contains(tc.name, "docker") && NoneDriver() {
 					t.Skipf("skipping %s - incompatible with none driver", t.Name())
@@ -136,6 +137,7 @@ func TestStartStop(t *testing.T) {
 					t.Errorf("status = %q; want = %q", got, state.Stopped)
 				}
+				WaitForStartSlot(t)
 				rr, err = Run(t, exec.CommandContext(ctx, Target(), startArgs...))
 				if err != nil {
 					// Explicit fatal so that failures don't move directly to deletion

--- a/test/integration/version_upgrade_test.go
+++ b/test/integration/version_upgrade_test.go
@@ -39,9 +39,10 @@ import (
 // the odlest supported k8s version and then runs the current head minikube
 // and it tries to upgrade from the older supported k8s to news supported k8s
 func TestVersionUpgrade(t *testing.T) {
+	MaybeParallel(t)
+	WaitForStartSlot(t)
 	profile := UniqueProfileName("vupgrade")
 	ctx, cancel := context.WithTimeout(context.Background(), 55*time.Minute)
-	MaybeSlowParallel(t)
 	defer CleanupWithLogs(t, profile, cancel)
@@ -89,6 +90,7 @@ func TestVersionUpgrade(t *testing.T) {
 		t.Errorf("status = %q; want = %q", got, state.Stopped.String())
 	}
+	WaitForStartSlot(t)
 	args = append([]string{"start", "-p", profile, fmt.Sprintf("--kubernetes-version=%s", constants.NewestKubernetesVersion), "--alsologtostderr", "-v=1"}, StartArgs()...)
 	rr, err = Run(t, exec.CommandContext(ctx, Target(), args...))
 	if err != nil {