未验证 提交 fe3674f9 编写于 作者: T Thomas Strömberg 提交者: GitHub

Merge pull request #7104 from tstromberg/apiserver-always

Reveal more deployment problems with less delay
......@@ -32,7 +32,7 @@ import (
const (
// number of problems per log to output
numberOfProblems = 5
numberOfProblems = 10
)
var (
......
......@@ -34,18 +34,28 @@ import (
"k8s.io/apimachinery/pkg/util/wait"
"k8s.io/client-go/kubernetes"
kconst "k8s.io/kubernetes/cmd/kubeadm/app/constants"
"k8s.io/minikube/pkg/minikube/bootstrapper"
"k8s.io/minikube/pkg/minikube/command"
"k8s.io/minikube/pkg/minikube/cruntime"
"k8s.io/minikube/pkg/minikube/logs"
)
// APIServerProcess waits for api server to be healthy returns error if it doesn't
func APIServerProcess(runner command.Runner, start time.Time, timeout time.Duration) error {
// WaitForAPIServerProcess waits for api server to be healthy returns error if it doesn't
func WaitForAPIServerProcess(r cruntime.Manager, bs bootstrapper.Bootstrapper, cr command.Runner, start time.Time, timeout time.Duration) error {
glog.Infof("waiting for apiserver process to appear ...")
minLogTime := kconst.APICallRetryInterval * 10
err := wait.PollImmediate(time.Millisecond*500, timeout, func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("cluster wait timed out during process check")
}
if _, ierr := apiServerPID(runner); ierr != nil {
if time.Since(start) > minLogTime {
announceProblems(r, bs, cr)
time.Sleep(kconst.APICallRetryInterval * 5)
}
if _, ierr := apiServerPID(cr); ierr != nil {
return false, nil
}
return true, nil
......@@ -67,14 +77,21 @@ func apiServerPID(cr command.Runner) (int, error) {
return strconv.Atoi(s)
}
// SystemPods verifies essential pods for running kurnetes is running
func SystemPods(client *kubernetes.Clientset, start time.Time, timeout time.Duration) error {
// WaitForSystemPods verifies essential pods for running kurnetes is running
func WaitForSystemPods(r cruntime.Manager, bs bootstrapper.Bootstrapper, cr command.Runner, client *kubernetes.Clientset, start time.Time, timeout time.Duration) error {
glog.Info("waiting for kube-system pods to appear ...")
pStart := time.Now()
minLogTime := kconst.APICallRetryInterval * 10
podList := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("cluster wait timed out during pod check")
}
if time.Since(start) > minLogTime {
announceProblems(r, bs, cr)
time.Sleep(kconst.APICallRetryInterval * 5)
}
// Wait for any system pod, as waiting for apiserver may block until etcd
pods, err := client.CoreV1().Pods("kube-system").List(meta.ListOptions{})
if err != nil {
......@@ -94,15 +111,22 @@ func SystemPods(client *kubernetes.Clientset, start time.Time, timeout time.Dura
return nil
}
// APIServerIsRunning waits for api server status to be running
func APIServerIsRunning(start time.Time, ip string, port int, timeout time.Duration) error {
// WaitForHealthyAPIServer waits for api server status to be running
func WaitForHealthyAPIServer(r cruntime.Manager, bs bootstrapper.Bootstrapper, cr command.Runner, start time.Time, ip string, port int, timeout time.Duration) error {
glog.Infof("waiting for apiserver healthz status ...")
hStart := time.Now()
minLogTime := kconst.APICallRetryInterval * 10
healthz := func() (bool, error) {
if time.Since(start) > timeout {
return false, fmt.Errorf("cluster wait timed out during healthz check")
}
if time.Since(start) > minLogTime {
announceProblems(r, bs, cr)
time.Sleep(kconst.APICallRetryInterval * 5)
}
status, err := apiServerHealthz(net.ParseIP(ip), port)
if err != nil {
glog.Warningf("status: %v", err)
......@@ -121,6 +145,15 @@ func APIServerIsRunning(start time.Time, ip string, port int, timeout time.Durat
return nil
}
// announceProblems checks for problems, and slows polling down if any are found
func announceProblems(r cruntime.Manager, bs bootstrapper.Bootstrapper, cr command.Runner) {
problems := logs.FindProblems(r, bs, cr)
if len(problems) > 0 {
logs.OutputProblems(problems, 5)
time.Sleep(kconst.APICallRetryInterval * 15)
}
}
// APIServerStatus returns apiserver status in libmachine style state.State
func APIServerStatus(cr command.Runner, ip net.IP, port int) (state.State, error) {
glog.Infof("Checking apiserver status ...")
......@@ -175,6 +208,10 @@ func apiServerHealthz(ip net.IP, port int) (state.State, error) {
if err != nil {
return state.Stopped, nil
}
if resp.StatusCode == http.StatusUnauthorized {
glog.Errorf("%s returned code %d (unauthorized). Please ensure that your apiserver authorization settings make sense!", url, resp.StatusCode)
return state.Error, nil
}
if resp.StatusCode != http.StatusOK {
glog.Warningf("%s response: %v %+v", url, err, resp)
return state.Error, nil
......
......@@ -260,7 +260,12 @@ func (k *Bootstrapper) WaitForCluster(cfg config.ClusterConfig, timeout time.Dur
if err != nil {
return err
}
if err := kverify.APIServerProcess(k.c, start, timeout); err != nil {
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return err
}
if err := kverify.WaitForAPIServerProcess(cr, k, k.c, start, timeout); err != nil {
return err
}
......@@ -273,7 +278,8 @@ func (k *Bootstrapper) WaitForCluster(cfg config.ClusterConfig, timeout time.Dur
return errors.Wrapf(err, "get host-bind port %d for container %s", port, cfg.Name)
}
}
if err := kverify.APIServerIsRunning(start, ip, port, timeout); err != nil {
if err := kverify.WaitForHealthyAPIServer(cr, k, k.c, start, ip, port, timeout); err != nil {
return err
}
......@@ -282,7 +288,7 @@ func (k *Bootstrapper) WaitForCluster(cfg config.ClusterConfig, timeout time.Dur
return errors.Wrap(err, "get k8s client")
}
if err := kverify.SystemPods(c, start, timeout); err != nil {
if err := kverify.WaitForSystemPods(cr, k, k.c, c, start, timeout); err != nil {
return errors.Wrap(err, "waiting for system pods")
}
return nil
......@@ -329,8 +335,13 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error {
}
}
cr, err := cruntime.New(cruntime.Config{Type: cfg.KubernetesConfig.ContainerRuntime, Runner: k.c})
if err != nil {
return err
}
// We must ensure that the apiserver is healthy before proceeding
if err := kverify.APIServerProcess(k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
if err := kverify.WaitForAPIServerProcess(cr, k, k.c, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "apiserver healthz")
}
......@@ -349,7 +360,7 @@ func (k *Bootstrapper) restartCluster(cfg config.ClusterConfig) error {
return errors.Wrap(err, "getting k8s client")
}
if err := kverify.SystemPods(client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
if err := kverify.WaitForSystemPods(cr, k, k.c, client, time.Now(), kconst.DefaultControlPlaneTimeout); err != nil {
return errors.Wrap(err, "system pods")
}
......
......@@ -35,8 +35,33 @@ import (
"k8s.io/minikube/pkg/minikube/out"
)
// rootCauseRe is a regular expression that matches known failure root causes
var rootCauseRe = regexp.MustCompile(`^error: |eviction manager: pods.* evicted|unknown flag: --|forbidden.*no providers available|eviction manager:.*evicted|tls: bad certificate|kubelet.*no API client|kubelet.*No api server|STDIN.*127.0.0.1:8080|failed to create listener|address already in use|unable to evict any pods|eviction manager: unexpected error`)
// rootCauses are regular expressions that match known failures
var rootCauses = []string{
`^error: `,
`eviction manager: pods.* evicted`,
`unknown flag: --`,
`forbidden.*no providers available`,
`eviction manager:.*evicted`,
`tls: bad certificate`,
`kubelet.*no API client`,
`kubelet.*No api server`,
`STDIN.*127.0.0.1:8080`,
`failed to create listener`,
`address already in use`,
`unable to evict any pods`,
`eviction manager: unexpected error`,
`Resetting AnonymousAuth to false`,
`CrashLoopBackOff`,
`Unable to register node.*forbidden`,
`Failed to initialize CSINodeInfo.*forbidden`,
`Failed to admit pod`,
`failed to "StartContainer"`,
`kubelet.*forbidden.*cannot \w+ resource`,
`leases.*forbidden.*cannot \w+ resource`,
}
// rootCauseRe combines rootCauses into a single regex
var rootCauseRe = regexp.MustCompile(strings.Join(rootCauses, "|"))
// ignoreCauseRe is a regular expression that matches spurious errors to not surface
var ignoreCauseRe = regexp.MustCompile("error: no objects passed to apply")
......@@ -44,6 +69,7 @@ var ignoreCauseRe = regexp.MustCompile("error: no objects passed to apply")
// importantPods are a list of pods to retrieve logs for, in addition to the bootstrapper logs.
var importantPods = []string{
"kube-apiserver",
"etcd",
"coredns",
"kube-scheduler",
"kube-proxy",
......
......@@ -36,6 +36,19 @@ func TestIsProblem(t *testing.T) {
{"no-objects-passed-to-apply #4010", false, "error: no objects passed to apply"},
{"bad-certificate #4251", true, "log.go:172] http: TLS handshake error from 127.0.0.1:49200: remote error: tls: bad certificate"},
{"ephemeral-eviction #5355", true, " eviction_manager.go:419] eviction manager: unexpected error when attempting to reduce ephemeral-storage pressure: wanted to free 9223372036854775807 bytes, but freed 0 bytes space with errors in image deletion"},
{"anonymous-auth", true, "AnonymousAuth is not allowed with the AlwaysAllow authorizer. Resetting AnonymousAuth to false. You should use a different authorizer"},
{"disk-pressure #7073", true, "eviction_manager.go:159] Failed to admit pod kindnet-jpzzf_kube-system(b63b1ee0-0fc6-428f-8e67-e357464f579c) - node has conditions: [DiskPressure]"},
{"csi timeout", true, `Failed to initialize CSINodeInfo: error updating CSINode annotation: timed out waiting for the condition; caused by: csinodes.storage.k8s.io "m01" is forbidden: User "system:node:m01" cannot get resource "csinodes" in API group "storage.k8s.io" at the cluster scope`},
{"node registration permissions", true, `Unable to register node "m01" with API server: nodes is forbidden: User "system:node:m01" cannot create resource "nodes" in API group "" at the cluster scope`},
{"regular kubelet refused", false, `kubelet_node_status.go:92] Unable to register node "m01" with API server: Post https://localhost:8443/api/v1/nodes: dial tcp 127.0.0.1:8443: connect: connection refused`},
{"regular csi refused", false, `Failed to initialize CSINodeInfo: error updating CSINode annotation: timed out waiting for the condition; caused by: Get https://localhost:8443/apis/storage.k8s.io/v1/csinodes/m01: dial tcp 127.0.0.1:8443: connect: connection refused`},
{"apiserver crashloop", true, `pod_workers.go:191] Error syncing pod 9f8ee739bd14e8733f807eb2be99768f ("kube-apiserver-m01_kube-system(9f8ee739bd14e8733f807eb2be99768f)"), skipping: failed to "StartContainer" for "kube-apiserver" with CrashLoopBackOff: "back-off 10s restarting failed container=kube-apiserver pod=kube-apiserver-m01_kube-system(9f8ee739bd14e8733f807eb2be99768f)`},
{"kubelet node timeout", false, `failed to ensure node lease exists, will retry in 6.4s, error: Get https://localhost:8443/apis/coordination.k8s.io/v1/namespaces/kube-node-lease/leases/m01?timeout=10s: dial tcp 127.0.0.1:8443: connect: connection refused`},
{"rbac misconfiguration", true, `leases.coordination.k8s.io "m01" is forbidden: User "system:node:m01" cannot get resource "leases" in API group "coordination.k8s.io" in the namespace "kube-node-lease"`},
{"regular controller init", false, `error retrieving resource lock kube-system/kube-controller-manager: endpoints "kube-controller-manager" is forbidden: User "system:kube-controller-manager" cannot get resource "endpoints" in API group "" in the namespace "kube-system"`},
{"regular scheduler services init", false, ` k8s.io/client-go/informers/factory.go:135: Failed to list *v1.Service: services is forbidden: User "system:kube-scheduler" cannot list resource "services" in API group "" at the cluster scope`},
{"regular scheduler nodes init", false, `k8s.io/client-go/informers/factory.go:135: Failed to list *v1.Node: nodes is forbidden: User "system:kube-scheduler" cannot list resource "nodes" in API group "" at the cluster scope`},
{"kubelet rbac fail", true, `k8s.io/kubernetes/pkg/kubelet/kubelet.go:526: Failed to list *v1.Node: nodes "m01" is forbidden: User "system:node:m01" cannot list resource "nodes" in API group "" at the cluster scope`},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册