提交 6cbd794c 编写于 作者: L liqingping

feat: ignore pod and service AlreadyExsists error

上级 38671351
......@@ -54,10 +54,8 @@ func (r *DIJobReconciler) reconcileReplicas(ctx context.Context, job *div1alpha1
envs[dicommon.CoordinatorURLEnv] = coorurl
diutil.AddEnvsToPod(coorpod, envs)
if coordinator == nil {
if err := r.createPodAndService(ctx, job, coorpod, coorsvc); err != nil {
return err
}
if err := r.createPodAndService(ctx, job, coorpod, coorsvc); err != nil {
return err
}
// update job status
......
......@@ -45,7 +45,7 @@ func (r *DIJobReconciler) reconcilePodsAndServices(ctx context.Context, job *div
gpus := int(rs.GPU.Value())
diutil.AddPortsToService(svc, gpus, port)
}
if err := r.createService(ctx, job, svc); err != nil {
if err := r.createService(ctx, job, svc); err != nil && !errors.IsAlreadyExists(err) {
return err
}
}
......
......@@ -150,15 +150,6 @@ func (s *DIServer) createReplicas(
replicaType string,
agtemplate *corev1.PodTemplateSpec) ([]string, error) {
var defaultPort int32
switch replicaType {
case dicommon.CollectorName:
defaultPort = dicommon.DefaultCollectorPort
case dicommon.LearnerName:
defaultPort = dicommon.DefaultLearnerPort
default:
}
results := []string{}
// create pods and services
for i := 0; i < resources.Replicas; i++ {
......@@ -209,7 +200,7 @@ func (s *DIServer) createReplicas(
// build ddp learner pod
pod, svc, _, err = buildDDPLearnerPodAndService(template, ownRefer, aggOwnRefer,
jobName, namespace, replicaType, defaultPort, *replicaResource, volumes)
jobName, namespace, replicaType, *replicaResource, volumes)
if err != nil {
return results, err
}
......@@ -275,7 +266,7 @@ func (s *DIServer) createReplicas(
// build ddp learner pod
pod, svc, _, err = buildDDPLearnerPodAndService(template, ownRefer, aggOwnRefer,
jobName, namespace, replicaType, defaultPort, resources, volumes)
jobName, namespace, replicaType, resources, volumes)
if err != nil {
return results, err
}
......@@ -327,7 +318,7 @@ func buildDDPLearnerPodAndService(template *corev1.PodTemplateSpec,
ownRefer metav1.OwnerReference,
aggOwnRefer metav1.OwnerReference,
jobName, namespace, replicaType string,
defaultPort int32, resources commontypes.ResourceQuantity, volumes []corev1.Volume) (*corev1.Pod, *corev1.Service, int32, error) {
resources commontypes.ResourceQuantity, volumes []corev1.Volume) (*corev1.Pod, *corev1.Service, int32, error) {
pod, svc, port, err := diutil.BuildPodAndService(template.DeepCopy(), ownRefer, jobName,
namespace, dicommon.DDPLearnerName, volumes)
if err != nil {
......
......@@ -7,6 +7,7 @@ import (
mapset "github.com/deckarep/golang-set"
corev1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/errors"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/apis/meta/v1/unstructured"
......@@ -125,7 +126,7 @@ func (s *DIServer) getServiceByKey(key string) (*corev1.Service, error) {
func (s *DIServer) createPodAndService(namespace string, pod *corev1.Pod, service *corev1.Service) (*corev1.Pod, error) {
// create pod
newpod, err := s.createPod(namespace, pod)
if err != nil {
if err != nil && !errors.IsAlreadyExists(err) {
return nil, err
}
......@@ -143,7 +144,7 @@ func (s *DIServer) createPodAndService(namespace string, pod *corev1.Pod, servic
service.OwnerReferences = append(service.OwnerReferences, ownRefer)
// create service
if err := s.createService(namespace, service); err != nil {
if err := s.createService(namespace, service); err != nil && !errors.IsAlreadyExists(err) {
return newpod, err
}
return newpod, nil
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册