Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
唯有杜康TM
ms-operator
提交
5922cdda
M
ms-operator
项目概览
唯有杜康TM
/
ms-operator
与 Fork 源项目一致
Fork自
MindSpore / ms-operator
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
ms-operator
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5922cdda
编写于
4月 27, 2020
作者:
Y
YedongLiu
提交者:
Gitee
4月 27, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete unrelated comment and fix variable names
上级
e0a3f13c
变更
8
显示空白变更内容
内联
并排
Showing
8 changed file
with
80 addition
and
121 deletion
+80
-121
cmd/ms-operator.v1/app/options/options.go
cmd/ms-operator.v1/app/options/options.go
+1
-1
pkg/apis/mindspore/helper/helper.go
pkg/apis/mindspore/helper/helper.go
+1
-2
pkg/apis/mindspore/v1/types.go
pkg/apis/mindspore/v1/types.go
+1
-4
pkg/apis/mindspore/validation/validation.go
pkg/apis/mindspore/validation/validation.go
+1
-1
pkg/controller/controller.go
pkg/controller/controller.go
+14
-17
pkg/trainer/replicas.go
pkg/trainer/replicas.go
+61
-67
pkg/trainer/training.go
pkg/trainer/training.go
+1
-23
pkg/util/k8sutil/k8sutil.go
pkg/util/k8sutil/k8sutil.go
+0
-6
未找到文件。
cmd/ms-operator.v1/app/options/options.go
浏览文件 @
5922cdda
...
...
@@ -37,7 +37,7 @@ func NewServerOption() *ServerOption {
// AddFlags adds flags for a specific CMServer to the specified FlagSet
func
(
s
*
ServerOption
)
AddFlags
(
fs
*
flag
.
FlagSet
)
{
// chaos level will be removed once we have a formal tool to inject failures.
fs
.
IntVar
(
&
s
.
ChaosLevel
,
"chaos-level"
,
-
1
,
"DO NOT USE IN PRODUCTION - level of chaos injected into the
PyTorch
Job created by the operator."
)
fs
.
IntVar
(
&
s
.
ChaosLevel
,
"chaos-level"
,
-
1
,
"DO NOT USE IN PRODUCTION - level of chaos injected into the
MS
Job created by the operator."
)
fs
.
BoolVar
(
&
s
.
PrintVersion
,
"version"
,
false
,
"Show version and quit"
)
fs
.
DurationVar
(
&
s
.
GCInterval
,
"gc-interval"
,
10
*
time
.
Minute
,
"GC interval"
)
fs
.
StringVar
(
&
s
.
ControllerConfigFile
,
"controller-config-file"
,
""
,
"Path to file containing the controller config."
)
...
...
pkg/apis/mindspore/helper/helper.go
浏览文件 @
5922cdda
...
...
@@ -106,8 +106,7 @@ func ConfigureAcceleratorsForMSJobSpec(c *msv1.MSJobSpec, accelerators map[strin
// Cleanup cleans up user passed spec, e.g. defaulting, transforming fields.
// TODO: move this to admission controller
func
Cleanup
(
c
*
msv1
.
MSJobSpec
)
{
// TODO(jlewi): Add logic to cleanup user provided spec; e.g. by filling in defaults.
// We should have default container images so user doesn't have to provide these.
}
func
CRDName
()
string
{
...
...
pkg/apis/mindspore/v1/types.go
浏览文件 @
5922cdda
...
...
@@ -45,7 +45,6 @@ type MSJob struct {
}
type
MSJobSpec
struct
{
// TODO(jlewi): Can we we get rid of this and use some value from Kubernetes or a random ide.
RuntimeId
string
// ReplicaSpecs specifies the MS replicas to run.
...
...
@@ -81,11 +80,9 @@ const (
const
(
DefaultMSContainer
string
=
"mindspore"
DefaultMSImage
string
=
"mindspore/mindspore:v0.1.0"
DefaultMSImage
string
=
"mindspore/mindspore:v0.1.0
-alpha
"
)
// TODO(jlewi): We probably want to add a name field. This would allow us to have more than 1 type of each worker.
// This might be useful if you wanted to have a separate set of workers to do eval.
type
MSReplicaSpec
struct
{
// Replicas is the number of desired replicas.
// This is a pointer to distinguish between explicit zero and unspecified.
...
...
pkg/apis/mindspore/validation/validation.go
浏览文件 @
5922cdda
...
...
@@ -57,7 +57,7 @@ func ValidateMSJobSpec(c *msv1.MSJobSpec) error {
}
if
!
isValidReplicaType
{
return
fmt
.
Errorf
(
"
tf
ReplicaSpec.MSReplicaType is %v but must be one of %v"
,
r
.
MSReplicaType
,
validReplicaTypes
)
return
fmt
.
Errorf
(
"
ms
ReplicaSpec.MSReplicaType is %v but must be one of %v"
,
r
.
MSReplicaType
,
validReplicaTypes
)
}
for
_
,
c
:=
range
r
.
Template
.
Spec
.
Containers
{
...
...
pkg/controller/controller.go
浏览文件 @
5922cdda
...
...
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
// Package controller provides a Kubernetes controller for a
TensorFlow
job resource.
// Package controller provides a Kubernetes controller for a
MindSpore
job resource.
package
controller
import
(
...
...
@@ -85,9 +85,9 @@ type Controller struct {
syncHandler
func
(
jobKey
string
)
(
bool
,
error
)
}
func
New
(
kubeClient
kubernetes
.
Interface
,
APIExtclient
apiextensionsclient
.
Interface
,
tf
JobClient
msjobclient
.
Interface
,
config
msv1
.
ControllerConfig
,
tf
JobInformerFactory
informers
.
SharedInformerFactory
)
(
*
Controller
,
error
)
{
tfJobInformer
:=
tf
JobInformerFactory
.
Kubeflow
()
.
V1
()
.
MSJobs
()
func
New
(
kubeClient
kubernetes
.
Interface
,
APIExtclient
apiextensionsclient
.
Interface
,
ms
JobClient
msjobclient
.
Interface
,
config
msv1
.
ControllerConfig
,
ms
JobInformerFactory
informers
.
SharedInformerFactory
)
(
*
Controller
,
error
)
{
msJobInformer
:=
ms
JobInformerFactory
.
Kubeflow
()
.
V1
()
.
MSJobs
()
kubeflowscheme
.
AddToScheme
(
scheme
.
Scheme
)
log
.
Debug
(
"Creating event broadcaster"
)
...
...
@@ -99,22 +99,21 @@ func New(kubeClient kubernetes.Interface, APIExtclient apiextensionsclient.Inter
controller
:=
&
Controller
{
KubeClient
:
kubeClient
,
APIExtclient
:
APIExtclient
,
MSJobClient
:
tf
JobClient
,
MSJobClient
:
ms
JobClient
,
WorkQueue
:
workqueue
.
NewNamedRateLimitingQueue
(
workqueue
.
DefaultControllerRateLimiter
(),
"MSjobs"
),
recorder
:
recorder
,
// TODO(jlewi)): What to do about cluster.Cluster?
jobs
:
make
(
map
[
string
]
*
trainer
.
TrainingJob
),
config
:
config
,
}
log
.
Info
(
"Setting up event handlers"
)
// Set up an event handler for when Foo resources change
tf
JobInformer
.
Informer
()
.
AddEventHandler
(
ms
JobInformer
.
Informer
()
.
AddEventHandler
(
cache
.
FilteringResourceEventHandler
{
FilterFunc
:
func
(
obj
interface
{})
bool
{
switch
t
:=
obj
.
(
type
)
{
case
*
msv1
.
MSJob
:
log
.
Debugf
(
"filter
tf
job name: %v"
,
t
.
Name
)
log
.
Debugf
(
"filter
ms
job name: %v"
,
t
.
Name
)
return
true
default
:
return
false
...
...
@@ -129,8 +128,8 @@ func New(kubeClient kubernetes.Interface, APIExtclient apiextensionsclient.Inter
},
})
controller
.
MSJobLister
=
tf
JobInformer
.
Lister
()
controller
.
MSJobSynced
=
tf
JobInformer
.
Informer
()
.
HasSynced
controller
.
MSJobLister
=
ms
JobInformer
.
Lister
()
controller
.
MSJobSynced
=
ms
JobInformer
.
Informer
()
.
HasSynced
controller
.
syncHandler
=
controller
.
syncMSJob
return
controller
,
nil
...
...
@@ -216,7 +215,7 @@ func (c *Controller) syncMSJob(key string) (bool, error) {
return
false
,
fmt
.
Errorf
(
"invalid job key %q: either namespace or name is missing"
,
key
)
}
tf
Job
,
err
:=
c
.
MSJobLister
.
MSJobs
(
ns
)
.
Get
(
name
)
ms
Job
,
err
:=
c
.
MSJobLister
.
MSJobs
(
ns
)
.
Get
(
name
)
if
err
!=
nil
{
if
apierrors
.
IsNotFound
(
err
)
{
...
...
@@ -228,8 +227,8 @@ func (c *Controller) syncMSJob(key string) (bool, error) {
// Create a new TrainingJob if there is no TrainingJob stored for it in the jobs map or if the UID's don't match.
// The UID's won't match in the event we deleted the job and then recreated the job with the same name.
if
cJob
,
ok
:=
c
.
jobs
[
key
];
!
ok
||
cJob
.
UID
()
!=
tf
Job
.
UID
{
nc
,
err
:=
trainer
.
NewJob
(
c
.
KubeClient
,
c
.
MSJobClient
,
c
.
recorder
,
tf
Job
,
&
c
.
config
)
if
cJob
,
ok
:=
c
.
jobs
[
key
];
!
ok
||
cJob
.
UID
()
!=
ms
Job
.
UID
{
nc
,
err
:=
trainer
.
NewJob
(
c
.
KubeClient
,
c
.
MSJobClient
,
c
.
recorder
,
ms
Job
,
&
c
.
config
)
if
err
!=
nil
{
return
false
,
err
...
...
@@ -243,15 +242,13 @@ func (c *Controller) syncMSJob(key string) (bool, error) {
return
false
,
err
}
tfJob
,
err
=
c
.
MSJobClient
.
KubeflowV1
()
.
MSJobs
(
tfJob
.
ObjectMeta
.
Namespace
)
.
Get
(
tf
Job
.
ObjectMeta
.
Name
,
metav1
.
GetOptions
{})
msJob
,
err
=
c
.
MSJobClient
.
KubeflowV1
()
.
MSJobs
(
msJob
.
ObjectMeta
.
Namespace
)
.
Get
(
ms
Job
.
ObjectMeta
.
Name
,
metav1
.
GetOptions
{})
if
err
!=
nil
{
return
false
,
err
}
// TODO(jlewi): This logic will need to change when/if we get rid of phases and move to conditions. At that
// case we should forget about a job when the appropriate condition is reached.
if
tfJob
.
Status
.
Phase
==
msv1
.
MSJobPhaseCleanUp
{
if
msJob
.
Status
.
Phase
==
msv1
.
MSJobPhaseCleanUp
{
return
true
,
nil
}
return
false
,
nil
...
...
pkg/trainer/replicas.go
浏览文件 @
5922cdda
...
...
@@ -29,9 +29,8 @@ import (
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/tools/record"
torchv1alpha
1
"gitee.com/mindspore/ms-operator/pkg/apis/mindspore/v1"
msv
1
"gitee.com/mindspore/ms-operator/pkg/apis/mindspore/v1"
"gitee.com/mindspore/ms-operator/pkg/util/k8sutil"
// TOOO(jlewi): Rename to apiErrors
"gitee.com/mindspore/ms-operator/pkg/apis/mindspore/helper"
"gitee.com/mindspore/ms-operator/pkg/util"
)
...
...
@@ -47,53 +46,53 @@ type MSReplicaSet struct {
recorder
record
.
EventRecorder
// Job is a pointer to the TrainingJob to which this replica belongs.
Job
*
TrainingJob
Spec
torchv1alpha
1
.
MSReplicaSpec
Spec
msv
1
.
MSReplicaSpec
}
// MSReplicas is an interface for managing a set of replicas.
type
MSReplicaSetInterface
interface
{
Create
()
error
Delete
()
error
GetStatus
()
(
torchv1alpha
1
.
MSReplicaStatus
,
error
)
GetStatus
()
(
msv
1
.
MSReplicaStatus
,
error
)
}
// MSConfig is a struct representing the
TensorFlow
config. This struct is turned into an environment
// which is used by
TensorFlow
processes to configure themselves.
// MSConfig is a struct representing the
MindSpore
config. This struct is turned into an environment
// which is used by
MindSpore
processes to configure themselves.
type
MSConfig
struct
{
Cluster
ClusterSpec
`json:"cluster"`
Task
TaskSpec
`json:"task"`
Environment
string
`json:"environment"`
}
func
NewMSReplicaSet
(
clientSet
kubernetes
.
Interface
,
recorder
record
.
EventRecorder
,
tfReplicaSpec
torchv1alpha
1
.
MSReplicaSpec
,
job
*
TrainingJob
)
(
*
MSReplicaSet
,
error
)
{
if
tfReplicaSpec
.
MSReplicaType
==
torchv1alpha1
.
MASTER
&&
*
tf
ReplicaSpec
.
Replicas
!=
1
{
func
NewMSReplicaSet
(
clientSet
kubernetes
.
Interface
,
recorder
record
.
EventRecorder
,
msReplicaSpec
msv
1
.
MSReplicaSpec
,
job
*
TrainingJob
)
(
*
MSReplicaSet
,
error
)
{
if
msReplicaSpec
.
MSReplicaType
==
msv1
.
MASTER
&&
*
ms
ReplicaSpec
.
Replicas
!=
1
{
return
nil
,
errors
.
New
(
"The MASTER must have Replicas = 1"
)
}
if
tf
ReplicaSpec
.
MasterPort
==
nil
{
return
nil
,
errors
.
New
(
"
tf
ReplicaSpec.MasterPort can't be nil."
)
if
ms
ReplicaSpec
.
MasterPort
==
nil
{
return
nil
,
errors
.
New
(
"
ms
ReplicaSpec.MasterPort can't be nil."
)
}
// Make sure the replica type is valid.
validReplicaTypes
:=
[]
torchv1alpha1
.
MSReplicaType
{
torchv1alpha1
.
MASTER
,
torchv1alpha
1
.
WORKER
}
validReplicaTypes
:=
[]
msv1
.
MSReplicaType
{
msv1
.
MASTER
,
msv
1
.
WORKER
}
isValidReplicaType
:=
false
for
_
,
t
:=
range
validReplicaTypes
{
if
t
==
tf
ReplicaSpec
.
MSReplicaType
{
if
t
==
ms
ReplicaSpec
.
MSReplicaType
{
isValidReplicaType
=
true
break
}
}
if
!
isValidReplicaType
{
return
nil
,
fmt
.
Errorf
(
"
tfReplicaSpec.MSReplicaType is %v but must be one of %v"
,
tf
ReplicaSpec
.
MSReplicaType
,
validReplicaTypes
)
return
nil
,
fmt
.
Errorf
(
"
msReplicaSpec.MSReplicaType is %v but must be one of %v"
,
ms
ReplicaSpec
.
MSReplicaType
,
validReplicaTypes
)
}
return
&
MSReplicaSet
{
ClientSet
:
clientSet
,
recorder
:
recorder
,
Job
:
job
,
Spec
:
tf
ReplicaSpec
,
Spec
:
ms
ReplicaSpec
,
},
nil
}
...
...
@@ -108,7 +107,7 @@ func (s *MSReplicaSet) Labels() KubernetesLabels {
"ms_job_name"
:
s
.
Job
.
job
.
ObjectMeta
.
Name
})
}
func
(
s
*
MSReplicaSet
)
Create
(
config
*
torchv1alpha
1
.
ControllerConfig
,
worldSize
int32
)
error
{
func
(
s
*
MSReplicaSet
)
Create
(
config
*
msv
1
.
ControllerConfig
,
worldSize
int32
)
error
{
// Create services
err
:=
s
.
SyncServices
()
if
err
!=
nil
{
...
...
@@ -137,7 +136,7 @@ func (s *MSReplicaSet) CreateServiceWithIndex(index int32) (*v1.Service, error)
Selector
:
taskLabels
,
Ports
:
[]
v1
.
ServicePort
{
{
Name
:
"
tf
-port"
,
Name
:
"
ms
-port"
,
Port
:
*
s
.
Spec
.
MasterPort
,
},
},
...
...
@@ -173,7 +172,7 @@ func (s *MSReplicaSet) CreatePodWithIndex(index int32, worldSize int32) (*v1.Pod
masterAddr
=
"localhost"
}
rank
:=
strconv
.
Itoa
(
int
(
index
))
tf
Config
:=
MSConfig
{
ms
Config
:=
MSConfig
{
Cluster
:
s
.
Job
.
ClusterSpec
(),
Task
:
TaskSpec
{
Type
:
strings
.
ToLower
(
string
(
s
.
Spec
.
MSReplicaType
)),
...
...
@@ -183,27 +182,26 @@ func (s *MSReplicaSet) CreatePodWithIndex(index int32, worldSize int32) (*v1.Pod
Environment
:
"cloud"
,
}
tfConfigJson
,
err
:=
json
.
Marshal
(
tf
Config
)
msConfigJson
,
err
:=
json
.
Marshal
(
ms
Config
)
if
err
!=
nil
{
log
.
Errorf
(
"Job: %v serializing
tfConfig: %v return error; %v"
,
s
.
Job
.
job
.
ObjectMeta
.
Name
,
util
.
Pformat
(
tf
Config
),
err
)
log
.
Errorf
(
"Job: %v serializing
msConfig: %v return error; %v"
,
s
.
Job
.
job
.
ObjectMeta
.
Name
,
util
.
Pformat
(
ms
Config
),
err
)
return
nil
,
err
}
// TODO(jose5918) Do not need TF_CONFIG but leaving for POC
// Add TF_CONFIG environment variable.
// Add MS_CONFIG environment variable.
for
i
,
_
:=
range
pod
.
Spec
.
Containers
{
// We can't get c in the loop variable because that would be by value so our modifications
// wouldn't have any effect.
c
:=
&
pod
.
Spec
.
Containers
[
i
]
if
c
.
Name
!=
torchv1alpha
1
.
DefaultMSContainer
{
if
c
.
Name
!=
msv
1
.
DefaultMSContainer
{
continue
}
if
len
(
c
.
Env
)
==
0
{
c
.
Env
=
make
([]
v1
.
EnvVar
,
0
)
}
c
.
Env
=
append
(
c
.
Env
,
v1
.
EnvVar
{
Name
:
"
TF
_CONFIG"
,
Value
:
string
(
tf
ConfigJson
),
Name
:
"
MS
_CONFIG"
,
Value
:
string
(
ms
ConfigJson
),
})
c
.
Env
=
append
(
c
.
Env
,
v1
.
EnvVar
{
Name
:
"MASTER_PORT"
,
...
...
@@ -258,7 +256,6 @@ func (s *MSReplicaSet) Delete() error {
}
// Services doesn't support DeleteCollection so we delete them individually.
// TODO(jlewi): We should check if this has changed with K8s 1.8 or other releases.
for
index
:=
int32
(
0
);
index
<
*
s
.
Spec
.
Replicas
;
index
++
{
log
.
V
(
1
)
.
Infof
(
"Deleting Service %v:%v"
,
s
.
Job
.
job
.
ObjectMeta
.
Namespace
,
s
.
genName
((
index
)))
err
=
s
.
ClientSet
.
CoreV1
()
.
Services
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Delete
(
s
.
genName
(
index
),
&
meta_v1
.
DeleteOptions
{})
...
...
@@ -269,17 +266,17 @@ func (s *MSReplicaSet) Delete() error {
}
}
// If the ConfigMap for the default
parameter serv
er exists, we delete it
log
.
Infof
(
"Get ConfigMaps %v:%v"
,
s
.
Job
.
job
.
ObjectMeta
.
Namespace
,
s
.
default
PS
ConfigMapName
())
_
,
err
=
s
.
ClientSet
.
CoreV1
()
.
ConfigMaps
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Get
(
s
.
default
PS
ConfigMapName
(),
meta_v1
.
GetOptions
{})
// If the ConfigMap for the default
mast
er exists, we delete it
log
.
Infof
(
"Get ConfigMaps %v:%v"
,
s
.
Job
.
job
.
ObjectMeta
.
Namespace
,
s
.
default
Master
ConfigMapName
())
_
,
err
=
s
.
ClientSet
.
CoreV1
()
.
ConfigMaps
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Get
(
s
.
default
Master
ConfigMapName
(),
meta_v1
.
GetOptions
{})
if
err
!=
nil
{
if
!
k8sutil
.
IsKubernetesResourceNotFoundError
(
err
)
{
log
.
Errorf
(
"Error deleting ConfigMap %v; %v"
,
s
.
default
PS
ConfigMapName
(),
err
)
log
.
Errorf
(
"Error deleting ConfigMap %v; %v"
,
s
.
default
Master
ConfigMapName
(),
err
)
failures
=
true
}
}
else
{
log
.
Infof
(
"Delete ConfigMaps %v:%v"
,
s
.
Job
.
job
.
ObjectMeta
.
Namespace
,
s
.
default
PS
ConfigMapName
())
err
=
s
.
ClientSet
.
CoreV1
()
.
ConfigMaps
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Delete
(
s
.
default
PS
ConfigMapName
(),
&
meta_v1
.
DeleteOptions
{})
log
.
Infof
(
"Delete ConfigMaps %v:%v"
,
s
.
Job
.
job
.
ObjectMeta
.
Namespace
,
s
.
default
Master
ConfigMapName
())
err
=
s
.
ClientSet
.
CoreV1
()
.
ConfigMaps
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Delete
(
s
.
default
Master
ConfigMapName
(),
&
meta_v1
.
DeleteOptions
{})
if
err
!=
nil
{
log
.
Errorf
(
"There was a problem deleting the ConfigMaps; %v"
,
err
)
failures
=
true
...
...
@@ -293,7 +290,7 @@ func (s *MSReplicaSet) Delete() error {
}
// replicaStatusFromPodList returns a status from a list of pods for a job.
func
replicaStatusFromPodList
(
l
v1
.
PodList
,
name
string
)
torchv1alpha
1
.
ReplicaState
{
func
replicaStatusFromPodList
(
l
v1
.
PodList
,
name
string
)
msv
1
.
ReplicaState
{
var
latest
*
v1
.
Pod
for
_
,
i
:=
range
l
.
Items
{
if
latest
==
nil
{
...
...
@@ -306,10 +303,10 @@ func replicaStatusFromPodList(l v1.PodList, name string) torchv1alpha1.ReplicaSt
}
if
latest
==
nil
{
return
torchv1alpha
1
.
ReplicaStateRunning
return
msv
1
.
ReplicaStateRunning
}
var
tf
State
v1
.
ContainerState
var
ms
State
v1
.
ContainerState
for
_
,
i
:=
range
latest
.
Status
.
ContainerStatuses
{
if
i
.
Name
!=
name
{
...
...
@@ -317,45 +314,45 @@ func replicaStatusFromPodList(l v1.PodList, name string) torchv1alpha1.ReplicaSt
}
// We need to decide whether to use the current state or the previous termination state.
tf
State
=
i
.
State
ms
State
=
i
.
State
// If the container previously terminated we will look at the termination to decide whether it is a retryable
// or permanenent error.
if
i
.
LastTerminationState
.
Terminated
!=
nil
{
tf
State
=
i
.
LastTerminationState
ms
State
=
i
.
LastTerminationState
}
}
if
tfState
.
Running
!=
nil
||
tf
State
.
Waiting
!=
nil
{
return
torchv1alpha
1
.
ReplicaStateRunning
if
msState
.
Running
!=
nil
||
ms
State
.
Waiting
!=
nil
{
return
msv
1
.
ReplicaStateRunning
}
if
tf
State
.
Terminated
!=
nil
{
if
tf
State
.
Terminated
.
ExitCode
==
0
{
return
torchv1alpha
1
.
ReplicaStateSucceeded
if
ms
State
.
Terminated
!=
nil
{
if
ms
State
.
Terminated
.
ExitCode
==
0
{
return
msv
1
.
ReplicaStateSucceeded
}
if
isRetryableTerminationState
(
tf
State
.
Terminated
)
{
if
isRetryableTerminationState
(
ms
State
.
Terminated
)
{
// Since its a retryable error just return RUNNING.
// We can just let Kubernetes restart the container to retry.
return
torchv1alpha
1
.
ReplicaStateRunning
return
msv
1
.
ReplicaStateRunning
}
return
torchv1alpha
1
.
ReplicaStateFailed
return
msv
1
.
ReplicaStateFailed
}
return
torchv1alpha
1
.
ReplicaStateUnknown
return
msv
1
.
ReplicaStateUnknown
}
func
(
s
*
MSReplicaSet
)
GetSingleReplicaStatus
(
index
int32
)
torchv1alpha
1
.
ReplicaState
{
func
(
s
*
MSReplicaSet
)
GetSingleReplicaStatus
(
index
int32
)
msv
1
.
ReplicaState
{
p
,
err
:=
s
.
ClientSet
.
CoreV1
()
.
Pods
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
Get
(
s
.
genName
(
index
),
meta_v1
.
GetOptions
{})
if
err
!=
nil
{
return
torchv1alpha
1
.
ReplicaStateUnknown
return
msv
1
.
ReplicaStateUnknown
}
if
v1
.
PodSucceeded
==
p
.
Status
.
Phase
{
return
torchv1alpha
1
.
ReplicaStateSucceeded
return
msv
1
.
ReplicaStateSucceeded
}
labels
:=
s
.
Labels
()
...
...
@@ -363,33 +360,30 @@ func (s *MSReplicaSet) GetSingleReplicaStatus(index int32) torchv1alpha1.Replica
selector
,
err
:=
labels
.
ToSelector
()
if
err
!=
nil
{
log
.
Errorf
(
"labels.ToSelector() error; %v"
,
err
)
return
torchv1alpha
1
.
ReplicaStateFailed
return
msv
1
.
ReplicaStateFailed
}
// TODO(jlewi): Handle errors. We need to get the pod and looking at recent container exits.
l
,
err
:=
s
.
ClientSet
.
CoreV1
()
.
Pods
(
s
.
Job
.
job
.
ObjectMeta
.
Namespace
)
.
List
(
meta_v1
.
ListOptions
{
// TODO(jlewi): Why isn't the label selector working?
LabelSelector
:
selector
,
})
if
err
!=
nil
{
// TODO(jlewi): Are there errors that should be treated as retryable errors?
return
torchv1alpha1
.
ReplicaStateFailed
return
msv1
.
ReplicaStateFailed
}
status
:=
replicaStatusFromPodList
(
*
l
,
torchv1alpha
1
.
DefaultMSContainer
)
status
:=
replicaStatusFromPodList
(
*
l
,
msv
1
.
DefaultMSContainer
)
return
status
}
// Status returns the status of the replica set.
func
(
s
*
MSReplicaSet
)
GetStatus
()
(
torchv1alpha
1
.
MSReplicaStatus
,
error
)
{
status
:=
torchv1alpha
1
.
MSReplicaStatus
{
func
(
s
*
MSReplicaSet
)
GetStatus
()
(
msv
1
.
MSReplicaStatus
,
error
)
{
status
:=
msv
1
.
MSReplicaStatus
{
MSReplicaType
:
s
.
Spec
.
MSReplicaType
,
State
:
torchv1alpha
1
.
ReplicaStateUnknown
,
ReplicasStates
:
make
(
map
[
torchv1alpha
1
.
ReplicaState
]
int
),
State
:
msv
1
.
ReplicaStateUnknown
,
ReplicasStates
:
make
(
map
[
msv
1
.
ReplicaState
]
int
),
}
increment
:=
func
(
state
torchv1alpha
1
.
ReplicaState
)
{
increment
:=
func
(
state
msv
1
.
ReplicaState
)
{
v
,
ok
:=
status
.
ReplicasStates
[
state
]
if
ok
{
status
.
ReplicasStates
[
state
]
=
v
+
1
...
...
@@ -405,20 +399,20 @@ func (s *MSReplicaSet) GetStatus() (torchv1alpha1.MSReplicaStatus, error) {
// Determine the overall status for the replica set based on the status of the individual
// replicas.
// If any of the replicas failed mark the set as failed.
if
_
,
ok
:=
status
.
ReplicasStates
[
torchv1alpha
1
.
ReplicaStateFailed
];
ok
{
status
.
State
=
torchv1alpha
1
.
ReplicaStateFailed
if
_
,
ok
:=
status
.
ReplicasStates
[
msv
1
.
ReplicaStateFailed
];
ok
{
status
.
State
=
msv
1
.
ReplicaStateFailed
return
status
,
nil
}
// If any replicas are RUNNING mark it as RUNNING.
if
_
,
ok
:=
status
.
ReplicasStates
[
torchv1alpha
1
.
ReplicaStateRunning
];
ok
{
status
.
State
=
torchv1alpha
1
.
ReplicaStateRunning
if
_
,
ok
:=
status
.
ReplicasStates
[
msv
1
.
ReplicaStateRunning
];
ok
{
status
.
State
=
msv
1
.
ReplicaStateRunning
return
status
,
nil
}
// If all of the replicas succeeded consider it success.
if
v
,
ok
:=
status
.
ReplicasStates
[
torchv1alpha
1
.
ReplicaStateSucceeded
];
ok
&&
int32
(
v
)
==
*
s
.
Spec
.
Replicas
{
status
.
State
=
torchv1alpha
1
.
ReplicaStateSucceeded
if
v
,
ok
:=
status
.
ReplicasStates
[
msv
1
.
ReplicaStateSucceeded
];
ok
&&
int32
(
v
)
==
*
s
.
Spec
.
Replicas
{
status
.
State
=
msv
1
.
ReplicaStateSucceeded
return
status
,
nil
}
...
...
@@ -515,7 +509,7 @@ func (s *MSReplicaSet) SyncServices() error {
}
func
(
s
*
MSReplicaSet
)
genName
(
index
int32
)
string
{
// Truncate
tf
job name to 40 characters
// Truncate
ms
job name to 40 characters
// The whole job name should be compliant with the DNS_LABEL spec, up to a max length of 63 characters
// Thus genName(40 chars)-replicaType(6 chars)-runtimeId(4 chars)-index(4 chars), also leaving some spaces
// See https://github.com/kubernetes/community/blob/master/contributors/design-proposals/architecture/identifiers.md
...
...
@@ -527,7 +521,7 @@ func (s *MSReplicaSet) genPodName(index int32) string {
return
s
.
genName
(
index
)
+
"-"
+
util
.
RandString
(
5
)
}
func
(
s
*
MSReplicaSet
)
default
PS
ConfigMapName
()
string
{
func
(
s
*
MSReplicaSet
)
default
Master
ConfigMapName
()
string
{
return
fmt
.
Sprintf
(
"cm-ps-%v"
,
s
.
Job
.
job
.
Spec
.
RuntimeId
)
}
pkg/trainer/training.go
浏览文件 @
5922cdda
...
...
@@ -34,8 +34,6 @@ import (
"gitee.com/mindspore/ms-operator/pkg/util"
)
// TODO(jlewi): We should switch a New pattern and make trainingJob private so we can
// ensure correctness on creation.
type
TrainingJob
struct
{
job
*
msv1
.
MSJob
...
...
@@ -55,9 +53,7 @@ type TrainingJob struct {
memberCounter
int
}
// TODO(jose5918): We don't really need the cluster spec for this operator but no harm in leaving it for POC
// ClusterSpec represents a cluster TensorFlow specification.
// https://www.tensorflow.org/deploy/distributed#create_a_tftrainclusterspec_to_describe_the_cluster
// ClusterSpec represents a cluster MindSpore specification.
// It is a map from job names to network addresses.
type
ClusterSpec
map
[
string
][]
string
...
...
@@ -110,7 +106,6 @@ func (j *TrainingJob) ClusterSpec() ClusterSpec {
// createResources creates all the replicas if requested
func
(
j
*
TrainingJob
)
createResources
(
config
*
msv1
.
ControllerConfig
)
error
{
// TODO(jose5918) Need to figure out where it is best to add worldSize logic
// Get MS worldSize by adding replicas
worldSize
:=
int32
(
0
)
for
_
,
r
:=
range
j
.
Replicas
{
...
...
@@ -144,7 +139,6 @@ func (j *TrainingJob) GetStatus() (msv1.State, []*msv1.MSReplicaStatus, error) {
replicaStatuses
:=
make
([]
*
msv1
.
MSReplicaStatus
,
0
)
// The state for each replica.
// TODO(jlewi): We will need to modify this code if we want to allow multiples of a given type of replica.
replicaSetStates
:=
make
(
map
[
msv1
.
MSReplicaType
]
msv1
.
ReplicaState
)
for
_
,
r
:=
range
j
.
Replicas
{
...
...
@@ -176,8 +170,6 @@ func (j *TrainingJob) GetStatus() (msv1.State, []*msv1.MSReplicaStatus, error) {
// isRetryableTerminationState returns true if a container terminated in a state
// that we consider retryable.
func
isRetryableTerminationState
(
s
*
v1
.
ContainerStateTerminated
)
bool
{
// TODO(jlewi): Need to match logic in
// https://cs.corp.google.com/piper///depot/google3/cloud/ml/beta/job/training_job_state_util.cc?l=88
if
s
.
Reason
==
"OOMKilled"
{
// If the user's process causes an OOM and Docker kills the container,
// the termination reason of ContainerState will be specified to
...
...
@@ -189,8 +181,6 @@ func isRetryableTerminationState(s *v1.ContainerStateTerminated) bool {
return
false
}
// TODO(jlewi): Should we use the exit code reported in the termination
// log message and not the ExitCode reported by the container.
if
s
.
ExitCode
>=
0
&&
s
.
ExitCode
<=
127
{
// For the exit_code in [0, 127]:
...
...
@@ -271,19 +261,12 @@ func (j *TrainingJob) setupReplicas() error {
}
func
(
j
*
TrainingJob
)
Delete
()
{
// TODO(jlewi): Delete is what should cause us to delete the Pods.
// we shouldn't delete the pods when the jobs finish because leaving the pods
// allows us to get the logs from the pods after the job finishes.
//
log
.
Infof
(
"MSJob %v deleted by the user"
,
j
.
fullname
())
// TODO(jlewi): This logic is probably insufficient.
if
j
.
job
.
Status
.
Phase
!=
msv1
.
MSJobPhaseCleanUp
{
j
.
status
.
Phase
=
msv1
.
MSJobPhaseCleanUp
}
// TODO(jlewi): Does it make sense to explicitly delete the resources? Should
// we just rely on K8s garbage collection to delete the resources before
// deleting MSJob?
if
cErr
:=
j
.
deleteResources
();
cErr
!=
nil
{
log
.
Errorf
(
"trainingJob.deleteResources() error; %v"
,
cErr
)
}
...
...
@@ -331,9 +314,6 @@ func (j *TrainingJob) Reconcile(config *msv1.ControllerConfig) error {
return
err
}
// TODO(jlewi): Can we determine from the CRD status whether we should
// Create the resources or not? We need to ensure the resources exist so for
// now we always call Create.
if
j
.
job
.
Status
.
Phase
==
msv1
.
MSJobPhaseCreating
||
j
.
job
.
Status
.
Phase
==
msv1
.
MSJobPhaseRunning
{
// We call Create to make sure all the resources exist and are running.
if
cErr
:=
j
.
createResources
(
config
);
cErr
!=
nil
{
...
...
@@ -354,7 +334,6 @@ func (j *TrainingJob) Reconcile(config *msv1.ControllerConfig) error {
log
.
Errorf
(
"GetStatus() for job %v returned error: %v"
,
j
.
job
.
ObjectMeta
.
Name
,
err
)
return
err
}
// TODO(jlewi): We should update the Phase if we detect the job is done.
if
state
==
msv1
.
StateFailed
{
log
.
Errorf
(
"Master failed Job: %v."
,
j
.
job
.
ObjectMeta
.
Name
)
j
.
status
.
Phase
=
msv1
.
MSJobPhaseDone
...
...
@@ -367,7 +346,6 @@ func (j *TrainingJob) Reconcile(config *msv1.ControllerConfig) error {
log
.
Infof
(
"Job %v status=%v"
,
j
.
job
.
ObjectMeta
.
Name
,
util
.
Pformat
(
j
.
status
))
}
}
// TODO(jose5918) Need to figure out where it is best to add worldSize logic
// Get MS worldSize by adding replicas
worldSize
:=
int32
(
0
)
for
_
,
r
:=
range
j
.
Replicas
{
...
...
pkg/util/k8sutil/k8sutil.go
浏览文件 @
5922cdda
...
...
@@ -33,8 +33,6 @@ import (
const
RecommendedConfigPathEnvVar
=
"KUBECONFIG"
// TODO(jlewi): I think this function is used to add an owner to a resource. I think we we should use this
// method to ensure all resources created for the TFJob are owned by the TFJob.
func
addOwnerRefToObject
(
o
metav1
.
Object
,
r
metav1
.
OwnerReference
)
{
o
.
SetOwnerReferences
(
append
(
o
.
GetOwnerReferences
(),
r
))
}
...
...
@@ -99,15 +97,11 @@ func JobListOpt(clusterName string) metav1.ListOptions {
func
LabelsForJob
(
jobName
string
)
map
[
string
]
string
{
return
map
[
string
]
string
{
// TODO(jlewi): Need to set appropriate labels for TF.
"ms_job"
:
jobName
,
"app"
:
msv1
.
AppLabel
,
}
}
// TODO(jlewi): CascadeDeletOptions are part of garbage collection policy.
// Do we want to use this? See
// https://kubernetes.io/docs/concepts/workloads/controllers/garbage-collection/
func
CascadeDeleteOptions
(
gracePeriodSeconds
int64
)
*
metav1
.
DeleteOptions
{
return
&
metav1
.
DeleteOptions
{
GracePeriodSeconds
:
func
(
t
int64
)
*
int64
{
return
&
t
}(
gracePeriodSeconds
),
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录