提交 c984d49d 编写于 作者: X xulongteng

fix trian image

上级 39d65815
apiVersion: batch.volcano.sh/v1alpha1 apiVersion: batch.volcano.sh/v1alpha1
kind: Job kind: Job
metadata: metadata:
name: edl-demo name: edl-demo
spec: spec:
minAvailable: 4 minAvailable: 4
schedulerName: volcano schedulerName: volcano
policies: policies:
- event: PodEvicted - event: PodEvicted
action: RestartJob action: RestartJob
- event: PodFailed - event: PodFailed
action: RestartJob action: RestartJob
tasks: tasks:
- replicas: 2 - replicas: 2
name: pserver name: pserver
template: template:
metadata: metadata:
labels: labels:
paddle-job-pserver: fluid-ctr paddle-job-pserver: fluid-ctr
spec: spec:
imagePullSecrets: imagePullSecrets:
- name: default-secret - name: default-secret
volumes: volumes:
- hostPath: - hostPath:
path: /home/work/ path: /home/work/
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: sivanzcw/edldemo:v1 - image: wangjiawei1993/edldemo:v12
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
imagePullPolicy: IfNotPresent imagePullPolicy: IfNotPresent
name: pserver name: pserver
volumeMounts: volumeMounts:
- mountPath: /mnt/seqdata - mountPath: /data
name: seqdata name: seqdata
resources: resources:
limits: limits:
cpu: 10 cpu: 10
memory: 30Gi memory: 30Gi
ephemeral-storage: 10Gi ephemeral-storage: 10Gi
requests: requests:
cpu: 1 cpu: 1
memory: 100M memory: 100M
ephemeral-storage: 1Gi ephemeral-storage: 1Gi
env: env:
- name: GLOG_v - name: GLOG_v
value: "0" value: "0"
- name: GLOG_logtostderr - name: GLOG_logtostderr
value: "1" value: "1"
- name: TOPOLOGY - name: TOPOLOGY
value: "" value: ""
- name: TRAINER_PACKAGE - name: TRAINER_PACKAGE
value: /workspace value: /workspace
- name: PADDLE_INIT_NICS - name: PADDLE_INIT_NICS
value: eth2 value: eth2
- name: NAMESPACE - name: NAMESPACE
valueFrom: valueFrom:
fieldRef: fieldRef:
apiVersion: v1 apiVersion: v1
fieldPath: metadata.namespace fieldPath: metadata.namespace
- name: POD_IP - name: POD_IP
valueFrom: valueFrom:
fieldRef: fieldRef:
apiVersion: v1 apiVersion: v1
fieldPath: status.podIP fieldPath: status.podIP
- name: POD_NAME - name: POD_NAME
valueFrom: valueFrom:
fieldRef: fieldRef:
apiVersion: v1 apiVersion: v1
fieldPath: metadata.name fieldPath: metadata.name
- name: PADDLE_CURRENT_IP - name: PADDLE_CURRENT_IP
valueFrom: valueFrom:
fieldRef: fieldRef:
apiVersion: v1 apiVersion: v1
fieldPath: status.podIP fieldPath: status.podIP
- name: PADDLE_JOB_NAME - name: PADDLE_JOB_NAME
value: fluid-ctr value: fluid-ctr
- name: PADDLE_IS_LOCAL - name: PADDLE_IS_LOCAL
value: "0" value: "0"
- name: PADDLE_TRAINERS_NUM - name: PADDLE_TRAINERS_NUM
value: "2" value: "2"
- name: PADDLE_PSERVERS_NUM - name: PADDLE_PSERVERS_NUM
value: "2" value: "2"
- name: FLAGS_rpc_deadline - name: FLAGS_rpc_deadline
value: "36000000" value: "36000000"
- name: ENTRY - name: ENTRY
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
- name: PADDLE_PORT - name: PADDLE_PORT
value: "30236" value: "30240"
- name: LD_LIBRARY_PATH - name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: PADDLE_TRAINING_ROLE - name: PADDLE_TRAINING_ROLE
value: PSERVER value: PSERVER
- name: TRAINING_ROLE - name: TRAINING_ROLE
value: PSERVER value: PSERVER
restartPolicy: OnFailure restartPolicy: OnFailure
- replicas: 2 - replicas: 2
policies: policies:
- event: TaskCompleted - event: TaskCompleted
action: CompleteJob action: CompleteJob
name: trainer name: trainer
template: template:
metadata: metadata:
labels: labels:
paddle-job: fluid-ctr paddle-job: fluid-ctr
spec: spec:
imagePullSecrets: nodeSelector:
- name: default-secret nodeType: model
volumes: imagePullSecrets:
- hostPath: - name: default-secret
path: /home/work/ volumes:
type: "" - hostPath:
name: seqdata path: /home/work/data
containers: type: ""
- image: sivanzcw/edldemo:v1 name: seqdata
command: containers:
- paddle_k8s - image: wangjiawei1993/edldemo:v12
- start_fluid command:
imagePullPolicy: IfNotPresent - paddle_k8s
name: trainer - start_fluid
volumeMounts: imagePullPolicy: IfNotPresent
- mountPath: /mnt/seqdata name: trainer
name: seqdata volumeMounts:
resources: - mountPath: /data
limits: name: seqdata
cpu: 10 resources:
memory: 30Gi limits:
ephemeral-storage: 10Gi cpu: 10
requests: memory: 30Gi
cpu: 1 ephemeral-storage: 10Gi
memory: 100M requests:
ephemeral-storage: 10Gi cpu: 1
env: memory: 100M
- name: GLOG_v ephemeral-storage: 10Gi
value: "0" env:
- name: GLOG_logtostderr - name: GLOG_v
value: "1" value: "0"
- name: TOPOLOGY - name: GLOG_logtostderr
- name: TRAINER_PACKAGE value: "1"
value: /workspace - name: TOPOLOGY
- name: PADDLE_INIT_NICS - name: TRAINER_PACKAGE
value: eth2 value: /workspace
- name: CPU_NUM - name: PADDLE_INIT_NICS
value: "2" value: eth2
- name: NAMESPACE - name: CPU_NUM
valueFrom: value: "2"
fieldRef: - name: NAMESPACE
apiVersion: v1 valueFrom:
fieldPath: metadata.namespace fieldRef:
- name: POD_IP apiVersion: v1
valueFrom: fieldPath: metadata.namespace
fieldRef: - name: POD_IP
apiVersion: v1 valueFrom:
fieldPath: status.podIP fieldRef:
- name: POD_NAME apiVersion: v1
valueFrom: fieldPath: status.podIP
fieldRef: - name: POD_NAME
apiVersion: v1 valueFrom:
fieldPath: metadata.name fieldRef:
- name: PADDLE_CURRENT_IP apiVersion: v1
valueFrom: fieldPath: metadata.name
fieldRef: - name: PADDLE_CURRENT_IP
apiVersion: v1 valueFrom:
fieldPath: status.podIP fieldRef:
- name: PADDLE_JOB_NAME apiVersion: v1
value: fluid-ctr fieldPath: status.podIP
- name: PADDLE_IS_LOCAL - name: PADDLE_JOB_NAME
value: "0" value: fluid-ctr
- name: FLAGS_rpc_deadline - name: PADDLE_IS_LOCAL
value: "36000000" value: "0"
- name: PADDLE_PORT - name: FLAGS_rpc_deadline
value: "30236" value: "36000000"
- name: PADDLE_PSERVERS_NUM - name: PADDLE_PORT
value: "2" value: "30240"
- name: PADDLE_TRAINERS_NUM - name: PADDLE_PSERVERS_NUM
value: "2" value: "2"
- name: PADDLE_TRAINING_ROLE - name: PADDLE_TRAINERS_NUM
value: TRAINER value: "2"
- name: TRAINING_ROLE - name: PADDLE_TRAINING_ROLE
value: TRAINER value: TRAINER
- name: LD_LIBRARY_PATH - name: TRAINING_ROLE
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind value: TRAINER
- name: ENTRY - name: LD_LIBRARY_PATH
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
restartPolicy: OnFailure - name: ENTRY
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册