提交 39238142 编写于 作者: W wangjiawei04

update latest yamls

上级 ca3a6719
...@@ -26,7 +26,7 @@ spec: ...@@ -26,7 +26,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
...@@ -116,7 +116,7 @@ spec: ...@@ -116,7 +116,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
...@@ -206,7 +206,7 @@ spec: ...@@ -206,7 +206,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
......
...@@ -7,7 +7,7 @@ metadata: ...@@ -7,7 +7,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-0 - name: cube-0
image: wangjiawei1993/cube:v11 image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube workingDir: /cube
command: ['/bin/bash'] command: ['/bin/bash']
args: ['start.sh'] args: ['start.sh']
...@@ -28,7 +28,7 @@ metadata: ...@@ -28,7 +28,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-1 - name: cube-1
image: wangjiawei1993/cube:v11 image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube workingDir: /cube
command: ['/bin/bash'] command: ['/bin/bash']
args: ['start.sh'] args: ['start.sh']
......
...@@ -12,7 +12,7 @@ spec: ...@@ -12,7 +12,7 @@ spec:
name: file-home name: file-home
containers: containers:
- name: file-server - name: file-server
image: halverneus/static-file-server image: hub.baidubce.com/ctr/file-server:latest
ports: ports:
- containerPort: 8080 - containerPort: 8080
volumeMounts: volumeMounts:
......
...@@ -12,7 +12,7 @@ spec: ...@@ -12,7 +12,7 @@ spec:
name: file-home name: file-home
containers: containers:
- name: file-server - name: file-server
image: halverneus/static-file-server image: hub.baidubce.com/ctr/file-server:latest
ports: ports:
- containerPort: 8080 - containerPort: 8080
volumeMounts: volumeMounts:
...@@ -37,7 +37,6 @@ spec: ...@@ -37,7 +37,6 @@ spec:
--- ---
apiVersion: v1 apiVersion: v1
kind: Pod kind: Pod
metadata: metadata:
...@@ -47,7 +46,7 @@ metadata: ...@@ -47,7 +46,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-0 - name: cube-0
image: wangjiawei1993/cube:v11 image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube workingDir: /cube
command: ['/bin/bash'] command: ['/bin/bash']
args: ['start.sh'] args: ['start.sh']
...@@ -68,7 +67,7 @@ metadata: ...@@ -68,7 +67,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-1 - name: cube-1
image: wangjiawei1993/cube:v11 image: hub.baidubce.com/ctr/cube:latest
workingDir: /cube workingDir: /cube
command: ['/bin/bash'] command: ['/bin/bash']
args: ['start.sh'] args: ['start.sh']
...@@ -123,7 +122,7 @@ metadata: ...@@ -123,7 +122,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-transfer - name: cube-transfer
image: wangjiawei1993/cube-transfer:v18 image: hub.baidubce.com/ctr/cube-transfer:latest
workingDir: / workingDir: /
env: env:
- name: POD_IP - name: POD_IP
...@@ -150,7 +149,7 @@ metadata: ...@@ -150,7 +149,7 @@ metadata:
spec: spec:
containers: containers:
- name: paddleserving - name: paddleserving
image: wangjiawei1993/paddleserving:v7-debug image: hub.baidubce.com/ctr/paddleserving:latest
workingDir: /serving workingDir: /serving
command: ['/bin/bash'] command: ['/bin/bash']
args: ['run.sh'] args: ['run.sh']
...@@ -183,7 +182,7 @@ metadata: ...@@ -183,7 +182,7 @@ metadata:
spec: spec:
containers: containers:
- name: pdservingclient - name: pdservingclient
image: wangjiawei1993/pdservingclient:v4 image: hub.baidubce.com/ctr/pdservingclient:latest
workingDir: / workingDir: /
command: ['bash'] command: ['bash']
args: ['nonstop.sh'] args: ['nonstop.sh']
...@@ -196,7 +195,7 @@ kind: Job ...@@ -196,7 +195,7 @@ kind: Job
metadata: metadata:
name: edl-demo name: edl-demo
spec: spec:
minAvailable: 6 minAvailable: 4
schedulerName: volcano schedulerName: volcano
policies: policies:
- event: PodEvicted - event: PodEvicted
...@@ -204,7 +203,7 @@ spec: ...@@ -204,7 +203,7 @@ spec:
- event: PodFailed - event: PodFailed
action: RestartJob action: RestartJob
tasks: tasks:
- replicas: 3 - replicas: 2
name: pserver name: pserver
template: template:
metadata: metadata:
...@@ -219,7 +218,7 @@ spec: ...@@ -219,7 +218,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
...@@ -273,9 +272,9 @@ spec: ...@@ -273,9 +272,9 @@ spec:
- name: PADDLE_IS_LOCAL - name: PADDLE_IS_LOCAL
value: "0" value: "0"
- name: PADDLE_TRAINERS_NUM - name: PADDLE_TRAINERS_NUM
value: "3" value: "2"
- name: PADDLE_PSERVERS_NUM - name: PADDLE_PSERVERS_NUM
value: "3" value: "2"
- name: FLAGS_rpc_deadline - name: FLAGS_rpc_deadline
value: "36000000" value: "36000000"
- name: ENTRY - name: ENTRY
...@@ -309,7 +308,7 @@ spec: ...@@ -309,7 +308,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
...@@ -368,9 +367,9 @@ spec: ...@@ -368,9 +367,9 @@ spec:
- name: PADDLE_PORT - name: PADDLE_PORT
value: "30240" value: "30240"
- name: PADDLE_PSERVERS_NUM - name: PADDLE_PSERVERS_NUM
value: "3" value: "2"
- name: PADDLE_TRAINERS_NUM - name: PADDLE_TRAINERS_NUM
value: "3" value: "2"
- name: PADDLE_TRAINING_ROLE - name: PADDLE_TRAINING_ROLE
value: TRAINER value: TRAINER
- name: TRAINING_ROLE - name: TRAINING_ROLE
...@@ -381,7 +380,7 @@ spec: ...@@ -381,7 +380,7 @@ spec:
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure restartPolicy: OnFailure
- replicas: 2 - replicas: 1
policies: policies:
- event: TaskCompleted - event: TaskCompleted
action: CompleteJob action: CompleteJob
...@@ -399,7 +398,7 @@ spec: ...@@ -399,7 +398,7 @@ spec:
type: "" type: ""
name: seqdata name: seqdata
containers: containers:
- image: wangjiawei1993/edldemo:v19 - image: hub.baidubce.com/ctr/edldemo:latest
command: command:
- paddle_k8s - paddle_k8s
- start_fluid - start_fluid
...@@ -458,9 +457,9 @@ spec: ...@@ -458,9 +457,9 @@ spec:
- name: PADDLE_PORT - name: PADDLE_PORT
value: "30240" value: "30240"
- name: PADDLE_PSERVERS_NUM - name: PADDLE_PSERVERS_NUM
value: "3" value: "2"
- name: PADDLE_TRAINERS_NUM - name: PADDLE_TRAINERS_NUM
value: "3" value: "2"
- name: PADDLE_TRAINING_ROLE - name: PADDLE_TRAINING_ROLE
value: TRAINER value: TRAINER
- name: TRAINING_ROLE - name: TRAINING_ROLE
...@@ -470,3 +469,4 @@ spec: ...@@ -470,3 +469,4 @@ spec:
- name: ENTRY - name: ENTRY
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1 value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure restartPolicy: OnFailure
...@@ -7,7 +7,7 @@ metadata: ...@@ -7,7 +7,7 @@ metadata:
spec: spec:
containers: containers:
- name: pdservingclient - name: pdservingclient
image: wangjiawei1993/pdservingclient:v4 image: hub.baidubce.com/ctr/pdservingclient:latest
workingDir: / workingDir: /
command: ['bash'] command: ['bash']
args: ['nonstop.sh'] args: ['nonstop.sh']
......
...@@ -7,7 +7,7 @@ metadata: ...@@ -7,7 +7,7 @@ metadata:
spec: spec:
containers: containers:
- name: paddleserving - name: paddleserving
image: wangjiawei1993/paddleserving:v7-debug image: hub.baidubce.com/ctr/paddleserving:latest
workingDir: /serving workingDir: /serving
command: ['/bin/bash'] command: ['/bin/bash']
args: ['run.sh'] args: ['run.sh']
......
...@@ -7,7 +7,7 @@ metadata: ...@@ -7,7 +7,7 @@ metadata:
spec: spec:
containers: containers:
- name: cube-transfer - name: cube-transfer
image: wangjiawei1993/cube-transfer:v18 image: hub.baidubce.com/ctr/cube-transfer:latest
workingDir: / workingDir: /
env: env:
- name: POD_IP - name: POD_IP
......
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
name: edl-demo
spec:
minAvailable: 4
schedulerName: volcano
policies:
- event: PodEvicted
action: RestartJob
- event: PodFailed
action: RestartJob
tasks:
- replicas: 2
name: pserver
template:
metadata:
labels:
paddle-job-pserver: fluid-ctr
spec:
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: pserver
volumeMounts:
- mountPath: /data
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 1Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
value: ""
- name: TRAINER_PACKAGE
value: /workspace
- name: PADDLE_INIT_NICS
value: eth2
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: ENTRY
value: cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
- name: PADDLE_PORT
value: "30240"
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: PADDLE_TRAINING_ROLE
value: PSERVER
- name: TRAINING_ROLE
value: PSERVER
restartPolicy: OnFailure
- replicas: 2
policies:
- event: TaskCompleted
action: CompleteJob
name: trainer
template:
metadata:
labels:
paddle-job: fluid-ctr
spec:
nodeSelector:
nodeType: model
imagePullSecrets:
- name: default-secret
volumes:
- hostPath:
path: /home/work/data
type: ""
name: seqdata
containers:
- image: wangjiawei1993/edldemo:v19
command:
- paddle_k8s
- start_fluid
imagePullPolicy: IfNotPresent
name: trainer
volumeMounts:
- mountPath: /data
name: seqdata
resources:
limits:
cpu: 10
memory: 30Gi
ephemeral-storage: 10Gi
requests:
cpu: 1
memory: 100M
ephemeral-storage: 10Gi
env:
- name: GLOG_v
value: "0"
- name: GLOG_logtostderr
value: "1"
- name: TOPOLOGY
- name: TRAINER_PACKAGE
value: /workspace
- name: PADDLE_INIT_NICS
value: eth2
- name: CPU_NUM
value: "2"
- name: NAMESPACE
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.namespace
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: POD_NAME
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: metadata.name
- name: PADDLE_CURRENT_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
- name: PADDLE_JOB_NAME
value: fluid-ctr
- name: PADDLE_IS_LOCAL
value: "0"
- name: FLAGS_rpc_deadline
value: "36000000"
- name: PADDLE_PORT
value: "30240"
- name: PADDLE_PSERVERS_NUM
value: "2"
- name: PADDLE_TRAINERS_NUM
value: "2"
- name: PADDLE_TRAINING_ROLE
value: TRAINER
- name: TRAINING_ROLE
value: TRAINER
- name: LD_LIBRARY_PATH
value: /usr/local/lib:/usr/local/nvidia/lib64:/usr/local/rdma/lib64:/usr/lib64/mlnx_ofed/valgrind
- name: ENTRY
value: (/postprocess &) && cd /workspace/ctr && python train.py --is_local 0 --cloud_train 1
restartPolicy: OnFailure
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册