k8s.yaml.template 6.0 KB
Newer Older
J
Jinhua Liang 已提交

apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
  name: <$ JOB_NAME $>
spec:
  minAvailable: <$ TOTAL_POD_NUM $>
  schedulerName: volcano
  policies:
  - event: PodEvicted
    action: RestartJob
  - event: PodFailed
    action: RestartJob
  tasks:
    - replicas: <$ PSERVER_NUM $>
      name: pserver
      template:
        metadata:
          labels:
            paddle-job-pserver: paddle-rec
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - image: <$ IMAGE $>
              command:
                - '/bin/bash'
              args:
              - "-c"
              - |
                set -ex
                sh /usr/paddlerec/set_k8s_env.sh start_fluid
              imagePullPolicy: Always
              volumeMounts:
                - name: model-config
                  mountPath: "/usr/paddlerec"
              name: pserver
              resources:
                limits:
                  cpu: <$ CPU_NUM $>
                  memory: <$ MEMORY $>
                  ephemeral-storage: <$ STORAGE $>
                requests:
                  cpu: 1
                  memory: 1Gi
                  ephemeral-storage: 1Gi
              env:
                - name: GLOG_v
                  value: "<$ GLOG_V $>"
                - name: GLOG_logtostderr
                  value: "1"
                - name: TOPOLOGY
                  value: ""
                - name: TRAINER_PACKAGE
                  value: /root/paddlejob
                - name: PADDLE_INIT_NICS
                  value: eth2
                - name: NAMESPACE
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.namespace
                - name: POD_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: POD_NAME
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.name
                - name: PADDLE_CURRENT_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: PADDLE_JOB_NAME
                  value: paddle-rec
                - name: PADDLE_IS_LOCAL
                  value: "0"
                - name: PADDLE_TRAINERS_NUM
                  value: "<$ TRAINER_NUM $>"
                - name: PADDLE_PSERVERS_NUM
                  value: "<$ PSERVER_NUM $>"
                - name: FLAGS_rpc_deadline
                  value: "100000"
                - name: ENTRY
                  value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
                - name: PADDLE_PORT
                  value: "30240"
                - name: PADDLE_TRAINING_ROLE
                  value: PSERVER
                - name: TRAINING_ROLE
                  value: PSERVER
          volumes:
            - name: model-config
              configMap:
                name: modelconfig
                defaultMode: 0777
          restartPolicy: OnFailure

    - replicas: <$ TRAINER_NUM $>
      policies:
      - event: TaskCompleted
        action: CompleteJob
      name: trainer
      template:
        metadata:
          labels:
            paddle-job: paddle-rec
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - image: <$ IMAGE $>
              command:
                - '/bin/bash'
              args:
              - "-c"
              - |
                set -ex
                sh /usr/paddlerec/set_k8s_env.sh start_fluid
              imagePullPolicy: Always
              volumeMounts:
                - name: model-config
                  mountPath: "/usr/paddlerec"
              name: trainer
              resources:
                limits:
                  cpu: <$ CPU_NUM $>
                  memory: <$ MEMORY $>
                  ephemeral-storage: <$ STORAGE $>
                requests:
                  cpu: 1
                  memory: 1Gi
                  ephemeral-storage: 1Gi
              env:
                - name: GLOG_v
                  value: "<$ GLOG_V $>"
                - name: GLOG_logtostderr
                  value: "1"
                - name: TRAINER_PACKAGE
                  value: /root/paddlejob
                - name: PADDLE_INIT_NICS
                  value: eth2
                - name: NAMESPACE
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.namespace
                - name: POD_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: POD_NAME
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.name
                - name: PADDLE_CURRENT_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: PADDLE_JOB_NAME
                  value: paddle-rec
                - name: PADDLE_IS_LOCAL
                  value: "0"
                - name: FLAGS_rpc_deadline
                  value: "3600"
                - name: PADDLE_PORT
                  value: "30240"
                - name: PADDLE_PSERVERS_NUM
                  value: "<$ PSERVER_NUM $>"
                - name: PADDLE_TRAINERS_NUM
                  value: "<$ TRAINER_NUM $>"
                - name: PADDLE_TRAINING_ROLE
                  value: TRAINER
                - name: TRAINING_ROLE
                  value: TRAINER
                - name: ENTRY
                  value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
          volumes:
            - name: model-config
              configMap:
                name: modelconfig
                defaultMode: 0777
          restartPolicy: OnFailure