k8s.yaml.template 6.0 KB
Newer Older
J
Jinhua Liang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191
apiVersion: batch.volcano.sh/v1alpha1
kind: Job
metadata:
  name: <$ JOB_NAME $>
spec:
  minAvailable: <$ TOTAL_POD_NUM $>
  schedulerName: volcano
  policies:
  - event: PodEvicted
    action: RestartJob
  - event: PodFailed
    action: RestartJob
  tasks:
    - replicas: <$ PSERVER_NUM $>
      name: pserver
      template:
        metadata:
          labels:
            paddle-job-pserver: paddle-rec
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - image: <$ IMAGE $>
              command:
                - '/bin/bash'
              args:
              - "-c"
              - |
                set -ex
                sh /usr/paddlerec/set_k8s_env.sh start_fluid
              imagePullPolicy: Always
              volumeMounts:
                - name: model-config
                  mountPath: "/usr/paddlerec"
              name: pserver
              resources:
                limits:
                  cpu: <$ CPU_NUM $>
                  memory: <$ MEMORY $>
                  ephemeral-storage: <$ STORAGE $>
                requests:
                  cpu: 1
                  memory: 1Gi
                  ephemeral-storage: 1Gi
              env:
                - name: GLOG_v
                  value: "<$ GLOG_V $>"
                - name: GLOG_logtostderr
                  value: "1"
                - name: TOPOLOGY
                  value: ""
                - name: TRAINER_PACKAGE
                  value: /root/paddlejob
                - name: PADDLE_INIT_NICS
                  value: eth2
                - name: NAMESPACE
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.namespace
                - name: POD_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: POD_NAME
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.name
                - name: PADDLE_CURRENT_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: PADDLE_JOB_NAME
                  value: paddle-rec
                - name: PADDLE_IS_LOCAL
                  value: "0"
                - name: PADDLE_TRAINERS_NUM
                  value: "<$ TRAINER_NUM $>"
                - name: PADDLE_PSERVERS_NUM
                  value: "<$ PSERVER_NUM $>"
                - name: FLAGS_rpc_deadline
                  value: "100000"
                - name: ENTRY
                  value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
                - name: PADDLE_PORT
                  value: "30240"
                - name: PADDLE_TRAINING_ROLE
                  value: PSERVER
                - name: TRAINING_ROLE
                  value: PSERVER
          volumes:
            - name: model-config
              configMap:
                name: modelconfig
                defaultMode: 0777
          restartPolicy: OnFailure

    - replicas: <$ TRAINER_NUM $>
      policies:
      - event: TaskCompleted
        action: CompleteJob
      name: trainer
      template:
        metadata:
          labels:
            paddle-job: paddle-rec
        spec:
          imagePullSecrets:
            - name: default-secret
          containers:
            - image: <$ IMAGE $>
              command:
                - '/bin/bash'
              args:
              - "-c"
              - |
                set -ex
                sh /usr/paddlerec/set_k8s_env.sh start_fluid
              imagePullPolicy: Always
              volumeMounts:
                - name: model-config
                  mountPath: "/usr/paddlerec"
              name: trainer
              resources:
                limits:
                  cpu: <$ CPU_NUM $>
                  memory: <$ MEMORY $>
                  ephemeral-storage: <$ STORAGE $>
                requests:
                  cpu: 1
                  memory: 1Gi
                  ephemeral-storage: 1Gi
              env:
                - name: GLOG_v
                  value: "<$ GLOG_V $>"
                - name: GLOG_logtostderr
                  value: "1"
                - name: TRAINER_PACKAGE
                  value: /root/paddlejob
                - name: PADDLE_INIT_NICS
                  value: eth2
                - name: NAMESPACE
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.namespace
                - name: POD_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: POD_NAME
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: metadata.name
                - name: PADDLE_CURRENT_IP
                  valueFrom:
                    fieldRef:
                      apiVersion: v1
                      fieldPath: status.podIP
                - name: PADDLE_JOB_NAME
                  value: paddle-rec
                - name: PADDLE_IS_LOCAL
                  value: "0"
                - name: FLAGS_rpc_deadline
                  value: "3600"
                - name: PADDLE_PORT
                  value: "30240"
                - name: PADDLE_PSERVERS_NUM
                  value: "<$ PSERVER_NUM $>"
                - name: PADDLE_TRAINERS_NUM
                  value: "<$ TRAINER_NUM $>"
                - name: PADDLE_TRAINING_ROLE
                  value: TRAINER
                - name: TRAINING_ROLE
                  value: TRAINER
                - name: ENTRY
                  value: python -m paddlerec.run -m /usr/paddlerec/config.yaml -r WORKER
          volumes:
            - name: model-config
              configMap:
                name: modelconfig
                defaultMode: 0777
          restartPolicy: OnFailure