trainer.yaml 1.7 KB
Newer Older
T
typhoonzero 已提交
1 2 3
apiVersion: batch/v1
kind: Job
metadata:
T
typhoonzero 已提交
4
  name: vgg16v2job-trainer
T
typhoonzero 已提交
5 6 7 8 9 10
spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
T
typhoonzero 已提交
11
        paddle-job: vgg16v2job
T
typhoonzero 已提交
12 13 14 15 16 17 18 19 20 21 22
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
T
typhoonzero 已提交
23
          value: vgg16v2job
T
typhoonzero 已提交
24 25 26 27 28 29 30
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
T
typhoonzero 已提交
31
          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
T
typhoonzero 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never