v2_trainer.yaml 1.7 KB
Newer Older
T
typhoonzero 已提交
1 2 3
apiVersion: batch/v1
kind: Job
metadata:
T
typhoonzero 已提交
4
  name: vgg16v2job-trainer
T
typhoonzero 已提交
5 6 7 8 9 10
spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
T
typhoonzero 已提交
11
        paddle-job: vgg16v2job
T
typhoonzero 已提交
12 13 14 15 16 17
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
T
typhoonzero 已提交
18
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
T
typhoonzero 已提交
19 20 21 22
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
T
typhoonzero 已提交
23
          value: vgg16v2job
T
typhoonzero 已提交
24
        - name: BATCH_SIZE
T
typhoonzero 已提交
25
          value: "256"
T
typhoonzero 已提交
26 27 28 29 30 31 32
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
T
typhoonzero 已提交
33
          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16_v2.py"
T
typhoonzero 已提交
34 35 36 37 38 39 40
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
T
typhoonzero 已提交
41
          value: "2"
T
typhoonzero 已提交
42 43 44 45 46 47 48
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
T
typhoonzero 已提交
49
          value: "2"
T
typhoonzero 已提交
50 51 52
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
T
typhoonzero 已提交
53
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
T
typhoonzero 已提交
54 55 56 57 58 59 60 61 62 63 64 65
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never