trainer.yaml 1.7 KB
Newer Older
T
typhoonzero 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
apiVersion: batch/v1
kind: Job
metadata:
  name: vgg16job-trainer
spec:
  parallelism: 20
  completions: 20
  template:
    metadata:
      labels:
        paddle-job: vgg16job
    spec:
      imagePullSecrets:
        - name: job-registry-secret
      hostNetwork: true
      containers:
      - name: trainer
        image: "registry.baidu.com/paddlepaddle/rawjob:vgg16"
        imagePullPolicy: Always
        command: ["paddle_k8s", "start_trainer", "v2"]
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
T
typhoonzero 已提交
31
          value: "cd /workspace && MKL_NUM_THREADS=1 python /workspace/vgg16.py"
T
typhoonzero 已提交
32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
          value: "/usr/local/nvidia/lib64"
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
        resources:
          requests:
            memory: 40Gi
            cpu: 2
          limits:
            memory: 40Gi
            cpu: 2
      restartPolicy: Never