fluid_pserver.yaml 1.9 KB
Newer Older
T
typhoonzero 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
apiVersion: extensions/v1beta1
kind: ReplicaSet
metadata:
  name: vgg16job-pserver
spec:
  replicas: 10
  template:
    metadata:
      labels:
        paddle-job-pserver: vgg16job
    spec:
      hostNetwork: true
      imagePullSecrets:
      - name: job-registry-secret
      containers:
      - name: pserver
T
typhoonzero 已提交
17
        image: "registry.baidu.com/paddlepaddle/fluid_benchmark:vgg16"
T
typhoonzero 已提交
18 19 20 21 22 23 24
        imagePullPolicy: Always
        ports:
        - name: jobport-30236
          containerPort: 30236
        env:
        - name: PADDLE_JOB_NAME
          value: vgg16job
T
typhoonzero 已提交
25 26 27 28
        - name: MKL_NUM_THREADS
          value: "1"
        - name: TRAINING_ROLE
          value: "PSERVER"
T
typhoonzero 已提交
29 30 31 32 33 34 35
        - name: TRAINERS
          value: "20"
        - name: PSERVERS
          value: "10"
        - name: TOPOLOGY
          value: ""
        - name: ENTRY
T
typhoonzero 已提交
36
          value: "MKL_NUM_THREADS=1 python /workspace/vgg16_fluid.py --local 0"
T
typhoonzero 已提交
37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
        - name: TRAINER_PACKAGE
          value: "/workspace"
        - name: PADDLE_INIT_PORT
          value: "30236"
        - name: PADDLE_INIT_NICS
          value: "xgbe0"
        - name: PADDLE_INIT_TRAINER_COUNT
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM
          value: "1"
        - name: PADDLE_INIT_PORTS_NUM_FOR_SPARSE
          value: "1"
        - name: PADDLE_INIT_NUM_GRADIENT_SERVERS
          value: "20"
        - name: PADDLE_INIT_NUM_PASSES
          value: "1"
        - name: PADDLE_INIT_USE_GPU
          value: "0"
        - name: LD_LIBRARY_PATH
T
typhoonzero 已提交
56
          value: "/usr/local/lib:/usr/local/nvidia/lib64"
T
typhoonzero 已提交
57 58 59 60
        - name: NAMESPACE
          valueFrom:
            fieldRef:
              fieldPath: "metadata.namespace"
T
typhoonzero 已提交
61 62 63 64 65
        - name: POD_IP
          valueFrom:
            fieldRef:
              fieldPath: "status.podIP"
        command: ["paddle_k8s", "start_fluid"]
T
typhoonzero 已提交
66 67 68 69 70 71 72
        resources:
          requests:
            memory: 10Gi
            cpu: 4
          limits:
            memory: 10Gi
            cpu: 4