apiVersion: batch/v1 kind: Job metadata: name: vgg16job-tf-trainer spec: parallelism: 20 completions: 20 template: metadata: labels: tf-job-trainer: vgg16job-tf spec: imagePullSecrets: - name: job-registry-secret hostNetwork: true containers: - name: trainer image: "registry.baidu.com/paddlepaddle/fluid_benchmark_tf:vgg16" imagePullPolicy: Always command: ["tf_k8s", "start_tf"] ports: - name: jobport-30236 containerPort: 30236 env: - name: PORT value: "32036" - name: JOB_NAME value: vgg16job-tf - name: TF_JOB_NAME value: "worker" - name: ENTRY value: "python vgg16_tf.py" - name: PSERVERS_NUM value: "10" - name: BATCH_SIZE value: "128" - name: TRAINERS_NUM value: "20" - name: TRAINER_PACKAGE value: "/workspace" - name: NUM_PASSES value: "1" - name: NAMESPACE valueFrom: fieldRef: fieldPath: "metadata.namespace" - name: POD_IP valueFrom: fieldRef: fieldPath: "status.podIP" resources: requests: memory: 40Gi cpu: 2 limits: memory: 40Gi cpu: 2 restartPolicy: Never