# WIP example for distributed training apiVersion: "kubeflow.org/v1" kind: "MSJob" metadata: name: "msjob-mnist" spec: backend: "tcp" masterPort: "23456" replicaSpecs: - replicas: 1 replicaType: MASTER template: spec: containers: - image: mindspore/mindspore-cpu:0.1.0-alpha imagePullPolicy: IfNotPresent name: msjob-mnist command: ["/bin/bash", "-c", "python /tmp/test/MNIST/lenet.py"] volumeMounts: - name: training-result mountPath: /tmp/result - name: ms-mnist-local-file mountPath: /tmp/test restartPolicy: OnFailure volumes: - name: training-result emptyDir: {} - name: entrypoint configMap: name: dist-train defaultMode: 0755 restartPolicy: OnFailure - replicas: 3 replicaType: WORKER template: spec: containers: - image: mindspore/mindspore-cpu:0.1.0-alpha imagePullPolicy: IfNotPresent name: msjob-mnist command: ["/bin/bash", "-c", "python /tmp/test/MNIST/lenet.py"] volumeMounts: - name: training-result mountPath: /tmp/result - name: ms-mnist-local-file hostPath: path: /root/gopath/src/gitee.com/mindspore/ms-operator/examples restartPolicy: OnFailure volumes: - name: training-result emptyDir: {} - name: entrypoint configMap: name: dist-train defaultMode: 0755 restartPolicy: OnFailure