trainer.py 2.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

trainer = {
    "apiVersion": "batch/v1",
    "kind": "Job",
    "metadata": {
        "name": "jobname-pserver"
    },
    "spec": {
        "parallelism": 4,
        "completions": 4,
        "template": {
            "metadata": {
                "labels": {
                    "paddle-job": "jobname"
                }
            },
            "spec": {
                "hostNetwork": True,
                "imagePullSecrets": [{
                    "name": "job-registry-secret"
                }],
                "restartPolicy": "Never",
                "containers": [{
                    "name": "trainer",
                    "image": "",
                    "imagePullPolicy": "Always",
                    # to let container set rlimit
                    "securityContext": {
                        "privileged": True
                        # TODO(wuyi): use below specific cap instead of privileged,
                        # using privileged will cause all GPU device are visible
                        # in the container.
                        # "capabilities": {
                        #     "add": ["SYS_RESOURCE"]
                        # }
                    },
                    "ports": [{
                        "name": "jobport-1",
                        "containerPort": 1
                    }],
                    "env": [],
                    "command": ["paddle_k8s", "start_trainer", "v2"],
                    "resources": {
                        "requests": {
                            "memory": "10Gi",
                            "cpu": "4",
                        },
                        "limits": {
                            "memory": "10Gi",
                            "cpu": "4",
                        }
                    }
                }]
            }
        }
    }
}