diff --git a/core/engine/cluster/cloud/k8s_cpu_job.sh.template b/core/engine/cluster/cloud/k8s_cpu_job.sh.template index c5203fcad76b28b5a48de62067b46f4ed5bf1696..2889cd1d55008f22b7e9fb854019f996a4746f8c 100644 --- a/core/engine/cluster/cloud/k8s_cpu_job.sh.template +++ b/core/engine/cluster/cloud/k8s_cpu_job.sh.template @@ -9,7 +9,7 @@ job_name=<$ JOB_NAME $> group_name="<$ GROUP_NAME $>" job_version="paddle-fluid-v1.7.1" start_cmd="<$ START_CMD $>" -wall_time="10:00:00" +wall_time="2000:00:00" k8s_priority=<$ K8S_PRIORITY $> k8s_trainers=<$ K8S_TRAINERS $> diff --git a/core/engine/cluster/cloud/k8s_job.sh.template b/core/engine/cluster/cloud/k8s_job.sh.template index 9886f11aebbbe547ed1fb433a35c653e2a77f6f3..8314e9efd0ec349bb00e28605386e34dfc601102 100644 --- a/core/engine/cluster/cloud/k8s_job.sh.template +++ b/core/engine/cluster/cloud/k8s_job.sh.template @@ -9,7 +9,7 @@ job_name=<$ JOB_NAME $> group_name="<$ GROUP_NAME $>" job_version="paddle-fluid-v1.7.1" start_cmd="<$ START_CMD $>" -wall_time="10:00:00" +wall_time="2000:00:00" k8s_priority=<$ K8S_PRIORITY $> k8s_trainers=<$ K8S_TRAINERS $> diff --git a/core/engine/cluster/cloud/mpi_job.sh.template b/core/engine/cluster/cloud/mpi_job.sh.template index 46d68d2130d591c86f4a587000498c139c1e74aa..b3a3c20a02094cca68c96f527bf29d3150996228 100644 --- a/core/engine/cluster/cloud/mpi_job.sh.template +++ b/core/engine/cluster/cloud/mpi_job.sh.template @@ -9,7 +9,7 @@ job_name=<$ JOB_NAME $> group_name=<$ GROUP_NAME $> job_version="paddle-fluid-v1.7.1" start_cmd="<$ START_CMD $>" -wall_time="2:00:00" +wall_time="2000:00:00" # 你的ak/sk(可在paddlecloud web页面【个人中心】处获取) ak=<$ AK $> diff --git a/doc/distributed_train.md b/doc/distributed_train.md index 0f63678220adb81d68dc181dc4d517a40823155e..e27e1f7eb1d2519eedc670d0021e09a1b0bfd17a 100644 --- a/doc/distributed_train.md +++ b/doc/distributed_train.md @@ -35,9 +35,9 @@ 分布式运行首先需要更改`config.yaml`,主要调整以下内容: -- workspace: 调整为在远程点运行时的工作目录,一般设置为`"./"`即可 -- runner_class: 从单机的"train"调整为"cluster_train",单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train) -- fleet_mode: 选则参数服务器模式(ps),抑或GPU的all-reduce模式(collective) +- workspace: 调整为在远程节点运行时的工作目录,一般设置为`"./"`即可 +- runner_class: 从单机的"train"调整为"cluster_train",单机训练->分布式训练(例外情况,k8s上单机单卡训练仍然为train,后续支持) +- fleet_mode: 选择参数服务器模式(ps),或者GPU的all-reduce模式(collective) - distribute_strategy: 可选项,选择分布式训练的策略,目前只在参数服务器模式下生效,可选项:`sync、asycn、half_async、geo` 配置选项具体参数,可以参考[yaml配置说明](./yaml.md) @@ -306,7 +306,7 @@ dataset: - name: dataloader_train batch_size: 2 type: DataLoader - data_path: "{workspace}/train_data" + data_path: "{workspace}/afs/挂载数据文件夹的路径" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" dense_slots: "dense_var:13" @@ -390,7 +390,7 @@ dataset: - name: dataloader_train batch_size: 2 type: DataLoader - data_path: "{workspace}/train_data" + data_path: "{workspace}/afs/挂载数据文件夹的路径" sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26" dense_slots: "dense_var:13"