add cluster support (#136)

* add cluster support

add cluster support (#136)
* add cluster support
15c57177 · Chengmo · GitHub · b8e17866 · 15c57177 · 15c57177
15 changed file
--- a/core/engine/cluster/cloud/before_hook_cpu.sh.template
+++ b/core/engine/cluster/cloud/before_hook_cpu.sh.template
+echo "Run before_hook.sh ..."
+
+wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
+
+tar -xf PaddleRec.tar.gz
+
+cd PaddleRec
+
+python setup.py install
+
+pip uninstall -y paddlepaddle
+
+pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $> --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
+
+echo "End before_hook.sh ..."
--- a/core/engine/cluster/cloud/before_hook_gpu.sh.template
+++ b/core/engine/cluster/cloud/before_hook_gpu.sh.template
+echo "Run before_hook.sh ..."
+
+wget https://paddlerec.bj.bcebos.com/whl/PaddleRec.tar.gz
+
+tar -xf PaddleRec.tar.gz
+
+cd PaddleRec
+
+python setup.py install
+
+pip uninstall -y paddlepaddle
+
+pip install paddlepaddle-gpu==<$ PADDLEPADDLE_VERSION $>.post107 --index-url=http://pip.baidu.com/pypi/simple --trusted-host pip.baidu.com
+
+echo "End before_hook.sh ..."
--- a/core/engine/cluster/cloud/cluster.sh
+++ b/core/engine/cluster/cloud/cluster.sh
@@ -16,23 +16,13 @@

 ###################################################
 # Usage: submit.sh
-# Description: run mpi submit client implement
+# Description: run paddlecloud submit client implement
 ###################################################

 # ---------------------------------------------------------------------------- #
 #                            variable define                                   #
 # ---------------------------------------------------------------------------- #

-#-----------------------------------------------------------------------------------------------------------------
-#fun : package
-#param : N/A
-#return : 0 -- success; not 0 -- failure
-#-----------------------------------------------------------------------------------------------------------------
-function package_hook() {
-  g_run_stage="package"
-  package
-}
-
 #-----------------------------------------------------------------------------------------------------------------
 #fun : before hook submit to cluster
 #param : N/A
@@ -40,17 +30,106 @@ function package_hook() {
 #-----------------------------------------------------------------------------------------------------------------
 function _before_submit() {
  echo "before_submit"
-  before_submit_hook
+  
+  if [ ${DISTRIBUTE_MODE} == "PS_CPU_MPI" ]; then
+    _gen_cpu_before_hook
+    _gen_mpi_config
+    _gen_mpi_job
+    _gen_end_hook
+  elif [ ${DISTRIBUTE_MODE} == "COLLECTIVE_GPU_K8S" ]; then
+    _gen_gpu_before_hook
+    _gen_k8s_config
+    _gen_k8s_job
+    _gen_end_hook
+  fi
+  
+}
+
+function _gen_mpi_config() {
+  echo "gen mpi_config.ini"
+  sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \
+      -e "s#<$ FS_UGI $>#$FS_UGI#g" \
+      -e "s#<$ TRAIN_DATA_PATH $>#$TRAIN_DATA_PATH#g" \
+      -e "s#<$ TEST_DATA_PATH $>#$TEST_DATA_PATH#g" \
+      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
+      -e "s#<$ THIRDPARTY_PATH $>#$THIRDPARTY_PATH#g" \
+      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
+      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
+      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
+      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \
+      -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \
+      -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \
+      -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \
+      -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \
+      ${abs_dir}/cloud/mpi_config.ini.template >${PWD}/config.ini
+}
+
+function _gen_k8s_config() {
+  echo "gen k8s_config.ini"
+  sed -e "s#<$ FS_NAME $>#$FS_NAME#g" \
+      -e "s#<$ FS_UGI $>#$FS_UGI#g" \
+      -e "s#<$ AFS_REMOTE_MOUNT_POINT $>#$AFS_REMOTE_MOUNT_POINT#g" \
+      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
+      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
+      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
+      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
+      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \
+      -e "s#<$ FLAGS_communicator_max_merge_var_num $>#$FLAGS_communicator_max_merge_var_num#g" \
+      -e "s#<$ FLAGS_communicator_max_send_grad_num_before_recv $>#$FLAGS_communicator_max_send_grad_num_before_recv#g" \
+      -e "s#<$ FLAGS_communicator_fake_rpc $>#$FLAGS_communicator_fake_rpc#g" \
+      -e "s#<$ FLAGS_rpc_retry_times $>#$FLAGS_rpc_retry_times#g" \
+      ${abs_dir}/cloud/k8s_config.ini.template >${PWD}/config.ini
+}
+
+function _gen_cpu_before_hook() {
+  echo "gen cpu before_hook.sh"
+  sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \
+    ${abs_dir}/cloud/before_hook_cpu.sh.template >${PWD}/before_hook.sh
 }

+function _gen_gpu_before_hook() {
+  echo "gen gpu before_hook.sh"
+  sed -e "s#<$ PADDLEPADDLE_VERSION $>#$PADDLE_VERSION#g" \
+    ${abs_dir}/cloud/before_hook_gpu.sh.template >${PWD}/before_hook.sh
+}
+
+function _gen_end_hook() {
+  echo "gen end_hook.sh"
+  cp ${abs_dir}/cloud/end_hook.sh.template ${PWD}/end_hook.sh
+}
+
+function _gen_mpi_job() {
+  echo "gen mpi_job.sh"
+  sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
+      -e "s#<$ AK $>#$AK#g" \
+      -e "s#<$ SK $>#$SK#g" \
+      -e "s#<$ MPI_PRIORITY $>#$PRIORITY#g" \
+      -e "s#<$ MPI_NODES $>#$MPI_NODES#g" \
+      -e "s#<$ START_CMD $>#$START_CMD#g" \
+      ${abs_dir}/cloud/mpi_job.sh.template >${PWD}/job.sh
+}
+
+function _gen_k8s_job() {
+  echo "gen k8s_job.sh"
+  sed -e "s#<$ GROUP_NAME $>#$GROUP_NAME#g" \
+      -e "s#<$ AK $>#$AK#g" \
+      -e "s#<$ SK $>#$SK#g" \
+      -e "s#<$ K8S_PRIORITY $>#$PRIORITY#g" \
+      -e "s#<$ K8S_TRAINERS $>#$K8S_TRAINERS#g" \
+      -e "s#<$ K8S_GPU_CARD $>#$K8S_GPU_CARD#g" \
+      -e "s#<$ START_CMD $>#$START_CMD#g" \
+      ${abs_dir}/cloud/k8s_job.sh.template >${PWD}/job.sh
+}
+
+
+
 #-----------------------------------------------------------------------------------------------------------------
 #fun : after hook submit to cluster
 #param : N/A
 #return : 0 -- success; not 0 -- failure
 #-----------------------------------------------------------------------------------------------------------------
 function _after_submit() {
-  echo "after_submit"
-  after_submit_hook
+  echo "end submit"
 }

 #-----------------------------------------------------------------------------------------------------------------
@@ -60,23 +139,18 @@ function _after_submit() {
 #-----------------------------------------------------------------------------------------------------------------
 function _submit() {
  g_run_stage="submit"
+  sh job.sh
+}

-  cd ${engine_temp_path}
-
-  paddlecloud job --ak ${engine_submit_ak} --sk ${engine_submit_sk} train --cluster-name ${engine_submit_cluster} \
-    --job-version ${engine_submit_version} \
-    --mpi-priority ${engine_submit_priority} \
-    --mpi-wall-time 300:59:00 \
-    --mpi-nodes ${engine_submit_nodes} --is-standalone 0 \
-    --mpi-memory 110Gi \
-    --job-name ${engine_submit_jobname} \
-    --start-cmd "${g_run_cmd}" \
-    --group-name ${engine_submit_group} \
-    --job-conf ${engine_submit_config} \
-    --files ${g_submitfiles} \
-    --json
-
-  cd -
+function package_hook() {
+  cur_time=`date  +"%Y%m%d%H%M"`
+  new_job_name="${JOB_NAME}_${cur_time}"
+  export JOB_NAME=${new_job_name}
+  export job_file_path="${PWD}/${new_job_name}"
+  mkdir ${job_file_path}
+  cp $FILES ${job_file_path}/
+  cd ${job_file_path}
+  echo "The task submission folder is generated at ${job_file_path}"
 }

 function submit_hook() {
@@ -86,8 +160,6 @@ function submit_hook() {
 }

 function main() {
-  source ${engine_submit_scrpit}
-
  package_hook
  submit_hook
 }

--- a/core/engine/cluster/cloud/end_hook.sh.template
+++ b/core/engine/cluster/cloud/end_hook.sh.template
+echo "Run before_hook.sh ..."
\ No newline at end of file
--- a/core/engine/cluster/cloud/k8s_config.ini.template
+++ b/core/engine/cluster/cloud/k8s_config.ini.template
+# 必须涵盖的参数
+fs_name=<$ FS_NAME $>
+fs_ugi=<$ FS_UGI $>
+
+# 模型输出目录
+output_path=<$ OUTPUT_PATH $>
+# ===================
+# 以下是新增参数
+# ===================
+# 挂载 afs 的开关
+mount_afs="true"
+
+# afs 路径的远端挂载点
+AFS_REMOTE_MOUNT_POINT=<$ AFS_REMOTE_MOUNT_POINT $>
+
+# 作业运行环境的本地挂载点，/root/paddlejob/workspace/env_run/是一个固定路径，是平台运行时workspace的路径
+afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/"
+# 可以访问运行时默认文件夹下的 ./afs/ 目录拿到挂载目录的文件
+# 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193
+
+PADDLE_PADDLEREC_ROLE=WORKER
+CPU_NUM=<$ CPU_NUM $>
+GLOG_v=0
+
+FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $>
+FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $>
+FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $>
+FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $>
+FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $>
+FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $>
+FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $>
\ No newline at end of file
--- a/core/engine/cluster/cloud/k8s_job.sh.template
+++ b/core/engine/cluster/cloud/k8s_job.sh.template
+#!/bin/bash
+###############################################################
+##                  注意-- 注意--注意                          ##
+##                 K8S NCCL2多机作业作业示例                    ##
+###############################################################
+job_name=${JOB_NAME}
+
+# 作业参数
+group_name="<$ GROUP_NAME $>"               
+job_version="paddle-fluid-v1.7.1"
+start_cmd="<$ START_CMD $>"
+wall_time="10:00:00"
+
+k8s_priority=<$ K8S_PRIORITY $>
+k8s_trainers=<$ K8S_TRAINERS $>
+k8s_gpu_cards=<$ K8S_GPU_CARD $>
+
+# 你的ak/sk（可在paddlecloud web页面【个人中心】处获取）
+ak=<$ AK $>
+sk=<$ SK $>
+
+paddlecloud job --ak ${ak} --sk ${sk} \
+        train --job-name ${job_name} \
+        --group-name ${group_name} \
+        --job-conf config.ini \
+        --start-cmd "${start_cmd}" \
+        --files ./*  \
+        --job-version ${job_version}  \
+        --k8s-trainers ${k8s_trainers} \
+        --k8s-gpu-cards ${k8s_gpu_cards} \
+        --k8s-priority ${k8s_priority} \
+        --wall-time ${wall_time} \
+        --is-standalone 0 \
+        --distribute-job-type "NCCL2" \
+        --json
\ No newline at end of file
--- a/core/engine/cluster/cloud/mpi_config.ini.template
+++ b/core/engine/cluster/cloud/mpi_config.ini.template
+#type of storage cluster
+storage_type="hdfs"
+
+#attention: files for training should be put on hdfs
+force_reuse_output_path="True"
+
+# 可以替换成自己的hdfs集群
+fs_name=<$ FS_NAME $>
+fs_ugi=<$ FS_UGI $>
+
+FLAGS_rpc_deadline=300000
+
+##train data path on hdfs
+train_data_path=<$ TRAIN_DATA_PATH $>
+test_data_path=<$ TEST_DATA_PATH $>
+output_path=<$ OUTPUT_PATH $>
+thirdparty_path=<$ THIRDPARTY_PATH $>
+
+PADDLE_PADDLEREC_ROLE=WORKER
+CPU_NUM=<$ CPU_NUM $>
+GLOG_v=0
+
+FLAGS_communicator_is_sgd_optimizer=<$ FLAGS_communicator_is_sgd_optimizer $>
+FLAGS_communicator_send_queue_size=<$ FLAGS_communicator_send_queue_size $>
+FLAGS_communicator_thread_pool_size=<$ FLAGS_communicator_thread_pool_size $>
+FLAGS_communicator_max_merge_var_num=<$ FLAGS_communicator_max_merge_var_num $>
+FLAGS_communicator_max_send_grad_num_before_recv=<$ FLAGS_communicator_max_send_grad_num_before_recv $>
+FLAGS_communicator_fake_rpc=<$ FLAGS_communicator_fake_rpc $>
+FLAGS_rpc_retry_times=<$ FLAGS_rpc_retry_times $>
--- a/core/engine/cluster/cloud/mpi_job.sh.template
+++ b/core/engine/cluster/cloud/mpi_job.sh.template
+#!/bin/bash
+###############################################################
+##                  注意--注意--注意                         ##
+##                  MPI 类型作业演示                         ##
+###############################################################
+job_name=${JOB_NAME}
+
+# 作业参数
+group_name=<$ GROUP_NAME $>
+job_version="paddle-fluid-v1.7.1"
+start_cmd="<$ START_CMD $>"
+wall_time="2:00:00"
+
+# 你的ak/sk（可在paddlecloud web页面【个人中心】处获取）
+ak=<$ AK $>
+sk=<$ SK $>
+
+paddlecloud job --ak ${ak} --sk ${sk} \
+    train \
+    --job-name ${job_name} \
+    --mpi-priority <$ MPI_PRIORITY $> \
+    --group-name ${group_name} \
+    --mpi-wall-time ${wall_time} \
+    --mpi-nodes <$ MPI_NODES $> \
+    --is-standalone 0 \
+    --permission group \
+    --job-version ${job_version} \
+    --job-conf config.ini \
+    --start-cmd "${start_cmd}" \
+    --files ./*  \
+    --json
--- a/core/engine/cluster/cluster.py
+++ b/core/engine/cluster/cluster.py
@@ -18,6 +18,7 @@ from __future__ import unicode_literals
 import copy
 import os
 import subprocess
+import warnings

 from paddlerec.core.engine.engine import Engine
 from paddlerec.core.factory import TrainerFactory
@@ -26,24 +27,35 @@ from paddlerec.core.utils import envs

 class ClusterEngine(Engine):
    def __init_impl__(self):
+        self.role = envs.get_runtime_environ("engine_role")
+        if self.role == "WORKER":
+            return
+
        abs_dir = os.path.dirname(os.path.abspath(__file__))
+        os.environ["abs_dir"] = str(abs_dir)

-        backend = envs.get_runtime_environ("engine_backend")
-        if not backend:
-            backend = ""
-        backend = backend.upper()
-        if backend == "PADDLECLOUD":
+        self.backend = envs.get_runtime_environ("backend")
+        if not self.backend:
+            self.backend = ""
+        self.backend = self.backend.upper()
+        if self.backend == "PADDLECLOUD":
            self.submit_script = os.path.join(abs_dir, "cloud/cluster.sh")
-        elif backend == "KUBERNETES":
+        elif self.backend == "KUBERNETES":
            self.submit_script = os.path.join(abs_dir, "k8s/cluster.sh")
        else:
-            raise ValueError("{} can not be supported now".format(backend))
+            raise ValueError("{} can not be supported now".format(
+                self.backend))

    def start_worker_procs(self):
        trainer = TrainerFactory.create(self.trainer)
        trainer.run()

    def start_master_procs(self):
+        if self.backend == "PADDLECLOUD":
+            self.paddlecloud_env_check()
+        elif self.backend == "KUBERNETES":
+            self.kubernetes_env_check()
+
        default_env = os.environ.copy()
        current_env = copy.copy(default_env)
        current_env.pop("http_proxy", None)
@@ -55,21 +67,229 @@ class ClusterEngine(Engine):

    @staticmethod
    def workspace_replace():
-        workspace = envs.get_runtime_environ("engine_workspace")
+        workspace = envs.get_runtime_environ("workspace")

        for k, v in os.environ.items():
            v = v.replace("{workspace}", workspace)
            os.environ[k] = str(v)

    def run(self):
-        role = envs.get_runtime_environ("engine_role")
-
-        if role == "MASTER":
+        if self.role == "MASTER":
            self.start_master_procs()

-        elif role == "WORKER":
+        elif self.role == "WORKER":
            self.start_worker_procs()

        else:
            raise ValueError("role {} error, must in MASTER/WORKER".format(
-                role))
+                self.role))
+
+    def paddlecloud_env_check(self):
+        # get fleet mode
+        fleet_mode = envs.get_runtime_environ("fleet_mode")
+        # get device
+        device = envs.get_runtime_environ("device")
+        # get cluster type
+        cluster_type = envs.get_runtime_environ("cluster_type")
+
+        cluster_env_check_tool = None
+        if cluster_type.upper() == "MPI":
+            if device == "CPU" and fleet_mode == "PS":
+                cluster_env_check_tool = PaddleCloudMpiEnv()
+            else:
+                raise ValueError(
+                    "Paddlecloud with Mpi don't support GPU training, check your config"
+                )
+        elif cluster_type.upper() == "K8S":
+            if fleet_mode == "PS":
+                if device == "CPU":
+                    raise ValueError(
+                        "PS-CPU on paddlecloud is not supported at this time, comming soon"
+                    )
+                elif device == "GPU":
+                    raise ValueError(
+                        "PS-GPU on paddlecloud is not supported at this time, comming soon"
+                    )
+            if fleet_mode == "COLLECTIVE":
+                if device == "GPU":
+                    cluster_env_check_tool = CloudCollectiveEnv()
+                elif device == "CPU":
+                    raise ValueError(
+                        "Unexpected config -> device: CPU with fleet_mode: Collective, check your config"
+                    )
+        else:
+            raise ValueError("cluster_type {} error, must in MPI/K8S".format(
+                cluster_type))
+
+        cluster_env_check_tool.env_check()
+        cluster_env_check_tool.env_set()
+
+    def kubernetes_env_check(self):
+        pass
+
+
+class ClusterEnvBase(object):
+    def __init__(self):
+        # get backend env
+        backend_yaml = envs.get_runtime_environ("backend_yaml")
+        _env = envs.load_yaml(backend_yaml)
+        self.backend_env = envs.flatten_environs(_env, ".")
+        self.cluster_env = {}
+
+    def env_check(self):
+        # check common env
+        # fs_name & fs_ugi
+        self.cluster_env["FS_NAME"] = self.backend_env.get("config.fs_name",
+                                                           "")
+        self.cluster_env["FS_UGI"] = self.backend_env.get("config.fs_ugi", "")
+        if self.cluster_env["FS_NAME"] == "" or self.cluster_env[
+                "FS_UGI"] == "":
+            raise ValueError(
+                "No -- FS_UGI or FS_NAME -- found in your backend.yaml, please check."
+            )
+
+        # output_path
+        self.cluster_env["OUTPUT_PATH"] = self.backend_env.get(
+            "config.output_path", "")
+        if self.cluster_env["OUTPUT_PATH"] == "":
+            warnings.warn(
+                "Job output_path not set! Please check your backend yaml.",
+                category=UserWarning,
+                stacklevel=2)
+
+        # paddle_version
+        self.cluster_env["PADDLE_VERSION"] = self.backend_env.get(
+            "config.paddle_version", "1.7.2")
+
+        # communicator
+        self.cluster_env[
+            "FLAGS_communicator_is_sgd_optimizer"] = self.backend_env.get(
+                "config.communicator.FLAGS_communicator_is_sgd_optimizer", 0)
+        self.cluster_env[
+            "FLAGS_communicator_send_queue_size"] = self.backend_env.get(
+                "config.communicator.FLAGS_communicator_send_queue_size", 5)
+        self.cluster_env[
+            "FLAGS_communicator_thread_pool_size"] = self.backend_env.get(
+                "config.communicator.FLAGS_communicator_thread_pool_size", 32)
+        self.cluster_env[
+            "FLAGS_communicator_max_merge_var_num"] = self.backend_env.get(
+                "config.communicator.FLAGS_communicator_max_merge_var_num", 5)
+        self.cluster_env[
+            "FLAGS_communicator_max_send_grad_num_before_recv"] = self.backend_env.get(
+                "config.communicator.FLAGS_communicator_max_send_grad_num_before_recv",
+                5)
+        self.cluster_env["FLAGS_communicator_fake_rpc"] = self.backend_env.get(
+            "config.communicator.FLAGS_communicator_fake_rpc", 0)
+        self.cluster_env["FLAGS_rpc_retry_times"] = self.backend_env.get(
+            "config.communicator.FLAGS_rpc_retry_times", 3)
+
+        # ak & sk
+        self.cluster_env["AK"] = self.backend_env.get("submit.ak", "")
+        self.cluster_env["SK"] = self.backend_env.get("submit.sk", "")
+        if self.cluster_env["AK"] == "" or self.cluster_env["SK"] == "":
+            raise ValueError(
+                "No -- AK or SK -- found in your backend.yaml, please check.")
+
+        # priority
+        self.cluster_env["PRIORITY"] = self.backend_env.get("submit.priority",
+                                                            "high")
+
+        # job name
+        self.cluster_env["JOB_NAME"] = self.backend_env.get(
+            "submit.job_name", "PaddleRecClusterJob")
+
+        # group
+        self.cluster_env["GROUP_NAME"] = self.backend_env.get("submit.group",
+                                                              "paddle")
+
+        # start_cmd
+        self.cluster_env["START_CMD"] = self.backend_env.get(
+            "submit.start_cmd", "python -m paddlerec.run -m config.yaml")
+
+        # files
+        self.cluster_env["FILES"] = self.backend_env.get("submit.files", "")
+        if self.cluster_env["FILES"] == "":
+            raise ValueError(
+                "No -- files -- found in your backend.yaml, please check.")
+
+    def env_set(self):
+        envs.set_runtime_environs(self.cluster_env)
+        flattens = envs.flatten_environs(self.cluster_env)
+        print(envs.pretty_print_envs(flattens, ("Cluster Envs", "Value")))
+
+
+class PaddleCloudMpiEnv(ClusterEnvBase):
+    def __init__(self):
+        super(PaddleCloudMpiEnv, self).__init__()
+
+    def env_check(self):
+        super(PaddleCloudMpiEnv, self).env_check()
+
+        # check mpi env
+
+        self.cluster_env["DISTRIBUTE_MODE"] = "PS_CPU_MPI"
+
+        # train_data_path
+        self.cluster_env["TRAIN_DATA_PATH"] = self.backend_env.get(
+            "config.train_data_path", "")
+        if self.cluster_env["TRAIN_DATA_PATH"] == "":
+            raise ValueError(
+                "No -- TRAIN_DATA_PATH -- found in your backend.yaml, please check."
+            )
+        # test_data_path
+        self.cluster_env["TEST_DATA_PATH"] = self.backend_env.get(
+            "config.test_data_path", "")
+        if self.cluster_env["TEST_DATA_PATH"] == "":
+            warnings.warn(
+                "Job test_data_path not set! Please check your backend yaml.",
+                category=UserWarning,
+                stacklevel=2)
+
+        # thirdparty_path
+        self.cluster_env["THIRDPARTY_PATH"] = self.backend_env.get(
+            "config.thirdparty_path", "")
+        if self.cluster_env["THIRDPARTY_PATH"] == "":
+            warnings.warn(
+                "Job thirdparty_path not set! Please check your backend yaml.",
+                category=UserWarning,
+                stacklevel=2)
+
+        # nodes
+        self.cluster_env["MPI_NODES"] = self.backend_env.get("submit.nodes", 1)
+
+
+class PaddleCloudK8sEnv(ClusterEnvBase):
+    def __init__(self):
+        super(PaddleCloudK8sEnv, self).__init__()
+
+    def env_check(self):
+        super(PaddleCloudK8sEnv, self).env_check()
+
+        # check afs_remote_mount_point
+        self.cluster_env["AFS_REMOTE_MOUNT_POINT"] = self.backend_env.get(
+            "config.afs_remote_mount_point", "")
+        if self.cluster_env["AFS_REMOTE_MOUNT_POINT"] == "":
+            warnings.warn(
+                "Job afs_remote_mount_point not set! Please check your backend yaml.",
+                category=UserWarning,
+                stacklevel=2)
+        warnings.warn(
+            "The remote mount point will be mounted to the ./afs/",
+            category=UserWarning,
+            stacklevel=2)
+
+
+class CloudCollectiveEnv(PaddleCloudK8sEnv):
+    def __init__(self):
+        super(CloudCollectiveEnv, self).__init__()
+
+    def env_check(self):
+        super(CloudCollectiveEnv, self).env_check()
+
+        self.cluster_env["DISTRIBUTE_MODE"] = "COLLECTIVE_GPU_K8S"
+        self.cluster_env["K8S_TRAINERS"] = self.backend_env.get(
+            "submit.k8s_trainers", 1)
+        self.cluster_env["K8S_GPU_CARD"] = self.backend_env.get(
+            "submit.k8s_gpu_card", 1)
+        self.cluster_env["K8S_CPU_CORES"] = self.backend_env.get(
+            "submit.k8s_cpu_cores", 1)
--- a/core/trainers/framework/dataset.py
+++ b/core/trainers/framework/dataset.py
@@ -118,6 +118,7 @@ class QueueDataset(DatasetBase):
        dataset.set_batch_size(batch_size)
        dataset.set_pipe_command(pipe_cmd)
        train_data_path = envs.get_global_env(name + "data_path")
+
        file_list = [
            os.path.join(train_data_path, x)
            for x in os.listdir(train_data_path)
@@ -125,7 +126,7 @@ class QueueDataset(DatasetBase):
        if context["engine"] == EngineMode.LOCAL_CLUSTER:
            file_list = split_files(file_list, context["fleet"].worker_index(),
                                    context["fleet"].worker_num())
-
+        print("File_list: {}".format(file_list))
        dataset.set_filelist(file_list)
        for model_dict in context["phases"]:
            if model_dict["dataset_name"] == dataset_name:

--- a/core/utils/dataloader_instance.py
+++ b/core/utils/dataloader_instance.py
@@ -42,7 +42,7 @@ def dataloader_by_name(readerclass,
    if context["engine"] == EngineMode.LOCAL_CLUSTER:
        files = split_files(files, context["fleet"].worker_index(),
                            context["fleet"].worker_num())
-        print("file_list : {}".format(files))
+    print("file_list : {}".format(files))

    reader = reader_class(yaml_file)
    reader.init()

--- a/models/rank/dnn/backend.yaml
+++ b/models/rank/dnn/backend.yaml
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+workspace: "./"
+
+backend: "PaddleCloud"
+cluster_type: k8s # k8s 可选
+
+config:
+  fs_name: "afs://xxx.com"
+  fs_ugi: "usr,pwd"
+  output_path: "" # 填远程地址，如afs:/user/your/path/ 则此处填 /user/your/path
+  
+  # for mpi
+  train_data_path: "" # 填远程地址，如afs:/user/your/path/ 则此处填 /user/your/path
+  test_data_path: "" # 填远程地址，如afs:/user/your/path/ 则此处填 /user/your/path
+  thirdparty_path: "" # 填远程地址，如afs:/user/your/path/ 则此处填 /user/your/path
+  paddle_version: "1.7.2" # 填写paddle官方版本号 >= 1.7.2
+
+  # for k8s
+  afs_remote_mount_point: "" # 填远程地址，如afs:/user/your/path/ 则此处填 /user/your/path
+
+  # paddle分布式底层超参，无特殊需求不理不改
+  communicator:
+    FLAGS_communicator_is_sgd_optimizer: 0
+    FLAGS_communicator_send_queue_size: 5
+    FLAGS_communicator_thread_pool_size: 32
+    FLAGS_communicator_max_merge_var_num: 5
+    FLAGS_communicator_max_send_grad_num_before_recv: 5
+    FLAGS_communicator_fake_rpc: 0
+    FLAGS_rpc_retry_times: 3
+  
+submit:
+  ak: ""
+  sk: ""
+  priority: "high"
+  job_name: "PaddleRec_CTR"
+  group: ""
+  start_cmd: "python -m paddlerec.run -m ./config.yaml"
+  files: ./*.py ./*.yaml
+
+  # for mpi ps-cpu
+  nodes: 2
+  
+  # for k8s gpu        
+  k8s_trainers: 2
+  k8s_gpu_card: 1
+  
--- a/models/rank/dnn/config.yaml
+++ b/models/rank/dnn/config.yaml
@@ -80,6 +80,28 @@ runner:
  init_model_path: "increment_dnn" # load model path
  phases: [phase2]

+- name: ps_cluster
+  class: cluster_train
+  epochs: 2
+  device: cpu
+  fleet_mode: ps
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_checkpoint_path: "increment_dnn" # save checkpoint path
+  init_model_path: "" # load model path
+  print_interval: 1
+  phases: [phase1]
+
+- name: collective_cluster
+  class: cluster_train
+  epochs: 2
+  device: gpu
+  fleet_mode: collective
+  save_checkpoint_interval: 1 # save model interval of epochs
+  save_checkpoint_path: "increment_dnn" # save checkpoint path
+  init_model_path: "" # load model path
+  print_interval: 1
+  phases: [phase1]
+
 # runner will run all the phase in each epoch
 phase:
 - name: phase1

--- a/run.py
+++ b/run.py
@@ -38,7 +38,7 @@ def engine_registry():
    engines["TRANSPILER"]["TRAIN"] = single_train_engine
    engines["TRANSPILER"]["INFER"] = single_infer_engine
    engines["TRANSPILER"]["LOCAL_CLUSTER_TRAIN"] = local_cluster_engine
-    engines["TRANSPILER"]["CLUSTER"] = cluster_engine
+    engines["TRANSPILER"]["CLUSTER_TRAIN"] = cluster_engine
    engines["PSLIB"]["TRAIN"] = local_mpi_engine
    engines["PSLIB"]["LOCAL_CLUSTER_TRAIN"] = local_mpi_engine
    engines["PSLIB"]["CLUSTER_TRAIN"] = cluster_mpi_engine
@@ -111,8 +111,8 @@ def get_engine(args, running_config, mode):

    engine = running_config.get(engine_class, None)
    if engine is None:
-        raise ValueError("not find {} in yaml, please check".format(
-            mode, engine_class))
+        raise ValueError("not find {} in engine_class , please check".format(
+            engine))
    device = running_config.get(engine_device, None)

    engine = engine.upper()
@@ -262,15 +262,48 @@ def single_infer_engine(args):
 def cluster_engine(args):
    def master():
        from paddlerec.core.engine.cluster.cluster import ClusterEngine
-        _envs = envs.load_yaml(args.backend)
-        flattens = envs.flatten_environs(_envs, "_")
+
+        # Get fleet_mode & device
+        run_extras = get_all_inters_from_yaml(args.model, ["runner."])
+        mode = envs.get_runtime_environ("mode")
+        fleet_class = ".".join(["runner", mode, "fleet_mode"])
+        device_class = ".".join(["runner", mode, "device"])
+        fleet_mode = run_extras.get(fleet_class, "ps")
+        device = run_extras.get(device_class, "cpu")
+        device = device.upper()
+        fleet_mode = fleet_mode.upper()
+
+        if fleet_mode == "COLLECTIVE" and device != "GPU":
+            raise ValueError("COLLECTIVE can not be used without GPU")
+
+        # Get Thread nums
+        model_envs = envs.load_yaml(args.model)
+        phases_class = ".".join(["runner", mode, "phases"])
+        phase_names = run_extras.get(phases_class)
+        phases = []
+        all_phases = model_envs.get("phase")
+        if phase_names is None:
+            phases = all_phases
+        else:
+            for phase in all_phases:
+                if phase["name"] in phase_names:
+                    phases.append(phase)
+
+        thread_num = []
+        for phase in phases:
+            thread_num.append(int(phase["thread_num"]))
+        max_thread_num = max(thread_num)
+
+        backend_envs = envs.load_yaml(args.backend)
+        flattens = envs.flatten_environs(backend_envs, "_")
        flattens["engine_role"] = "MASTER"
        flattens["engine_mode"] = envs.get_runtime_environ("mode")
        flattens["engine_run_config"] = args.model
-        flattens["engine_temp_path"] = tempfile.mkdtemp()
+        flattens["max_thread_num"] = max_thread_num
+        flattens["fleet_mode"] = fleet_mode
+        flattens["device"] = device
+        flattens["backend_yaml"] = args.backend
        envs.set_runtime_environs(flattens)
-        ClusterEngine.workspace_replace()
-        print(envs.pretty_print_envs(flattens, ("Submit Envs", "Value")))

        launch = ClusterEngine(None, args.model)
        return launch
@@ -278,40 +311,29 @@ def cluster_engine(args):
    def worker(mode):
        if not mode:
            raise ValueError("mode: {} can not be recognized")
+        from paddlerec.core.engine.cluster.cluster import ClusterEngine

        run_extras = get_all_inters_from_yaml(args.model, ["runner."])

        trainer_class = ".".join(["runner", mode, "trainer_class"])
        fleet_class = ".".join(["runner", mode, "fleet_mode"])
        device_class = ".".join(["runner", mode, "device"])
-        selected_gpus_class = ".".join(["runner", mode, "selected_gpus"])
        strategy_class = ".".join(["runner", mode, "distribute_strategy"])
-        worker_class = ".".join(["runner", mode, "worker_num"])
-        server_class = ".".join(["runner", mode, "server_num"])
-
        trainer = run_extras.get(trainer_class, "GeneralTrainer")
        fleet_mode = run_extras.get(fleet_class, "ps")
        device = run_extras.get(device_class, "cpu")
-        selected_gpus = run_extras.get(selected_gpus_class, "0")
        distributed_strategy = run_extras.get(strategy_class, "async")
-        worker_num = run_extras.get(worker_class, 1)
-        server_num = run_extras.get(server_class, 1)
        executor_mode = "train"

        device = device.upper()
        fleet_mode = fleet_mode.upper()
-
        if fleet_mode == "COLLECTIVE" and device != "GPU":
-            raise ValueError("COLLECTIVE can not be used with GPU")
+            raise ValueError("COLLECTIVE can not be used without GPU")

        cluster_envs = {}

-        if device == "GPU":
-            cluster_envs["selected_gpus"] = selected_gpus
-
-        cluster_envs["server_num"] = server_num
-        cluster_envs["worker_num"] = worker_num
        cluster_envs["fleet_mode"] = fleet_mode
+        cluster_envs["engine_role"] = "WORKER"
        cluster_envs["train.trainer.trainer"] = trainer
        cluster_envs["train.trainer.engine"] = "cluster"
        cluster_envs["train.trainer.executor_mode"] = executor_mode
@@ -321,15 +343,15 @@ def cluster_engine(args):
        cluster_envs["train.trainer.platform"] = envs.get_platform()
        print("launch {} engine with cluster to with model: {}".format(
            trainer, args.model))
-        set_runtime_envs(cluster_envs, args.model)

-        trainer = TrainerFactory.create(args.model)
-        return trainer
+        set_runtime_envs(cluster_envs, args.model)
+        launch = ClusterEngine(None, args.model)
+        return launch

    role = os.getenv("PADDLE_PADDLEREC_ROLE", "MASTER")

    if role == "WORKER":
-        mode = os.getenv("PADDLE_PADDLEREC_MODE", None)
+        mode = os.getenv("mode", None)
        return worker(mode)
    else:
        return master()

--- a/setup.py
+++ b/setup.py
-# coding=utf8
+# -*- coding: utf-8 -*-
+
 # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -69,7 +70,7 @@ def build(dirname):
        'Criteo_data/sample_data/train/*'
    ]

-    engine_copy = ['*/*.sh']
+    engine_copy = ['*/*.sh', '*/*.template']
    for package in packages:
        if package.startswith("paddlerec.models."):
            package_data[package] = models_copy