Merge pull request #150 from MrChengmo/fix_distributed

Fix distributed

Merge pull request #150 from MrChengmo/fix_distributed
Fix distributed
52649a21 · wuzhihua · GitHub · 6eef2a3f · b1fdf253 · 52649a21
6 changed file
--- a/core/engine/cluster/cloud/cluster.sh
+++ b/core/engine/cluster/cloud/cluster.sh
@@ -59,6 +59,7 @@ function _gen_mpi_config() {
      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
      -e "s#<$ THIRDPARTY_PATH $>#$THIRDPARTY_PATH#g" \
      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
+      -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \
      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \
@@ -76,6 +77,7 @@ function _gen_k8s_config() {
      -e "s#<$ AFS_REMOTE_MOUNT_POINT $>#$AFS_REMOTE_MOUNT_POINT#g" \
      -e "s#<$ OUTPUT_PATH $>#$OUTPUT_PATH#g" \
      -e "s#<$ CPU_NUM $>#$max_thread_num#g" \
+      -e "s#<$ USE_PYTHON3 $>#$USE_PYTHON3#g" \
      -e "s#<$ FLAGS_communicator_is_sgd_optimizer $>#$FLAGS_communicator_is_sgd_optimizer#g" \
      -e "s#<$ FLAGS_communicator_send_queue_size $>#$FLAGS_communicator_send_queue_size#g" \
      -e "s#<$ FLAGS_communicator_thread_pool_size $>#$FLAGS_communicator_thread_pool_size#g" \

--- a/core/engine/cluster/cloud/k8s_config.ini.template
+++ b/core/engine/cluster/cloud/k8s_config.ini.template
@@ -19,6 +19,7 @@ afs_local_mount_point="/root/paddlejob/workspace/env_run/afs/"
 # 新k8s afs挂载帮助文档: http://wiki.baidu.com/pages/viewpage.action?pageId=906443193

 PADDLE_PADDLEREC_ROLE=WORKER
+use_python3=<$ USE_PYTHON3 $>
 CPU_NUM=<$ CPU_NUM $>
 GLOG_v=0


--- a/core/engine/cluster/cloud/mpi_config.ini.template
+++ b/core/engine/cluster/cloud/mpi_config.ini.template
@@ -17,6 +17,7 @@ output_path=<$ OUTPUT_PATH $>
 thirdparty_path=<$ THIRDPARTY_PATH $>

 PADDLE_PADDLEREC_ROLE=WORKER
+use_python3=<$ USE_PYTHON3 $>
 CPU_NUM=<$ CPU_NUM $>
 GLOG_v=0


--- a/core/engine/cluster/cluster.py
+++ b/core/engine/cluster/cluster.py
@@ -159,23 +159,30 @@ class ClusterEnvBase(object):
        self.cluster_env["PADDLE_VERSION"] = self.backend_env.get(
            "config.paddle_version", "1.7.2")

+        # python_version
+        self.cluster_env["USE_PYTHON3"] = self.backend_env.get(
+            "config.use_python3", "0")
+
        # communicator
+        max_thread_num = int(envs.get_runtime_environ("max_thread_num"))
        self.cluster_env[
            "FLAGS_communicator_is_sgd_optimizer"] = self.backend_env.get(
                "config.communicator.FLAGS_communicator_is_sgd_optimizer", 0)
        self.cluster_env[
            "FLAGS_communicator_send_queue_size"] = self.backend_env.get(
-                "config.communicator.FLAGS_communicator_send_queue_size", 5)
+                "config.communicator.FLAGS_communicator_send_queue_size",
+                max_thread_num)
        self.cluster_env[
            "FLAGS_communicator_thread_pool_size"] = self.backend_env.get(
                "config.communicator.FLAGS_communicator_thread_pool_size", 32)
        self.cluster_env[
            "FLAGS_communicator_max_merge_var_num"] = self.backend_env.get(
-                "config.communicator.FLAGS_communicator_max_merge_var_num", 5)
+                "config.communicator.FLAGS_communicator_max_merge_var_num",
+                max_thread_num)
        self.cluster_env[
            "FLAGS_communicator_max_send_grad_num_before_recv"] = self.backend_env.get(
                "config.communicator.FLAGS_communicator_max_send_grad_num_before_recv",
-                5)
+                max_thread_num)
        self.cluster_env["FLAGS_communicator_fake_rpc"] = self.backend_env.get(
            "config.communicator.FLAGS_communicator_fake_rpc", 0)
        self.cluster_env["FLAGS_rpc_retry_times"] = self.backend_env.get(

--- a/doc/distributed_train.md
+++ b/doc/distributed_train.md
@@ -69,6 +69,12 @@ dataset:
  data_path: "{workspace}/data/sample_data/train"
  sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
  dense_slots: "dense_var:13"
+
+phase:
+- name: phase1
+  model: "{workspace}/model.py"
+  dataset_name: dataloader_train 
+  thread_num: 1
 ```

 分布式的训练配置可以改为：
@@ -101,6 +107,13 @@ dataset:
  data_path: "{workspace}/train_data"
  sparse_slots: "click 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26"
  dense_slots: "dense_var:13"
+
+phase:
+- name: phase1
+  model: "{workspace}/model.py"
+  dataset_name: dataloader_train 
+  # 分布式训练节点的CPU_NUM环境变量与thread_num相等，多个phase时，取最大的thread_num
+  thread_num: 1
 ```

 除此之外，还需关注数据及模型加载的路径，一般而言：
@@ -120,6 +133,8 @@ cluster_type: mpi # k8s 可选
 config:
  # 填写任务运行的paddle官方版本号 >= 1.7.2， 默认1.7.2
  paddle_version: "1.7.2" 
+  # 是否使用PaddleCloud运行环境下的Python3，默认使用python2
+  use_python3: 1

  # hdfs/afs的配置信息填写
  fs_name: "afs://xxx.com"
@@ -140,11 +155,13 @@ config:

  # paddle参数服务器分布式底层超参，无特殊需求不理不改
  communicator:
+    # 使用SGD优化器时，建议设置为1
    FLAGS_communicator_is_sgd_optimizer: 0
+    # 以下三个变量默认都等于训练时的线程数：CPU_NUM
    FLAGS_communicator_send_queue_size: 5
-    FLAGS_communicator_thread_pool_size: 32
    FLAGS_communicator_max_merge_var_num: 5
    FLAGS_communicator_max_send_grad_num_before_recv: 5
+    FLAGS_communicator_thread_pool_size: 32
    FLAGS_communicator_fake_rpc: 0
    FLAGS_rpc_retry_times: 3
  
@@ -175,12 +192,12 @@ submit:
  # for k8s gpu        
  # k8s gpu 模式下，训练节点数，及每个节点上的GPU卡数
  k8s_trainers: 2
-  k8s-cpu-cores: 4
+  k8s_cpu_cores: 4
  k8s_gpu_card: 1

  # for k8s ps-cpu
  k8s_trainers: 2
-  k8s-cpu-cores: 4
+  k8s_cpu_cores: 4
  k8s_ps_num: 2
  k8s_ps_cores: 4
  
@@ -232,7 +249,7 @@ phase:
 再新增`backend.yaml`
 ```yaml
 backend: "PaddleCloud"
-cluster_type: mpi 
+cluster_type: mpi # k8s可选

 config:
  paddle_version: "1.7.2" 
@@ -317,7 +334,7 @@ phase:

 ```yaml
 backend: "PaddleCloud"
-cluster_type: k8s # k8s 可选
+cluster_type: k8s # mpi 可选

 config:
  # 填写任务运行的paddle官方版本号 >= 1.7.2， 默认1.7.2
@@ -357,7 +374,7 @@ submit:
  # for k8s gpu        
  # k8s gpu 模式下，训练节点数，及每个节点上的GPU卡数
  k8s_trainers: 2
-  k8s-cpu-cores: 4
+  k8s_cpu_cores: 4
  k8s_gpu_card: 1
 ```

@@ -399,7 +416,7 @@ phase:
 再新增`backend.yaml`
 ```yaml
 backend: "PaddleCloud"
-cluster_type: k8s # k8s 可选
+cluster_type: k8s # mpi 可选

 config:
  # 填写任务运行的paddle官方版本号 >= 1.7.2， 默认1.7.2
@@ -439,7 +456,7 @@ submit:
  # for k8s gpu        
  # k8s ps-cpu 模式下，训练节点数，参数服务器节点数，及每个节点上的cpu核心数及内存限制
  k8s_trainers: 2
-  k8s-cpu-cores: 4
+  k8s_cpu_cores: 4
  k8s_ps_num: 2
  k8s_ps_cores: 4
 ```

--- a/doc/yaml.md
+++ b/doc/yaml.md
-# PaddleRec yaml配置说明
+# PaddleRec config.yaml配置说明

 ## 全局变量

@@ -12,31 +12,31 @@

 ## runner变量

-|             名称              |     类型     |                     取值                      | 是否必须 |                               作用描述                               |
-| :---------------------------: | :----------: | :-------------------------------------------: | :------: | :------------------------------------------------------------------: |
-|             name              |    string    |                     任意                      |    是    |                            指定runner名称                            |
+|             名称              |     类型     |                           取值                            | 是否必须 |                               作用描述                               |
+| :---------------------------: | :----------: | :-------------------------------------------------------: | :------: | :------------------------------------------------------------------: |
+|             name              |    string    |                           任意                            |    是    |                            指定runner名称                            |
 |             class             |    string    | train(默认) / infer / local_cluster_train / cluster_train |    是    |           指定运行runner的类别（单机/分布式， 训练/预测）            |
-|            device             |    string    |                cpu(默认) / gpu                |    否    |                             程序执行设备                             |
-|          fleet_mode           |    string    |         ps(默认) / pslib / collective         |    否    |                            分布式运行模式                            |
-|         selected_gpus         |    string    |                   "0"(默认)                   |    否    | 程序运行GPU卡号，若以"0,1"的方式指定多卡，则会默认启用collective模式 |
-|          worker_num           |     int      |                    1(默认)                    |    否    |                     参数服务器模式下worker的数量                     |
-|          server_num           |     int      |                    1(默认)                    |    否    |                     参数服务器模式下server的数量                     |
-|      distribute_strategy      |    string    |        async(默认)/sync/half_async/geo        |    否    |                    参数服务器模式下训练模式的选择                    |
-|            epochs             |     int      |                     >= 1                      |    否    |                           模型训练迭代轮数                           |
-|            phases             | list[string] |            由phase name组成的list             |    否    |                  当前runner的训练过程列表，顺序执行                  |
-|        init_model_path        |    string    |                     路径                      |    否    |                            初始化模型地址                            |
-|   save_checkpoint_interval    |     int      |                     >= 1                      |    否    |                          Save参数的轮数间隔                          |
-|     save_checkpoint_path      |    string    |                     路径                      |    否    |                            Save参数的地址                            |
-|    save_inference_interval    |     int      |                     >= 1                      |    否    |                        Save预测模型的轮数间隔                        |
-|      save_inference_path      |    string    |                     路径                      |    否    |                          Save预测模型的地址                          |
-| save_inference_feed_varnames  | list[string] |           组网中指定Variable的name            |    否    |                        预测模型的入口变量name                        |
-| save_inference_fetch_varnames | list[string] |           组网中指定Variable的name            |    否    |                        预测模型的出口变量name                        |
-|        print_interval         |     int      |                     >= 1                      |    否    |                        训练指标打印batch间隔                         |
-|      instance_class_path      |    string    |                     路径                      |    否    |                     自定义instance流程实现的地址                     |
-|      network_class_path       |    string    |                     路径                      |    否    |                     自定义network流程实现的地址                      |
-|      startup_class_path       |    string    |                     路径                      |    否    |                     自定义startup流程实现的地址                      |
-|       runner_class_path       |    string    |                     路径                      |    否    |                      自定义runner流程实现的地址                      |
-|      terminal_class_path      |    string    |                     路径                      |    否    |                     自定义terminal流程实现的地址                     |
+|            device             |    string    |                      cpu(默认) / gpu                      |    否    |                             程序执行设备                             |
+|          fleet_mode           |    string    |               ps(默认) / pslib / collective               |    否    |                            分布式运行模式                            |
+|         selected_gpus         |    string    |                         "0"(默认)                         |    否    | 程序运行GPU卡号，若以"0,1"的方式指定多卡，则会默认启用collective模式 |
+|          worker_num           |     int      |                          1(默认)                          |    否    |                     参数服务器模式下worker的数量                     |
+|          server_num           |     int      |                          1(默认)                          |    否    |                     参数服务器模式下server的数量                     |
+|      distribute_strategy      |    string    |              async(默认)/sync/half_async/geo              |    否    |                    参数服务器模式下训练模式的选择                    |
+|            epochs             |     int      |                           >= 1                            |    否    |                           模型训练迭代轮数                           |
+|            phases             | list[string] |                  由phase name组成的list                   |    否    |                  当前runner的训练过程列表，顺序执行                  |
+|        init_model_path        |    string    |                           路径                            |    否    |                            初始化模型地址                            |
+|   save_checkpoint_interval    |     int      |                           >= 1                            |    否    |                          Save参数的轮数间隔                          |
+|     save_checkpoint_path      |    string    |                           路径                            |    否    |                            Save参数的地址                            |
+|    save_inference_interval    |     int      |                           >= 1                            |    否    |                        Save预测模型的轮数间隔                        |
+|      save_inference_path      |    string    |                           路径                            |    否    |                          Save预测模型的地址                          |
+| save_inference_feed_varnames  | list[string] |                 组网中指定Variable的name                  |    否    |                        预测模型的入口变量name                        |
+| save_inference_fetch_varnames | list[string] |                 组网中指定Variable的name                  |    否    |                        预测模型的出口变量name                        |
+|        print_interval         |     int      |                           >= 1                            |    否    |                        训练指标打印batch间隔                         |
+|      instance_class_path      |    string    |                           路径                            |    否    |                     自定义instance流程实现的地址                     |
+|      network_class_path       |    string    |                           路径                            |    否    |                     自定义network流程实现的地址                      |
+|      startup_class_path       |    string    |                           路径                            |    否    |                     自定义startup流程实现的地址                      |
+|       runner_class_path       |    string    |                           路径                            |    否    |                      自定义runner流程实现的地址                      |
+|      terminal_class_path      |    string    |                           路径                            |    否    |                     自定义terminal流程实现的地址                     |



@@ -70,3 +70,55 @@
 | optimizer.learning_rate | float  |       > 0        |    否    |         指定学习率          |
 |           reg           | float  |       > 0        |    否    | L2正则化参数，只在SGD下生效 |
 |         others          |   /    |        /         |    /     |   由各个模型组网独立指定    |
+
+
+# PaddleRec backend.yaml配置说明
+
+## 全局变量
+
+ |     名称     |  类型  |      取值       | 是否必须 |                     作用描述                     |
+ | :----------: | :----: | :-------------: | :------: | :----------------------------------------------: |
+ |   backend    | string | paddlecloud/k8s |    是    | 使用PaddleCloud平台提交，还是在公有云K8S集群提交 |
+ | cluster_type | string |     mpi/k8s     |    是    |        指定运行的计算集群： mpi 还是 k8s         |
+
+ ## config
+ |          名称          |  类型  |                  取值                   | 是否必须 |                                           作用描述                                           |
+ | :--------------------: | :----: | :-------------------------------------: | :------: | :------------------------------------------------------------------------------------------: |
+ |     paddle_version     | string | paddle官方版本号，如1.7.2/1.8.0/1.8.3等 |    否    |                           指定运行训练使用的Paddle版本，默认1.7.2                            |
+ |      use_python3       |  int   |               0（默认）/1               |    否    |                                 指定是否使用python3进行训练                                  |
+ |        fs_name         | string |             "afs://xxx.com"             |    是    |                                   hdfs/afs集群名称所需配置                                   |
+ |         fs_ugi         | string |                "usr,pwd"                |    是    |                                   hdfs/afs集群密钥所需配置                                   |
+ |      output_path       | string |            "/user/your/path"            |    否    |                                      任务输出的远程目录                                      |
+ |    train_data_path     | string |            "/user/your/path"            |    是    | mpi集群下指定训练数据路径，paddlecloud会自动将数据分片并下载到工作目录的`./train_data`文件夹 |
+ |     test_data_path     | string |            "/user/your/path"            |    否    |             mpi集群下指定测试数据路径，会自动下载到工作目录的`./test_data`文件夹             |
+ |    thirdparty_path     | string |            "/user/your/path"            |    否    |           mpi集群下指定thirdparty路径，会自动下载到工作目录的`./thirdparty`文件夹            |
+ | afs_remote_mount_point | string |            "/user/your/path"            |    是    |                  k8s集群下指定远程路径的地址，会挂载到工作目录的`./afs/下`                   |
+ 
+ ### config.communicator
+ |                       名称                       | 类型  |      取值      | 是否必须 |                        作用描述                        |
+ | :----------------------------------------------: | :---: | :------------: | :------: | :----------------------------------------------------: |
+ |       FLAGS_communicator_is_sgd_optimizer        |  int  |  0（默认）/1   |    否    | 异步分布式训练时的多线程的梯度融合方式是否使用SGD模式  |
+ |        FLAGS_communicator_send_queue_size        |  int  | 线程数（默认） |    否    |               分布式训练时发送队列的大小               |
+ |       FLAGS_communicator_max_merge_var_num       |  int  | 线程数（默认） |    否    |        分布式训练多线程梯度融合时，线程数的配置        |
+ | FLAGS_communicator_max_send_grad_num_before_recv |  int  | 线程数（默认） |    否    | 分布式训练使用独立recv参数线程时，与send的步调配置超参 |
+ |       FLAGS_communicator_thread_pool_size        |  int  |   32（默认）   |    否    |        分布式训练时，多线程发送参数的线程池大小        |
+ |           FLAGS_communicator_fake_rpc            |  int  |  0（默认）/1   |    否    |              分布式训练时，选择不进行通信              |
+ |              FLAGS_rpc_retry_times               |  int  |    3(默认)     |    否    |            分布式训练时，GRPC的失败重试次数            |
+
+
+## submit
+|     名称      |  类型  |            取值             | 是否必须 |                         作用描述                         |
+| :-----------: | :----: | :-------------------------: | :------: | :------------------------------------------------------: |
+|      ak       | string | PaddleCloud平台提供的ak密钥 |    是    |                   paddlecloud用户配置                    |
+|      sk       | string | PaddleCloud平台提供的sk密钥 |    否    |                   paddlecloud用户配置                    |
+|   priority    | string |    normal/high/very_high    |    否    |                        任务优先级                        |
+|   job_name    | string |            任意             |    是    |                         任务名称                         |
+|     group     | string |     计算资源所在组名称      |    是    |                          组名称                          |
+|   start_cmd   | string |            任意             |    是    | 启动命令，默认`python -m paddlerec.run -m ./config.yaml` |
+|     files     | string |            任意             |    是    |         随任务提交上传的文件，给出相对或绝对路径         |
+|     nodes     |  int   |        >=1（默认1）         |    否    |                    mpi集群下的节点数                     |
+| k8s_trainers  |  int   |        >=1（默认1）         |    否    |                 k8s集群下worker的节点数                  |
+| k8s_cpu_cores |  int   |        >=1（默认1）         |    否    |                 k8s集群下worker的CPU核数                 |
+| k8s_gpu_card  |  int   |        >=1（默认1）         |    否    |                 k8s集群下worker的GPU卡数                 |
+|  k8s_ps_num   |  int   |        >=1（默认1）         |    否    |                 k8s集群下server的节点数                  |
+| k8s_ps_cores  |  int   |        >=1（默认1）         |    否    |                 k8s集群下server的CPU核数                 |