diff --git a/doc/fluid/user_guides/howto/training/fleet_api_howto_cn.rst b/doc/fluid/user_guides/howto/training/fleet_api_howto_cn.rst index cdd8fab071b4d03db2f35040866249bedc7adbe2..92ffc39422c0ffa817e881dcd293f54cf6f01a65 100644 --- a/doc/fluid/user_guides/howto/training/fleet_api_howto_cn.rst +++ b/doc/fluid/user_guides/howto/training/fleet_api_howto_cn.rst @@ -1,5 +1,3 @@ -.. _fleet_api_howto_cn: - 使用FleetAPI进行分布式训练 ========================== @@ -16,9 +14,10 @@ Fleet API快速上手示例 下面会针对Fleet API最常见的两种使用场景,用一个模型做示例,目的是让用户有快速上手体验的模板。快速上手的示例源代码可以在\ `Fleet Quick -Start `__\ 找到。 +Start `__ +找到。 -假设我们定义MLP网络如下: +- 假设我们定义MLP网络如下: .. code:: python @@ -32,7 +31,7 @@ Start ` avg_cost = fluid.layers.mean(x=cost) return avg_cost -定义一个在内存生成数据的Reader如下: +- 定义一个在内存生成数据的Reader如下: .. code:: python @@ -42,32 +41,24 @@ Start ` return {"x": np.random.random(size=(128, 32)).astype('float32'), "y": np.random.randint(2, size=(128, 1)).astype('int64')} -单机Trainer定义 ->>>>>>>>>>>>>>> +- 单机Trainer定义 -.. code:: python + import paddle.fluid as fluid from nets import mlp from utils import + gen\_data - import paddle.fluid as fluid - from nets import mlp - from utils import gen_data + input\_x = fluid.layers.data(name="x", shape=[32], dtype='float32') + input\_y = fluid.layers.data(name="y", shape=[1], dtype='int64') - input_x = fluid.layers.data(name="x", shape=[32], dtype='float32') - input_y = fluid.layers.data(name="y", shape=[1], dtype='int64') + cost = mlp(input\_x, input\_y) optimizer = + fluid.optimizer.SGD(learning\_rate=0.01) optimizer.minimize(cost) + place = fluid.CUDAPlace(0) - cost = mlp(input_x, input_y) - optimizer = fluid.optimizer.SGD(learning_rate=0.01) - optimizer.minimize(cost) - place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default\_startup\_program()) step = 1001 for i in + range(step): cost\_val = exe.run(feed=gen\_data(), + fetch\_list=[cost.name]) print("step%d cost=%f" % (i, cost\_val[0])) - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - step = 1001 - for i in range(step): - cost_val = exe.run(feed=gen_data(), fetch_list=[cost.name]) - print("step%d cost=%f" % (i, cost_val[0])) - -Parameter Server训练方法 ->>>>>>>>>>>>>>> +- Parameter Server训练方法 参数服务器方法对于大规模数据,简单模型的并行训练非常适用,我们基于单机模型的定义给出其实用Parameter Server进行训练的示例如下: @@ -107,8 +98,7 @@ Server进行训练的示例如下: print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0])) -Collective训练方法 ->>>>>>>>>>>>>>> +- Collective训练方法 collective training通常在GPU多机多卡训练中使用,一般在复杂模型的训练中比较常见,我们基于上面的单机模型定义给出使用Collective方法进行分布式训练的示例如下: @@ -147,23 +137,23 @@ training通常在GPU多机多卡训练中使用,一般在复杂模型的训练 更多使用示例 ------------ -`点击率预估 <>`__ +`点击率预估 `__ -`语义匹配 <>`__ +`语义匹配 `__ -`向量学习 <>`__ +`向量学习 `__ -`基于Resnet50的图像分类 <>`__ +`基于Resnet50的图像分类 `__ -`基于Transformer的机器翻译 <>`__ +`基于Transformer的机器翻译 `__ -`基于Bert的语义表示学习 <>`__ +`基于Bert的语义表示学习 `__ Fleet API相关的接口说明 ----------------------- Fleet API接口 ->>>>>>>>>>>>>>> +>>>>>>>>>>>> - init(role\_maker=None) - fleet初始化,需要在使用fleet其他接口前先调用,用于定义多机的环境配置 @@ -187,7 +177,7 @@ Fleet API接口 - 分布式优化算法装饰器,用户可带入单机optimizer,并配置分布式训练策略,返回一个分布式的optimizer RoleMaker ->>>>>>>>>>>>>>> +>>>>>>>>>>>> - MPISymetricRoleMaker @@ -195,19 +185,15 @@ RoleMaker - 示例: - .. code:: python - - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet - from paddle.fluid.incubate.fleet.base import role_maker + \`\`\`python from + paddle.fluid.incubate.fleet.parameter\_server.distribute\_transpiler + import fleet from paddle.fluid.incubate.fleet.base import role\_maker - role = role_maker.MPISymetricRoleMaker() - fleet.init(role) +role = role\_maker.MPISymetricRoleMaker() fleet.init(role) \`\`\` - 启动方法: - .. code:: shell - - mpirun -np 2 python trainer.py + ``python mpirun -np 2 python trainer.py`` - PaddleCloudRoleMaker @@ -215,35 +201,27 @@ RoleMaker - Parameter Server训练示例: - .. code:: python + \`\`\`python from + paddle.fluid.incubate.fleet.parameter\_server.distribute\_transpiler + import fleet from paddle.fluid.incubate.fleet.base import role\_maker - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet - from paddle.fluid.incubate.fleet.base import role_maker - - role = role_maker.PaddleCloudRoleMaker() - fleet.init(role) +role = role\_maker.PaddleCloudRoleMaker() fleet.init(role) \`\`\` - 启动方法: - .. code:: python - - python -m paddle.distributed.launch_ps --worker_num 2 --server_num 2 trainer.py + ``python python -m paddle.distributed.launch_ps --worker_num 2 --server_num 2 trainer.py`` - Collective训练示例: - .. code:: python - - from paddle.fluid.incubate.fleet.collective import fleet - from paddle.fluid.incubate.fleet.base import role_maker + \`\`\`python from paddle.fluid.incubate.fleet.collective import fleet + from paddle.fluid.incubate.fleet.base import role\_maker - role = role_maker.PaddleCloudRoleMaker(is_collective=True) - fleet.init(role) +role = role\_maker.PaddleCloudRoleMaker(is\_collective=True) +fleet.init(role) \`\`\` - 启动方法: - .. code:: python - - python -m paddle.distributed.launch trainer.py + ``python python -m paddle.distributed.launch trainer.py`` - UserDefinedRoleMaker @@ -251,21 +229,18 @@ RoleMaker - 示例: - .. code:: python - - from paddle.fluid.incubate.fleet.parameter_server.distribute_transpiler import fleet - from paddle.fluid.incubate.fleet.base import role_maker + \`\`\`python from + paddle.fluid.incubate.fleet.parameter\_server.distribute\_transpiler + import fleet from paddle.fluid.incubate.fleet.base import role\_maker - role = role_maker.UserDefinedRoleMaker( - current_id=int(os.getenv("CURRENT_ID")), - role=role_maker.Role.WORKER if bool(int(os.getenv("IS_WORKER"))) - else role_maker.Role.SERVER, - worker_num=int(os.getenv("WORKER_NUM")), - server_endpoints=pserver_endpoints) - fleet.init(role) +role = role\_maker.UserDefinedRoleMaker( +current\_id=int(os.getenv("CURRENT\_ID")), role=role\_maker.Role.WORKER +if bool(int(os.getenv("IS\_WORKER"))) else role\_maker.Role.SERVER, +worker\_num=int(os.getenv("WORKER\_NUM")), +server\_endpoints=pserver\_endpoints) fleet.init(role) \`\`\` Strategy ->>>>>>>>>>>>>>> +>>>>>>>>>>>> - Parameter Server Training - Sync\_mode @@ -274,7 +249,7 @@ Strategy - ReduceGrad Fleet Mode ->>>>>>>>>>>>>>> +>>>>>>>>>>>> - Parameter Server Training