提交 45a8c275 编写于 作者: T typhoonzero

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

......@@ -156,6 +156,7 @@ include(rdma) # set rdma libraries
include(flags) # set paddle compile flags
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
......
......@@ -28,9 +28,3 @@ endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
IF(NOT WITH_C_API AND WITH_FLUID)
INSTALL(FILES ${EIGEN_INCLUDE_DIR}/Eigen/Core DESTINATION third_party/eigen3/Eigen)
INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/Eigen/src DESTINATION third_party/eigen3/Eigen)
INSTALL(DIRECTORY ${EIGEN_INCLUDE_DIR}/unsupported/Eigen DESTINATION third_party/eigen3/unsupported)
ENDIF()
......@@ -52,7 +52,7 @@ ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GFLAGS_INCLUDE_DIR} DESTINATION third_party/gflags)
IF(ANDROID)
INSTALL(FILES ${GFLAGS_LIBRARIES} DESTINATION third_party/gflags/lib/${ANDROID_ABI})
......
......@@ -68,7 +68,7 @@ LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${GLOG_INCLUDE_DIR} DESTINATION third_party/glog)
IF(ANDROID)
INSTALL(FILES ${GLOG_LIBRARIES} DESTINATION third_party/glog/lib/${ANDROID_ABI})
......
......@@ -250,7 +250,7 @@ IF(NOT PROTOBUF_FOUND)
SET(PROTOBUF_PROTOC_LIBRARY ${extern_protobuf_PROTOC_LIBRARY}
CACHE FILEPATH "protoc library." FORCE)
IF(WITH_C_API OR WITH_FLUID)
IF(WITH_C_API)
INSTALL(DIRECTORY ${PROTOBUF_INCLUDE_DIR} DESTINATION third_party/protobuf)
IF(ANDROID)
INSTALL(FILES ${PROTOBUF_LITE_LIBRARY} DESTINATION third_party/protobuf/lib/${ANDROID_ABI})
......
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
if(NOT ${copy_lib_SRCS_len} EQUAL ${copy_lib_DSTS_len})
message(FATAL_ERROR "${TARGET} source numbers are not equal to destination numbers")
endif()
math(EXPR len "${copy_lib_SRCS_len} - 1")
add_custom_target(${TARGET} DEPENDS ${copy_lib_DEPS})
foreach(index RANGE ${len})
list(GET copy_lib_SRCS ${index} src)
list(GET copy_lib_DSTS ${index} dst)
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}")
if(IS_DIRECTORY ${src})
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp -r "${src}" "${dst}")
else()
add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND cp "${src}" "${dst}")
endif()
endforeach()
endfunction()
# third party
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
copy(eigen3_lib
SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
copy(gflags_lib
SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
copy(glog_lib
SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
DSTS ${dst_dir} ${dst_dir}/lib
)
IF(NOT PROTOBUF_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
)
ENDIF(NOT PROTOBUF_FOUND)
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle")
set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle")
set(module "framework")
copy(framework_lib DEPS framework_py_proto
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/framework/framework.pb.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/details ${dst_dir}/${module}
)
set(module "memory")
copy(memory_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/detail/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/detail
)
set(module "inference")
copy(inference_lib DEPENDS paddle_fluid_shared
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/inference/libpaddle_fluid.so
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
set(module "platform")
copy(platform_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/dynload/*.h ${src_dir}/${module}/details/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/dynload ${dst_dir}/${module}/details
)
set(module "string")
copy(string_lib
SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/tinyformat/*.h
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib)
......@@ -13,7 +13,7 @@ PaddlePaddle提供pip和Docker的安装方式:
pip_install_cn.rst
docker_install_cn.rst
../../howto/dev/build_cn.md
build_cn.md
编译流程
++++++++
......
......@@ -13,7 +13,7 @@ You can choose either pip or Docker to complete your install:
pip_install_en.rst
docker_install_en.rst
../../howto/dev/build_en.md
build_en.md
Build from Source
......
# C++ Data Feeding
In training with Paddle V2 API, data feeding wholly dependents on Python code. To get rid of the Python environment and achieve the goal of "wrapping the whole training by a while loop op" in Paddle Fluid, a C++ data feeding mechanism is required.
In this document we show the fundamental design of C++ data feeding process, which includes the data reading, shuffling and batching.
## Reader
A new concept named 'Reader' is introduced. `Reader` is a series of inherited classes which can be hold by our `Variable` and they are used to read or process file data.
### `ReaderBase`
`ReaderBase` is the abstract base class of all readers. It defines the all readers' interfaces.
```cpp
class ReaderBase {
public:
explicit ReaderBase(const std::vector<DDim>& shapes) : shapes_(shapes) {
PADDLE_ENFORCE(!shapes_.empty());
}
// Read the next batch of data. (A 'batch' can be only one instance)
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Show whether the next bacth exists.
virtual bool HasNext() const = 0;
// Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0;
// Get a certain read in data's shape.
DDim shape(size_t idx) const;
// Get shapes of all read in data.
std::vector<DDim> shapes() const { return shapes_; }
// Set shapes of read in data.
void set_shapes(const std::vector<DDim>& shapes) { shapes_ = shapes; }
virtual ~ReaderBase() {}
protected:
std::vector<DDim> shapes_;
};
```
### `FileReader` and `DecoratedReader`
These two classes are derived from the `ReaderBase` and will further be derived by respective specific readers. That is to say, in our design, there are two kinds of readers: file readers and decorated readers. A file reader reads from a file of some specific format, and yield only one instance of data at a time. e.g. RecordIO reader, jpg reader, .... A decorated reader takes another reader(both file reader and decorated reader are OK) as its 'underlying reader'. It gets data from its underlying reader, does some process on them(shuffling, or batching), then yields processed data. The output data of a decorated reader can be a single instance or a batch. `ShuffleReader` and `BatchReader` are both decorated readers.
All the readers share exactly the same interfaces defined in `ReaderBase`. So they can be decorated for more than one time: We can **shuffle** a reader's outputs and then **batch** the shuffle outputs. The interface consistency also allows related ops use readers without knowing what they are exactly.
### `ReaderHolder`
Different readers belong to different class types. It leads to a problem: How can we drop them into `Variable`s and fetch them out by a unified method? For example, if a Variable holds a `BatchReader`, we can not get it by the following code:
```cpp
var->Get<ReaderBase>("batch_reader");
```
we have to write:
```cpp
var->Get<BatchReader>("batch_reader");
```
This requires each time getting a reader from a variable we must know the reader's type exactly. It is nearly impossible.
To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an empty decorator of `ReaderBase`, which erases reader's type. With `ReaderHolder` we are able to fetch all types of readers by `var->Get<ReaderHolder>("...")` and regard the obtained object as a reader.
## Related Operators
To create and invoke readers, some now ops are introduced:
### `CreateReaderOp`
Each reader has its creating op. File readers' creating ops have no input and yield the created file reader as its output. Decorated readers' creating ops take the underlying readers as inputs and then yield new decorated readers.
### `ReadOp`
A reader is only a Variable. It cannot trigger the reading process by itself. So we add the `ReadOp` to execute it. A `ReadOp` takes a reader Variable as its input. Each time it runs, it invokes the reader‘s `ReadNext()` function and gets a new batch of data(or only one instance of data, if we use file reader directly). The output data of a reader are in the form of `std::vector<LoDTenosr>`, so the `ReadOp` also needs to split the vector and move LoDTensors to their respective output Variables.
../../CONTRIBUTING.md
\ No newline at end of file
开发标准
========
.. toctree::
:maxdepth: 1
contribute_to_paddle_cn.md
write_docs_cn.rst
Development
------------
.. toctree::
:maxdepth: 1
new_layer_en.rst
contribute_to_paddle_en.md
write_docs_en.rst
##################
如何贡献/修改文档
##################
#############
如何贡献文档
#############
PaddlePaddle的文档包括英文文档 ``doc`` 和中文文档 ``doc_cn`` 两个部分。文档都是通过 `cmake`_ 驱动 `sphinx`_ 编译生成,生成后的文档分别存储在编译目录的 ``doc`` 和 ``doc_cn`` 两个子目录下。
也可以利用PaddlePaddle 工具来编译文档,这个情况下所有的文件会存在整理过的的文件目录 .ppo_workspace/content 下
......
##################
########################
Contribute Documentation
##################
########################
PaddlePaddle supports English documentation ``doc`` and Chinese documentation ``doc_cn``.
Both are compiled by `cmake`_ and `sphinx`_ , the compiled documentations will be stored under ``doc`` and ``doc_cn`` directories.
......
......@@ -4,7 +4,7 @@
PaddlePaddle是源于百度的一个深度学习平台。PaddlePaddle为深度学习研究人员提供了丰富的API,可以轻松地完成神经网络配置,模型训练等任务。
这里将介绍PaddlePaddle的基本使用概念,并且展示了如何利用PaddlePaddle来解决一个经典的线性回归问题。
在使用该文档之前,请参考 `安装文档 <../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
在使用该文档之前,请参考 `安装文档 <../../build_and_install/index_cn.html>`_ 完成PaddlePaddle的安装。
配置网络
......
新手入门
============
.. _quick_install:
快速安装
++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装,版本为cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
更详细的安装和编译方法参考:
.. toctree::
:maxdepth: 1
build_and_install/index_cn.rst
.. _quick_start:
快速开始
++++++++
创建一个 housing.py 并粘贴此Python代码:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
concepts/use_concepts_cn.rst
GET STARTED
============
.. _quick_install:
Quick Install
----------------------
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash
pip install paddlepaddle-gpu
For more details about installation and build:
.. toctree::
:maxdepth: 1
build_and_install/index_en.rst
.. _quick_start:
Quick Start
++++++++
Create a new file called housing.py, and paste this Python
code:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
for the test housing data.
quickstart_en.rst
快速开始
========
快速安装
--------
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装,版本为cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
pip install paddlepaddle-gpu
更详细的安装和编译方法参考::ref:`install_steps` 。
快速使用
--------
创建一个 housing.py 并粘贴此Python代码:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
执行 :code:`python housing.py` 瞧! 它应该打印出预测住房数据的清单。
Quick Start
============
Quick Install
-------------
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash
pip install paddlepaddle-gpu
For more details about installation and build: :ref:`install_steps` .
Quick Use
---------
Create a new file called housing.py, and paste this Python
code:
.. code-block:: python
import paddle.v2 as paddle
# Initialize PaddlePaddle.
paddle.init(use_gpu=False, trainer_count=1)
# Configure the neural network.
x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
y_predict = paddle.layer.fc(input=x, size=1, act=paddle.activation.Linear())
# Infer using provided test data.
probs = paddle.infer(
output_layer=y_predict,
parameters=paddle.dataset.uci_housing.model(),
input=[item for item in paddle.dataset.uci_housing.test()()])
for i in xrange(len(probs)):
print 'Predicted price: ${:,.2f}'.format(probs[i][0] * 1000)
Run :code:`python housing.py` and voila! It should print out a list of predictions
for the test housing data.
## 编译 PaddlePaddle 预测库
## 安装与编译C-API预测库
### 概述
......
PaddlePaddle C-API
C-API预测库
==================
.. toctree::
......
## C-API 使用流程
## C-API使用流程
这篇文档介绍 PaddlePaddle C-API 整体使用流程。
......
# 分布式训练
## 概述
本文将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
<img src="https://user-images.githubusercontent.com/13348433/31772175-5f419eca-b511-11e7-9db7-5231fe3d9ccb.png" width="500">
- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
## 环境准备
1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
## 启动参数说明
下面以`doc/howto/usage/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
下面以`doc/howto/cluster/src/word2vec`中的代码作为实例,介绍使用PaddlePaddle v2 API完成分布式训练。
## 启动参数说明
### 启动参数服务器
执行以下的命令启动一个参数服务器并等待和计算节点的数据交互
```bash
......@@ -167,22 +133,3 @@ test.txt-00002
- `train_data_dir`:包含训练数据的目录,可以是从分布式存储挂载过来的,也可以是在任务启动前下载到本地的。
- `test_data_dir`:包含测试数据集的目录。
## 使用分布式计算平台或工具
PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
- [Kubernetes](http://kubernetes.io) Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
- [OpenMPI](https://www.open-mpi.org) 成熟的高性能并行计算框架。
- [Fabric](http://www.fabfile.org) 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在[cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2)找到。
在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
## 在不同集群中运行
- [fabric集群](fabric_cn.md)
- [openmpi集群](openmpi_cn.md)
- [kubernetes单机](k8s_cn.md)
- [kubernetes distributed分布式](k8s_distributed_cn.md)
- [AWS上运行kubernetes集群训练](k8s_aws_cn.md)
# Distributed Training
## Introduction
In this article, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
<img src="https://user-images.githubusercontent.com/13348433/31772146-41523d84-b511-11e7-8a12-a69fd136c283.png" width="500">
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
## Preparations
1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0rc, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
We'll take `doc/howto/usage/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
## Command-line arguments
We'll take `doc/howto/cluster/src/word2vec` as an example to introduce distributed training using PaddlePaddle v2 API.
### Starting parameter server
Type the below command to start a parameter server which will wait for trainers to connect:
......@@ -171,21 +138,3 @@ Your workspace may looks like:
- `train_data_dir`: containing training data. Mount from storage service or copy trainning data to here.
- `test_data_dir`: containing testing data.
## Use cluster platforms or cluster management tools
PaddlePaddle supports running jobs on several platforms including:
- [Kubernetes](http://kubernetes.io) open-source system for automating deployment, scaling, and management of containerized applications from Google.
- [OpenMPI](https://www.open-mpi.org) Mature high performance parallel computing framework.
- [Fabric](http://www.fabfile.org) A cluster management tool. Write scripts to submit jobs or manage the cluster.
We'll introduce cluster job management on these platforms. The examples can be found under [cluster_train_v2](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2).
These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
## Use different clusters
- [fabric](fabric_en.md)
- [openmpi](openmpi_en.md)
- [kubernetes](k8s_en.md)
- [kubernetes on AWS](k8s_aws_en.md)
分布式训练
==========
本节将介绍如何使用PaddlePaddle在不同的集群框架下完成分布式训练。分布式训练架构如下图所示:
.. image:: src/ps_cn.png
:width: 500
- 数据分片(Data shard): 用于训练神经网络的数据,被切分成多个部分,每个部分分别给每个trainer使用。
- 计算节点(Trainer): 每个trainer启动后读取切分好的一部分数据,开始神经网络的“前馈”和“后馈”计算,并和参数服务器通信。在完成一定量数据的训练后,上传计算得出的梯度(gradients),然后下载优化更新后的神经网络参数(parameters)。
- 参数服务器(Parameter server):每个参数服务器只保存整个神经网络所有参数的一部分。参数服务器接收从计算节点上传的梯度,并完成参数优化更新,再将更新后的参数下发到每个计算节点。
这样,通过计算节点和参数服务器的分布式协作,可以完成神经网络的SGD方法的训练。PaddlePaddle可以同时支持同步随机梯度下降(SGD)和异步随机梯度下降。
在使用同步SGD训练神经网络时,PaddlePaddle使用同步屏障(barrier),使梯度的提交和参数的更新按照顺序方式执行。在异步SGD中,则并不会等待所有trainer提交梯度才更新参数,这样极大地提高了计算的并行性:参数服务器之间不相互依赖,并行地接收梯度和更新参数,参数服务器也不会等待计算节点全部都提交梯度之后才开始下一步,计算节点之间也不会相互依赖,并行地执行模型的训练。可以看出,虽然异步SGD方式会提高参数更新并行度, 但是并不能保证参数同步更新,在任意时间某一台参数服务器上保存的参数可能比另一台要更新,与同步SGD相比,梯度会有噪声。
.. toctree::
:maxdepth: 1
preparations_cn.md
cmd_argument_cn.md
multi_cluster/index_cn.rst
Distributed Training
====================
In this section, we'll explain how to run distributed training jobs with PaddlePaddle on different types of clusters. The diagram below shows the main architecture of a distributed trainning job:
.. image:: src/ps_en.png
:width: 500
- Data shard: training data will be split into multiple partitions, trainers use the partitions of the whole dataset to do the training job.
- Trainer: each trainer reads the data shard, and train the neural network. Then the trainer will upload calculated "gradients" to parameter servers, and wait for parameters to be optimized on the parameter server side. When that finishes, the trainer download optimized parameters and continues its training.
- Parameter server: every parameter server stores part of the whole neural network model data. They will do optimization calculations when gradients are uploaded from trainers, and then send updated parameters to trainers.
PaddlePaddle can support both synchronize stochastic gradient descent (SGD) and asynchronous SGD.
When training with synchronize SGD, PaddlePaddle uses an internal "synchronize barrier" which makes gradients update and parameter download in strict order. On the other hand, asynchronous SGD won't wait for all trainers to finish upload at a single step, this will increase the parallelism of distributed training: parameter servers do not depend on each other, they'll do parameter optimization concurrently. Parameter servers will not wait for trainers, so trainers will also do their work concurrently. But asynchronous SGD will introduce more randomness and noises in the gradient.
.. toctree::
:maxdepth: 1
preparations_en.md
cmd_argument_en.md
multi_cluster/index_en.rst
在不同集群中运行
================
PaddlePaddle可以使用多种分布式计算平台构建分布式计算任务,包括:
- `Kubernetes <http://kubernetes.io>`_ Google开源的容器集群的调度框架,支持大规模集群生产环境的完整集群方案。
- `OpenMPI <https://www.open-mpi.org>`_ 成熟的高性能并行计算框架。
- `Fabric <http://www.fabfile.org>`_ 集群管理工具。可以使用`Fabric`编写集群任务提交和管理脚本。
对于不同的集群平台,会分别介绍集群作业的启动和停止方法。这些例子都可以在 `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ 找到。
在使用分布式计算平台进行训练时,任务被调度在集群中时,分布式计算平台通常会通过API或者环境变量提供任务运行需要的参数,比如节点的ID、IP和任务节点个数等。
.. toctree::
:maxdepth: 1
fabric_cn.md
openmpi_cn.md
k8s_cn.md
k8s_distributed_cn.md
k8s_aws_cn.md
Use different clusters
======================
PaddlePaddle supports running jobs on several platforms including:
- `Kubernetes <http://kubernetes.io>`_ open-source system for automating deployment, scaling, and management of containerized applications from Google.
- `OpenMPI <https://www.open-mpi.org>`_ Mature high performance parallel computing framework.
- `Fabric <http://www.fabfile.org>`_ A cluster management tool. Write scripts to submit jobs or manage the cluster.
We'll introduce cluster job management on these platforms. The examples can be found under `cluster_train_v2 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/scripts/cluster_train_v2>`_ .
These cluster platforms provide API or environment variables for training processes, when the job is dispatched to different nodes. Like node ID, IP or total number of nodes etc.
.. toctree::
:maxdepth: 1
fabric_en.md
openmpi_en.md
k8s_en.md
k8s_aws_en.md
## 环境准备
1. 准备您的计算集群。计算集群通常由一组(几台到几千台规模)的Linux服务器组成。服务器之间可以通过局域网(LAN)联通,每台服务器具有集群中唯一的IP地址(或者可被DNS解析的主机名)。集群中的每台计算机通常被成为一个“节点”。
1. 我们需要在集群的所有节点上安装 PaddlePaddle。 如果要启用GPU,还需要在节点上安装对应的GPU驱动以及CUDA。PaddlePaddle的安装可以参考[build_and_install](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/index_cn.html)的多种安装方式。我们推荐使用[Docker](http://www.paddlepaddle.org/docs/develop/documentation/zh/getstarted/build_and_install/docker_install_cn.html)安装方式来快速安装PaddlePaddle。
安装完成之后,执行下面的命令可以查看已经安装的版本(docker安装方式可以进入docker容器执行:`docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
## Preparations
1. Prepare your computer cluster. It's normally a bunch of Linux servers connected by LAN. Each server will be assigned a unique IP address. The computers in the cluster can be called "nodes".
2. Install PaddlePaddle on every node. If you are going to take advantage of GPU cards, you'll also need to install proper driver and CUDA libraries. To install PaddlePaddle please read [this build and install](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/index_en.html) document. We strongly recommend using [Docker installation](http://www.paddlepaddle.org/docs/develop/documentation/en/getstarted/build_and_install/docker_install_en.html).
After installation, you can check the version by typing the below command (run a docker container if using docker: `docker run -it paddlepaddle/paddle:[tag] /bin/bash`):
```bash
$ paddle version
PaddlePaddle 0.10.0rc, compiled with
with_avx: ON
with_gpu: OFF
with_double: OFF
with_python: ON
with_rdma: OFF
with_timer: OFF
```
.. _cmd_line_index:
设置命令行参数
命令行参数设置
===============
.. toctree::
......
../../../CONTRIBUTING.md
\ No newline at end of file
进阶指南
进阶使用
========
使用说明
--------
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_cn.rst
usage/cluster/cluster_train_cn.md
usage/capi/index_cn.rst
开发标准
--------
.. toctree::
:maxdepth: 1
dev/contribute_to_paddle_cn.md
dev/write_docs_cn.rst
模型配置
--------
.. toctree::
:maxdepth: 1
deep_model/rnn/index_cn.rst
性能优化
--------
.. toctree::
:maxdepth: 1
cmd_parameter/index_cn.rst
cluster/index_cn.rst
capi/index_cn.rst
rnn/index_cn.rst
optimization/gpu_profiling_cn.rst
HOW TO
=======
Usage
-------
.. toctree::
:maxdepth: 1
usage/cmd_parameter/index_en.rst
usage/cluster/cluster_train_en.md
Development
------------
.. toctree::
:maxdepth: 1
dev/new_layer_en.rst
dev/contribute_to_paddle_en.md
dev/write_docs_en.rst
Configuration
-------------
.. toctree::
:maxdepth: 1
deep_model/rnn/index_en.rst
Optimization
-------------
.. toctree::
:maxdepth: 1
cmd_parameter/index_en.rst
cluster/index_en.rst
rnn/index_en.rst
optimization/gpu_profiling_en.rst
==================
GPU性能分析与调优
==================
============
GPU性能调优
============
.. contents::
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册