提交 b1a35bfb 编写于 作者: _青葱's avatar _青葱

Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fluid_api

Merge branch develop
...@@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) ...@@ -53,8 +53,7 @@ option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF) option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF) option(WITH_C_API "Compile PaddlePaddle with C-API(Prediction)" OFF)
# TODO: Only compile PaddlePaddle fluid version by WITH_FLUID option. option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_FLUID "Compile PaddlePaddle fluid only(TODO)" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON) option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF)
...@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON) ...@@ -109,7 +108,7 @@ if (WITH_C_API AND WITH_PYTHON)
endif() endif()
if (WITH_C_API) if (WITH_C_API)
set(WITH_FLUID OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE) set(WITH_FLUID_ONLY OFF CACHE STRING "Disable install fluid when compile the C_API" FORCE)
endif() endif()
if(MOBILE_INFERENCE) if(MOBILE_INFERENCE)
...@@ -147,6 +146,7 @@ include(external/cares) ...@@ -147,6 +146,7 @@ include(external/cares)
include(external/grpc) include(external/grpc)
include(external/snappy) # download snappy include(external/snappy) # download snappy
include(external/snappystream) include(external/snappystream)
include(external/threadpool)
include(cudnn) # set cudnn libraries, must before configure include(cudnn) # set cudnn libraries, must before configure
include(cupti) include(cupti)
......
INCLUDE(ExternalProject)
SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool)
SET(THREADPOOL_INCLUDE_DIR ${THREADPOOL_SOURCE_DIR}/src/extern_threadpool)
INCLUDE_DIRECTORIES(${THREADPOOL_INCLUDE_DIR})
ExternalProject_Add(
extern_threadpool
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/progschj/ThreadPool.git"
GIT_TAG 9a42ec1329f259a5f4881a291db1dcb8f2ad9040
PREFIX ${THREADPOOL_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/threadpool_dummy.c)
file(WRITE ${dummyfile} "const char *dummy_threadpool = \"${dummyfile}\";")
add_library(simple_threadpool STATIC ${dummyfile})
else()
add_library(simple_threadpool INTERFACE)
endif()
add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
digraph G {
subgraph cluster_init {
label="Initialization"
startup_program [label="startup", shape=box]
node_w_g0 [label="W\nGPU0"]
startup_program -> node_w_g0 [label="Initialize"]
node_w_g1 [label="W\nGPU1"]
node_w_g0 -> node_w_g1 [label="broadcast"]
}
subgraph cluster_train {
label="forward_backward"
subgraph cluster_gpu0 {
label="GPU0"
fc_0 [label="fc\nGPU0", shape=box]
hidden_0 [label="hidden\nGPU0"]
node_w_g0 -> fc_0
fc_0 -> hidden_0
loss0 [label="loss\nGPU0"]
hidden_0 -> loss0 [label="many ops omitted"]
scale_loss_0 [label="scale_loss_gradient\nGPU0", shape=box]
loss_g0 [label="loss_grad\nGPU0"]
scale_loss_0->loss_g0
fc_g_0 [label="w_grad\nGPU0", shape=box]
loss0 -> fc_g_0
loss_g0 -> fc_g_0
hidden_0 -> fc_g_0
}
subgraph cluster_gpu1 {
label="GPU1"
fc_1 [label="fc\nGPU1", shape=box]
hidden_1 [label="hidden\nGPU1"]
node_w_g1 -> fc_1
fc_1 -> hidden_1
loss1 [label="loss\nGPU1"]
hidden_1 -> loss1 [label="many ops omitted"]
scale_loss_1 [label="scale_loss_gradient\nGPU1", shape=box]
loss_g1 [label="loss_grad\nGPU1"]
scale_loss_1->loss_g1
fc_g_1 [label="w_grad\nGPU1", shape=box]
loss1 -> fc_g_1
loss_g1 -> fc_g_1
hidden_1 -> fc_g_1
}
}
all_reduce_w [label="Merge Gradients(AllReduce)", shape=box]
fc_g_0 -> all_reduce_w
fc_g_1 -> all_reduce_w
fc_g_0_merged [label="w_grad\nMerged\nGPU0"]
fc_g_1_merged [label="w_grad\nMerged\nGPU1"]
all_reduce_w -> fc_g_0_merged
all_reduce_w -> fc_g_1_merged
subgraph cluster_optimization {
label="Optimization"
subgraph cluster_opt_gpu0 {
label="GPU0"
sgd_0 [label="SGD Op\nGPU0", shape=box]
fc_g_0_merged -> sgd_0
node_w_g0 -> sgd_0
optimized_w_0 [label="Optimized W\nGPU0"]
sgd_0 -> optimized_w_0
}
subgraph cluster_opt_gpu1 {
label="GPU1"
sgd_1 [label="SGD Op\nGPU1", shape=box]
fc_g_1_merged -> sgd_1
node_w_g1 -> sgd_1
optimized_w_1 [label="Optimized W\nGPU0"]
sgd_1 -> optimized_w_1
}
}
}
# ParallelExecutor
## Background
Neural network models are defined as a `ProgramDesc` in Fluid. The `ProgramDesc` can be executed by an interpreter(i.e. the `executor` concept in Fluid). The instructions or operators in a `Program` will be executed, and the results will be fetched in Python side.
The executor is a very naive interpreter. It runs operators one by one. We can use `Parallel.Do` to support data parallelism, however, lacking device information in `ProgramDesc`; it is not possible to optimize the performance of `Parallel.Do`.
We want a `ProgramDesc` can be run on different nodes. It is better not to contain device information in `ProgramDesc`. However, we can write a high-performance interpreter, which can hold an alternative intermediate representation of `ProgramDesc`, to take full usage of Multi-GPUs.
ParallelExecutor is an interpreter of `ProgramDesc` which will [out-of-order execute](https://en.wikipedia.org/wiki/Out-of-order_execution) `Program` in data parallelism mode and maximise the utility of Multi-GPUs.
## Overview of MultiGPUs logic
The ParallelExecutor takes the startup program and main program as inputs. The parameters will be initialised on `GPU0` by startup program and will broadcast to multi-GPUs. The main program will be duplicated into multi-GPUs. The gradient will be merged during each iteration, and each device will optimize parameters independently. Since the gradients on each device will be merged before parameter optimization, the parameters will be the same on each device and it does not need to be broadcast the parameters.
![alt](images/parallel_executor_overview.png)
There are several optimizations for this logic.
1. We use an alternate representation in ParallelExecutor. It because the device information is critical for performance optimization.
2. The execution is out-of-order, i.e., an operator will be executed whenever the inputs of the operator are ready.
* GPU is a high-performance device; only one CPU thread cannot fulfil one GPU. So there is a thread pool to execute operators.
* Out-of-order also helps transpilers to generate `ProgramDesc`. It is no need to concern about the best order of performance when implementing a transpiler.
3. The streams of computation, merge gradients and fetch data are different.
The performance of `ResNeXt152` on `TitanX` which `batch_size=12` is shown below.
| Number of GPUs | 1 | 2 | 3 | 4|
| --- | --- | --- | --- | --- |
| Image/Sec | 17.9906 | 25.771 | 36.911 | 48.8428 |
| Speed Up | N/A | 1.43247029 | 2.05168255 | 2.71490667 |
## Static single assignment Graph
[Static single assignment form](https://en.wikipedia.org/wiki/Static_single_assignment_form)(`SSA` for short) is a common form for compiler optimization. To implement concurrent execution, we uses an `SSA` graph as an intermedia representation of `ProgramDesc`.
The `Program` is a directed acyclic graph, since a variable can be assigned multiple times. We enforce a variable will be assigned once, by adding version number to varaibles. We parsing the `Program` into a `SSA` graph. Also, ProgramExecutor duplicate `Program` into multi-devices. We also add a device number to varaibles and insert `NCCLAllReduce` into Graph.
The data structure of `SSA` graph is:
```c++
struct VarHandleBase {
OpHandleBase* generated_op_;
vector<OpHandleBase*> pending_ops_;
string name;
Place place;
size_t version;
};
struct OpHandleBase {
vector<OpHandleBase*> inputs_;
vector<OpHnadleBase*> outputs_;
};
struct SSAGraph {
// vars on each devices.
// * the vars in each map in vector is on different device.
// * the map is mapping a variable name to variable handles
// with different versions
vector<std::unordered_map<string, vector<VarHandleBase>>> vars_;
// All ops
vector<OpHandleBase> ops_;
};
```
The variable handles are the wrapper of `Variables`. The operator handles are the wrapper of `OperatorBase`. Some `OpHandle` is not an `OperatorBase`, such as `NCCLAllReduceOpHandle`, because `AllReduceOpHandle` will use new device contexts.
When the `ProgramDesc` converted into an `SSA` Graph, the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem is also need to be taken care. The dummy variables, which represent the dependency between operators, will be manually inserted into SSA graph to resolve the [data hazard](https://en.wikipedia.org/wiki/Hazard_(computer_architecture)) problem.
## Execute SSA Graph
The SSA graph can be out-of-order executed by an approximate [topological sorting](https://en.wikipedia.org/wiki/Topological_sorting) algorithm. The algorithm is
1. Maintaining a map of an operator and its needed input number.
2. If a variable is not generated by an operator, i.e., `var.generated_op == nullptr`, decrease the needed input number of its pending operators.
3. If there is an operator which needed input number is decreased to zero, just run this operator.
4. After run this operator, just mark the variables are generated and repeat step 2 until all variables are generated.
Running an operator can be asynchronized. There is a thread pool to execute an `SSA` graph.
## Synchronize GPU Kernels
The GPU is a non-blocking device. The different streams need be synchronized when switing streams. In current implementation, the synchronization based on the following algorithm:
1. `OpHandle` will record `DeviceContext` that it is used.
2. In `OpHandle::Run`, if the `DeviceContext` of current operator is different from `DeviceContext` of any input variable, just wait the generate operator of this input variable.
The `wait` are implemented by two strategies:
1. Invoke `DeviceContext->Wait()`, It will wait all operators on this device contexts complete.
2. Uses `cudaStreamWaitEvent` to sending a event to the stream. It is a non-blocking call. The wait operators will be executed in GPU.
Generally, the `cudaStreamWaitEvent` will have a better perforamnce. However, `DeviceContext->Wait()` strategy is easier to debug. The strategy can be changed in runtime.
## What's next?
* Merging gradient of dense parameters has been done. However, the merging of sparse parameters has not been done.
* The CPU version of Parallel Executor has not been implemented. The out-of-order logic will make CPU compuatation faster, too.
* A better strategy to merge gradients can be introduced. We can shrink the gradients from `float32` to `int8` or `int4` while merging. It will significantly speed up multi-GPUs training without much loss of precision.
* Combine multi-Nodes implementation. By the benifit of out-of-order, sending and recving operator can be an blocking operator, and the transpiler does not need to concern about the best position of operator.
../../v2/build_and_install/build_from_source_cn.rst
\ No newline at end of file
../../v2/build_and_install/build_from_source_en.rst
\ No newline at end of file
../../v2/build_and_install/docker_install_cn.rst
\ No newline at end of file
../../v2/build_and_install/docker_install_en.rst
\ No newline at end of file
../../v2/build_and_install/index_cn.rst
\ No newline at end of file
../../v2/build_and_install/index_en.rst
\ No newline at end of file
../../v2/build_and_install/pip_install_cn.rst
\ No newline at end of file
../../v2/build_and_install/pip_install_en.rst
\ No newline at end of file
梯度更新算法
------------
.. toctree::
:maxdepth: 1
parameter_average.md
Gradient Update Algorithm
--------------------------------------
.. toctree::
:maxdepth: 1
parameter_average.md
...@@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su ...@@ -2,7 +2,7 @@ A few months ago when we were trying to replace CMake with Bazel, @emailweixu su
Here are some initial thoughts. Your comments are welcome! Here are some initial thoughts. Your comments are welcome!
### Required CMake Function # Required CMake Function
I think we need only the following few CMake functions to make a project description mean and clean: I think we need only the following few CMake functions to make a project description mean and clean:
...@@ -25,7 +25,7 @@ Also, ...@@ -25,7 +25,7 @@ Also,
- to describe external dependencies, we need `external_library`. - to describe external dependencies, we need `external_library`.
- to build shared libraries, we need `shared_library`. - to build shared libraries, we need `shared_library`.
### An Example Project ## An Example Project
Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files: Suppose that we have aforementioned functions defined in our `/cmake` directory. The following example `CMakeLists.txt` describes a project including the following source files:
...@@ -102,11 +102,11 @@ shared_library(api ...@@ -102,11 +102,11 @@ shared_library(api
``` ```
### Implementation ## Implementation
As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`. As above example CMakeLists.txt executes, each function invocation adds "nodes" to a dependency graph. It also use this graph to generate CMake commands including `add_executable`, `add_dependencies`, `target_link_libraries`, and `add_test`.
### Using Package Manager For Go ## Using Package Manager For Go
Building Go binaries and libraries need to satisfy their dependencies, generally Building Go binaries and libraries need to satisfy their dependencies, generally
we can do `go get ./...` to download and compile all external dependencies. The we can do `go get ./...` to download and compile all external dependencies. The
...@@ -122,7 +122,7 @@ problems are: ...@@ -122,7 +122,7 @@ problems are:
at many cloud file hosting, so users what to compile paddle by themselves can at many cloud file hosting, so users what to compile paddle by themselves can
download this "vendor" package from a mirror site. download this "vendor" package from a mirror site.
#### Choose A Suitable Tool ### Choose A Suitable Tool
As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools) As mentioned by @wangkuiyi, [Here](https://github.com/golang/go/wiki/PackageManagementTools)
list dozens of Go package managers. We choose the tool using following principles: list dozens of Go package managers. We choose the tool using following principles:
...@@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve ...@@ -140,7 +140,7 @@ management tool has been started at: https://github.com/golang/dep to resolve
such problems, but it's currently at Alpha stage. So the best choice now is such problems, but it's currently at Alpha stage. So the best choice now is
glide obviously. glide obviously.
#### Manage Go Packages ### Manage Go Packages
- Dependencies: `go/glide.yaml` will store the dependencies and their versions which - Dependencies: `go/glide.yaml` will store the dependencies and their versions which
is directly imported by paddle. `go/glide.lock` will store all dependencies recursively is directly imported by paddle. `go/glide.lock` will store all dependencies recursively
......
...@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e ...@@ -113,7 +113,7 @@ To solve this problem, we introduce `ReaderHolder` as a wrapper. It acts as an e
To create and invoke readers, some new ops are introduced: To create and invoke readers, some new ops are introduced:
### CreateReaderOp ### Operators That Create Readers
Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers. Each reader has its creation op. File readers' creation ops have no input and yield the created file reader as its output. Decorated readers' creation ops take the underlying readers as inputs and then yield new decorated readers.
...@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader) ...@@ -153,19 +153,52 @@ double_buffer_reader = create_double_buffer_op(batch_reader)
The forwarding ops of the corresponding `main_program` would be like this: The forwarding ops of the corresponding `main_program` would be like this:
``` ```
while_op { not_completed = true
pass_count = 0
while_op(not_completed) {
has_next = has_next_op(double_buffer_reader) has_next = has_next_op(double_buffer_reader)
if_else_op(has_next) { if_else_op(has_next) {
batch_data = read_op(double_buffer_reader) batch_data = read_op(double_buffer_reader)
... (subsequent training ops) ... (subsequent training ops)
} else { } else {
reset_op(double_buffer_reader) reset_op(double_buffer_reader)
increase_op(pass_count)
not_completed = less_than_op(pass_count, reqiured_pass_num)
} }
} }
``` ```
Two important considerations for these programs are as follows: A few important considerations for these programs are as follows:
1. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader. 1. `not_completed`, `pass_count` and other variables shown above are all Fluid Variables.
2. All readers exist in both `startup_program` and `main_program`. And they are persistable. 2. The multiple\_reader is the batch\_reader's underlying reader, and the batch\_reader is the double\_buffer\_reader's underlying reader. `read_op`, `has_next_op` and other reader related ops will only invoke the top-most reader. In this case, it's the double\_buffer\_reader.
3. All readers exist in both `startup_program` and `main_program`. And they are persistable.
### Simplify Configuration by MultiPassReader
The Program configuration mentioned above is complicated. Users need to be very familiar to concepts of Program and Block to prevent making mistakes in their code. To make the usage of C++ readers more friendly to new users, we introduce `MultiPassReader`.
`MultiPassReader` is a decorated reader. A multi-pass reader is used to continuously yield data for several training passes. It takes the number of passes to run as one of its attributes('pass_num') and maintains a counter to record how many passes it has completed. Each time its underlying reader reaches the EOF, the multi-pass reader checks whether it has completed the training of given number of pass. If not, the underlying reader will be re-initialized and starts a new pass automatically. Before completing the whole training, the return of MultiPassReader's `HasNext()` will always be `true`.
With `MultiPassReader`, the startup program would be like this:
```
multiple_reader = open_files_op(...)
batch_reader = create_batch_reader_op(multiple_reader)
multi_pass_reader = create_multi_pass_reader_op(batch_reader)
double_buffer_reader = create_double_buffer_op(multi_pass_reader)
... (other initializers)
```
The forwarding part of the corresponding `main_program` would be like this:
```
not_completed = true
while_op(not_completed) {
batch_data = read_op(double_buffer_reader)
... (subsequent training ops)
not_completed = has_next_op(double_buffer_reader)
}
```
核心概念
-------------
.. toctree::
:maxdepth: 1
README.md
cpp_data_feeding.md
functions_operators_layers.md
program.md
variable.md
var_desc.md
tensor.md
tensor_array.md
lod_tensor.md
block.md
scope.md
executor.md
Core Concepts
--------------------------------------
.. toctree::
:maxdepth: 1
README.md
cpp_data_feeding.md
functions_operators_layers.md
program.md
variable.md
var_desc.md
tensor.md
tensor_array.md
lod_tensor.md
block.md
scope.md
executor.md
...@@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`. ...@@ -30,7 +30,7 @@ Scope is an association of a name to variable. All variables belong to `Scope`.
Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`. Variable can not belong to many scopes. If you want to use variables from parent scope, you can use `parent scope`.
1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else. 1. Scope should destruct all Variables inside it when itself is destructed. User can never store `Variable` pointer somewhere else.
Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed. Because Variable can only be got from Scope. When destroying Scope, we also need to destroy all the Variables in it. If user store `Variable` pointer to private data member or some global variable, the pointer will be an invalid pointer when associated `Scope` is destroyed.
...@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is ...@@ -78,7 +78,7 @@ In `Scope` class, there is a private data member called `parent_`. `parent_` is
A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily. A local scope is very useful when we implement Recurrent Neural Network. Each timestep of an RNN should be a `Net`. Each `Net` of timestep (`StepNet` for short) should use an independent local scope. Just like variables in a while loop is inside a local scope in programming languages. By using a single `StepNet` and changing local scope, we can implement an RNN easily.
# Interface Design ## Interface Design
```cpp ```cpp
class Variable { class Variable {
......
# Design Doc: Var_desc
## Background ## Background
PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations. PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
......
并发编程
------------
.. toctree::
:maxdepth: 1
concurrent_programming.md
parallel_do.md
Concurrent Programming
-------------------------
.. toctree::
:maxdepth: 1
concurrent_programming.md
parallel_do.md
数据类型
------------
.. toctree::
:maxdepth: 1
float16.md
Data Type
------------
.. toctree::
:maxdepth: 1
float16.md
## Design Doc: Distributed Lookup Table Operator # Design Doc: Distributed Lookup Table Operator
A lookup table operator in PaddlePaddle where the table could be out A lookup table operator in PaddlePaddle where the table could be out
of the memory of a computer. of the memory of a computer.
......
分布式训练
------------
.. toctree::
:maxdepth: 1
distributed_architecture.md
distributed_lookup_table_design.md
parameter_server.md
Distributed Training
---------------------
.. toctree::
:maxdepth: 1
distributed_architecture.md
distributed_lookup_table_design.md
parameter_server.md
动态RNN
------------
.. toctree::
:maxdepth: 1
rnn.md
rnn_design.md
Dynamic RNN
------------
.. toctree::
:maxdepth: 1
rnn.md
rnn_design.md
...@@ -99,7 +99,7 @@ private: ...@@ -99,7 +99,7 @@ private:
- 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos` - 由于传递过程是以复制`shared_ptr`的方式实现,因此框架只需要传递一次 `lod_start_pos`
2. 对于不感知 `lod_start_pos` 的Op足够透明 2. 对于不感知 `lod_start_pos` 的Op足够透明
3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据 3. 需要修改 `lod_start_pos` 的producer Op可以在 `Run` 时更新自己的 `lod_start_pos` 数据
具体的设计分为以下3小节 具体的设计分为以下3小节
...@@ -189,7 +189,7 @@ struct SortedSeqItem { ...@@ -189,7 +189,7 @@ struct SortedSeqItem {
std::vector<SortedSeqItem> sorted_seqs; std::vector<SortedSeqItem> sorted_seqs;
``` ```
来追踪序列排序后的位置,并添加一个新的接口 来追踪序列排序后的位置,并添加一个新的接口
```c++ ```c++
std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor); std::vector<SortedSeqItem> SortBySeqLen(const LODTensor& tensor);
...@@ -233,7 +233,10 @@ x x ...@@ -233,7 +233,10 @@ x x
- 将每个序列concat 为规则的mini-batch表示 - 将每个序列concat 为规则的mini-batch表示
## 参考文献 ## 参考文献
1. [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing) [Tensorflow Bucketing](https://www.tensorflow.org/versions/r0.12/api_docs/python/contrib.training/bucketing)
2. [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
3. [variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5) [mxnet Bucketing](http://mxnet.io/how_to/bucketing.html)
4. [Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
[variable length input in RNN scenario](https://discuss.pytorch.org/t/about-the-variable-length-input-in-rnn-scenario/345/5)
[Level of details](https://en.wikipedia.org/wiki/Level_of_detail)
执行流程
-------------
.. toctree::
:maxdepth: 1
switch.md
if_else_op.md
Execution Process
--------------------------------------
.. toctree::
:maxdepth: 1
switch.md
if_else_op.md
### Design Doc: Switch # Design Doc: Switch
### Background ## Background
Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid. Many programming languages provide `switch` as a generalization of `if-elif-else`. We want to add it to Fluid.
...@@ -19,7 +19,7 @@ with switch() as switch: ...@@ -19,7 +19,7 @@ with switch() as switch:
fluid.print("Case 3") fluid.print("Case 3")
``` ```
### The Semantics ## The Semantics
1. A `switch` control-flow checks cases one-by-one. 1. A `switch` control-flow checks cases one-by-one.
1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values. 1. The condition of each case is a boolean value, which is a scalar, and differs from the `fluid.if_else` control-flow, which condition could be a vector of boolean values.
......
设计思想 设计思想
------------ ------------
.. toctree::
:maxdepth: 1
motivation/index_cn.rst
execution/index_cn.rst
concepts/index_cn.rst
data_type/index_cn.rst
memory/index_cn.rst
muti_devices/index_cn.rst
dynamic_rnn/index_cn.rst
concurrent/index_cn.rst
algorithm/index_cn.rst
network/index_cn.rst
modules/index_cn.rst
interface/index_cn.rst
dist_train/index_cn.rst
Design Design
------------ ------------
.. toctree::
:maxdepth: 1
motivation/index_en.rst
execution/index_en.rst
concepts/index_en.rst
data_type/index_en.rst
memory/index_en.rst
muti_devices/index_en.rst
dynamic_rnn/index_en.rst
concurrent/index_en.rst
algorithm/index_en.rst
network/index_en.rst
modules/index_en.rst
interface/index_en.rst
dist_train/index_en.rst
多语言接口
------------
TBD
Multi-Language Interface
-----------------------
TBD
内存管理
------------
.. toctree::
:maxdepth: 1
memory_optimization.md
Memory Management
-------------------
.. toctree::
:maxdepth: 1
memory_optimization.md
## Evaluator Design # Evaluator Design
### Problem Statement ## Problem Statement
During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants. During training or inference, we provide an evaluation function to measure the model performance, for example, accuracy, precision, etc. In the operator based framework design, the data passes through the network pipeline batch by batch. As a result, inside the operator, we only calculate the metrics for one minibatch. Thus, we need to provide a mechanism to calculate the metrics for each N pass/batch the user wants.
### Evaluator Design ## Evaluator Design
Currently, every operation is expressed in the graph. We divide the evaluator process into three steps. Currently, every operation is expressed in the graph. We divide the evaluator process into three steps.
1. Initialize the metric state and add it into the block. 1. Initialize the metric state and add it into the block.
...@@ -14,11 +14,11 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr ...@@ -14,11 +14,11 @@ Currently, every operation is expressed in the graph. We divide the evaluator pr
3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices. 3. Merge the mini-batch statistics to form the evaluation result for multiple mini-batches. When it comes to distributed training/Multi-GPU training, aggregate the value from different devices.
### Implementation ## Implementation
This design is shown in the Python API. This design is shown in the Python API.
Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass. Each metric operator needs to caculate the metric statistic and return the batch-aware states. Python side is responsible for accumulating the states for each pass.
```python ```python
class Evaluator(object): class Evaluator(object):
""" """
...@@ -32,7 +32,7 @@ class Evaluator(object): ...@@ -32,7 +32,7 @@ class Evaluator(object):
The initialization of Evaluator should be responsible for: The initialization of Evaluator should be responsible for:
create metric states and append to the main_program create metric states and append to the main_program
""" """
pass pass
def _update_ops(self, input, label, **kwargs) def _update_ops(self, input, label, **kwargs)
...@@ -40,14 +40,14 @@ class Evaluator(object): ...@@ -40,14 +40,14 @@ class Evaluator(object):
Add mini-batch evaluator caculate operators to the main_program. Add mini-batch evaluator caculate operators to the main_program.
Add increment operator to accumulate the metric states. Add increment operator to accumulate the metric states.
""" """
def reset(self, executor, reset_program=None): def reset(self, executor, reset_program=None):
""" """
Reset metric states at the begin of each pass/user specified batch number. Reset metric states at the begin of each pass/user specified batch number.
Execute the reset_program to reset the states. Execute the reset_program to reset the states.
""" """
def eval(self, executor, eval_program=None): def eval(self, executor, eval_program=None):
""" """
......
代码结构和重要模块
-----------------
.. toctree::
:maxdepth: 1
backward.md
python_api.md
regularization.md
infer_var_type.md
optimizer.md
prune.md
register_grad_op.md
net_op_design.md
Code Structure and Important Modules
-------------------------------------
.. toctree::
:maxdepth: 1
backward.md
python_api.md
regularization.md
infer_var_type.md
optimizer.md
prune.md
register_grad_op.md
net_op_design.md
# Network Design # Network Design
`Network` is the container and controller of a set of operators, `Network` is the container and controller of a set of operators,
user can build a real network from a `NetDesc` which is a protobuf message user can build a real network from a `NetDesc` which is a protobuf message
and use `Network.Run()` to run all the operators in the network. and use `Network.Run()` to run all the operators in the network.
A network object knows all Operators belonging to this network. Variables, A network object knows all Operators belonging to this network. Variables,
which are inputs and outputs of these operators, which are inputs and outputs of these operators,
are created and managed by a hierarchy of Scope objects. are created and managed by a hierarchy of Scope objects.
# API ## API
## Net ### Net
To make the `Network` extendable, a base class is defined like this To make the `Network` extendable, a base class is defined like this
```c++ ```c++
...@@ -43,8 +43,8 @@ class Net { ...@@ -43,8 +43,8 @@ class Net {
}; };
``` ```
All network implementations should build networks from a protobuf message which All network implementations should build networks from a protobuf message which
describes the structure of a real network; `Run` method should be implemented by describes the structure of a real network; `Run` method should be implemented by
all implementations to offer a universal method to forward or backward compute a network. all implementations to offer a universal method to forward or backward compute a network.
`Net::Create` is a method of factory pattern and can be implemented like `Net::Create` is a method of factory pattern and can be implemented like
...@@ -64,7 +64,7 @@ std::unique<Net> Net::Create(const NetDesc& def) { ...@@ -64,7 +64,7 @@ std::unique<Net> Net::Create(const NetDesc& def) {
``` ```
Network is designed as the container of operators. to make it more extendable, Network is designed as the container of operators. to make it more extendable,
we decouple it from the related variable resources. we decouple it from the related variable resources.
`Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes. `Run(Scope* scope)` takes the scope as a argument so that it can run in different scopes.
...@@ -80,7 +80,7 @@ if (net) { ...@@ -80,7 +80,7 @@ if (net) {
} }
``` ```
## `PlainNet` as a simple implementation of `BaseNet` ### `PlainNet` as a simple implementation of `BaseNet`
A very basic implementation is as follows. All it does is simply to run every operators in sequence. A very basic implementation is as follows. All it does is simply to run every operators in sequence.
...@@ -211,9 +211,9 @@ class NetBuilder final { ...@@ -211,9 +211,9 @@ class NetBuilder final {
} }
``` ```
## Compatibility with RNN ### Compatibility with RNN
Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design, Benefitting from the decoupling of `PlainNet.Run` and `Scope`, `PlainNet` is compatible with future RNN design,
for example we can implement a simple recurrent neural network as follows for example we can implement a simple recurrent neural network as follows
```c++ ```c++
......
## Optimizer Design # Optimizer Design
### The Problem ## The Problem
A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works: A PaddlePaddle program, or a block, is a sequence of operators operating variables. A training program needs to do three kinds of works:
...@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca ...@@ -19,7 +19,7 @@ It's true that users should be able to create all these operators manually by ca
In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass. In this design, we propose a high-level API that automatically derives the optimisation pass and operators from the forward pass.
### High-level Python API to describe the training process ## High-level Python API to describe the training process
1. User write code to describe the network: 1. User write code to describe the network:
...@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim ...@@ -54,7 +54,7 @@ In this design, we propose a high-level API that automatically derives the optim
sess.run(target= opt_op_list, ...) sess.run(target= opt_op_list, ...)
``` ```
#### Optimizer Python interface: ### Optimizer Python interface:
```python ```python
class Optimizer(object): class Optimizer(object):
......
设计动机和目标
-------------
.. toctree::
:maxdepth: 1
api.md
refactorization.md
fluid.md
fluid_compiler.md
Design Motivations and Goals
--------------------------------------
.. toctree::
:maxdepth: 1
api.md
refactorization.md
fluid.md
fluid_compiler.md
...@@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime ...@@ -97,13 +97,13 @@ Compile Time -> IR -> Runtime
--- ---
# Operator/OpWithKernel/OpKernel ## Operator/OpWithKernel/OpKernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/49caf1fb70820fb4a6c217634317c9306f361f36/op_op_with_kern_class_diagram.dot)
--- ---
# Operator ## Operator
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/dd598e8f1976f5759f58af5e5ef94738a6b2e661/op.dot)
* `Operator` is the fundamental building block of the user interface. * `Operator` is the fundamental building block of the user interface.
...@@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime ...@@ -113,7 +113,7 @@ Compile Time -> IR -> Runtime
--- ---
# OpWithKernel/Kernel ## OpWithKernel/Kernel
![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot) ![class_diagram](http://api.paddlepaddle.org/graphviz?dot=https://gist.githubusercontent.com/reyoung/53df507f6749762675dff3e7ce53372f/raw/9d7f4eba185cf41c8e2fbfb40ae21890dbddcd39/op_with_kernel.dot)
...@@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime ...@@ -124,7 +124,7 @@ Compile Time -> IR -> Runtime
--- ---
# Why separate Kernel and Operator ## Why separate Kernel and Operator
* Separate GPU and CPU code. * Separate GPU and CPU code.
* Make Paddle capable of running without GPU. * Make Paddle capable of running without GPU.
...@@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime ...@@ -132,7 +132,7 @@ Compile Time -> IR -> Runtime
* For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel. * For example, same multiplication op can have different implementations kernels such as FP16 kernel, FP32 kernel, MKL, eigen kernel.
--- ---
# Libraries for Kernel development ## Libraries for Kernel development
* `Eigen::Tensor` contains basic math and element-wise functions. * `Eigen::Tensor` contains basic math and element-wise functions.
* Note that `Eigen::Tensor` has broadcast implementation. * Note that `Eigen::Tensor` has broadcast implementation.
...@@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime ...@@ -143,16 +143,16 @@ Compile Time -> IR -> Runtime
* Hand-writing `GPUKernel` and `CPU` code * Hand-writing `GPUKernel` and `CPU` code
* Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.) * Do not write in header (`.h`) files. CPU Kernel should be in cpp source (`.cc`) and GPU kernels should be in cuda (`.cu`) files. (GCC cannot compile GPU code.)
--- ---
# Operator Registration ## Operator Registration
## Why is registration necessary? ### Why is registration necessary?
We need a method to build mappings between Op type names and Op classes. We need a method to build mappings between Op type names and Op classes.
## How is registration implemented? ### How is registration implemented?
Maintaining a map, whose key is the type name and the value is the corresponding Op constructor. Maintaining a map, whose key is the type name and the value is the corresponding Op constructor.
--- ---
# The Registry Map ## The Registry Map
### `OpInfoMap` ### `OpInfoMap`
...@@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding ...@@ -166,7 +166,7 @@ Maintaining a map, whose key is the type name and the value is the corresponding
- **`checker`**: Used to check attributes. - **`checker`**: Used to check attributes.
--- ---
# Related Concepts ## Related Concepts
### Op_Maker ### Op_Maker
It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37)) It's constructor takes `proto` and `checker`. They are completed during Op_Maker's construction. ([ScaleOpMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/scale_op.cc#L37))
...@@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -178,7 +178,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
``` ```
--- ---
# Registration Process ## Registration Process
1. Write an Op class and its gradient Op class, if required. 1. Write an Op class and its gradient Op class, if required.
2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator. 2. Write an Op maker class. In the constructor of this class, describe the inputs, outputs and attributes of the operator.
3. Invoke the macro `REGISTER_OP`. This macro will 3. Invoke the macro `REGISTER_OP`. This macro will
...@@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -186,13 +186,13 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap` 2. Using the completed `proto` and `checker`, it will add a new key-value pair to the `OpInfoMap`
--- ---
# Backward Module (1/2) ## Backward Module (1/2)
### Create Backward Operator ### Create Backward Operator
- Mapping from forward Op to backward Op - Mapping from forward Op to backward Op
![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png) ![backward](https://gist.githubusercontent.com/dzhwinter/a6fbd4623ee76c459f7f94591fd1abf0/raw/61026ab6e518e66bde66a889bc42557a1fccff33/backward.png)
--- ---
# Backward Module (2/2) ## Backward Module (2/2)
### Build Backward Network ### Build Backward Network
- **Input**: a graph of forward operators - **Input**: a graph of forward operators
- **Output**: a graph of backward operators - **Output**: a graph of backward operators
...@@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -205,7 +205,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
--- ---
# Scope, Variable, Tensor ## Scope, Variable, Tensor
* `Tensor` is an n-dimension array with type. * `Tensor` is an n-dimension array with type.
* Only dims and data pointers are stored in `Tensor`. * Only dims and data pointers are stored in `Tensor`.
...@@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -218,8 +218,8 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
* `Scope` has a hierarchical structure. The local scope can get variables from its parent scope. * `Scope` has a hierarchical structure. The local scope can get variables from its parent scope.
--- ---
# Block (in design) ## Block (in design)
## the difference between original RNNOp and Block ### the difference between original RNNOp and Block
- As an operator is more intuitive than `RNNOp`, - As an operator is more intuitive than `RNNOp`,
- Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`, - Offers a new interface `Eval(targets)` to deduce the minimal block to `Run`,
- Fits the compile-time/ runtime separation design paradigm. - Fits the compile-time/ runtime separation design paradigm.
...@@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -227,7 +227,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
- When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`. - When graph executes, a Block with `BlockDesc` is passed. It then creates `Op` and `Var` instances and then invokes `Run`.
--- ---
# Milestone ## Milestone
- Take Paddle/books as the main line, the requirement of the models motivates framework refactoring, - Take Paddle/books as the main line, the requirement of the models motivates framework refactoring,
- Model migration - Model migration
- Framework development gives **priority support** to model migration, for example, - Framework development gives **priority support** to model migration, for example,
...@@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class) ...@@ -240,7 +240,7 @@ REGISTER_OP_WITHOUT_GRADIENT(op_type, op_class, op_maker_class)
- Accept imperfection, concentrate on solving the specific problem at the right price. - Accept imperfection, concentrate on solving the specific problem at the right price.
--- ---
# Control the migration quality ## Control the migration quality
- Compare the performance of migrated models with old ones. - Compare the performance of migrated models with old ones.
- Follow the google C++ style guide. - Follow the google C++ style guide.
- Build the automatic workflow of generating Python/C++ documentations. - Build the automatic workflow of generating Python/C++ documentations.
......
多设备支持
------------
.. toctree::
:maxdepth: 1
operator_kernel_type.md
kernel_selection.md
kernel_hint_design.md
Multi-Device Support
----------------------
.. toctree::
:maxdepth: 1
operator_kernel_type.md
kernel_selection.md
kernel_hint_design.md
## Problem # Problem
In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this. In PaddlePaddle's [Design](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md), one Operator may have multiple kernels. Users may have some personal preference to choose a certain type of kernel for an operator, such as `force_cpu` to choose a CPU kernel, `use_cudnn` to choose a CUDNN kernel, we need to provide a way for users to do this.
In the current design, we use KernelType to describe one kernel. In the current design, we use KernelType to describe one kernel.
......
## Background # Background
Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold. Every operator has many kernels because there are multiple data types, places, data layout, library type that Fluid supports. We use the `OpKernelType ` to describe kernel types that operators can hold.
The `OpKernelType ` is as follows: The `OpKernelType ` is as follows:
......
复杂网络设计
------------
.. toctree::
:maxdepth: 1
sequence_decoder.md
Complex Network Design
------------------------
.. toctree::
:maxdepth: 1
sequence_decoder.md
...@@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -45,11 +45,11 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
- Python API Definition - Python API Definition
- 格式: - 格式:
[Python API Definition] [Python API Definition]
- 示例 - 示例
``` ```
fc(input, fc(input,
size, size,
...@@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -63,19 +63,19 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
``` ```
- Function Description - Function Description
- 格式 - 格式
本模块应包含以下内容(排列顺序为文档撰写顺序): 本模块应包含以下内容(排列顺序为文档撰写顺序):
[Function Description] [Function Description]
[Formula] [Formula]
[Symbols' Descriptions if necessary] [Symbols' Descriptions if necessary]
[References if necessary] [References if necessary]
- 示例 - 示例
[Function Description] [Function Description]
...@@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -119,18 +119,18 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
[References if necessary] [References if necessary]
因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例: 因fc没有必要列出的参考文献,故该内容省略。其他情况下需明确给出对应的参考文献和对应连接,以 layer_norm 为例:
``` ```
Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details. Refer to `Layer Normalization <https://arxiv.org/pdf/1607.06450v1.pdf>`_ for more details.
``` ```
- Args Description - Args Description
- 格式 - 格式
\[Arg's Name\][(Data Type, Default Value)][Description] \[Arg's Name\][(Data Type, Default Value)][Description]
- 示例 - 示例
fc的部分参数注释如下: fc的部分参数注释如下:
...@@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -145,35 +145,35 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
``` ```
- Returns - Returns
- 格式 - 格式
[Name][Shape] [Name][Shape]
- 示例 - 示例
``` ```
Returns: Returns:
A tensor variable storing the transformation result. A tensor variable storing the transformation result.
``` ```
当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例: 当返回值为包含多个参数的tuple时,应按顺序逐个介绍各参数,以dynamic_lstm为例:
``` ```
Returns: Returns:
A tuple containing: A tuple containing:
The hidden state of LSTM whose shape is (T X D). The hidden state of LSTM whose shape is (T X D).
The cell state of LSTM whose shape is (T X D). The cell state of LSTM whose shape is (T X D).
``` ```
- Raises - Raises
- 格式 - 格式
[Exception Type][Condition] [Exception Type][Condition]
- 示例 - 示例
``` ```
Raises: Raises:
ValueError: If the rank of the input is less than 2. ValueError: If the rank of the input is less than 2.
...@@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -182,7 +182,7 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
- Note - Note
- 格式 - 格式
[Note] [Note]
- 示例 - 示例
...@@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接 ...@@ -198,15 +198,15 @@ API文档须使用reStructuredText格式撰写,该格式详情请参考[链接
2. When num_heads == 1, scaled_dot_product_attention has no learnable 2. When num_heads == 1, scaled_dot_product_attention has no learnable
parameters. parameters.
``` ```
- Examples - Examples
- 格式 - 格式
\[Python Code Snipper] \[Python Code Snipper]
- 示例 - 示例
``` ```
Examples: Examples:
.. code-block:: python .. code-block:: python
......
开发标准 开发标准
------------ ------------
.. toctree::
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
use_eigen_en.md
name_convention.md
support_new_device.md
releasing_process.md
op_markdown_format.md
Development Development
------------ ------------
This is Development page .. toctree::
:maxdepth: 1
new_op_en.md
new_op_kernel_en.md
use_eigen_en.md
name_convention.md
support_new_device.md
releasing_process.md
op_markdown_format.md
## Operator's Parameter Name Convention # Operator's Parameter Name Convention
To make the operator document itself more clear, we recommend operator names obey the listing conventions. To make the operator document itself more clear, we recommend operator names obey the listing conventions.
### OpProtoMaker names ## OpProtoMaker names
When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator. When defining an operator in Paddle, a corresponding [OpProtoMaker](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L170) (TODO: OpProtoMaker Doc)need to be defined. All the Input/Output and Attributes will write into the [OpProto](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto#L61) , and will be used in client language to create operator.
...@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith ...@@ -20,7 +20,7 @@ When defining an operator in Paddle, a corresponding [OpProtoMaker](https://gith
- Order. - Order.
- Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice. - Follow the order of Input/Output, then Attribute, then Comments. See the example in best practice.
### Best Practice ## Best Practice
Here we give some examples to show how these rules will be used. Here we give some examples to show how these rules will be used.
......
## Add Kernels for a New Device # Add Kernels for a New Device
### Background ## Background
PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU. PaddlePaddle Fluid have hundreds of operators. Each operator could have one or more kernels. A kernel is an implementation of the operator for a certain device, which could be a hardware device, e.g., the CUDA GPU, or a library that utilizes a device, e.g., Intel MKL that makes full use of the Xeon CPU.
[This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md). [This document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) explains how to add an operator, and its kernels. The kernels of an operator are indexed by a C++ type [`OpKernelType`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/operator_kernel_type.md). An operator chooses the right kernel at runtime. This choosing mechanism is described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/switch_kernel.md).
### Write Kernels for A New Device ## Write Kernels for A New Device
#### Add A New Device ### Add A New Device
For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP. For some historical reaons, we misuse the word *library* for *device*. For example, we call the deivce type by *library type*. An example is the header file [`library_type.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/library_type.h#L24). We will correct this ASAP.
...@@ -23,7 +23,7 @@ enum class LibraryType { ...@@ -23,7 +23,7 @@ enum class LibraryType {
``` ```
#### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53) ### Add A New [Place](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53)
If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`: If you have a new kind of Device, firstly you need to add a new kind of [`Place`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h#L53). For example `CUDAPlace`:
...@@ -45,7 +45,7 @@ struct CUDAPlace { ...@@ -45,7 +45,7 @@ struct CUDAPlace {
typedef boost::variant<CUDAPlace, CPUPlace> Place; typedef boost::variant<CUDAPlace, CPUPlace> Place;
``` ```
#### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37)) ### Add [device context]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37))
After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it. After a new kind of Device is added, you should add a corresponding [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h#L37) for it.
```cpp ```cpp
...@@ -58,7 +58,7 @@ class DeviceContext { ...@@ -58,7 +58,7 @@ class DeviceContext {
}; };
``` ```
#### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device. ### Implement new [OpKernel](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h#L351) for your Device.
A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md) A detailed documentation can be found in [`new_op_and_kernel`](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/dev/new_op_en.md)
...@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase { ...@@ -85,7 +85,7 @@ class OpKernel : public OpKernelBase {
``` ```
#### Register the OpKernel to framework ### Register the OpKernel to framework
After writing the components described above, we should register the kernel to the framework. After writing the components described above, we should register the kernel to the framework.
...@@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper ...@@ -107,7 +107,7 @@ take [`conv2d`]((https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/oper
REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace, REGISTER_OP_KERNEL(conv2d, CPU, paddle::platform::CPUPlace,
paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>, paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, float>,
paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>); paddle::operators::GemmConvKernel<paddle::platform::CPUDeviceContext, double>);
REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace, REGISTER_OP_KERNEL(conv2d, CUDNN, ::paddle::platform::CUDAPlace,
paddle::operators::CUDNNConvOpKernel<float>, paddle::operators::CUDNNConvOpKernel<float>,
paddle::operators::CUDNNConvOpKernel<double>); paddle::operators::CUDNNConvOpKernel<double>);
......
...@@ -15,26 +15,26 @@ The signature of the operator. ...@@ -15,26 +15,26 @@ The signature of the operator.
Each section mentioned above has been covered in further detail in the rest of the document. Each section mentioned above has been covered in further detail in the rest of the document.
# PaddlePaddle Operator Name ## PaddlePaddle Operator Name
This should be in all small letters, in case of multiple words, we separate them with an underscore. For example: This should be in all small letters, in case of multiple words, we separate them with an underscore. For example:
`array to lod tensor` should be written as `array_to_lod_tensor`. `array to lod tensor` should be written as `array_to_lod_tensor`.
This naming convention should be standard across all PaddlePaddle operators. This naming convention should be standard across all PaddlePaddle operators.
# Standard Operator Name ## Standard Operator Name
This is the standard name of the operator as used in the community. The general standard is usually: This is the standard name of the operator as used in the community. The general standard is usually:
- Standard abbreviations like `SGD` are written in all capital letters. - Standard abbreviations like `SGD` are written in all capital letters.
- Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word). - Operator names that have multiple words inside a single word use `camelCase` (capitalize word boundaries inside of a word).
- Keep numbers inside a word as is, with no boundary delimiters. - Keep numbers inside a word as is, with no boundary delimiters.
- Follow the name of the operator with the keyword: `Activation Operator.` - Follow the name of the operator with the keyword: `Activation Operator.`
# Operator description ## Operator description
This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section. This section should contain the description of what the operator does, including the operation performed, the literature from where it comes and was introduced first, and other important details. The relevant paper/article including the hyperlink should be cited in this section.
# LaTeX equation ## LaTeX equation
This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`). This section should contain an overall equation of the update or operation that the operator performs. The variables used in the equation should follow the naming convention of operators as described [here](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/operators/name_convention.md). Two words in the same word should be separated by an underscore (`_`).
# The signature ## The signature
This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is: This section describes the signature of the operator. A list of Inputs and Outputs, each of which have a small description of what the variable represents and the type of variable. The variable names follow the `CamelCase` naming convention. The proposed format for this is:
`Section : `Section :
VariableName : (VariableType) VariableDescription VariableName : (VariableType) VariableDescription
......
## 在Paddle中如何使用Eigen # 在Paddle中如何使用Eigen
神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。 神经网络本质上是一个计算图,计算需要的数据存放在`Tensor`中,而计算过程是由`Operartor`来描述的。在执行时,`Operator`调用对应`OpKernel`中的`Compute`接口,实现对`Tensor`的操作。
### Eigen Tensor模块 ## Eigen Tensor模块
Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。 Eigen Tensor模块对element-wise计算提供了强大的支持,并且书写一份代码,可以同时在CPU、GPU执行。但Eigen Tensor是一个正在开发中的模块,因此可能测试不够完备,文档较少。
关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md) 关于Eigen Tensor模块的详细介绍请参考[文档1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md)[文档2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md)
### paddle::framework::Tensor ## paddle::framework::Tensor
Paddle Tensor定义在framework目录下,其主要接口如下: Paddle Tensor定义在framework目录下,其主要接口如下:
...@@ -20,14 +20,14 @@ class Tensor { ...@@ -20,14 +20,14 @@ class Tensor {
/*! Return a pointer to mutable memory block. */ /*! Return a pointer to mutable memory block. */
template <typename T> template <typename T>
inline T* data(); inline T* data();
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place); inline T* mutable_data(platform::Place place);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
* *
...@@ -38,17 +38,17 @@ class Tensor { ...@@ -38,17 +38,17 @@ class Tensor {
*/ */
template <typename T> template <typename T>
inline T* mutable_data(DDim dims, platform::Place place); inline T* mutable_data(DDim dims, platform::Place place);
/*! Resize the dimensions of the memory block. */ /*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims); inline Tensor& Resize(const DDim& dims);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
inline const DDim& dims() const; inline const DDim& dims() const;
private: private:
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
std::shared_ptr<Placeholder> holder_; std::shared_ptr<Placeholder> holder_;
/*! points to dimensions of memory block. */ /*! points to dimensions of memory block. */
DDim dim_; DDim dim_;
}; };
...@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework ...@@ -129,7 +129,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
### 实现计算 ## 实现计算
当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。 当需要完成计算时,我们需要等式左边的EigenTensor调用device接口。在这里需要注意的是,这里的EigenTensor之间的运算只是改变了原有Tensor中的数据,而不会改变原有Tensor的shape信息。
......
## How to use Eigen in Paddle # How to use Eigen in Paddle
Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`. Essentially, a neural network is a compute graph. T data needed for the computation is stored in `Tensor`s and its computation procedure is described by `Operator`s. An `Operator` calls the `Compute` interface in its corresponding `OpKernel` and operates on the `Tensor`.
### Eigen Tensor Module ## Eigen Tensor Module
The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU. The Eigen Tensor module supports powerful element-wise computation. In addition, a piece of code written using it can be run on both the CPU and the GPU.
...@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c ...@@ -12,7 +12,7 @@ Note that Eigen Tensor is still being actively developed, so its tests are not c
For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md). For details on Eigen Tensor module, please see [doc 1](https://github.com/RLovelett/eigen/blob/master/unsupported/Eigen/CXX11/src/Tensor/README.md) and [doc 2](https://bitbucket.org/eigen/eigen/src/default/unsupported/Eigen/CXX11/src/Tensor/README.md).
### paddle::framework::Tensor ## paddle::framework::Tensor
Paddle Tensor's is defined in the framework directory with the following interface: Paddle Tensor's is defined in the framework directory with the following interface:
...@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override { ...@@ -105,7 +105,7 @@ void Compute(const framework::ExecutionContext& context) const override {
``` ```
### paddle::framework::Tensor到EigenTensor的转换 ## paddle::framework::Tensor到EigenTensor的转换
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
...@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P ...@@ -129,7 +129,7 @@ For more transformations, see the [unit tests](https://github.com/PaddlePaddle/P
### Implementing Computation ## Implementing Computation
While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor. While computing, the device interface is needed from the EigenTensors on the left hand side of the assignments. Note that the computation between EigenTensors only changes the data originally inthe Tensor and does not change all the shape information associated with the Tensor.
......
基本使用概念
============
TBD
新手入门 新手入门
------------ ============
新手入门
如果需要快速了解PaddlePaddle的使用,可以参考以下指南。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
在使用PaddlePaddle构建应用时,需要了解一些基本概念。
这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。
.. toctree::
:maxdepth: 1
concepts/use_concepts_cn.rst
GET STARTED GET STARTED
------------ ============
This is get started page If you want to quickly know how to use PaddlePaddle, please refer to the following guide:
.. toctree::
:maxdepth: 1
quickstart_en.rst
While using PaddlePaddle to build applications, please understand some basic concepts.
Here is an example of linear regression. It introduces workflow of PaddlePaddle, including data format, model configuration and training, etc.
.. toctree::
:maxdepth: 1
concepts/index_en.rst
../../v2/getstarted/quickstart_cn.rst
\ No newline at end of file
../../v2/getstarted/quickstart_en.rst
\ No newline at end of file
进阶使用 进阶使用
------------ ------------
.. toctree::
:maxdepth: 1
optimization/index_cn.rst
HOW TO HOW TO
------------ ------------
This is how to page .. toctree::
:maxdepth: 1
optimization/index_en.rst
../../../../../benchmark/cluster/README.md
\ No newline at end of file
基准
------------
.. toctree::
:maxdepth: 1
vgg16/README.md
README.md
Benchmark
------------
.. toctree::
:maxdepth: 1
vgg16/README.md
README.md
../../../../../../benchmark/cluster/vgg16/README.md
\ No newline at end of file
...@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大 ...@@ -8,7 +8,7 @@ PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大
* Python 与 C++ 混合代码的性能分析 * Python 与 C++ 混合代码的性能分析
## Python代码的性能分析 # Python代码的性能分析
### 生成性能分析文件 ### 生成性能分析文件
......
...@@ -14,7 +14,7 @@ the profiling and tuning of ...@@ -14,7 +14,7 @@ the profiling and tuning of
1. the Python code and 1. the Python code and
1. the mixture of Python and C++ code. 1. the mixture of Python and C++ code.
## Profiling the Python Code # Profiling the Python Code
### Generate the Performance Profiling File ### Generate the Performance Profiling File
...@@ -81,7 +81,7 @@ focus on. We can sort above profiling file by tottime: ...@@ -81,7 +81,7 @@ focus on. We can sort above profiling file by tottime:
We can see that the most time-consuming function is the `built-in We can see that the most time-consuming function is the `built-in
method run`, which is a C++ function in `libpaddle.so`. We will method run`, which is a C++ function in `libpaddle.so`. We will
explain how to profile C++ code in the next section. At this explain how to profile C++ code in the next section. At this
moment, let's look into the third function `sync_with_cpp`, which is a moment, let's look into the third function `sync_with_cpp`, which is a
Python function. We can click it to understand more about it: Python function. We can click it to understand more about it:
......
性能优化
------------
.. toctree::
:maxdepth: 1
timeline.md
cpu_profiling_cn.md
benchmark/index_cn.rst
Performance Optimization
---------------------------
.. toctree::
:maxdepth: 1
timeline.md
cpu_profiling_en.md
benchmark/index_en.rst
## how to use timeline tool to do profile # how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. 1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
......
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
:maxdepth: 1 :maxdepth: 1
getstarted/index_cn.rst getstarted/index_cn.rst
design/index_cn.rst
build_and_install/index_cn.rst build_and_install/index_cn.rst
design/index_cn.rst
howto/index_cn.rst howto/index_cn.rst
dev/index_cn.rst dev/index_cn.rst
faq/index_cn.rst faq/index_cn.rst
...@@ -5,8 +5,8 @@ ...@@ -5,8 +5,8 @@
:maxdepth: 1 :maxdepth: 1
getstarted/index_en.rst getstarted/index_en.rst
design/index_en.rst
build_and_install/index_en.rst build_and_install/index_en.rst
design/index_en.rst
howto/index_en.rst howto/index_en.rst
dev/index_en.rst dev/index_en.rst
faq/index_en.rst faq/index_en.rst
多语言接口
------------
.. toctree::
:maxdepth: 1
00.why_plain_c.md
Multilingual Interface
-----------------------
.. toctree::
:maxdepth: 1
00.why_plain_c.md
...@@ -44,7 +44,7 @@ MKL,MKLML以及MKL-DNN三者关系如下表: ...@@ -44,7 +44,7 @@ MKL,MKLML以及MKL-DNN三者关系如下表:
| Name | Open Source | License | Descriptions | | Name | Open Source | License | Descriptions |
| :---------- | :--------------- | :---------- | :------------ | | :---------- | :--------------- | :---------- | :------------ |
| MKL | No | Proprietary | Accelerate math processing routines | | MKL | No | Proprietary | Accelerate math processing routines |
| MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning | | MKLML | No | Proprietary | Small package of MKL, especially for Machine Learning |
| MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks | | MKL-DNN | Yes | Apache 2.0 | Accelerate primitives processing routines especially for Deep Neural Networks |
...@@ -89,7 +89,7 @@ PaddlePaddle/Paddle ...@@ -89,7 +89,7 @@ PaddlePaddle/Paddle
### CMake ### CMake
`CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN `CMakeLists.txt`中提供一个与MKL有关的总开关:`WITH_MKL`,它负责决定编译时是否使用MKLML和MKL-DNN
- `WITH_MKLML` 控制是否使用MKLML库。 - `WITH_MKLML` 控制是否使用MKLML库。
当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。 当打开`WITH_MKL`时,会自动使用MKLML库作为PaddlePaddle的CBLAS和LAPACK库,同时会开启Intel OpenMP用于提高MKLML的性能。
编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。 编译时会把对应的头文件和库放在`build/third_party/install/mklml/*`目录下对应的地方。
MKLML的库目前都是动态库,主要包括`libiomp5.so``libmklml_intel.so` MKLML的库目前都是动态库,主要包括`libiomp5.so``libmklml_intel.so`
...@@ -172,7 +172,7 @@ if use_mkldnn ...@@ -172,7 +172,7 @@ if use_mkldnn
self.layer_type = mkldnn_* self.layer_type = mkldnn_*
``` ```
所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。 所有MKL-DNN的`layer_type`会以*mkldnn_*开头,这些会在`MKLDNN*Layer`注册layer的时候保证,以示区分。
同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。 同时,会在`paddle/utils.Flags`中添加一个`use_mkldnn`的flag,用于选择是否使用MKL-DNN的相关功能。
......
...@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二 ...@@ -139,3 +139,77 @@ PaddlePaddle使用avx SIMD指令提高cpu执行效率,因此错误的使用二
touch ../extern_mklml-stamp/extern_mklml-download touch ../extern_mklml-stamp/extern_mklml-download
// 4. 接着编译即可 // 4. 接着编译即可
9. 在Mac上无法安装numpy等Python包,权限错误
------------------
Mac上对自带的Python和包有严格的权限保护,最好不要在自带的Python上安装。建议用virtualenv建立一个新的Python环境来操作。
virtualenv的基本原理是将机器上的Python运行所需的运行环境完整地拷贝一份。我们可以在一台机器上制造多份拷贝,并在这多个拷贝之间自由切换,这样就相当于在一台机器上拥有了多个相互隔离、互不干扰的Python环境。
下面简单介绍下如何用virtualenv为Paddle生成一个专用的Python环境:
安装virtualenv:
::::::::::::::::
virtualenv本身也是Python的一个包,可以用pip进行安装:
.. code-block:: bash
sudo -H pip install virtualenv
由于virtualenv需要安装给系统自带的Python,因此需要使用sudo权限。
创建一个新的Python运行环境:
:::::::::::::::::::
.. code-block:: bash
virtualenv --no-site-packages paddle
--no-site-packages 参数表示不拷贝已有的任何第三方包,创造一个完全干净的新Python环境。后面的paddle是我们为这个新创建的环境取的名字。
执行完这一步后,当前目录下应该会出现一个名为paddle(或者你取的其他名字)的目录。这个目录里保存了运行一个Python环境所需要的各种文件。
启动运行环境:
::::::::::::::::
.. code-block:: bash
source paddle/bin/activate
执行后会发现命令提示符前面增加了(paddle)字样,说明已经成功启动了名为‘paddle’的Python环境。执行which python,可以发现使用的已经是刚刚创建的paddle目录下的Python。
在这个环境中,我们可以自由地进行Paddle的安装、使用和开发工作,无需担心对系统自带Python的影响。
退出运行环境:
:::::::::::::::
直接执行:
.. code-block:: bash
deactivate
可以看到命令提示符前面的(paddle)字样消失。
自动启动某一Python环境:
::::::::::::::::
如果我们经常使用Paddle,我们每次打开终端后都需要执行一下source paddle/bin/activate来启动环境,比较繁琐。为了简便,可以修改终端的配置文件,来让终端每次启动后自动启动特定的Python环境。
执行:
.. code-block:: bash
vi ~/.bash_profile
打开终端配置文件,并在文件的最后添加一行:
.. code-block:: bash
source paddle/bin/activate
保存并关闭文件。
这样,每次打开终端时就会自动启动名为‘paddle’的Python环境了。
...@@ -2,4 +2,80 @@ ...@@ -2,4 +2,80 @@
Model Configuration Model Configuration
################### ###################
TBD .. contents::
1. How to deal with error :code:`Duplicated layer name`
----------------------------------------------------------
The general reason for this error is that users may have set the same value for the attribute :code:`name` in different layers. Try to find out the :code:`name` attribute with the same value in diffrent layers and set them differently.
2. How to use :code:`paddle.layer.memory`'s attribute :code:`name`
----------------------------------------------------------------------
* :code:`paddle.layer.memory` is used to get the output of a layer's last timestep and the layer is specified by the attribute :code:`name` . Thus, :code:`paddle.layer.memory` will associate with the layer that has the same value of attribute :code:`name` , and uses the output of the layer's last timestep as the input of its current timestep.
* All the PaddlePaddle's layers have a unique name, which is set by the attribute :code:`name` . PaddlePaddle will automatically set it for the user when it is not explicitly set. :code:`paddle.layer.memory` is not a real layer, its name is set by the attribute :code:`memory_name` and PaddlePaddle will also automatically set it when the user does not explicitly set. The :code:`paddle.layer.memory` attribute :code:`name` is used to specify the layer it is associated with, and needs to be explicitly set by the user.
3. What is the difference between the two ways of using dropout
-----------------------------------------------------------------
* There are two ways to use dropout in PaddlePaddle
* Set the :code:`drop_rate` parameter in the layer's :code:`layer_atter` attribute. Take :code:`paddle.layer.fc` as an example:
.. code-block:: python
fc = paddle.layer.fc(input=input, layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=0.5))
* Use :code:`paddle.layer.dropout` layer. Take :code:`paddle.layer.fc` as an example:
.. code-block:: python
fc = paddle.layer.fc(input=input)
drop_fc = paddle.layer.dropout(input=fc, dropout_rate=0.5)
* :code:`paddle.layer.dropout` actually uses the :code:`paddle.layer.add_to` layer and sets :code:`drop_rate` as the previous method. This method is very memory intensive.
* PaddlePaddle implements dropout in the activation function rather than in the layer.
* :code:`paddle.layer.lstmemory`, :code:`paddle.layer.grumemory`, :code:`paddle.layer.recurrent` implement activation of output in an unusual way, so we cannot use dropout by setting :code:`drop_rate` . To use dropout for these layers, we could use the second method, which is to use :code:`paddle.layer.dropout`.
4. The differences between different recurrent layers
--------------------------------------------------------
Take LSTM as an example. There are several kinds of recurrent layers in PaddlePaddle:
* :code:`paddle.layer.lstmemory`
* :code:`paddle.networks.simple_lstm`
* :code:`paddle.networks.lstmemory_group`
* :code:`paddle.networks.bidirectional_lstm`
According to implementations, recurrent layer can be classified into 2 types:
1. Recurrent layer implemented by recurrent_group:
* Using this type of recurrent layers, users can access the intermediate value calculated by the recurrent unit within a timestep (eg: hidden states, memory cells, etc.)
* :code:`paddle.networks.lstmemory_group` belongs to this type of recurrent layers.
2. Recurrent layer implemented as a complete operation:
* Users can only access output values when using this type of recurrent layers.
* :code:`paddle.networks.lstmemory_group` , :code:`paddle.networks.simple_lstm` and :code:`paddle.networks.bidirectional_lstm` belong to this type of recurrent layer;
By implementing recurrent layer as a complete operation, CPU and GPU calculations can be optimized. Therefore, the second type of recurrent layer is more efficient than the first one. In practical applications, we propose to use the second type of recurrent layers if there is no need to access the intermediate variable of LSTM.
In addition, PaddlePaddle also contains a kind of LSTM calculation unit: :code:`paddle.networks.lstmemory_unit`:
* Unlike the recurrent layer described above, :code:`paddle.networks.lstmemory_unit` defines the computational process of an LSTM unit in a timestep. It is not a complete recurrent layer, nor can it receive sequence data as input.
* :code:`paddle.networks.lstmemory_unit` can only be used as a step function in recurrent_group.
5. Can Softmax's calculation dimension be specified?
--------------------------------------------------------------------
We can't specify calculation dimension for PaddlePaddle's softmax. It can only be calculated by rows.
In image tasks, for NCHW, if you need to calculate softmax in C dimension, you could use :code:`paddle.layer.switch_order` to change the dimension order, that is, convert NCHW to NHWC, then do the reshape operation and calculate softmax.
6. Does PaddlePaddle support variable-dimensional data inputs
----------------------------------------------------------------
PaddlePaddle provides :code:`paddle.data_type.dense_array` to support variable-dimensional data input. Simply set the dimension of the data layer to a value larger than the dimension of the input data for occupancy.
if(NOT WITH_FLUID) if(NOT WITH_FLUID_ONLY)
add_subdirectory(cuda) add_subdirectory(cuda)
add_subdirectory(function) add_subdirectory(function)
add_subdirectory(utils) add_subdirectory(utils)
......
add_subdirectory(details)
# ddim lib # ddim lib
proto_library(framework_proto SRCS framework.proto) proto_library(framework_proto SRCS framework.proto)
...@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo ...@@ -87,6 +88,9 @@ cc_library(feed_fetch_method SRCS feed_fetch_method.cc DEPS lod_tensor scope glo
cc_library(executor SRCS executor.cc DEPS op_registry device_context scope cc_library(executor SRCS executor.cc DEPS op_registry device_context scope
framework_proto backward glog lod_rank_table feed_fetch_method) framework_proto backward glog lod_rank_table feed_fetch_method)
cc_library(parallel_executor SRCS parallel_executor.cc DEPS multi_devices_graph_builder threaded_ssa_graph_executor)
cc_library(prune SRCS prune.cc DEPS framework_proto) cc_library(prune SRCS prune.cc DEPS framework_proto)
cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context)
cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
......
...@@ -87,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -87,6 +87,21 @@ class ChannelImpl : public paddle::framework::Channel<T> {
return value; return value;
} }
std::shared_ptr<QueueMessage> get_first_message(
std::deque<std::shared_ptr<QueueMessage>> &queue, ChannelAction action) {
while (!queue.empty()) {
// Check whether this message was added by Select
// If this was added by Select then execute the callback
// to check if you can execute this message. The callback
// can return false if some other case was executed in Select.
// In that case just discard this QueueMessage and process next.
std::shared_ptr<QueueMessage> m = queue.front();
queue.pop_front();
if (m->callback == nullptr || m->callback(action)) return m;
}
return nullptr;
}
size_t cap_; size_t cap_;
std::recursive_mutex mu_; std::recursive_mutex mu_;
bool closed_; bool closed_;
...@@ -131,36 +146,21 @@ void ChannelImpl<T>::Send(T *item) { ...@@ -131,36 +146,21 @@ void ChannelImpl<T>::Send(T *item) {
// If there is a receiver, directly pass the value we want // If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any // to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) { if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = recvq.front(); std::shared_ptr<QueueMessage> m =
recvq.pop_front(); get_first_message(recvq, ChannelAction::SEND);
// Do the data transfer
// We will do this data transfer if either of the following if (m != nullptr) {
// cases are true
// 1. callback == nullptr // This means it was a regular channel send
// 2. callback returns true
bool do_send = true;
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send)
*(m->data) = std::move(*item); *(m->data) = std::move(*item);
else { m->Notify();
// We cannot do the data transfer because lock.unlock();
// this QueueMessage was added by Select send_return();
// and some other case was executed. return;
// So call the Send function again. } else {
// We do not care about notifying other
// because they would have been notified
// by the executed select case.
lock.unlock(); lock.unlock();
Send(item); Send(item);
send_return(); send_return();
return; return;
} }
// Wake up the blocked process and unlock
m->Notify();
lock.unlock();
send_return();
return;
} }
// Unbuffered channel will always bypass this // Unbuffered channel will always bypass this
...@@ -201,32 +201,34 @@ bool ChannelImpl<T>::Receive(T *item) { ...@@ -201,32 +201,34 @@ bool ChannelImpl<T>::Receive(T *item) {
} }
// If there is a sender, directly receive the value we want // If there is a sender, directly receive the value we want
// from the sender, bypassing the channel buffer if any // from the sender. In case of a buffered channel, read from
// buffer and move front of send queue to the buffer
if (!sendq.empty()) { if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = sendq.front(); std::shared_ptr<QueueMessage> m =
sendq.pop_front(); get_first_message(sendq, ChannelAction::RECEIVE);
// Do the data transfer if (buf_.size() > 0) {
// We will do this data transfer if either of the following // Case 1 : Channel is Buffered
// cases are true // Do Data transfer from front of buffer
// 1. callback == nullptr // This means it was a regular channel send // and add a QueueMessage to the buffer
// 2. callback returns true *item = std::move(buf_.front());
bool do_receive = true; buf_.pop_front();
if (m->callback != nullptr) // If first message from sendq is not null
do_receive = m->callback(ChannelAction::RECEIVE); // add it to the buffer and notify it
if (do_receive) if (m != nullptr) {
*item = std::move(*(m->data)); // Copy to buffer
else buf_.push_back(std::move(*(m->data)));
// We cannot do the data transfer because m->Notify();
// this QueueMessage was added by Select } // Ignore if there is no first message
// and some other case was executed. } else {
// So call the Receive function again. // Case 2: Channel is Unbuffered
// We do not care about notifying other // Do data transfer from front of SendQ
// because they would have been notified // If front is nullptr, then recursively call itself
// by the executed select case. if (m != nullptr) {
return recv_return(Receive(item)); *item = std::move(*(m->data));
m->Notify();
// Wake up the blocked process and unlock } else
m->Notify(); return recv_return(Receive(item));
}
lock.unlock(); lock.unlock();
return recv_return(true); return recv_return(true);
} }
......
...@@ -36,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) { ...@@ -36,23 +36,25 @@ TEST(Channel, ChannelCapacityTest) {
delete ch; delete ch;
} }
void RecevingOrderEqualToSendingOrder(Channel<int> *ch) { void RecevingOrderEqualToSendingOrder(Channel<int> *ch, int num_items) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < num_items; i++) {
ch->Send(&i); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
for (int i = 0; i < 5; i++) { std::this_thread::sleep_for(std::chrono::milliseconds(200));
int recv = 999; for (int i = 0; i < num_items; i++) {
int recv = -1;
EXPECT_EQ(ch->Receive(&recv), true); EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i); EXPECT_EQ(recv, i);
} }
std::this_thread::sleep_for(std::chrono::milliseconds(200)); std::this_thread::sleep_for(std::chrono::milliseconds(200));
CloseChannel(ch); CloseChannel(ch);
t.join(); t.join();
EXPECT_EQ(sum_send, 10U); unsigned expected_sum = (num_items * (num_items - 1)) / 2;
EXPECT_EQ(sum_send, expected_sum);
delete ch; delete ch;
} }
...@@ -185,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -185,12 +187,28 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) { TEST(Channel, RecevingOrderEqualToSendingOrderWithUnBufferedChannel) {
auto ch = MakeChannel<int>(0); auto ch = MakeChannel<int>(0);
RecevingOrderEqualToSendingOrder(ch); RecevingOrderEqualToSendingOrder(ch, 20);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel1) {
// Test that Receive Order is same as Send Order when number of items
// sent is less than size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 5);
}
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel2) {
// Test that Receive Order is same as Send Order when number of items
// sent is equal to size of buffer
auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch, 10);
} }
TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel) { TEST(Channel, RecevingOrderEqualToSendingOrderWithBufferedChannel3) {
// Test that Receive Order is same as Send Order when number of items
// sent is greater than the size of buffer
auto ch = MakeChannel<int>(10); auto ch = MakeChannel<int>(10);
RecevingOrderEqualToSendingOrder(ch); RecevingOrderEqualToSendingOrder(ch, 20);
} }
void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) { void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
......
cc_library(var_handle SRCS var_handle.cc DEPS place)
cc_library(op_handle_base SRCS op_handle_base.cc DEPS var_handle device_context)
cc_library(scale_loss_grad_op_handle SRCS scale_loss_grad_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
cc_library(fetch_op_handle SRCS fetch_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory)
nv_library(nccl_all_reduce_op_handle SRCS nccl_all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory
dynload_cuda)
cc_library(computation_op_handle SRCS computation_op_handle.cc DEPS framework_proto scope place operator op_registry)
cc_library(ssa_graph SRCS ssa_graph.cc DEPS var_handle op_handle_base)
cc_library(ssa_graph_builder SRCS ssa_graph_builder.cc DEPS ssa_graph)
if(WITH_GPU)
set(multi_devices_graph_builder_deps nccl_all_reduce_op_handle)
else()
set(multi_devices_graph_builder_deps)
endif()
cc_library(multi_devices_graph_builder SRCS multi_devices_graph_builder.cc DEPS ssa_graph_builder computation_op_handle
scale_loss_grad_op_handle ${multi_devices_graph_builder_deps})
cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ssa_graph)
cc_library(threaded_ssa_graph_executor SRCS threaded_ssa_graph_executor.cc DEPS fetch_op_handle ssa_graph_executor scope
simple_threadpool device_context)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/computation_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
ComputationOpHandle::ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
platform::Place place)
: op_(framework::OpRegistry::CreateOp(op_desc)),
scope_(scope),
place_(place) {}
void ComputationOpHandle::RunImpl() {
auto *cur_ctx = dev_ctxes_[place_];
for (auto *in : inputs_) {
bool need_wait =
in->generated_op_ && in->generated_op_->dev_ctxes_[place_] != cur_ctx;
if (need_wait) {
in->generated_op_->Wait(cur_ctx);
}
}
op_->Run(*scope_->FindVar("@TMP_SCOPE@")->Get<Scope *>(), place_);
}
std::string ComputationOpHandle::Name() const { return op_->Type(); }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct ComputationOpHandle : public OpHandleBase {
std::unique_ptr<OperatorBase> op_;
Scope *scope_;
platform::Place place_;
ComputationOpHandle(const OpDesc &op_desc, Scope *scope,
platform::Place place);
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/fetch_op_handle.h"
namespace paddle {
namespace framework {
namespace details {
FetchOpHandle::FetchOpHandle(FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes)
: data_(data), offset_(offset), local_scopes_(local_scopes) {}
FetchOpHandle::~FetchOpHandle() {
for (auto *input_var : inputs_) {
input_var->pending_ops_.erase(this);
}
}
void FetchOpHandle::Wait(platform::DeviceContext *waited_dev) {
PADDLE_THROW("Nobody should wait FetchOp. Unexpceted Error");
}
void FetchOpHandle::WaitAndMergeCPUTensors() const {
std::vector<const LoDTensor *> tensors_ptr;
tensors_ptr.reserve(tensors_.size());
for (auto &t : tensors_) {
tensors_ptr.emplace_back(&t);
}
data_->at(offset_).MergeLoDTensor(tensors_ptr, platform::CPUPlace());
}
void FetchOpHandle::RunImpl() {
auto cpu_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
for (auto *input : inputs_) {
auto *var = static_cast<VarHandle *>(input);
var->generated_op_->Wait(cpu_ctx);
}
tensors_.resize(inputs_.size());
auto *var = static_cast<VarHandle *>(inputs_[0]);
auto &var_name = var->name_;
platform::CPUPlace cpu;
auto &scopes = *local_scopes_;
for (size_t i = 0; i < scopes.size(); ++i) {
auto &scope = scopes[i];
auto &t = scope->FindVar(var_name)->Get<framework::LoDTensor>();
if (platform::is_gpu_place(var->place_)) {
#ifdef PADDLE_WITH_CUDA
TensorCopy(t, cpu, *dev_ctxes_[t.place()], &tensors_[i]);
dev_ctxes_[t.place()]->Wait();
#endif
} else {
tensors_[i].ShareDataWith(t);
tensors_[i].set_lod(t.lod());
}
}
this->WaitAndMergeCPUTensors();
}
std::string FetchOpHandle::Name() const { return "Fetch"; }
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/feed_fetch_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace framework {
namespace details {
struct FetchOpHandle : public OpHandleBase {
FeedFetchList *data_;
size_t offset_;
std::vector<Scope *> *local_scopes_;
std::vector<LoDTensor> tensors_;
FetchOpHandle(FeedFetchList *data, size_t offset,
std::vector<Scope *> *local_scopes);
~FetchOpHandle();
void Wait(platform::DeviceContext *waited_dev) override;
void WaitAndMergeCPUTensors() const;
std::string Name() const override;
protected:
void RunImpl() override;
};
} // namespace details
} // namespace framework
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/details/multi_devices_graph_builder.h"
#include "paddle/fluid/framework/details/computation_op_handle.h"
#include "paddle/fluid/framework/details/scale_loss_grad_op_handle.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/framework/details/nccl_all_reduce_op_handle.h"
#endif
namespace paddle {
namespace framework {
namespace details {
#ifdef PADDLE_WITH_CUDA
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes,
platform::NCCLContextMap *nccl_ctxs)
: loss_var_name_(loss_var_name),
places_(places),
local_scopes_(local_scopes),
nccl_ctxs_(nccl_ctxs) {
#else
MultiDevSSAGraphBuilder::MultiDevSSAGraphBuilder(
const std::vector<platform::Place> &places,
const std::string &loss_var_name,
const std::unordered_set<std::string> &params,
const std::vector<Scope *> &local_scopes)
: loss_var_name_(loss_var_name),
places_(places),
local_scopes_(local_scopes) {
#endif
for (auto &p : params) {
grad_names_.insert(GradVarName(p));
}
}
std::unique_ptr<SSAGraph> MultiDevSSAGraphBuilder::Build(
const ProgramDesc &program) const {
auto graph = new SSAGraph();
SSAGraph &result = *graph;
result.vars_.resize(places_.size());
bool is_forwarding = true;
for (auto *op : program.Block(0).AllOps()) {
bool change_forward = false;
if (!is_forwarding) {
// FIXME(yy): Do not hard code like this
if (op->OutputArgumentNames().size() == 1 &&
op->OutputArgumentNames()[0] == GradVarName(loss_var_name_)) {
continue; // Drop fill 1. for backward coeff;
}
}
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto *s = local_scopes_[i];
result.ops_.emplace_back(new ComputationOpHandle(*op, s, p));
auto *op_handle = result.ops_.back().get();
op_handle->dev_ctxes_[p] = const_cast<platform::DeviceContext *>(
platform::DeviceContextPool::Instance().Get(p));
auto var_names = op->InputArgumentNames();
for (auto &each_var_name : var_names) {
VarHandle *var =
CreateOrGetLatestVarHandle(&result, each_var_name, p, i);
op_handle->AddInput(var);
}
var_names = op->OutputArgumentNames();
for (auto &each_var_name : var_names) {
CreateOpOutput(&result, op_handle, each_var_name, p, i);
}
if (is_forwarding) {
if (var_names.size() == 1 && var_names[0] == loss_var_name_) {
// Insert ScaleCost OpHandle
#ifdef PADDLE_WITH_CUDA
auto *communication_dev_ctx = nccl_ctxs_->DevCtx(p);
#else
auto *communication_dev_ctx =
platform::DeviceContextPool::Instance().Get(platform::CPUPlace());
#endif
op_handle = new ScaleLossGradOpHandle(local_scopes_.size(), s, p,
communication_dev_ctx);
result.ops_.emplace_back(op_handle);
// FIXME: Currently ScaleLossGradOp only use device_count as scale
// factor. So it does not depend on any other operators.
// VarHandle *loss = GetVarHandle(loss_var_name, place);
// loss->pending_ops_.emplace_back(op_handle);
// op_handle->inputs_.emplace_back(loss);
CreateOpOutput(&result, op_handle, GradVarName(loss_var_name_), p, i);
change_forward = true;
}
}
}
if (change_forward) {
is_forwarding = false;
}
if (!is_forwarding) {
auto var_names = op->OutputArgumentNames();
for (auto &og : var_names) {
if (grad_names_.count(og) != 0) { // is param grad
// Insert NCCL AllReduce Op
#ifdef PADDLE_WITH_CUDA
result.ops_.emplace_back(
new NCCLAllReduceOpHandle(local_scopes_, places_, *nccl_ctxs_));
auto *op_handle = result.ops_.back().get();
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
auto &vars = result.vars_[i][og];
if (vars.empty()) { // This device has no data. continue.
continue;
}
auto *prev_grad = &vars[vars.size() - 1];
op_handle->AddInput(prev_grad);
auto &var = vars[vars.size()];
var.place_ = p;
var.name_ = og;
var.version_ = vars.size() - 1;
op_handle->AddOutput(&var);
}
#else
PADDLE_ENFORCE("Not implemented");
#endif
}
}
}
}
/*
Dependency graph has been constructed. However, there are still data
harzaeds need to be handled.
*/
PolishGraphToSupportDataHazards(&result);
if (VLOG_IS_ON(10)) {
std::ostringstream sout;
PrintGraphviz(*graph, sout);
VLOG(10) << sout.str();
}
return std::unique_ptr<SSAGraph>(graph);
} // namespace details
} // namespace details
} // namespace framework
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册