提交 f3dcd006 编写于 作者: Y Yibing Liu

Merge branch 'develop' of upstream into ctc_edit_distance_dev

......@@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system)
project(paddle CXX C Go)
message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
"${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
......
......@@ -7,11 +7,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle: (TODO: will rerun after 0.11.0)
- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
PaddlePaddle:
- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
......@@ -56,15 +56,15 @@ Input image size - 3 * 224 * 224, Time: images/second
<img src="figs/googlenet-cpu-train.png" width="500">
- Alexnet
- AlexNet
| BatchSize | 64 | 128 | 256 |
|--------------|--------| ------ | -------|
| OpenBLAS | 2.13 | 2.45 | 2.68 |
| OpenBLAS | 45.62 | 72.79 | 107.22 |
| MKLML | 66.37 | 105.60 | 144.04 |
| MKL-DNN | 399.00 | 498.94 | 626.53 |
chart TBD
<img src="figs/alexnet-cpu-train.png" width="500">
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
......@@ -72,27 +72,41 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------|
| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 |
| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
<img src="figs/vgg-cpu-infer.png" width="500">
- ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------|
| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 |
| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
<img src="figs/resnet-cpu-infer.png" width="500">
- GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 |
| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
<img src="figs/googlenet-cpu-infer.png" width="500">
- AlexNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
<img src="figs/alexnet-cpu-infer.png" width="500">
### Laptop
TBD
benchmark/figs/resnet-cpu-train.png

19.8 KB | W: | H:

benchmark/figs/resnet-cpu-train.png

17.6 KB | W: | H:

benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
  • 2-up
  • Swipe
  • Onion skin
benchmark/figs/vgg-cpu-train.png

17.9 KB | W: | H:

benchmark/figs/vgg-cpu-train.png

16.7 KB | W: | H:

benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
  • 2-up
  • Swipe
  • Onion skin
......@@ -19,7 +19,11 @@ args = {
'num_samples': num_samples
}
define_py_data_sources2(
"train.list", None, module="provider", obj="process", args=args)
"train.list" if not is_infer else None,
"test.list" if is_infer else None,
module="provider",
obj="process",
args=args)
settings(
batch_size=batch_size,
......
# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import argparse
import matplotlib.pyplot as plt
def parse_args():
parser = argparse.ArgumentParser('Parse Log')
parser.add_argument(
'--file_path', '-f', type=str, help='the path of the log file')
parser.add_argument(
'--sample_rate',
'-s',
type=float,
default=1.0,
help='the rate to take samples from log')
parser.add_argument(
'--log_period', '-p', type=int, default=1, help='the period of log')
args = parser.parse_args()
return args
def parse_file(file_name):
loss = []
error = []
with open(file_name) as f:
for i, line in enumerate(f):
line = line.strip()
if not line.startswith('pass'):
continue
line_split = line.split(' ')
if len(line_split) != 5:
continue
loss_str = line_split[2][:-1]
cur_loss = float(loss_str.split('=')[-1])
loss.append(cur_loss)
err_str = line_split[3][:-1]
cur_err = float(err_str.split('=')[-1])
error.append(cur_err)
accuracy = [1.0 - err for err in error]
return loss, accuracy
def sample(metric, sample_rate):
interval = int(1.0 / sample_rate)
if interval > len(metric):
return metric[:1]
num = len(metric) / interval
idx = [interval * i for i in range(num)]
metric_sample = [metric[id] for id in idx]
return metric_sample
def plot_metric(metric,
batch_id,
graph_title,
line_style='b-',
line_label='y',
line_num=1):
plt.figure()
plt.title(graph_title)
if line_num == 1:
plt.plot(batch_id, metric, line_style, label=line_label)
else:
for i in range(line_num):
plt.plot(batch_id, metric[i], line_style[i], label=line_label[i])
plt.xlabel('batch')
plt.ylabel(graph_title)
plt.legend()
plt.savefig(graph_title + '.jpg')
plt.close()
def main():
args = parse_args()
assert args.sample_rate > 0. and args.sample_rate <= 1.0, "The sample rate should in the range (0, 1]."
loss, accuracy = parse_file(args.file_path)
batch = [args.log_period * i for i in range(len(loss))]
batch_sample = sample(batch, args.sample_rate)
loss_sample = sample(loss, args.sample_rate)
accuracy_sample = sample(accuracy, args.sample_rate)
plot_metric(loss_sample, batch_sample, 'loss', line_label='loss')
plot_metric(
accuracy_sample,
batch_sample,
'accuracy',
line_style='g-',
line_label='accuracy')
if __name__ == '__main__':
main()
......@@ -8,15 +8,20 @@ function clock_to_seconds() {
}
function infer() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OPENBLAS_MAIN_FREE=1
topology=$1
layer_num=$2
bs=$3
thread=`nproc`
if [ $thread -gt $bs ]; then
thread=$bs
trainers=`nproc`
if [ $trainers -gt $bs ]; then
trainers=$bs
fi
log="logs/infer-${topology}-${layer_num}-${thread}openblas-${bs}.log"
log="logs/infer-${topology}-${layer_num}-${trainers}openblas-${bs}.log"
threads=$((`nproc` / trainers))
if [ $threads -eq 0 ]; then
threads=1
fi
export OPENBLAS_NUM_THREADS=$threads
models_in="models/${topology}-${layer_num}/pass-00000/"
if [ ! -d $models_in ]; then
......@@ -28,7 +33,7 @@ function infer() {
--config="${topology}.py" \
--use_mkldnn=False \
--use_gpu=False \
--trainer_count=$thread \
--trainer_count=$trainers \
--log_period=$log_period \
--config_args="batch_size=${bs},layer_num=${layer_num},is_infer=True,num_samples=256" \
--init_model_path=$models_in \
......
set -e
function train() {
unset OMP_NUM_THREADS MKL_NUM_THREADS OMP_DYNAMIC KMP_AFFINITY
export OPENBLAS_NUM_THREADS=1
topology=$1
layer_num=$2
bs=$3
......
......@@ -19,7 +19,7 @@ ExternalProject_Add(
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile})
else()
add_library(eigen3 INTERFACE)
......
......@@ -63,9 +63,30 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT}
)
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn)
LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(mkldnn STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
# copy the real so.0 lib to install dir
# it can be directly contained in wheel or capi
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF()
......@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
IF(WITH_C_API)
INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
ENDIF()
......@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
# use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
......@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ELSEIF(RPI)
# use hardfp
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
......@@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
......
......@@ -63,7 +63,7 @@ ExternalProject_Add(
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
......
......@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
......@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
......@@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
)
# Add dummy code to support `make target_name` under Terminal Command
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
if (go_library_SHARED OR go_library_shared)
add_library(${TARGET_NAME} SHARED ${dummyfile})
else()
......
......@@ -252,6 +252,11 @@ first_seq
.. autoclass:: paddle.v2.layer.first_seq
:noindex:
sub_seq
---------
.. autoclass:: paddle.v2.layer.sub_seq
:noindex:
concat
------
.. autoclass:: paddle.v2.layer.concat
......
......@@ -68,12 +68,6 @@ scale
:noindex:
reshape
---------
.. autofunction:: paddle.v2.fluid.layers.reshape
:noindex:
transpose
---------
.. autofunction:: paddle.v2.fluid.layers.transpose
......@@ -313,6 +307,12 @@ sequence_expand
:noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
......
# Backward Building
## Motivation
In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. However, when configuring the model structure, users do not need to define the backward part. So a mechanism is required by the framework which can complete the model's backward part automatically according to the given forward part.
When implementing a specific `op`, the developer is also asked to implement its backward version, called `grad_op`. A `grad_op` takes gradients of its corresponding `op`'s outputs, and calculate gradients of the `op`'s inputs. During the building of a model's backward part, the framework creates each forward `op`'s `grad_op`, and then string them together in reverse order of forwarding part. In this way, gradients spread from the end to the beginning of the model, in another word, from the loss to parameters.
## Challenges
The motivation of backward building is apparent. However, implementation it correctly is not so easy. In the **Fluid** design, a deep learning model is described by `Program`, `Block`, `Op` and `Variable`. The `Block` itself can be nested. It means that the `op`s and `variable`s are scattered across different blocks rather than all be gathered in a single graph. Our backward building algorithm shall visit blocks in recursive order and be able to insert `grad_op`s and new created `variable`s into the right place.
## Usage
Although the whole algorithm is comprised of many functions, only one is exposed as API:
```python
def append_backward(loss, parameter_list=None, no_grad_set=None):
"""
Append backward part to main_program
Args:
loss(Variable): The variable generated by the cost function.
parameter_list(list): Parameters that need to be updated by optimizers.
If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and
contains all variables with `step_gradient=True` from all blocks.
Return:
(list[Variable]): list of (parameters, gradients) pair.
"""
```
By invoking this API, the framework appends backward part of the program where the `loss` is. It takes three arguments. `loss` means the final loss value. It must be a scalar and is usually the output of the loss layer. It is also where the gradient generated and backpropagation starts. `parameter_list` marks all parameters needs updating. If it's `None`, all parameter will be updated by optimizers. `no_grad_set` marks variables without gradient. if all outputs of some `grad_op` are in `no_grad_set`, the `grad_op` will not be run.
This API will be invoked automatically before optimizer building.
As a result, in most cases, users do not need to invoke the API by themselves to append backward part.
## Implementation
The implementation of backward building algorithm is in `backward.py` file. The whole algorithm can be divided into two independent parts: creating `grad_op`s and creating new variables.
### Creating `grad_op`s
The creating of `grad_op`s is implemented by:
```python
def _append_backward_ops_(target,
block,
target_block,
no_grad_dict,
grad_to_var):
"""
Create all grad ops, and insert them into given block
Args:
target(Variable): the target variable of forward pass
block(Block): the block where forward ops are
target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict):
key(int) block index
val(set) a set of varibale names. These varibales have no gradient
grad_to_var(dict)(output argument):
key(str): grad variable name
val(str): corresponding forward variable name
"""
```
Given a `block`, the function will traverses all `op`s in this block in reverse order, gets corresponding `grad_op` from the C++ core via `core.get_grad_op_desc()`, then append it to `target_block`.
However, some specific `op`(e.g. `while_op`, `if_else_op`) can hold its own sub-block. For these sub-blocks contains `op`s as well, the `grad_op` creating should be recursive.
During the reverse traversal, we check each `op` whether it has an attribute named `sub_block`. If so, it means there is a sub-block and we need to deal with it first. After creating a new block whose father is the one in `op`'s attribute, we invoke `_append_backward_ops_()` recursively, assigning the new block to parameter `target_block` and the one in `op`'s attribute to `block`. The *pseudo-code* shows this process:
```
******* pseudo-code ********
for op in reversed(block.ops):
if op has an attribute named 'sub_block':
Get the sub-block(`s_block`) from op's attribute.
Create a new block(`grad_s_block`), whose father is `s_block`.
Invoke _append_backward_ops_(), with `block=s_block` and `target_block=grad_s_block`
Invoke `core.get_grad_op_desc()` to get op's grad_op.
Insert name correspondings between variables and their gradients of the grad_op to grad_to_var
Assign grad_s_block to grad_op as it's 'sub_block' attribute.
Append grad_op to current target_block.
```
The first invoking of `_append_backward_ops_()` is initiated by `append_backward()`, in which parameters `block` and `target_block` are all assigned with root block(the block with index 0).
### Corner Cases of `grad_op` Creating
In the previous section, we show the regular process of `grad_op` creating. However, in some corner cases, the conventional algorithm is not enough to get the correct result and appending handling is required. These additional processes run after the algorithm mentioned above and do some special adjusts on its output `grad_op`s.
#### Shared Variables
If a variable is read by more than one `op` in the forward pass, its gradient is likely to be written by more than one `grad_op`s in the next backward pass. To make the gradient result being the sum of all `grad_op`s' outputs instead of the last running one, we assign each output with a temporary variable and then add a `sum_op` to add them up.
For the debug convenience, if the final gradient name is `w@GRAD`, it's corresponding temporary variables will be named as `w@GRAD@RENAME@0`, `w@GRAD@RENAME@1`...
See function `_addup_repetitive_outputs_` in `backward.py` for implementation details.
#### No Gradient Variables
In our framework, variables can be marked as *no_gradient*, it means that the gradient of this variable is unnecessary and can be considered as zero in model training. Apparently, when all the outputs of some `grad_op` are marked as *no_gradient*, the `grad_op` itself can be skipped in backward pass.
Another situation is all the gradient inputs of some `grad_op` are marked as *no_gradient*, which means all of them can be considered as zeros. For `grad_op`s are in essence the propagation of gradients, all the outputs are definitely zeros when all gradient inputs are zeros. Therefore the `grad_op` can also be skipped.
It should be noted that all these zero gradients still need to be creating and initialized by something, otherwise following `grad_op`s who take these gradients as inputs take the risk of using uninitialized memory. In our code, we employ `fill_zeros_like_op` to initialize them as all zeros.
This features are implemented in function `_remove_no_grad_branch_`. It checks new created `grad_op`s one-by-one, removes who can be skipped and inserts `fill_zeros_like_op` when its necessary. We can get the `no_grad_set` from the `_append_backward_ops_` argument `no_grad_dict` or generate it on the fly by scanning all variables' `no_gradient` attribute(True or False).
### Creating Backward Variables
Up to now, we have completed all creating and adjusting jobs of `grad_op`s. However, backward variables have not been created. Now they are only represented by `grad_op`'s input and output arguments. The backward variable creating job will be done by:
```python
def _append_backward_vars_(block,
start_op_idx,
grad_to_var,
grad_info_map):
"""
Create new variables required by backward pass.
Args:
block(Block): the block where new variables will be created
start_op_idx(int): Only variables required by ops in block.ops[start_op_idx : ] will be created
grad_to_var(dict):
key(str): grad variable name
val(str): corresponding forward variable name
In most cases, this dict is generated by _append_backward_ops_()
grad_info_map(dict)(output argument):
key(str): forward variable name
val(tuple): a tuple of (str, int), str is the corresponding grad name, int is the block index
"""
```
Given a `block`, this function traverses all the `grad_op`s in it(The argument `start_op_idx` indicates where the grad_op sequence starts.) and creates all the uncreated outputs. The *pseudo-code* shows this process:
```
for op in block.ops[start_op_idx : ]:
if op has an attribute named 'sub_block':
Get the sub-block(`s_block`) from op's attribute.
Invoke _append_backward_vars_(), with `block=s_block`
for var_name in op.all_output_names():
if block.has_var_recursive(var_name) or var_name is the name of empty variable:
continue
create a new variable named 'var_name' in block
if grad_to_var.has_key(var_name):
set grad_info_map[grad_to_var[var_name]] as a tuple of (var_name. block)
do op's var type inference
do op's shape inference
```
# Design Doc: Concurrent Programming with Fluid
With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
## An Analogy
The following table compares concepts in Fluid and Go
| Go | Fluid |
|----|-------|
|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
## An Example Concurrent Program
To review all above concepts in an example, let us take a simple program and writes its distributed version.
Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
```go
import "fluid"
func paddlepaddle() {
X = fluid.read(...)
W = fluid.Tensor(...)
Y = fluid.mult(X, W)
}
```
Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
```protobuf
message ProgramDesc {
block[0] = Block {
vars = [X, W, Y],
ops = [
read(output = X)
assign(input = ..., output = W)
mult(input = {X, W}, output = Y)
],
}
}
```
Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
The default `main` function is defined as follows:
```go
func main() {
paddlepaddle()
fluid.run()
}
```
## The Concurrent Version
By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
### The Master Program
The master program could look like the following:
```protobuf
message ProgramDesc {
block[0] = Block {
vars = [X, L, Y],
ops = [
read(output = X)
kube_get_workers_addrs(output = L)
Y = tensor_array(len(L))
parallel_for(input = X, output = Y,
attrs = {L, block_id(1)}) # referring to block 1
]
}
block[1] = Block {
parent = 0,
vars = [x, y, index],
ops = [
slice(input = [X, index], output = x) # index is initialized by parallel_for
send(input = x, attrs = L[index])
recv(outputs = y, attrs = L[index])
assign(input = y, output = Y[index])
]
}
}
```
The equivalent Fluid program (calling the Go binding) is:
```go
func main() { //// block 0
X = fluid.read(...)
L = fluid.k8s.get_worker_addrs()
Y = fluid.tensor_array(len(L))
fluid.parallel_for(X, L,
func(index int) { //// block 1
x = X[index]
fluid.send(L[index], x)
y = fluid.recv(L[index])
Y[index] = y
})
}
```
An explanation of the above program:
- `fluid.k8s` is a package that provides access to Kubernetes API.
- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
1. creates an Executor instance, and
2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
### The Worker Program
The worker program looks like
```go
func main() {
W = Tensor(...)
x = fluid.listen_and_do(
fluid.k8s.self_addr(),
func(input Tensor) {
output = fluid.mult(input, W)
})
}
```
where
- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
2. once a connection is established,
1. creates a scope of two parameters, "input" and "output",
2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
## Summarization
From the above example, we see that:
1. Fluid enables the imperative programming paradigm by:
1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
2. call the `fluid.run` function that runs the program implicitly.
1. The program is described as a `ProgramDesc` protobuf message.
2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
......@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
For a detailed explanation, refer to this document -
[Design Doc: Parameter Server](./parameter_server.md)
## Distributed Training Architecture
......@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l
<img src="src/distributed_architecture.png"/>
The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
### PaddlePaddle Python
### Python API
PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
```Python
paddle.init()
input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
img, label = input[0], input[1]
hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
cost = paddle.layer.classification_cost(input=prediction, label=label)
optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
for i in range(1000):
_, cost_val = session.eval(targets=[cost, optimizer])
print cost_val
images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
...
predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(x=cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.01)
optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
for pass_id in range(10):
for data in train_reader():
loss, acc = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost])
```
The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
#### session.eval
As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
### PaddlePaddle Converter
The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
3. Optimize the computation graph.
4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
The code above is a typical local training program, the "Training Program" is built using helper functions such as
`fluid.layer.fc`. The training is done by calling `Executor.run`
iteratively.
For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
`Executor` to run the program locally. For any kind of distributed training, you can use
`RemoteExecutor` to specify desired distributed training method with some optional arguments.
### Distributed Transpiler
The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
Below are the steps that are followed :
1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
distributed training program:
1. Parse configurations from `RemoteExecutor`.
1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
"trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
### RemoteExecutor
As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
You can also use parameter `fetch_list` to interactively fetch variable back to local for
log printing.
The Python `RemoteExecutor` is derived from `Executor` class.
```python
exe = RemoteExecutor(
feed=feeder.feed(data),
fetch_list=[avg_cost],
job_desc=JobDesc(
jobname,
num_trainer,
num_pserver,
cpu_per_trainer,
gpu_per_trainer,
mem_per_trainer,
cpu_per_pserver,
mem_per_pserver
))
for data in train_reader():
loss, acc = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost])
```
6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
`JobDesc` object describe the distributed job resource specification to run on
Cluster environment.
7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
<img src="src/remote_executor.png"/>
The output IRs will be cached to optimize the conversion latency.
`RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
#### Placement Algorithm
### Placement Algorithm
Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
### Local Training Architecture
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
......@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc
### Training Data
In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
In PaddlePaddle v0.10.0, training data is typically read
with [data reader](../reader/README.md) from Python. This approach is
no longer efficient when training distributedly since the Python
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `Executor.run`. However, should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References:
......
# Design Doc: Operation Graph Based Parameter Server
# Design Doc: Parameter Server
## Abstract
......@@ -10,7 +10,7 @@ different purposes.
## Background
The previous implementations of the parameter server does not run a
subgraph. parameter initialization, optimizer computation, network
fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the
trainer and the parameter server.
......@@ -23,10 +23,10 @@ server becomes a natural extension.
## Design
### Graph Converter
### Distributed Transpiler
The *graph converter* converts the user-defined operation (OP) graph
into subgraphs to be scheduled on different nodes with the following
The *Distributed Transpiler* converts the user-defined fluid program
into sub-programs to be scheduled on different nodes with the following
steps:
1. OP placement: the OPs will be placed on different nodes according
......@@ -34,7 +34,6 @@ steps:
time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer
workers.
1. Add communication OPs to enable the communication between nodes.
We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
......@@ -48,8 +47,8 @@ After converting:
<img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
1. Operators are added to the subgraphs.
1. The parameter variable W and it's optimizer program are placed on the parameter server.
1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark
......@@ -64,39 +63,30 @@ After converting:
### Benefits
- Model parallelism become easier to implement: it's an extension to
the trainer - parameter server approach. we already have the
communication OPs, but need to extend the graph converter's
placement functionality.
the trainer - parameter server approach. We can have several "Transpilers"
to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as
a subgraph.
a sub-program.
- No more duplication logic inside the trainer and the parameter
server mentioned in the background section.
### Challenges
- It might be hard for the graph converter to cut a general graph
(without any hint for which subgraph is the optimizer). We may need
to label which subgraph inside the OP graph is the optimizer.
- It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid.
### Discussion
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently, what is our locking strategy?
E.g., each variable have a lock cpp method to be invoked by every
OP, or, have a lock OP.
- Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)
......
# Error Clip
## Overview
Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
## Usage
Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
```python
var = framework.Variable(..., error_clip=myErrorClip, ...)
```
The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
```python
ErrorClipByValue(max, min=None)
```
`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
```python
var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
```
## Implementation
The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
```python
class BaseErrorClipAttr(object):
def append_clip_op(self, block, grad_name):
raise NotImplementedError()
class ErrorClipByValue(BaseErrorClipAttr):
def __init__(self, max, min=None):
max = float(max)
if min is None:
min = -max
else:
min = float(min)
self.max = max
self.min = min
def append_clip_op(self, block, grad_name):
block.append_op(
type="clip",
inputs={"X": grad_name},
outputs={"Out": grad_name},
attrs={"min": self.min,
"max": self.max})
```
The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
```python
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
callback(block=target_block, context=grad_to_var)
```
Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
The callback function for `clip_op` appending is defined in *clip.py*:
```python
def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if error_clip is not None:
error_clip.append_clip_op(block, grad_n)
```
This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
# Memory Optimization
## Problem
In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
- availability of Big Data
- supercomputing power to process this Big Data over very large neural networks
- modern algorithms
Following graph shows the details:
![](images/deep_learning.png)
Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference.
## Solution
### Basic Strategy
There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
#### In-place Operation
In a relu activation operator:
$y = \max(x, 0)$
If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
#### Memory Sharing
Not all operators support in-place operations. Memory sharing is a more general strategy.
Following is an example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
### Live Variable Analysis
It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
- construct a control flow graph
- solve the dataflow equations
#### Control Flow Graph
To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
Following is the flow graph for a simple loop.
![](images/control_flow_graph.png)
#### Dataflow Analysis
liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
- Flow Graph Terminology
A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
- Uses and Defs
An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
- Liveness
A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
![](images/dataflow_equations.png)
### Memory optimization transpiler
At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
#### add in-place attribute
In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
#### contruct control flow graph
Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
- Block0:
```
lookup_table
mul
...
while(sub-block idx 1)
...
array_to_lod_tensor
cross_entropy
...
while_grad(sub-block idx 2)
read_from_array
array_to_lod_tensor
...
```
- Block1
```
read_from_array
read_from_array
...
write_to_array
increment
write_to_array
less_than
```
- Block2
```
read_from_array
increment
...
write_to_array
write_to_array
```
We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
```python
class ControlFlowGraph(object):
def __init__(self, Program):
self._sucessors = defaultdict(set)
self._presucessors = defaultdict(set)
self._uses = defaultdict(set)
self._defs = defaultdict(set)
self._live_in = defaultdict(set)
self._live_out = defaultdict(set)
self._program = Program
def build(self):
pass
def dataflow_analysis(self):
pass
def memory_optimization(self):
pass
def get_program(self):
return self._program
```
#### make dataflow analysis
We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
For example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
The dataflow analysis result is:
```
live_in(op1) = {b, c, f}
live_out(op1) = {a, f}
live_in(op2) = {a, f}
live_out(op2) = {d, f}
live_in(op3) = {d, f}
live_out(op3) = {}
```
After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
#### memory sharing policy
A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
```
if op.support_inplace():
i --> pool
pool --> o
else:
pool --> o
i --> pool
```
## Reference
- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
- Modern compiler implementation in ML, by Andrew W. Appel
- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
## Introduction
There are many performance analysis tools for [different programming languages and different software frameworks](https://en.wikipedia.org/wiki/List_of_performance_analysis_tools). For most popular deep learning frameworks, they use several programming languages and adapt to heterogeneous platforms. Similar to most of the deep learning frameworks, PaddlePaddle also uses C++, CUDA and Python as the basic programming languages to adapt to run on CPU and GPU devices. The [`nvprof` tools](http://docs.nvidia.com/cuda/profiler-users-guide/index.html#nvprof-overview) is usually used to analyse the CUDA program. We have [a document](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/howto/optimization/cpu_profiling.md) to profile CPU and Python program by [yep](https://pypi.python.org/pypi/yep) and [Google's perftools](https://github.com/google/pprof) to profile only the CPU and Python program. But for [PaddlePaddle fluid](https://github.com/PaddlePaddle/Paddle/blob/develop/doc/design/fluid.md), the operator is the basic computing unit. The developers usually want to collect the time of each operator and locate bottlenecks. The `nvprof` usually collect the timeline of CUDA-related activities on both CPU and GPU, including kernel execution, memory transfers, memory set and CUDA API calls and events or metrics for CUDA kernels. And the `yep` and `Google's perftools` can't collect the timeline for CUDA program. All these tools can't collect time in the operator level. So we design this profiling tool.
## Architecture
The work flow for most task is as follows. Each operator will run many times in the all iterations. So the profiler must collect the total time of each operator during the iteration. For more, sometimes, the developers may want to collect more detailed time span inside the operator or record time span for elsewhere, this requires that the profiler must support to record the nested time span. And in order to speedup training, all the deep learning frameworks support parallel computing, including multiple threads on CPU and multiple GPUs. So the profiler must be able to collect the timeline for each thread. In addition, the profiler also occupies certain resources. It must can be easily to be enabled or disabled by the developers. At last, the profiler should present a human-readable report.
```python
for i in xrange(M): # M is the iteration number
for op in operator_lists: # The `operator_lists` contains all the operators in the network.
op.run();
```
In summary, the proflier should have following features:
- records time span in loop.
- supports nested time span.
- supports multiple threads/multiple GPUs.
- supports to be enabled and disabled by users.
But how to record the time for the mixed C++ and CUDA program? There many C++ APIs to get the current calendar time in host program. But for GPU, the CUDA kernels may be executed concurrently if they are in different [streams](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#streams) and the CUDA kernels is asynchronous with the host program if there is no the synchronous aftern the CUDA kernels. CUDA provides [event](http://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#events) to monitor the device and perform accurate timing. Inspired by PyTorch and CUDA event, we also design and apply the events to record the timeline. Then summarize and present statistics based on these events.
The overall flow is shown as the following figure.
<img src="./images/profiler.png" align="center"/><br/>
### Event
In above work flow, a pair of events are needed before and after the piece of code to collect time. So the event has a flag to mark whether it is a starting event or an ending event. Except this two kinds of event, sometime, a only marker with a text message is needed, for example, a marker to specify the profiling start or end. There are three kinds of event:
```c++
enum EventKind {
kMark,
kPushRange,
kPopRange};
```
- kMark: only a marker without time range.
- kPushRange: mark the starting event for time range.
- kPopRange: mark the ending event for time range.
For the CPU code, the events only need to record the current time. For the CUDA code, the [event management functions of CUDA](http://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT) are used. For many pieces of code, an event lists are used to record each piece.
```c++
class Event {
public:
// The DeviceContext is used to get current CUDA stream.
Event(EventKind kind, std::string name, uint32_t thread_id,
const platform::DeviceContext* dev_ctx = nullptr);
double CpuElapsedUs(const Event& e) const;
double CudaElapsedUs(const Event& e) const;
private:
EventKind kind_;
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_;
#ifdef PADDLE_WITH_CUDA
cudaEvent_t event_ = nullptr;
int device_ = -1;
#endif
};
struct EventList {
std::forward_list<std::vector<Event>> event_blocks;
};
```
As mentioned above, there is no need to record the timeline when disabling the profiler. So there is a global state to enable or disable the profiler.
```c++
enum ProfilerState {
kDisabled,
kCPU,
kCUDA
};
ProfilerState g_state;
```
- kDisabled: the disabled state.
- kCPU: CPU profiling state.
- kCUDA: GPU profiling state.
A pair of starting and ending events are pushed to event lists in constructor and destructor of `RecordEvent`. So the timeline is recorded for the code in the lifecycle of an object of `RecordEvent`.
```c++
struct RecordEvent {
explicit RecordEvent(const std::string name,
platform::DeviceContext* dev_ctx = nullptr) {
if (kState == ProfilerState::kDisabled) return;
// push the starting event to the event lists.
}
~RecordEvent() {
if (kState == ProfilerState::kDisabled) return;
// push the ending event to the event lists.
}
};
```
# Design Doc: Session
## Abstract
The *session* object encapsulates the environment in which the
computation graph is executed.
We will have the *local* session and *remote* session, they offer the
same [interface](#interface). The local session encapsulates the local
runtime environment and the remote session encapsulates the cluster
runtime environment.
The local runtime environment contains:
1. computation devices (i.e., CPU, GPU) handles, and
1. the [scope](../scope.md) which holds all variables.
The remote runtime environment contains:
1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
and
1. the distributed [scope](../scope.md) in a cluster which holds all
variables.
The user can create a remote session on Paddle Cloud and evaluate the
computation graph with it. In this way, the user can control the
remote computation resource in a cluster from his local computer.
## Background
The current design has an implicit global session in which
`paddle.eval()` is executed. The pain point is:
Since the user is not able to explicitly switch between runtime
environments, the user cannot run a topology in two independent
environments.
For example, in reinforcement learning, the user may want to have a
stale model for inference and a fresh model for training, and only
replace the stale model with the fresh model periodically.
Furthermore, we have no concept that encapsulates a remote environment
that executes a computation graph.
We need the session object to address above issues.
## Session
A session is an object that owns the runtime environment. All
computations are executed through `session.eval()`.
### Interface
```python
eval(
targets,
feed_dict=None,
)
```
Evaluates the target Operations or Variables in `targets`.
- *targets*: the evaluation targets. Can be a single Operation or
Variable, or a list with the Operations or Variables as
elements. The value returned by `eval()` has the same shape as the
`target` argument.
The PaddlePaddle program is represented by
the [ProgramDesc](../design/program.md), `eval()` will infer the
ProgramDesc from the given targets and run the PaddlePaddle
program. Please
see
[this graph](./distributed_architecture.md#local-training-architecture) for
the detailed illustration for the local session
and
[this graph](./distributed_architecture.md#distributed-training-architecture) for
the detailed illustration for the remote session.
- *feed_dict*: a dictionary that contains the tensors which override
the edges of the computation graph.
feed_dict not only can provide the input data, it can override any
OP's input as well:
```python
a = pd.constant(2.0, name="a")
b = pd.variable(name="b")
c = pd.mul(a,b)
sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
```
```python
close()
```
Closes the session and releases the scope that the session owns.
### Create a Local Session
```python
session(
devices=None
)
```
Creates a new session. One session owns one global scope, so creating
multiple sessions will create different scopes.
- *devices*: a single `string` or a list of `string` of device names,
the corresponding devices will be the computation devices for
`eval()`. If not specified, all available devices (e.g., all GPUs)
will be used. The user doesn't need to specify the CPU device since
it will be always used. Multiple sessions can use the same device.
#### Example
```Python
a = paddle.constant(1.0)
b = paddle.constant(2.0)
c = a + b
sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
sess.eval(c)
sess.close()
```
### Create a Remote Session
```python
create_cloud_job(
name,
num_trainer,
mem_per_trainer,
gpu_per_trainer,
cpu_per_trainer,
num_ps,
mem_per_ps,
cpu_per_ps,
)
```
Creates a Paddle Cloud job. Fails if the job name exists.
```python
get_cloud_job(
name
)
```
Gets a Paddle Cloud job.
```python
remote_session(
job
)
```
- *job*: the Paddle Cloud job.
#### Example
```Python
reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
image = reader.column(0)
label = reader.column(1)
fc1 = paddle.op.fc(image, size=256, act="sigmoid")
fc2 = paddle.op.fc(fc1, size=10, act="softmax")
cost = paddle.op.cross_entropy(fc2, label)
opt = paddle.optimizer.sgd(cost)
job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
sess = paddle.remote_ession(job)
for i in range(1000):
sess.eval(opt)
sess.close()
```
......@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
1. 对这个版本的提交,做如下几个操作:
* 使用Regression Test List作为检查列表,测试本次release的正确性。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
* 编译这个版本的python wheel包,并发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`
......@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine
twine upload dist/[package to upload]
```
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
1. 协同完成Release Note的书写
......@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
*`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。
## 发布wheel包到pypi
使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。然后按照上述的方法
使用`twine`工具上传即可。
<img src="ci_build_whl.png">
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
## 发布Docker镜像
上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
版本号对应的tag即可:
1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
1. 执行 `docker push paddlepaddle/paddle:[version]`
## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
......
......@@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev
```
/-> CPUDeviceContext --> MKLDeviceContext
DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext
/-> CPUDeviceContext
DeviceContext ----> CUDADeviceContext
\-> FPGADeviceContext
```
......@@ -79,16 +79,6 @@ private:
};
```
- CUDNNDeviceContext
```
class CUDNNDeviceContext : public CUDADeviceContext {
private:
cudnnHandle_t cudnn_handle_;
};
```
### Memory and Tensor
......
......@@ -15,7 +15,7 @@
获取PaddlePaddle的Docker镜像
------------------------------
执行下面的命令获取最新的PaddlePaddle Docker镜像
执行下面的命令获取最新的PaddlePaddle Docker镜像,版本为cpu_avx_mkl:
.. code-block:: bash
......@@ -27,7 +27,7 @@
docker pull docker.paddlepaddle.org/paddle
下载GPU版本的Docker镜像:
下载GPU版本(cuda8.0_cudnn5_avx_mkl)的Docker镜像:
.. code-block:: bash
......@@ -54,7 +54,7 @@
.. _docker_run:
在Docker中执行PaddlePaddle训练程序
------------------------------
----------------------------------
假设您已经在当前目录(比如在/home/work)编写了一个PaddlePaddle的程序 :code:`train.py` (可以参考
`PaddlePaddleBook <http://www.paddlepaddle.org/docs/develop/book/01.fit_a_line/index.cn.html>`_
......@@ -82,7 +82,7 @@
.. _docker_run_book:
使用Docker启动PaddlePaddle Book教程
------------------------------
-----------------------------------
使用Docker可以快速在本地启动一个包含了PaddlePaddle官方Book教程的Jupyter Notebook,可以通过网页浏览。
PaddlePaddle Book是为用户和开发者制作的一个交互式的Jupyter Notebook。
......
......@@ -16,7 +16,7 @@ After you've read above tutorials you may proceed the following steps.
Pull PaddlePaddle Docker Image
------------------------------
Run the following command to download the latest Docker images:
Run the following command to download the latest Docker images, the version is cpu_avx_mkl:
.. code-block:: bash
......@@ -28,7 +28,7 @@ For users in China, we provide a faster mirror:
docker pull docker.paddlepaddle.org/paddle
Download GPU version images:
Download GPU version (cuda8.0_cudnn5_avx_mkl) images:
.. code-block:: bash
......@@ -58,7 +58,7 @@ and run:
.. _docker_run:
Launch your training program in Docker
------------------------------
--------------------------------------
Assume that you have already written a PaddlePaddle program
named :code:`train.py` under directory :code:`/home/work` (refer to
......
......@@ -11,14 +11,14 @@ PaddlePaddle可以使用常用的Python包管理工具
------------------------------
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件。
执行下面的命令即可在当前机器上安装PaddlePaddle的运行时环境,并自动下载安装依赖软件,版本为cpu_avx_openblas
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
......
......@@ -12,14 +12,14 @@ Install Using pip
------------------------------
Run the following command to install PaddlePaddle on the current
machine, it will also download requirements.
machine, it will also download requirements, the version is cpu_avx_openblas.
.. code-block:: bash
pip install paddlepaddle
If you wish to install GPU version, just run:
If you wish to install GPU version (cuda7.5_cudnn5_avx_openblas), just run:
.. code-block:: bash
......
......@@ -7,13 +7,13 @@
++++++++
PaddlePaddle支持使用pip快速安装,目前支持CentOS 6以上, Ubuntu 14.04以及MacOS 10.12,并安装有Python2.7。
执行下面的命令完成快速安装:
执行下面的命令完成快速安装,版本为cpu_avx_openblas
.. code-block:: bash
pip install paddlepaddle
如果需要安装支持GPU的版本,需要执行:
如果需要安装支持GPU的版本(cuda7.5_cudnn5_avx_openblas),需要执行:
.. code-block:: bash
......
......@@ -8,13 +8,13 @@ Quick Install
You can use pip to install PaddlePaddle with a single command, supports
CentOS 6 above, Ubuntu 14.04 above or MacOS 10.12, with Python 2.7 installed.
Simply run the following command to install:
Simply run the following command to install, the version is cpu_avx_openblas:
.. code-block:: bash
pip install paddlepaddle
If you need to install GPU version, run:
If you need to install GPU version (cuda7.5_cudnn5_avx_openblas), run:
.. code-block:: bash
......
......@@ -51,7 +51,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信
- ports_num:**必选,默认1**,监听的端口个数
- ports_num_for_sparse:**必选,默认1**,用于稀疏类型参数通信的端口个数
- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
### 启动计算节点
......@@ -60,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
$ python train.py
```
trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量(https://zh.wikipedia.org/wiki/环境变量 )或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
使用环境变量:
......@@ -95,7 +95,7 @@ paddle.init(
- trainer_count:**必选,默认1**,当前训练任务trainer总个数
- port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认1**,和pserver之间用于稀疏类型参数通信的端口个数
- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
......
......@@ -52,7 +52,7 @@ Parameter Description
- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
- ports_num: **required, default 1**, total number of ports will listen on.
- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
- num_gradient_servers: **required, default 1**, total number of gradient servers.
### Starting trainer
......@@ -98,7 +98,7 @@ Parameter Description
- trainer_count: **required, default 1**, total count of trainers in the training job.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
......
# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式
- 基于Linux交叉编译环境的编译方式
- [基于Docker容器的编译方式](#基于docker容器的编译方式)
- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
## 基于Docker容器的编译方式
Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
......@@ -16,6 +17,12 @@ $ cd Paddle
$ docker build -t username/paddle-android:dev . -f Dockerfile.android
```
用户也可以使用PaddlePaddle提供的官方开发镜像:
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### 编译PaddlePaddle C-API库
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数:
......@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>>= 16</td>
<td>21</td>
</tr>
</tbody>
</table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
```
- 编译`arm64-v8a``Android API 21`的PaddlePaddle库
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
```
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
......@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8`
- 构建`arm64-v8a``Android API 21`的独立工具链:
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
......@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8`
注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**
### 配置交叉编译参数
CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)
交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF``WITH_AVX=OFF``WITH_PYTHON=OFF``WITH_RDMA=OFF`)。
- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF``WITH_AVX=OFF``WITH_PYTHON=OFF``WITH_RDMA=OFF``WITH_MKL=OFF``WITH_GOLANG=OFF`)。
- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
......@@ -119,7 +127,7 @@ Android平台可选配置参数:
其他配置参数:
- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`
- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
常用的cmake配置如下:
......@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
..
```
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
- 设置`CMAKE_BUILD_TYPE``Release`
- 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
......
# Build PaddlePaddle for Android
There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
There are two approaches to build PaddlePaddle for Android:
- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
- [Cross-Compiling on Linux](#cross-compiling-on-linux)
## Cross-Compiling Using Docker
......@@ -16,6 +19,12 @@ $ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android
```
Users can directly use the published Docker image.
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
......@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
</tr>
<tr class="row-odd">
<td>ANDROID_API</td>
<td>>= 21</td>
<td>>= 16</td>
<td>21</td>
</tr>
</tbody>
......@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know:
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
......@@ -123,7 +130,7 @@ Some Android-specific arguments:
Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
Some frequent configurations for your reference:
......@@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
- `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
......
# PaddlePaddle Compiling Guide for iOS
# Build PaddlePaddle for iOS
This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
......@@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying
- set `CMAKE_BUILD_TYPE` with `Release`
- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
## Compile and install
## Build and install
After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
......@@ -109,7 +109,7 @@ $ make install
Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
`your/path/to/install` directory will have following directories after `compile` and `install`:
`your/path/to/install` directory will have following directories after `make install`:
- `include`, contains all the C-API header files.
- `lib`, contains PaddlePaddle C-API static library.
......
......@@ -24,6 +24,7 @@ else()
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
endif()
if(WITH_SWIG_PY)
......
......@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output(
out->args.push_back(layerOutput);
return kPD_NO_ERROR;
}
paddle_error paddle_gradient_machine_release_layer_output(
paddle_gradient_machine machine) {
auto m = cast(machine);
if (m == nullptr || m->machine == nullptr) {
return kPD_NULLPTR;
}
m->machine->releaseOutput();
return kPD_NO_ERROR;
}
......@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
const char* layerName,
paddle_arguments args);
/**
* @brief Release the middle layer's output memory of the gradient machine.
* @param [in] gradient machine that have run a inference
* @return paddle_error
*/
PD_API paddle_error
paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
#ifdef __cplusplus
}
#endif
......
......@@ -5,10 +5,18 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
if (WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif ()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
if (WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
endif()
cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)
......@@ -18,10 +26,15 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc DEPS glog)
cc_library(threadpool SRCS threadpool.cc)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_transform SRCS data_transform.cc DEPS tensor framework_proto)
cc_library(device_data_transform SRCS device_data_transform.cc DEPS tensor)
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows device_data_transform)
cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
......@@ -30,7 +43,7 @@ device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
......@@ -62,9 +75,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_library(threadpool SRCS threadpool.cc)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
nv_test(device_data_transform_test SRCS device_data_transform_test.cu
DEPS operator op_registry init math_function)
......@@ -427,7 +427,8 @@ std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDesc>> op_grads;
if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
(*it)->Type() == "parallel_do") {
int step_block_idx = (*it)->GetBlockAttr("sub_block");
BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
grad_to_var, step_block_idx);
......
# Operator/expression 's Backward
## Motivation
In Neural Network, most models are solved by the backpropagation algorithm(known as **BP**) at present. Technically, BP calculates the gradient of the loss function, then propagates it back through the networks following the chain rule. Hence we need a module that chains the gradient operators/expressions together to construct the backward pass. Every forward network needs a backward network to construct the full computation graph. The operator/expression's backward pass will be generated with respect to the forward pass.
## Implementation
In this design doc, we exported only one API for generating the backward pass.
```c++
std::unique_ptr<OperatorBase> Backward(const OperatorBase& forwardOp,
const std::unordered_set<std::string>& no_grad_vars);
```
The implementation behind it can be divided into two parts, **Backward Operator Creating** and **Backward Operator Building**.
### Backward Operator Registry
A backward network is built up with several backward operators. Backward operators take forward operators' inputs, outputs, and output gradients and then calculate its input gradients.
| | forward operator | backward operator
| ---------------------- | ---------------- |------------------------- |
| **Operator::inputs_** | Inputs | Inputs, Outputs, OutputGradients |
| **Operator::outputs_** | Outputs | InputGradients |
In most cases, there is a one-to-one relation between the forward and backward operators. These relations are recorded by a global hash map(`OpInfoMap`). To follow the philosophy of minimum core and to make operators pluggable, the registry mechanism is introduced.
For example, we have `mul_op`, and we can register its information and corresponding backward operator by the following macro:
```cpp
REGISTER_OP(mul, MulOp, MulOpMaker, mul_grad, MulOpGrad);
```
`mul` is the operator's type. `MulOp` and `MulOpMaker` are the operator class and the operator maker class respectively.
`mul_grad` is the type of backward operator, and `MulOpGrad` is its class name.
### Backward Opeartor Creating
Given a certain forward operator, we can get its corresponding backward operator by calling:
```cpp
OperatorBase* bwd_op = BuildGradOp(const OperatorBase* fwd_op);
```
The function `BuildGradOp` will sequentially execute following processes:
1. Get the `type_` of given forward operator, and then get the corresponding backward operator's type by looking up the `OpInfoMap`.
2. Build two maps named `inputs` and `outputs` to temporarily store backward operator's inputs and outputs. Copy forward operator's `inputs_` and `outputs_` to map `inputs`, except these, are not necessary for gradient computing.
3. Add forward inputs' gradient variables into map `output`, adding forward outputs' gradient variables into map `input`.
4. Building backward operator with `inputs`, `outputs` and forward operator's attributes.
### Backward Network Building
A backward network is a series of backward operators. The main idea of building a backward network is creating backward operators in the inverted sequence and appending them together one by one. There are some corner cases that need special processing.
1. Op
When the input forward network is an Op, return its gradient Operator immediately. If all of its outputs are in no gradient set, then return a special `NOP`.
2. NetOp
In our design, the network itself is also a kind of operator(**NetOp**). So the operators contained by a big network may be some small network. When the input forward network is a NetOp, it needs to call the sub NetOp/Operators backward function recursively. During the process, we need to collect the `OutputGradients` name according to the forward NetOp.
3. RnnOp
RnnOp is a nested stepnet operator. Backward module needs to recusively call `Backward` for every stepnet.
4. Sharing Variables
As illustrated in the figure 1 and figure 2, two operators share the same variable name **W@GRAD**, which will overwrite their shared input variable.
<p align="center">
<img src="./images/duplicate_op.png" width="50%" ><br/>
​ Figure 1. Sharing variables in operators.
</p>
​ Sharing variable between operators or same input variable used in multiple operators can lead to duplicate gradient variables. As illustrated in figure 2, we need to rename the gradient names recursively and add a generic add operator to prevent overwriting.
<p align="center">
<img src="images/duplicate_op2.png" width="40%" ><br/>
​ Figure 2. Replace sharing variable's gradient with `Add` operator.
</p>
​ Because the framework finds variables according to their names, we need to rename the output links. We add an integer suffix to represent its position in the clockwise direction.
5. Part of the Gradient is Zero.
In the whole graph, there is some case of that one operator's gradient is not needed, but its input's gradient is a dependency link of other operator, we need to fill a same shape gradient matrix in the position. In our implementation, we insert a special `fillZeroLike` operator.
Follow these rules above, then collect the sub graph `OutputGradients`/`InputGradients` as the NetOp's and return it.
......@@ -53,12 +53,12 @@ VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
return it->second.get();
}
VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
VarDesc *res = FindVarRecursive(name_bytes);
if (res == nullptr) {
res = Var(name_bytes);
}
return res;
return *res;
}
bool BlockDesc::HasVarRecursive(const std::string &name) const {
......
......@@ -57,7 +57,7 @@ class BlockDesc {
VarDesc *FindVarRecursive(const std::string &name_bytes) const;
VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
bool HasVarRecursive(const std::string &var_name) const;
......
......@@ -11,8 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <functional>
#include "paddle/framework/data_transform.h"
#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
......@@ -22,5 +27,165 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
return data_transform_map;
}
Tensor* DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor) {
Tensor* out = nullptr;
if (!platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_type.place_)) {
out = DeviceTransform(input_tensor, expected_kernel_type.place_);
}
PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
return out;
}
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable& out_var) {
if (in_var.IsType<LoDTensor>()) {
auto& in_lod_tensor = in_var.Get<LoDTensor>();
auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
tran_lod_tensor->set_lod(in_lod_tensor.lod());
tran_lod_tensor->set_layout(in_lod_tensor.layout());
tran_lod_tensor->ShareDataWith(tensor);
} else if (in_var.IsType<SelectedRows>()) {
auto& in_selected_rows = in_var.Get<SelectedRows>();
auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
trans_selected_rows->set_height(in_selected_rows.height());
trans_selected_rows->set_rows(in_selected_rows.rows());
trans_selected_rows->mutable_value()->ShareDataWith(tensor);
} else {
PADDLE_THROW("unknown var type");
}
}
auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelFP64 = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNCHW, LibraryType::kPlain);
// TODO(dzhwinter): Only for testing multiple op kernel.
// Dummy transform function for library_type
// should be removed.
auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kPlain);
auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kCUDNN);
void DummyTrans(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
*dst = src;
}
void TransDataType(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
auto dims = src.dims();
dst->Resize(dims);
auto dst_type = kernel_pair.second.data_type_;
auto src_type = kernel_pair.first.data_type_;
switch (src_type) {
case proto::DataType::FP32:
framework::VisitDataType(dst_type, CastDataType<float>(src, dst, ctx));
break;
case proto::DataType::FP64:
framework::VisitDataType(dst_type, CastDataType<double>(src, dst, ctx));
break;
case proto::DataType::INT32:
framework::VisitDataType(dst_type, CastDataType<int>(src, dst, ctx));
break;
case proto::DataType::INT64:
framework::VisitDataType(dst_type, CastDataType<int64_t>(src, dst, ctx));
break;
case proto::DataType::BOOL:
framework::VisitDataType(dst_type, CastDataType<bool>(src, dst, ctx));
break;
default:
PADDLE_THROW("Not support type %d", src_type);
}
}
void TransDataLayout(const std::vector<int>& axis,
const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataLayout only support DataLayout transform on same place!");
PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
"TransDataLayout only support Datatype are same!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
auto src_dim = src.dims();
std::vector<int64_t> dst_dim;
dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]];
}
dst->Resize(make_ddim(dst_dim));
auto place = kernel_pair.second.place_;
dst->mutable_data(place, src.type());
auto src_type = kernel_pair.first.data_type_;
framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
dst->set_layout(kernel_pair.second.data_layout_);
}
} // namespace framework
} // namespace paddle
namespace f = paddle::framework;
namespace {
std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
}
REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
std::bind(f::TransDataLayout, NHWC2NCHW,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
std::bind(f::TransDataLayout, NCHW2NHWC,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
......@@ -19,19 +19,23 @@ limitations under the License. */
#include <vector>
#include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/variable.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/macros.h"
#include "paddle/platform/transform.h"
namespace paddle {
namespace framework {
using DataTransformFN =
std::function<void(const std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out)>;
using KernelTypePair = std::pair<OpKernelType, OpKernelType>;
using DataTransformFn =
std::function<void(const platform::DeviceContext*, const KernelTypePair&,
const Variable&, Variable*)>;
struct KernelTypePairHash {
static void HashCombine(const OpKernelType& t, std::size_t* seed) {
OpKernelType::Hash kernel_type_hasher;
......@@ -46,8 +50,76 @@ struct KernelTypePairHash {
}
};
Tensor* DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor);
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable& out_var);
template <typename InType, typename OutType>
struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const {
return static_cast<OutType>(in);
}
};
template <typename InType>
struct CastDataType {
CastDataType(const framework::Tensor& in, framework::Tensor* out,
const platform::DeviceContext* ctx)
: in_(in), out_(out), ctx_(ctx) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
template <typename OutType>
void operator()() {
auto place = ctx_->GetPlace();
auto* in_begin = in_.data<InType>();
auto numel = in_.numel();
auto* in_end = in_begin + numel;
auto* out_begin = out_->mutable_data<OutType>(place);
if (platform::is_cpu_place(place)) {
platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
} else {
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW("Unsupport CPU <-> GPU!");
}
}
};
struct CastDataLayout {
CastDataLayout(const platform::DeviceContext* ctx,
const std::vector<int>& axis, const framework::Tensor& in,
framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_;
framework::Tensor* out_;
const platform::DeviceContext* ctx_;
const std::vector<int> axis_;
template <typename T>
void operator()() {
auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) {
operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans4(*context, in_, out_, axis_);
} else {
PADDLE_THROW("Unsupport CPU <-> GPU!");
}
}
};
using DataTransformMap =
std::unordered_map<KernelTypePair, DataTransformFN, KernelTypePairHash>;
std::unordered_map<KernelTypePair, DataTransformFn, KernelTypePairHash>;
class DataTransformFnMap {
public:
......@@ -58,25 +130,25 @@ class DataTransformFnMap {
}
void Insert(const OpKernelType& left, const OpKernelType& right,
const DataTransformFN& data_tranform_fn) {
const DataTransformFn& data_tranform_fn) {
Insert(std::make_pair(left, right), data_tranform_fn);
}
void Insert(const KernelTypePair& kernel_type_pair,
const DataTransformFN& data_tranform_fn) {
const DataTransformFn& data_tranform_fn) {
PADDLE_ENFORCE(!Has(kernel_type_pair),
"KernelTypePair %s has been registered", "");
map_.insert({kernel_type_pair, data_tranform_fn});
}
const DataTransformFN& Get(const KernelTypePair& key_pair) const {
const DataTransformFn& Get(const KernelTypePair& key_pair) const {
auto data_transformer = GetNullable(key_pair);
PADDLE_ENFORCE_NOT_NULL(data_transformer,
"DataTransformFN should not be NULL");
"DataTransformFn should not be NULL");
return *data_transformer;
}
const DataTransformFN* GetNullable(const KernelTypePair& key_pair) const {
const DataTransformFn* GetNullable(const KernelTypePair& key_pair) const {
auto it = map_.find(key_pair);
if (it == map_.end()) {
return nullptr;
......
......@@ -11,36 +11,67 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <array>
#include <vector>
#include "paddle/framework/data_transform.h"
#include <gtest/gtest.h>
#include "paddle/framework/data_transform.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
using namespace platform;
/**
* @brief cross validation of different kernel type transform
* We use four bit map represent different combination.
* If the field has multiple possible value, only choose two of them.
* For DataType, only test the FP32(float), FP64(double).
* e.g. 0000 -> FP32, CPUPlace, kNHWC, kPlain
* 1111 -> FP64, GPUPlace, kNCHW, kMKLDNN
*/
std::array<proto::DataType, 2> kDataType = {
{proto::DataType::FP32, proto::DataType::FP64}};
std::array<Place, 2> kPlace = {{CPUPlace(), CUDAPlace(0)}};
std::array<DataLayout, 2> kDataLayout = {{
DataLayout::kNHWC, DataLayout::kNCHW,
}};
std::array<LibraryType, 2> kLibraryType = {{
LibraryType::kPlain, LibraryType::kMKLDNN,
}};
OpKernelType GenFromBit(const std::vector<bool> bits) {
return OpKernelType(kDataType[bits[0]], kPlace[bits[1]], kDataLayout[bits[2]],
kLibraryType[bits[3]]);
}
int test_value = 0;
OpKernelType kernel_type_1(proto::DataType::FP32, CPUPlace(), DataLayout::kNCHW,
LibraryType::kCUDNN);
OpKernelType kernel_type_2(proto::DataType::FP32, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
OpKernelType kernel_type_3(proto::DataType::FP16, CUDAPlace(0),
DataLayout::kNCHW, LibraryType::kCUDNN);
auto kernel0 = GenFromBit({0, 0, 0, 0});
auto kernel1 = GenFromBit({0, 0, 0, 1});
auto kernel2 = GenFromBit({0, 0, 1, 0});
auto kernel3 = GenFromBit({0, 0, 1, 1});
void type1_to_type2(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransDataType_t(const platform::DeviceContext* ctx,
const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value++;
}
void type2_to_type3(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransDataLayout_t(const platform::DeviceContext* ctx,
const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value--;
}
void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
const Variable& in, Variable* out) {
void TransLibraryType_t(const platform::DeviceContext* ctx,
const KernelTypePair& p, const Variable& in,
Variable* out) {
test_value += 2;
}
......@@ -49,30 +80,89 @@ void type1_to_type3(std::vector<platform::DeviceContext*> ctx,
namespace frw = paddle::framework;
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_2,
frw::type1_to_type2);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_2, frw::kernel_type_3,
frw::type2_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel_type_1, frw::kernel_type_3,
frw::type1_to_type3);
REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel1, frw::TransDataType_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel1, frw::kernel2, frw::TransDataLayout_t);
REGISTER_DATA_TRANSFORM_FN(frw::kernel0, frw::kernel2, frw::TransLibraryType_t);
TEST(DataTransform, Register) {
using namespace paddle::framework;
using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance();
ASSERT_EQ(instance.Map().size(), 3UL);
std::vector<DeviceContext*> ctx;
paddle::framework::Variable in;
paddle::framework::Variable out;
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_2))(ctx, in,
&out);
DeviceContext* ctx = new CPUDeviceContext();
auto pair0 = std::make_pair(frw::kernel0, frw::kernel1);
instance.Get(pair0)(ctx, pair0, in, &out);
ASSERT_EQ(test_value, 1);
instance.Get(std::make_pair(frw::kernel_type_2, frw::kernel_type_3))(ctx, in,
&out);
auto pair1 = std::make_pair(frw::kernel1, frw::kernel2);
instance.Get(pair1)(ctx, pair1, in, &out);
ASSERT_EQ(test_value, 0);
instance.Get(std::make_pair(frw::kernel_type_1, frw::kernel_type_3))(ctx, in,
&out);
auto pair3 = std::make_pair(frw::kernel0, frw::kernel2);
instance.Get(pair3)(ctx, pair3, in, &out);
ASSERT_EQ(test_value, 2);
}
TEST(DataTransform, DataLayout) {
using namespace paddle::framework;
using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance();
Variable in;
Variable out;
Tensor* src = in.GetMutable<Tensor>();
src->mutable_data<double>(make_ddim({2, 3, 1, 2}), CPUPlace());
src->set_layout(DataLayout::kNHWC);
DeviceContext* ctx = new CPUDeviceContext();
{
auto kernel1 = GenFromBit({1, 0, 0, 0});
auto kernel2 = GenFromBit({1, 0, 1, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, in, &out);
}
Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
{
auto kernel1 = GenFromBit({1, 0, 1, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, out, &in);
}
EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
}
TEST(DataTransform, DataType) {
using namespace paddle::framework;
using namespace paddle::platform;
auto& instance = DataTransformFnMap::Instance();
DeviceContext* ctx = new CPUDeviceContext();
Variable in;
Variable out;
Tensor* src = in.GetMutable<Tensor>();
float* ptr = src->mutable_data<float>(make_ddim({2, 3}), CPUPlace());
for (int i = 0; i < 6; ++i) {
ptr[i] = i / 3;
}
{
auto kernel1 = GenFromBit({0, 0, 0, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, in, &out);
}
Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.data<double>() != nullptr);
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <thread>
namespace paddle {
namespace framework {
namespace details {
// Change it to thread safe flags if needed.
class ThreadUnsafeOwnershipFlags {
public:
ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private:
bool flag_;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
// Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
}
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
// Ownership flag.
OwnershipFlags ownership_;
};
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/details/cow_ptr.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
namespace details {
TEST(COWPtr, all) {
COWPtr<int> ptr(new int{0});
ASSERT_EQ(ptr.Data(), 0);
COWPtr<int> ptr2 = ptr;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(&ptr2.Data(), &ptr.Data());
*ptr2.MutableData() = 10;
ASSERT_EQ(ptr.Data(), 0);
ASSERT_EQ(ptr2.Data(), 10);
}
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/device_data_transform.h"
namespace paddle {
namespace framework {
static const platform::DeviceContext* GetDeviceContext(
const platform::Place& src_place, const platform::Place& dst_place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
return pool.Get(src_place);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
return pool.Get(dst_place);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) {
VLOG(3) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
Tensor* out = new Tensor();
auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
dev_ctx->Wait();
Copy(in, dst_place, *dev_ctx, out);
dev_ctx->Wait();
return out;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_info.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/elementwise_op_function.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input1 of test op");
AddOutput("output", "output of test op");
AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
AddComment("This is test op");
}
};
class TestOpWithKernel : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override {
if (Attr<bool>("use_gpu")) {
VLOG(3) << "force use gpu kernel";
return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
} else {
VLOG(3) << "use default kernel";
return OpKernelType(proto::DataType::FP32,
ctx.Input<Tensor>("input")->place());
}
}
};
template <typename DeviceContext, typename T>
class TestKernel : public OpKernel<float> {
public:
void Compute(const ExecutionContext& ctx) const {
std::cout << ctx.op().DebugString() << std::endl;
const Tensor* input = ctx.Input<Tensor>("input");
std::cout << "input place:" << input->place() << std::endl;
auto* output = ctx.Output<framework::LoDTensor>("output");
output->Resize(input->dims());
output->mutable_data<T>(ctx.GetPlace());
operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
input, input, output, ctx.template device_context<DeviceContext>(),
AddFunctor<T>());
functor.Run();
}
};
} // namespace framework
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(
test_op, paddle::framework::TestOpWithKernel,
paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
static void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments,
paddle::framework::proto::OpDesc::Var* var) {
var->set_parameter(param_name);
for (auto& arg_name : arguments) {
*var->mutable_arguments()->Add() = arg_name;
}
}
TEST(Operator, CPUtoGPU) {
using namespace paddle::framework;
using namespace paddle::platform;
InitDevices();
paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place;
// create an op to run on CPU
paddle::framework::proto::OpDesc cpu_op_desc;
cpu_op_desc.set_type("test_op");
BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
// prepare input
auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float>(i);
}
// get output
auto* output = scope.Var("OUT1");
cpu_op->Run(scope, cpu_place);
auto* output_ptr = output->Get<LoDTensor>().data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
}
// create an op to run on GPU
paddle::framework::proto::OpDesc gpu_op_desc;
gpu_op_desc.set_type("test_op");
BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
auto attr = gpu_op_desc.mutable_attrs()->Add();
attr->set_name("use_gpu");
attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
attr->set_b(true);
auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
paddle::platform::CUDAPlace cuda_place(0);
// get output
auto* output2 = scope.Var("OUT2");
gpu_op->Run(scope, cuda_place);
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
DeviceContextPool& pool = DeviceContextPool::Instance();
auto dev_ctx = pool.Get(cuda_place);
paddle::framework::Tensor output_tensor;
Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
dev_ctx->Wait();
float* output2_ptr = output_tensor.data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
}
}
......@@ -14,18 +14,18 @@ limitations under the License. */
#include "paddle/framework/executor.h"
#include <algorithm>
#include <iostream>
#include <memory>
#include <set>
#include <vector>
#include "gflags/gflags.h"
#include "paddle/framework/feed_fetch_type.h"
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/scope.h"
#include "paddle/platform/place.h"
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
"extremely slow so please use this flag wisely.");
namespace paddle {
namespace framework {
......@@ -50,14 +50,30 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarDesc::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else {
PADDLE_THROW(
"Variable type %d is not in "
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
" PLACE_LIST]",
var_type);
}
}
static void CheckTensorNANOrInf(const std::string& name,
const framework::Tensor& tensor) {
if (tensor.memory_size() == 0) {
return;
}
if (tensor.type().hash_code() != typeid(float).hash_code() &&
tensor.type().hash_code() != typeid(double).hash_code()) {
return;
}
PADDLE_ENFORCE(!framework::HasInf(tensor), "Tensor %s has Inf", name);
PADDLE_ENFORCE(!framework::HasNAN(tensor), "Tensor %s has NAN", name);
}
void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
bool create_local_scope, bool create_vars) {
// TODO(tonyyang-svail):
......@@ -99,10 +115,19 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString();
VLOG(3) << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
auto* var = local_scope->FindVar(vname);
if (var == nullptr) continue;
if (var->IsType<framework::LoDTensor>()) {
CheckTensorNANOrInf(vname, var->Get<framework::LoDTensor>());
}
}
}
}
if (create_local_scope) {
if (create_vars && create_local_scope) {
scope->DeleteScope(local_scope);
}
}
......
......@@ -123,6 +123,7 @@ message VarDesc {
STEP_SCOPES = 5;
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
PLACE_LIST = 8;
}
required string name = 1;
required VarType type = 2;
......
......@@ -87,7 +87,11 @@ class GradOpDescMakerBase {
auto onames = this->Output(name);
ret_val.reserve(onames.size());
std::transform(onames.begin(), onames.end(), std::back_inserter(ret_val),
GradVarName);
[this](const std::string& fwd_var_name) -> std::string {
auto g_name = GradVarName(fwd_var_name);
(*this->grad_to_var_)[g_name] = fwd_var_name;
return g_name;
});
return ret_val;
}
......
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <string>
#include "paddle/framework/init.h"
#include "paddle/framework/operator.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/string/piece.h"
......@@ -24,7 +25,6 @@ namespace framework {
std::once_flag gflags_init_flag;
// TODO(qijun) move init gflags to init.cc
void InitGflags(std::vector<std::string> &argv) {
std::call_once(gflags_init_flag, [&]() {
int argc = argv.size();
......@@ -40,39 +40,28 @@ void InitGflags(std::vector<std::string> &argv) {
});
}
bool InitDevices(const std::vector<std::string> &devices) {
// device format
// CPU
// GPU:1
// TODO(dzhwinter) : add device format annotation for users.
void InitDevices() {
/*Init all avaiable devices by default */
std::vector<platform::Place> places;
for (auto &device : devices) {
auto p = string::Piece(device);
if (string::HasPrefix(p, "CPU")) {
places.emplace_back(platform::CPUPlace());
} else if (string::HasPrefix(p, "GPU")) {
places.emplace_back(platform::CPUPlace());
#ifdef PADDLE_WITH_CUDA
auto pos = string::RFind(p, ':', string::Piece::npos);
auto number = device.substr(pos + 1);
places.emplace_back(platform::CUDAPlace(std::stoi(number)));
int count = platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
#else
LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option";
LOG(WARNING)
<< "'GPU' is not supported, Please re-compile with WITH_GPU option";
#endif
} else {
return false;
}
}
if (std::find_if(places.begin(), places.end(),
[&](const platform::Place &place) {
return platform::is_cpu_place(place);
}) == places.end()) {
places.emplace_back(platform::CPUPlace());
LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
}
platform::DeviceContextPool::Create(places);
return true;
platform::DeviceContextPool::Init(places);
}
void InitGLOG(const std::string &prog_name) {
google::InitGoogleLogging(prog_name.c_str());
google::InstallFailureSignalHandler();
}
} // namespace framework
......
......@@ -22,7 +22,9 @@ namespace framework {
void InitGflags(std::vector<std::string> &argv);
bool InitDevices(const std::vector<std::string> &devices);
void InitGLOG(const std::string &prog_name);
void InitDevices();
} // namespace framework
} // namespace paddle
......@@ -14,18 +14,13 @@ limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/platform/device_context.h"
TEST(Init, InitDevices) {
TEST(InitDevices, CPU) {
using paddle::framework::InitDevices;
std::vector<std::string> ds1 = {"CPU"};
ASSERT_EQ(InitDevices(ds1), true);
using paddle::platform::DeviceContextPool;
#ifdef PADDLE_WITH_CUDA
std::vector<std::string> ds2 = {"CPU", "GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds2), true);
// test re-init
std::vector<std::string> ds3 = {"GPU:0", "GPU:1"};
ASSERT_EQ(InitDevices(ds3), true);
#endif
InitDevices();
DeviceContextPool& pool = DeviceContextPool::Instance();
ASSERT_GE(pool.size(), 1U);
}
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cctype>
namespace paddle {
namespace framework {
......@@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
inline LibraryType StringToLibraryType(const char* ctype) {
std::string s(ctype);
for (size_t i = 0; i < s.size(); ++i) {
s[i] = toupper(s[i]);
}
if (s == std::string("PLAIN")) {
return LibraryType::kPlain;
} else if (s == std::string("MKLDNN")) {
......
......@@ -43,16 +43,30 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
return os;
}
LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
LoD new_lod;
new_lod.reserve(level_end - level_begin);
for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i));
std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
if (!platform::is_cpu_place(t.place())) {
LoDTensor tt;
framework::Copy(t, platform::CPUPlace(), &tt);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(t.place());
dev_ctx.Wait();
os << tt;
return os;
}
os << "dim: " << t.dims() << "\n";
os << "lod: " << t.lod() << "\n";
// only print first ten elements
int64_t size = t.numel() < 10 ? t.numel() : 10;
for (int64_t i = 0; i < size; ++i) {
os << t.data<float>()[i] << " ";
}
// transform the lowest level to absolute offset.
LoD abs_offset_lod = ToAbsOffset(in);
new_lod.back() = abs_offset_lod[level_end - 1];
return new_lod;
return os;
}
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
......@@ -115,43 +129,6 @@ bool operator==(const LoD &a, const LoD &b) {
return true;
}
size_t LoDTensor::NumElements(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level));
return lod_[level][idx + 1] - lod_[level][idx];
}
size_t LoDTensor::NumInstancesInElement(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level));
auto abs_lod = ToAbsOffset(lod());
size_t begin = abs_lod[level][idx];
size_t end = abs_lod[level][idx + 1];
return end - begin;
}
void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
lod_ = new_lod;
}
void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
size_t elem_end) {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
auto abs_lod = framework::ToAbsOffset(lod());
auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
lod_ = new_lod;
// slice the underlying tensor
size_t begin = abs_lod[level][elem_begin];
size_t end = abs_lod[level][elem_end];
PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
ShareDataWith(Slice(begin, end));
}
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) {
......@@ -177,6 +154,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
if (lod->empty()) {
for (size_t i = 0; i < lod_length.size(); ++i) {
lod->emplace_back(1, 0); // size = 1, value = 0;
}
*lod = LoD(lod_length.size(), std::vector<size_t>({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
......@@ -189,62 +169,16 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
const platform::DeviceContext &dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
{ // the 1st field, uint32_t version for LoDTensor
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char *>(&version), sizeof(version));
}
{ // the 2nd field, tensor description
// int32_t size
// void* protobuf message
proto::TensorDesc desc;
desc.set_data_type(framework::ToDataType(tensor.type()));
auto dims = framework::vectorize(tensor.dims());
auto *pb_dims = desc.mutable_dims();
pb_dims->Resize(static_cast<int>(dims.size()), 0);
std::copy(dims.begin(), dims.end(), pb_dims->begin());
int32_t size = desc.ByteSize();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
auto out = desc.SerializeAsString();
os.write(out.data(), size);
}
{ // the 3rd field, tensor data
uint64_t size = tensor.memory_size();
auto *data_ptr = tensor.data<void>();
PADDLE_ENFORCE(size < std::numeric_limits<std::streamsize>::max(),
"Index overflow when writing tensor");
if (platform::is_gpu_place(tensor.place())) {
#ifdef PADDLE_WITH_CUDA
constexpr size_t kBufSize = 1024 * 1024 * 64; // 64MB
std::unique_ptr<char[]> buf(new char[kBufSize]);
auto &gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext &>(dev_ctx);
platform::CPUPlace cpu;
uintptr_t data = reinterpret_cast<uintptr_t>(data_ptr);
while (size != 0) {
size_t size_to_write = std::min(kBufSize, static_cast<size_t>(size));
memory::Copy(cpu, buf.get(),
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void *>(data), size_to_write,
gpu_dev_ctx.stream());
gpu_dev_ctx.Wait();
os.write(buf.get(), size_to_write);
data += size_to_write;
size -= size_to_write;
}
#else
PADDLE_THROW("Unexpected branch");
#endif
} else {
os.write(static_cast<const char *>(data_ptr),
static_cast<std::streamsize>(size));
}
}
{ // the 4th field, lod information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
{
// the 2st field, LoD information
// uint64_t lod_level
// uint64_t lod_level_1 size in byte.
// int* lod_level_1 data
// ...
auto lod = tensor.lod();
uint64_t size = lod.size();
os.write(reinterpret_cast<const char *>(&size), sizeof(size));
......@@ -256,49 +190,20 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
static_cast<std::streamsize>(size));
}
}
// the 3st field, Tensor
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
}
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
proto::TensorDesc desc;
{ // int32_t size
// proto buffer
int32_t size;
is.read(reinterpret_cast<char *>(&size), sizeof(size));
std::unique_ptr<char[]> buf(new char[size]);
is.read(reinterpret_cast<char *>(buf.get()), size);
PADDLE_ENFORCE(desc.ParseFromArray(buf.get(), size),
"Cannot parse tensor desc");
void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
const platform::DeviceContext &dev_ctx) {
{
// the 1st field, unit32_t version for LoDTensor
uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
}
{ // read tensor
std::vector<int64_t> dims;
dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
void *buf;
platform::Place cpu = platform::CPUPlace();
switch (desc.data_type()) {
case proto::FP32:
buf = tensor->mutable_data<float>(cpu);
break;
case proto::FP64:
buf = tensor->mutable_data<double>(cpu);
break;
case proto::INT32:
buf = tensor->mutable_data<int>(cpu);
break;
case proto::INT64:
buf = tensor->mutable_data<int64_t>(cpu);
break;
default:
PADDLE_THROW("DataType %d not supported", desc.data_type());
}
is.read(static_cast<char *>(buf), tensor->memory_size());
}
{ // read lod
{
// the 2st field, LoD information
uint64_t lod_level;
is.read(reinterpret_cast<char *>(&lod_level), sizeof(lod_level));
auto &lod = *tensor->mutable_lod();
......@@ -312,6 +217,59 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
lod[i] = tmp;
}
}
// the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
// TODO(tonyyang-svail): make this function support LoD
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
const std::vector<platform::Place> places) const {
check_memory_size();
PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
PADDLE_ENFORCE(dims()[0] % places.size() == 0,
"Batch size should be divided by places size");
std::vector<LoDTensor> lods;
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
int begin = place_idx * dims()[0] / places.size();
int end = (place_idx + 1) * dims()[0] / places.size();
auto src = Slice(begin, end);
auto &dst_place = places[place_idx];
LoDTensor dst;
framework::Copy(src, dst_place, &dst);
lods.emplace_back(dst);
}
return lods;
}
// TODO(tonyyang-svail): make this function support LoD
void LoDTensor::MergeLoDTensor(
const std::vector<const LoDTensor *> &lod_tensors,
platform::Place dst_place) {
PADDLE_ENFORCE(!lod_tensors.empty());
framework::DDim new_dim = lod_tensors[0]->dims();
std::type_index new_type = lod_tensors[0]->type();
auto new_layout = lod_tensors[0]->layout();
for (auto *lod : lod_tensors) {
PADDLE_ENFORCE(new_dim == lod->dims());
PADDLE_ENFORCE(new_type == lod->type());
PADDLE_ENFORCE(new_layout == lod->layout());
}
new_dim[0] *= lod_tensors.size();
Resize(new_dim);
set_layout(new_layout);
mutable_data(dst_place, new_type);
int begin = 0;
for (auto *src : lod_tensors) {
int end = begin + src->dims()[0];
auto dst = Slice(begin, end);
framework::Copy(*src, dst_place, &dst);
begin = end;
}
}
} // namespace framework
......
......@@ -58,14 +58,7 @@ using Vector = thrust::host_vector<
using LoD = std::vector<Vector<size_t>>;
std::ostream& operator<<(std::ostream& os, const LoD& lod);
/*
* Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying
* tensor instances. So if higher layers are sliced without the lowest level,
* the lower level of the sliced LoD will be transformed to the absolute offset.
*/
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end);
......@@ -115,34 +108,11 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1;
}
/*
* Number of lower-level elements.
* For example, a 2-level lod-tensor
*
* 0-th level | |
* 1-th level || |||
*
* NumElements(0, 0) get 2
* NumElements(0, 1) get 3
*/
size_t NumElements(size_t level, size_t idx) const;
std::vector<LoDTensor> SplitLoDTensor(
const std::vector<platform::Place> places) const;
/*
* Get the number of instances in the underlying tensor in the `idx`-th
* element.
*/
size_t NumInstancesInElement(size_t level, size_t idx) const;
/*
* Shrink levels[level_begin:level_end]
*/
void ShrinkLevels(size_t level_begin, size_t level_end);
/*
* Shrink elements of a level, [elem_begin: elem_end]
* @note: low performance in slice lod_.
*/
void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
platform::Place place);
private:
LoD lod_;
......@@ -177,8 +147,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
for (size_t ins = 0; ins < num_instances; ins++) {
for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
auto slice = tensor.Slice(elem, elem + 1);
CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
platform::CPUDeviceContext(), &slice);
Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
platform::CPUDeviceContext(), &slice);
}
}
return tensor;
......@@ -208,7 +178,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
*/
void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx);
} // namespace framework
} // namespace paddle
......@@ -54,78 +54,6 @@ class LoDTensorTester : public ::testing::Test {
LoDTensor lod_tensor_;
};
TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
}
TEST_F(LoDTensorTester, NumElements2) {
ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
}
TEST_F(LoDTensorTester, ShrinkLevels) {
// slice 1 level
for (size_t level = 0; level < 3UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
// shrink 2 level
for (size_t level = 0; level < 2UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 2);
// the lowest level's last element should be the tensor's batch_size.
ASSERT_EQ(new_lod_tensor.lod().back().back(),
lod_tensor_.lod().back().back());
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
}
TEST_F(LoDTensorTester, ShrinkInLevel) {
size_t level = 0;
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(2), 5UL);
ASSERT_EQ(new_lod_tensor.dims()[0], 12);
for (int i = 0; i < 12 * 128; i++) {
ASSERT_EQ(new_lod_tensor.data<float>()[i], i);
}
level = 1;
new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 1, 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
ASSERT_EQ(new_lod_tensor.dims()[0], 7);
for (int i = 5 * 128; i < 12 * 128; i++) {
ASSERT_EQ(new_lod_tensor.data<float>()[i - 5 * 128], i);
}
LoDTensor t1;
t1.set_lod(lod_tensor_.lod());
t1.ShareDataWith(lod_tensor_);
LoDTensor t2;
t2.set_lod(lod_tensor_.lod());
t2.ShareDataWith(lod_tensor_);
t1.ShrinkInLevel(0, 1, 2);
t2.ShrinkInLevel(0, 0, 1);
EXPECT_NE(t1.data<float>(), t2.data<float>());
EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
}
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
......@@ -187,5 +115,21 @@ TEST(LoD, AppendLoD) {
EXPECT_EQ(origin, expected);
}
TEST(LoD, ToAbsOffset) {
LoD relative_lod;
relative_lod.push_back(std::vector<size_t>({0, 2}));
relative_lod.push_back(std::vector<size_t>({0, 1, 3}));
relative_lod.push_back(std::vector<size_t>({0, 2, 4, 5}));
LoD abs_lod = paddle::framework::ToAbsOffset(relative_lod);
LoD expected;
expected.push_back(std::vector<size_t>({0, 5}));
expected.push_back(std::vector<size_t>({0, 2, 5}));
expected.push_back(std::vector<size_t>({0, 2, 4, 5}));
EXPECT_EQ(abs_lod, expected);
}
} // namespace framework
} // namespace paddle
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册