diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5df83499d5dde29b205ee17fba81a63c9a643235..00996cb7ed5cc573c42b69be6db369c3654d6d1a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system)
project(paddle CXX C Go)
-message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION})
-message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION})
+message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
+ "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
+message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
+ "${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING)
diff --git a/benchmark/IntelOptimizedPaddle.md b/benchmark/IntelOptimizedPaddle.md
index 084d3237d9cfe9ca4837f77cf5f70a2449cfcc03..8b7dc5b7db800896eb4de2054ab5e584aed93999 100644
--- a/benchmark/IntelOptimizedPaddle.md
+++ b/benchmark/IntelOptimizedPaddle.md
@@ -7,11 +7,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1.
-PaddlePaddle: (TODO: will rerun after 0.11.0)
-- paddlepaddle/paddle:latest (for MKLML and MKL-DNN)
+PaddlePaddle:
+- paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007
-- paddlepaddle/paddle:latest-openblas (for OpenBLAS)
+- paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
@@ -56,15 +56,15 @@ Input image size - 3 * 224 * 224, Time: images/second
-- Alexnet
+- AlexNet
| BatchSize | 64 | 128 | 256 |
|--------------|--------| ------ | -------|
-| OpenBLAS | 2.13 | 2.45 | 2.68 |
+| OpenBLAS | 45.62 | 72.79 | 107.22 |
| MKLML | 66.37 | 105.60 | 144.04 |
| MKL-DNN | 399.00 | 498.94 | 626.53 |
-chart TBD
+
#### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
@@ -72,36 +72,41 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------|
-| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 |
+| OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
+
+
- ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------|
-| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 |
+| OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
+
- GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 |
+| OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
-- Alexnet
+
+
+- AlexNet
| BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------|
-| OpenBLAS | | | | | |
+| OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
-chart TBD
+
### Laptop
TBD
diff --git a/benchmark/figs/alexnet-cpu-infer.png b/benchmark/figs/alexnet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..6215ae4e4288f969a909c258ddd5b5f51e6abb3f
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-infer.png differ
diff --git a/benchmark/figs/alexnet-cpu-train.png b/benchmark/figs/alexnet-cpu-train.png
new file mode 100644
index 0000000000000000000000000000000000000000..b3200bbc049a9d75857fb5692902d7b475aa8f68
Binary files /dev/null and b/benchmark/figs/alexnet-cpu-train.png differ
diff --git a/benchmark/figs/googlenet-cpu-infer.png b/benchmark/figs/googlenet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..19478d433bae651f4506153ded11a96d5137b409
Binary files /dev/null and b/benchmark/figs/googlenet-cpu-infer.png differ
diff --git a/benchmark/figs/googlenet-cpu-train.png b/benchmark/figs/googlenet-cpu-train.png
index c3f67faf096fe9b45dd815f294b41679dc7c9e54..4e86e058d0654d02c898bf7f5fe73aa1c7614e20 100644
Binary files a/benchmark/figs/googlenet-cpu-train.png and b/benchmark/figs/googlenet-cpu-train.png differ
diff --git a/benchmark/figs/resnet-cpu-infer.png b/benchmark/figs/resnet-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc43d4b8d20c600d6f1046a5986a6c62adfa6b44
Binary files /dev/null and b/benchmark/figs/resnet-cpu-infer.png differ
diff --git a/benchmark/figs/resnet-cpu-train.png b/benchmark/figs/resnet-cpu-train.png
index b96ecd5ff940c0d000613b1ed1f11fb16796cf47..96746b1759fa17d25ac5f40ed3678e16086364ba 100644
Binary files a/benchmark/figs/resnet-cpu-train.png and b/benchmark/figs/resnet-cpu-train.png differ
diff --git a/benchmark/figs/vgg-cpu-infer.png b/benchmark/figs/vgg-cpu-infer.png
new file mode 100644
index 0000000000000000000000000000000000000000..3a51ec6c474f0e0f0c4384c8ccd1e08c4382230b
Binary files /dev/null and b/benchmark/figs/vgg-cpu-infer.png differ
diff --git a/benchmark/figs/vgg-cpu-train.png b/benchmark/figs/vgg-cpu-train.png
index f830ca6a87d10b72a5113636dd5686ab25a2e864..6d548cfd59f86f8166c011d71ebde4e4b33ef644 100644
Binary files a/benchmark/figs/vgg-cpu-train.png and b/benchmark/figs/vgg-cpu-train.png differ
diff --git a/benchmark/paddle/image/run_openblas_infer.sh b/benchmark/paddle/image/run_openblas_infer.sh
index 71a49231a5527ebee9f45d5f4650ce2a4f6a1c31..a9a7b8a66717c4be0543c3fe2db293fe199e3dc4 100755
--- a/benchmark/paddle/image/run_openblas_infer.sh
+++ b/benchmark/paddle/image/run_openblas_infer.sh
@@ -8,6 +8,7 @@ function clock_to_seconds() {
}
function infer() {
+ export OPENBLAS_MAIN_FREE=1
topology=$1
layer_num=$2
bs=$3
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 96fc886a342cae38d5b804266d3af7bc909a4da2..c4712f19eb80b34ffbf713d2b13fc0c775312af1 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -19,7 +19,7 @@ ExternalProject_Add(
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
- file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";")
+ file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile})
else()
add_library(eigen3 INTERFACE)
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index 5d24caebdcc5a28823164d718fb1628be5c4179d..89fc34796a03ff3f3e5b022ae10b2646832b1ac7 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -63,9 +63,30 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT}
)
-ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL)
-SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
-ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
+SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
+ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
-LIST(APPEND external_project_dependencies mkldnn)
+LIST(APPEND external_project_dependencies shared_mkldnn)
+
+# generate a static dummy target to track mkldnn dependencies
+# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
+SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
+FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ADD_LIBRARY(mkldnn STATIC ${dummyfile})
+TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+# copy the real so.0 lib to install dir
+# it can be directly contained in wheel or capi
+SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
+ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
+ COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
+ DEPENDS mkldnn)
+ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
+
+IF(WITH_C_API)
+ INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
+ENDIF()
+
diff --git a/cmake/external/mklml.cmake b/cmake/external/mklml.cmake
index 20dbc32a738d982df2d3f035206279c82c8de264..15a07ea3daf5aa606235f20288a8306966334a1a 100644
--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
+
+IF(WITH_C_API)
+ INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
+ENDIF()
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 97857a686b38d935b19f510ecdcb66bcca91fe03..0e79c0cc7992060cbe3b668ec927936183389eb6 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
+ SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID)
- # arm_soft_fp_abi branch of OpenBLAS to support softfp
- # https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
- SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
+ # use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF()
ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
- SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
ENDIF()
ELSEIF(RPI)
# use hardfp
- SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF()
ELSE()
IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF()
- SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
@@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
-FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index a8e1aca49c97df256b1269c286b0bce7732fa932..7cb4efa7bff7164464f1210a2b2188226c219ef6 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -63,7 +63,7 @@ ExternalProject_Add(
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
-ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL)
+ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 66c8e3ad7ef7c80c1f388c25983425a0db5c0220..585db019d521b1699baadfae31ef95b5059c71b4 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs})
# Generate dummy staic lib
- file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+ file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib
- file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";")
+ file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps})
@@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
)
# Add dummy code to support `make target_name` under Terminal Command
- file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
+ file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
if (go_library_SHARED OR go_library_shared)
add_library(${TARGET_NAME} SHARED ${dummyfile})
else()
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index 004ee2d8c85ce7661886179570e693d7d61bc6d8..a7c8670f66cc7f319e41155211ead2d89126117f 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -307,6 +307,12 @@ sequence_expand
:noindex:
+gru_unit
+--------
+.. autofunction:: paddle.v2.fluid.layers.gru_unit
+ :noindex:
+
+
lstm_unit
---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit
diff --git a/doc/design/ci_build_whl.png b/doc/design/ci_build_whl.png
new file mode 100644
index 0000000000000000000000000000000000000000..232762b82a9ae3e979a1f38a7beb715c87438f40
Binary files /dev/null and b/doc/design/ci_build_whl.png differ
diff --git a/doc/design/concurrent_programming.md b/doc/design/concurrent_programming.md
new file mode 100644
index 0000000000000000000000000000000000000000..afc65e831d58ff427663806e56294292ccbef85b
--- /dev/null
+++ b/doc/design/concurrent_programming.md
@@ -0,0 +1,163 @@
+# Design Doc: Concurrent Programming with Fluid
+
+With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
+
+Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
+
+The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
+
+## An Analogy
+
+The following table compares concepts in Fluid and Go
+
+| Go | Fluid |
+|----|-------|
+|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
+| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
+| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
+| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
+
+## An Example Concurrent Program
+
+To review all above concepts in an example, let us take a simple program and writes its distributed version.
+
+Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
+
+```go
+import "fluid"
+
+func paddlepaddle() {
+ X = fluid.read(...)
+ W = fluid.Tensor(...)
+ Y = fluid.mult(X, W)
+}
+```
+
+Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, W, Y],
+ ops = [
+ read(output = X)
+ assign(input = ..., output = W)
+ mult(input = {X, W}, output = Y)
+ ],
+ }
+}
+```
+
+Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
+
+The default `main` function is defined as follows:
+
+```go
+func main() {
+ paddlepaddle()
+ fluid.run()
+}
+```
+
+## The Concurrent Version
+
+By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
+
+In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
+
+### The Master Program
+
+The master program could look like the following:
+
+```protobuf
+message ProgramDesc {
+ block[0] = Block {
+ vars = [X, L, Y],
+ ops = [
+ read(output = X)
+ kube_get_workers_addrs(output = L)
+ Y = tensor_array(len(L))
+ parallel_for(input = X, output = Y,
+ attrs = {L, block_id(1)}) # referring to block 1
+ ]
+ }
+
+ block[1] = Block {
+ parent = 0,
+ vars = [x, y, index],
+ ops = [
+ slice(input = [X, index], output = x) # index is initialized by parallel_for
+ send(input = x, attrs = L[index])
+ recv(outputs = y, attrs = L[index])
+ assign(input = y, output = Y[index])
+ ]
+ }
+}
+```
+
+The equivalent Fluid program (calling the Go binding) is:
+
+```go
+func main() { //// block 0
+ X = fluid.read(...)
+ L = fluid.k8s.get_worker_addrs()
+ Y = fluid.tensor_array(len(L))
+ fluid.parallel_for(X, L,
+ func(index int) { //// block 1
+ x = X[index]
+ fluid.send(L[index], x)
+ y = fluid.recv(L[index])
+ Y[index] = y
+ })
+}
+```
+
+An explanation of the above program:
+
+- `fluid.k8s` is a package that provides access to Kubernetes API.
+- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
+- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
+
+ 1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
+ 2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
+ 1. creates an Executor instance, and
+ 2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
+1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
+
+### The Worker Program
+
+The worker program looks like
+
+```go
+func main() {
+ W = Tensor(...)
+ x = fluid.listen_and_do(
+ fluid.k8s.self_addr(),
+ func(input Tensor) {
+ output = fluid.mult(input, W)
+ })
+}
+```
+
+where
+
+- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
+ 1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
+ 2. once a connection is established,
+ 1. creates a scope of two parameters, "input" and "output",
+ 2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
+ 3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
+
+## Summarization
+
+From the above example, we see that:
+
+1. Fluid enables the imperative programming paradigm by:
+ 1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
+ 2. call the `fluid.run` function that runs the program implicitly.
+1. The program is described as a `ProgramDesc` protobuf message.
+2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
+3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
+4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
+5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
+6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
diff --git a/doc/design/refactor/distributed_architecture.md b/doc/design/dist_refactor/distributed_architecture.md
similarity index 52%
rename from doc/design/refactor/distributed_architecture.md
rename to doc/design/dist_refactor/distributed_architecture.md
index d9fe7d6bbb0eeb73fcdca3ee749a4f10bcdda682..3a741f95866fb6c301ca9097af7916281f2278cf 100644
--- a/doc/design/refactor/distributed_architecture.md
+++ b/doc/design/dist_refactor/distributed_architecture.md
@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
-This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document -
-[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md)
+This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
+For a detailed explanation, refer to this document -
+[Design Doc: Parameter Server](./parameter_server.md)
## Distributed Training Architecture
@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l
-The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*.
+The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
-### PaddlePaddle Python
+### Python API
-PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc.
+Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
```Python
-paddle.init()
-input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster
-img, label = input[0], input[1]
-hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh())
-prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax())
-cost = paddle.layer.classification_cost(input=prediction, label=label)
-optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01)
-session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1)
-for i in range(1000):
- _, cost_val = session.eval(targets=[cost, optimizer])
- print cost_val
+images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
+label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+...
+predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
+cost = fluid.layers.cross_entropy(input=predict, label=label)
+avg_cost = fluid.layers.mean(x=cost)
+optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+optimizer.minimize(avg_cost)
+
+train_reader = paddle.batch(
+ paddle.reader.shuffle(
+ paddle.dataset.mnist.train(), buf_size=500),
+ batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+exe = fluid.Executor(place)
+
+for pass_id in range(10):
+ for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
```
-The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively.
-
-#### session.eval
-
-As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation.
-The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
-
-The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md).
-
-### PaddlePaddle Converter
-
-The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
-
-1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR.
-
-2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
-
-3. Optimize the computation graph.
-
-4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user.
-
-5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries.
+The code above is a typical local training program, the "Training Program" is built using helper functions such as
+`fluid.layer.fc`. The training is done by calling `Executor.run`
+iteratively.
+
+For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
+
+[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
+`Executor` to run the program locally. For any kind of distributed training, you can use
+`RemoteExecutor` to specify desired distributed training method with some optional arguments.
+
+### Distributed Transpiler
+
+The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
+the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
+Below are the steps that are followed :
+
+1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
+1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
+ distributed training program:
+ 1. Parse configurations from `RemoteExecutor`.
+ 1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
+ 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
+ DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
+ "trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
+1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
+1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
+
+
+### RemoteExecutor
+
+As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
+You can also use parameter `fetch_list` to interactively fetch variable back to local for
+log printing.
+
+The Python `RemoteExecutor` is derived from `Executor` class.
+
+```python
+exe = RemoteExecutor(
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost],
+ job_desc=JobDesc(
+ jobname,
+ num_trainer,
+ num_pserver,
+ cpu_per_trainer,
+ gpu_per_trainer,
+ mem_per_trainer,
+ cpu_per_pserver,
+ mem_per_pserver
+ ))
+for data in train_reader():
+ loss, acc = exe.run(trainer_prog,
+ feed=feeder.feed(data),
+ fetch_list=[avg_cost])
+```
-6. Dispatch the partitioned graph to different PaddlePaddle runtimes.
+`JobDesc` object describe the distributed job resource specification to run on
+Cluster environment.
-7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python.
+
-The output IRs will be cached to optimize the conversion latency.
+`RemoteExecutor.run` sends the `ProgramDesc` and
+[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
+to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
+to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
-#### Placement Algorithm
+### Placement Algorithm
Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
-### PaddlePaddle Runtime
-
-The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
-
-
### Local Training Architecture
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc
### Training Data
-In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic.
-
-When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs.
+In PaddlePaddle v0.10.0, training data is typically read
+with [data reader](../reader/README.md) from Python. This approach is
+no longer efficient when training distributedly since the Python
+process no longer runs on the same node with the trainer processes,
+the Python reader will need to read from the distributed filesystem
+(assuming it has the access) and send to the trainers, doubling the
+network traffic.
+
+When doing distributed training, the user can still use Python data
+reader: the training data are sent with `Executor.run`. However, should
+be used for debugging purpose only. The users are encouraged to use
+the read data OPs.
## References:
diff --git a/doc/design/refactor/multi_cpu.md b/doc/design/dist_refactor/multi_cpu.md
similarity index 100%
rename from doc/design/refactor/multi_cpu.md
rename to doc/design/dist_refactor/multi_cpu.md
diff --git a/doc/design/refactor/parameter_server.md b/doc/design/dist_refactor/parameter_server.md
similarity index 76%
rename from doc/design/refactor/parameter_server.md
rename to doc/design/dist_refactor/parameter_server.md
index fa3c5d7990213cf2b0d236e66e592dd2699da876..1094f06d461275a9ad4034d5e48b39856d967b71 100644
--- a/doc/design/refactor/parameter_server.md
+++ b/doc/design/dist_refactor/parameter_server.md
@@ -1,4 +1,4 @@
-# Design Doc: Operation Graph Based Parameter Server
+# Design Doc: Parameter Server
## Abstract
@@ -10,7 +10,7 @@ different purposes.
## Background
The previous implementations of the parameter server does not run a
-subgraph. parameter initialization, optimizer computation, network
+fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the
trainer and the parameter server.
@@ -23,10 +23,10 @@ server becomes a natural extension.
## Design
-### Graph Converter
+### Distributed Transpiler
-The *graph converter* converts the user-defined operation (OP) graph
-into subgraphs to be scheduled on different nodes with the following
+The *Distributed Transpiler* converts the user-defined fluid program
+into sub-programs to be scheduled on different nodes with the following
steps:
1. OP placement: the OPs will be placed on different nodes according
@@ -34,7 +34,6 @@ steps:
time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer
workers.
-
1. Add communication OPs to enable the communication between nodes.
We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
@@ -48,8 +47,8 @@ After converting:
-1. The parameter variable W and it's optimizer subgraph are placed on the parameter server.
-1. Operators are added to the subgraphs.
+1. The parameter variable W and it's optimizer program are placed on the parameter server.
+1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark
@@ -64,39 +63,30 @@ After converting:
### Benefits
- Model parallelism become easier to implement: it's an extension to
- the trainer - parameter server approach. we already have the
- communication OPs, but need to extend the graph converter's
- placement functionality.
-
+ the trainer - parameter server approach. We can have several "Transpilers"
+ to achieve different goals.
- User-defined optimizer is easier to add - user can now express it as
- a subgraph.
-
+ a sub-program.
- No more duplication logic inside the trainer and the parameter
server mentioned in the background section.
### Challenges
-- It might be hard for the graph converter to cut a general graph
- (without any hint for which subgraph is the optimizer). We may need
- to label which subgraph inside the OP graph is the optimizer.
-
- It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some
word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends
on the parameter variable).
+- In the "Aync SGD" figure, the "W" variable on the parameter server
+ could be read and wrote concurrently. See
+ [here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
+ details about concurrent program in fluid.
### Discussion
-- In the "Aync SGD" figure, the "W" variable on the parameter server
- could be read and wrote concurrently, what is our locking strategy?
- E.g., each variable have a lock cpp method to be invoked by every
- OP, or, have a lock OP.
-
- Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)?
-
- *Dequeue* OP will have variable numbers of output (depends on the
`min_count` attribute), does our current design support it? (similar
question for the *Add* OP)
diff --git a/doc/design/refactor/src/compiler.graffle b/doc/design/dist_refactor/src/compiler.graffle
similarity index 100%
rename from doc/design/refactor/src/compiler.graffle
rename to doc/design/dist_refactor/src/compiler.graffle
diff --git a/doc/design/refactor/src/compiler.png b/doc/design/dist_refactor/src/compiler.png
similarity index 100%
rename from doc/design/refactor/src/compiler.png
rename to doc/design/dist_refactor/src/compiler.png
diff --git a/doc/design/refactor/src/dist-graph.graffle b/doc/design/dist_refactor/src/dist-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/dist-graph.graffle
rename to doc/design/dist_refactor/src/dist-graph.graffle
diff --git a/doc/design/refactor/src/dist-graph.png b/doc/design/dist_refactor/src/dist-graph.png
similarity index 100%
rename from doc/design/refactor/src/dist-graph.png
rename to doc/design/dist_refactor/src/dist-graph.png
diff --git a/doc/design/dist_refactor/src/distributed_architecture.graffle b/doc/design/dist_refactor/src/distributed_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..d1b60141342232e06227c2d430ebc60ec349a907
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/distributed_architecture.png b/doc/design/dist_refactor/src/distributed_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c7b0c0783f97c6d33b1db1ed484d6a2b9dd356
Binary files /dev/null and b/doc/design/dist_refactor/src/distributed_architecture.png differ
diff --git a/doc/design/refactor/src/local-graph.graffle b/doc/design/dist_refactor/src/local-graph.graffle
similarity index 100%
rename from doc/design/refactor/src/local-graph.graffle
rename to doc/design/dist_refactor/src/local-graph.graffle
diff --git a/doc/design/refactor/src/local-graph.png b/doc/design/dist_refactor/src/local-graph.png
similarity index 100%
rename from doc/design/refactor/src/local-graph.png
rename to doc/design/dist_refactor/src/local-graph.png
diff --git a/doc/design/dist_refactor/src/local_architecture.graffle b/doc/design/dist_refactor/src/local_architecture.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..49fcc663ebe3824aa234e3a67aadf285cb417877
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.graffle differ
diff --git a/doc/design/dist_refactor/src/local_architecture.png b/doc/design/dist_refactor/src/local_architecture.png
new file mode 100644
index 0000000000000000000000000000000000000000..14adc9fd72b855bb9f74fbf2c84ac9ec0cf2b122
Binary files /dev/null and b/doc/design/dist_refactor/src/local_architecture.png differ
diff --git a/doc/design/refactor/src/multi-threads.graffle b/doc/design/dist_refactor/src/multi-threads.graffle
similarity index 100%
rename from doc/design/refactor/src/multi-threads.graffle
rename to doc/design/dist_refactor/src/multi-threads.graffle
diff --git a/doc/design/refactor/src/multi-threads/multi-threads@3x.png b/doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
similarity index 100%
rename from doc/design/refactor/src/multi-threads/multi-threads@3x.png
rename to doc/design/dist_refactor/src/multi-threads/multi-threads@3x.png
diff --git a/doc/design/refactor/src/multi-threads/single-thread@3x.png b/doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
similarity index 100%
rename from doc/design/refactor/src/multi-threads/single-thread@3x.png
rename to doc/design/dist_refactor/src/multi-threads/single-thread@3x.png
diff --git a/doc/design/refactor/src/paddle-compile.graffle b/doc/design/dist_refactor/src/paddle-compile.graffle
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.graffle
rename to doc/design/dist_refactor/src/paddle-compile.graffle
diff --git a/doc/design/refactor/src/paddle-compile.png b/doc/design/dist_refactor/src/paddle-compile.png
similarity index 100%
rename from doc/design/refactor/src/paddle-compile.png
rename to doc/design/dist_refactor/src/paddle-compile.png
diff --git a/doc/design/dist_refactor/src/remote_executor.graffle b/doc/design/dist_refactor/src/remote_executor.graffle
new file mode 100644
index 0000000000000000000000000000000000000000..ce2c18fee5687732053c48af9c8c290a994a8090
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.graffle differ
diff --git a/doc/design/dist_refactor/src/remote_executor.png b/doc/design/dist_refactor/src/remote_executor.png
new file mode 100644
index 0000000000000000000000000000000000000000..6be4b1841b99efdb59557975485d0387f422308c
Binary files /dev/null and b/doc/design/dist_refactor/src/remote_executor.png differ
diff --git a/doc/design/error_clip.md b/doc/design/error_clip.md
new file mode 100644
index 0000000000000000000000000000000000000000..8e845462cce2a29556bcb6010b08f00fbc3d99d7
--- /dev/null
+++ b/doc/design/error_clip.md
@@ -0,0 +1,87 @@
+# Error Clip
+
+## Overview
+
+Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
+## Usage
+
+Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
+
+```python
+var = framework.Variable(..., error_clip=myErrorClip, ...)
+```
+
+The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
+
+```python
+ErrorClipByValue(max, min=None)
+```
+
+`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
+
+So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
+
+```python
+var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
+```
+
+## Implementation
+
+The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
+
+```python
+class BaseErrorClipAttr(object):
+ def append_clip_op(self, block, grad_name):
+ raise NotImplementedError()
+
+
+class ErrorClipByValue(BaseErrorClipAttr):
+ def __init__(self, max, min=None):
+ max = float(max)
+ if min is None:
+ min = -max
+ else:
+ min = float(min)
+ self.max = max
+ self.min = min
+
+ def append_clip_op(self, block, grad_name):
+ block.append_op(
+ type="clip",
+ inputs={"X": grad_name},
+ outputs={"Out": grad_name},
+ attrs={"min": self.min,
+ "max": self.max})
+```
+
+The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
+
+This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
+
+These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
+
+```python
+for op_desc in grad_op_descs:
+ new_op_desc = target_block.desc.append_op()
+ new_op_desc.copy_from(op_desc)
+ callback(block=target_block, context=grad_to_var)
+```
+
+Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
+
+The callback function for `clip_op` appending is defined in *clip.py*:
+
+```python
+def error_clip_callback(block, context):
+ # the context is a grad_to_var map
+ grad_to_var = context
+ op_desc = block.desc.op(block.desc.op_size() - 1)
+ for grad_n in filter(lambda n: grad_to_var.has_key(n),
+ op_desc.output_arg_names()):
+ fwd_var = block.var_recursive(grad_to_var[grad_n])
+ error_clip = getattr(fwd_var, "error_clip", None)
+ if error_clip is not None:
+ error_clip.append_clip_op(block, grad_n)
+```
+
+This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
diff --git a/doc/design/images/control_flow_graph.png b/doc/design/images/control_flow_graph.png
new file mode 100644
index 0000000000000000000000000000000000000000..3579998e58d07abc50bd3332128d4733a391cb3b
Binary files /dev/null and b/doc/design/images/control_flow_graph.png differ
diff --git a/doc/design/images/dataflow_equations.png b/doc/design/images/dataflow_equations.png
new file mode 100644
index 0000000000000000000000000000000000000000..c10f7f69f4007952e5b0394edaa04efa1cfbb658
Binary files /dev/null and b/doc/design/images/dataflow_equations.png differ
diff --git a/doc/design/images/deep_learning.png b/doc/design/images/deep_learning.png
new file mode 100644
index 0000000000000000000000000000000000000000..026becc4d94e01e407dacb2a5314a0e5723334ff
Binary files /dev/null and b/doc/design/images/deep_learning.png differ
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
new file mode 100644
index 0000000000000000000000000000000000000000..00f514711a46bfd5af3bae51e0d9225ecc4c8998
--- /dev/null
+++ b/doc/design/memory_optimization.md
@@ -0,0 +1,217 @@
+# Memory Optimization
+
+
+## Problem
+
+In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
+
+- availability of Big Data
+- supercomputing power to process this Big Data over very large neural networks
+- modern algorithms
+
+Following graph shows the details:
+
+![](images/deep_learning.png)
+
+Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference.
+
+## Solution
+
+### Basic Strategy
+
+There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
+
+#### In-place Operation
+In a relu activation operator:
+
+$y = \max(x, 0)$
+
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
+
+#### Memory Sharing
+
+Not all operators support in-place operations. Memory sharing is a more general strategy.
+
+Following is an example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+
+
+### Live Variable Analysis
+
+It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
+
+In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
+
+In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
+
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
+
+We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
+
+- construct a control flow graph
+- solve the dataflow equations
+
+
+#### Control Flow Graph
+To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+
+Following is the flow graph for a simple loop.
+
+![](images/control_flow_graph.png)
+
+#### Dataflow Analysis
+
+liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+
+A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
+
+- Flow Graph Terminology
+
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
+
+- Uses and Defs
+
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+
+- Liveness
+
+A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
+
+
+The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
+
+![](images/dataflow_equations.png)
+
+### Memory optimization transpiler
+
+At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
+
+#### add in-place attribute
+
+In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
+
+
+#### contruct control flow graph
+
+Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
+
+- Block0:
+
+```
+lookup_table
+mul
+...
+while(sub-block idx 1)
+...
+array_to_lod_tensor
+cross_entropy
+...
+while_grad(sub-block idx 2)
+read_from_array
+array_to_lod_tensor
+...
+```
+
+- Block1
+
+```
+read_from_array
+read_from_array
+...
+write_to_array
+increment
+write_to_array
+less_than
+```
+
+- Block2
+
+```
+read_from_array
+increment
+...
+write_to_array
+write_to_array
+```
+
+We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
+
+```python
+class ControlFlowGraph(object):
+ def __init__(self, Program):
+ self._sucessors = defaultdict(set)
+ self._presucessors = defaultdict(set)
+ self._uses = defaultdict(set)
+ self._defs = defaultdict(set)
+ self._live_in = defaultdict(set)
+ self._live_out = defaultdict(set)
+ self._program = Program
+
+ def build(self):
+ pass
+
+ def dataflow_analysis(self):
+ pass
+
+ def memory_optimization(self):
+ pass
+
+ def get_program(self):
+ return self._program
+```
+
+#### make dataflow analysis
+
+We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
+
+For example:
+
+```
+a = op1(b, c);
+d = op2(a)
+e = op3(d, f)
+```
+
+The dataflow analysis result is:
+
+```
+live_in(op1) = {b, c, f}
+live_out(op1) = {a, f}
+
+live_in(op2) = {a, f}
+live_out(op2) = {d, f}
+
+live_in(op3) = {d, f}
+live_out(op3) = {}
+```
+
+After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
+
+#### memory sharing policy
+
+A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
+
+```
+if op.support_inplace():
+ i --> pool
+ pool --> o
+else:
+ pool --> o
+ i --> pool
+```
+
+
+
+## Reference
+
+- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
+- Modern compiler implementation in ML, by Andrew W. Appel
+- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
diff --git a/doc/design/refactor/session.md b/doc/design/refactor/session.md
deleted file mode 100644
index 1d9a26683c14f54e3b5fe41675cd03b5620646b8..0000000000000000000000000000000000000000
--- a/doc/design/refactor/session.md
+++ /dev/null
@@ -1,180 +0,0 @@
-# Design Doc: Session
-
-## Abstract
-
-The *session* object encapsulates the environment in which the
-computation graph is executed.
-
-We will have the *local* session and *remote* session, they offer the
-same [interface](#interface). The local session encapsulates the local
-runtime environment and the remote session encapsulates the cluster
-runtime environment.
-
-The local runtime environment contains:
-
-1. computation devices (i.e., CPU, GPU) handles, and
-1. the [scope](../scope.md) which holds all variables.
-
-The remote runtime environment contains:
-
-1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
- and
-1. the distributed [scope](../scope.md) in a cluster which holds all
- variables.
-
-The user can create a remote session on Paddle Cloud and evaluate the
-computation graph with it. In this way, the user can control the
-remote computation resource in a cluster from his local computer.
-
-
-## Background
-
-The current design has an implicit global session in which
-`paddle.eval()` is executed. The pain point is:
-
-Since the user is not able to explicitly switch between runtime
-environments, the user cannot run a topology in two independent
-environments.
-
-For example, in reinforcement learning, the user may want to have a
-stale model for inference and a fresh model for training, and only
-replace the stale model with the fresh model periodically.
-
-Furthermore, we have no concept that encapsulates a remote environment
-that executes a computation graph.
-
-We need the session object to address above issues.
-
-
-## Session
-
-A session is an object that owns the runtime environment. All
-computations are executed through `session.eval()`.
-
-
-### Interface
-
-```python
-eval(
- targets,
- feed_dict=None,
-)
-```
-
-Evaluates the target Operations or Variables in `targets`.
-
-- *targets*: the evaluation targets. Can be a single Operation or
- Variable, or a list with the Operations or Variables as
- elements. The value returned by `eval()` has the same shape as the
- `target` argument.
-
- The PaddlePaddle program is represented by
- the [ProgramDesc](../design/program.md), `eval()` will infer the
- ProgramDesc from the given targets and run the PaddlePaddle
- program. Please
- see
- [this graph](./distributed_architecture.md#local-training-architecture) for
- the detailed illustration for the local session
- and
- [this graph](./distributed_architecture.md#distributed-training-architecture) for
- the detailed illustration for the remote session.
-
-- *feed_dict*: a dictionary that contains the tensors which override
- the edges of the computation graph.
-
- feed_dict not only can provide the input data, it can override any
- OP's input as well:
-
- ```python
- a = pd.constant(2.0, name="a")
- b = pd.variable(name="b")
- c = pd.mul(a,b)
- sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
- ```
-
-```python
-close()
-```
-
-Closes the session and releases the scope that the session owns.
-
-
-### Create a Local Session
-
-```python
-session(
- devices=None
-)
-```
-
-Creates a new session. One session owns one global scope, so creating
-multiple sessions will create different scopes.
-
-- *devices*: a single `string` or a list of `string` of device names,
- the corresponding devices will be the computation devices for
- `eval()`. If not specified, all available devices (e.g., all GPUs)
- will be used. The user doesn't need to specify the CPU device since
- it will be always used. Multiple sessions can use the same device.
-
-
-#### Example
-
-```Python
-a = paddle.constant(1.0)
-b = paddle.constant(2.0)
-c = a + b
-sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
-sess.eval(c)
-sess.close()
-```
-
-### Create a Remote Session
-
-```python
-create_cloud_job(
- name,
- num_trainer,
- mem_per_trainer,
- gpu_per_trainer,
- cpu_per_trainer,
- num_ps,
- mem_per_ps,
- cpu_per_ps,
-)
-```
-
-Creates a Paddle Cloud job. Fails if the job name exists.
-
-```python
-get_cloud_job(
- name
-)
-```
-
-Gets a Paddle Cloud job.
-
-```python
-remote_session(
- job
-)
-```
-
-- *job*: the Paddle Cloud job.
-
-#### Example
-
-```Python
-reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
-image = reader.column(0)
-label = reader.column(1)
-fc1 = paddle.op.fc(image, size=256, act="sigmoid")
-fc2 = paddle.op.fc(fc1, size=10, act="softmax")
-cost = paddle.op.cross_entropy(fc2, label)
-opt = paddle.optimizer.sgd(cost)
-
-job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
-sess = paddle.remote_ession(job)
-for i in range(1000):
- sess.eval(opt)
-sess.close()
-```
diff --git a/doc/design/refactor/src/distributed_architecture.graffle b/doc/design/refactor/src/distributed_architecture.graffle
deleted file mode 100644
index f8496e57326c38de7468eb452a7713291d57653c..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/distributed_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/distributed_architecture.png b/doc/design/refactor/src/distributed_architecture.png
deleted file mode 100644
index 410c4510c6aab301dec95e6427fe80ac24e105fe..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/distributed_architecture.png and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.graffle b/doc/design/refactor/src/local_architecture.graffle
deleted file mode 100644
index cc7783c45381f25ded0b898649322c81418ad317..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/local_architecture.graffle and /dev/null differ
diff --git a/doc/design/refactor/src/local_architecture.png b/doc/design/refactor/src/local_architecture.png
deleted file mode 100644
index 4b999538b7825c805292ee28b5e3256d5543bd09..0000000000000000000000000000000000000000
Binary files a/doc/design/refactor/src/local_architecture.png and /dev/null differ
diff --git a/doc/design/releasing_process.md b/doc/design/releasing_process.md
index 14c081ea84282e52a2e36475c3c0ea755122d154..b9787261092f1f27377886152cb1596d9ff54188 100644
--- a/doc/design/releasing_process.md
+++ b/doc/design/releasing_process.md
@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
1. 从`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
1. 对这个版本的提交,做如下几个操作:
+ * 使用Regression Test List作为检查列表,测试本次release的正确性。
+ * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`。
- * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
- * 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
- * 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
- * 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
* 编译这个版本的python wheel包,并发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`。
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`。
@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine
twine upload dist/[package to upload]
```
+ * 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
-1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
1. 协同完成Release Note的书写
@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
* 在`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop`和`release/版本号`这三个分支。
+## 发布wheel包到pypi
+
+使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
+完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
+弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
+可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m`和`cp27mu`的版本。然后按照上述的方法
+使用`twine`工具上传即可。
+
+
+
+* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
+ 发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
+* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
+
+## 发布Docker镜像
+
+上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
+版本号对应的tag即可:
+
+1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
+1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
+1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
+1. 执行 `docker push paddlepaddle/paddle:[version]`
+
## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
diff --git a/doc/design/support_new_device.md b/doc/design/support_new_device.md
index f54b2b3694cc2a8f1d892792fd4d39a0484dc750..4c5f10e2ecb9ec09b78926ca27552741d02d7cc9 100644
--- a/doc/design/support_new_device.md
+++ b/doc/design/support_new_device.md
@@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev
```
- /-> CPUDeviceContext --> MKLDeviceContext
-DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext
+ /-> CPUDeviceContext
+DeviceContext ----> CUDADeviceContext
\-> FPGADeviceContext
```
@@ -79,16 +79,6 @@ private:
};
```
-- CUDNNDeviceContext
-
-```
-class CUDNNDeviceContext : public CUDADeviceContext {
- private:
- cudnnHandle_t cudnn_handle_;
-};
-```
-
-
### Memory and Tensor
diff --git a/doc/howto/usage/cluster/cluster_train_cn.md b/doc/howto/usage/cluster/cluster_train_cn.md
index 659bae9c0ceaf2fb2df8446b9d406a822a9df0ea..c2fc86687d7106aac7c74d6dd16bc229353cb7c1 100644
--- a/doc/howto/usage/cluster/cluster_train_cn.md
+++ b/doc/howto/usage/cluster/cluster_train_cn.md
@@ -51,7 +51,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信
- ports_num:**必选,默认1**,监听的端口个数
-- ports_num_for_sparse:**必选,默认1**,用于稀疏类型参数通信的端口个数
+- ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
### 启动计算节点
@@ -60,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
$ python train.py
```
-trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量(https://zh.wikipedia.org/wiki/环境变量 )或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
+trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
使用环境变量:
@@ -95,7 +95,7 @@ paddle.init(
- trainer_count:**必选,默认1**,当前训练任务trainer总个数
- port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数
-- ports_num_for_sparse:**必选,默认1**,和pserver之间用于稀疏类型参数通信的端口个数
+- ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
diff --git a/doc/howto/usage/cluster/cluster_train_en.md b/doc/howto/usage/cluster/cluster_train_en.md
index 915405ca5b446981515e301ca4b7ee065a82a9ff..28cd1fa7903e559e33a7fc2f00172fdfbe2fdc97 100644
--- a/doc/howto/usage/cluster/cluster_train_en.md
+++ b/doc/howto/usage/cluster/cluster_train_en.md
@@ -52,7 +52,7 @@ Parameter Description
- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
- ports_num: **required, default 1**, total number of ports will listen on.
-- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update.
+- ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
- num_gradient_servers: **required, default 1**, total number of gradient servers.
### Starting trainer
@@ -98,7 +98,7 @@ Parameter Description
- trainer_count: **required, default 1**, total count of trainers in the training job.
- port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication.
-- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation.
+- ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server.
- trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
diff --git a/doc/mobile/cross_compiling_for_android_cn.md b/doc/mobile/cross_compiling_for_android_cn.md
index 424d7718c64438496cf0895397babd5408e1ca02..ae24ced770492743065e37654b494caf6b4c5bc0 100644
--- a/doc/mobile/cross_compiling_for_android_cn.md
+++ b/doc/mobile/cross_compiling_for_android_cn.md
@@ -1,8 +1,9 @@
# Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
-- 基于Docker容器的编译方式
-- 基于Linux交叉编译环境的编译方式
+
+- [基于Docker容器的编译方式](#基于docker容器的编译方式)
+- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
## 基于Docker容器的编译方式
Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
@@ -16,6 +17,12 @@ $ cd Paddle
$ docker build -t username/paddle-android:dev . -f Dockerfile.android
```
+用户也可以使用PaddlePaddle提供的官方开发镜像:
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
### 编译PaddlePaddle C-API库
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数:
@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
ANDROID_API |
- >= 21 |
+ >= 16 |
21 |
- 编译`armeabi-v7a`,`Android API 21`的PaddlePaddle库
+
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
```
- 编译`arm64-v8a`,`Android API 21`的PaddlePaddle库
+
```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
```
-执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
+执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI`和`ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a`,`ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
- 构建`arm64-v8a`、 `Android API 21`的独立工具链:
+
```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9`和`clang 3.8`。
-注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**。
-
### 配置交叉编译参数
CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)。
交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
-- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`)。
+- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF`、`WITH_AVX=OFF`、`WITH_PYTHON=OFF`、`WITH_RDMA=OFF`、`WITH_MKL=OFF`、`WITH_GOLANG=OFF`)。
- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
@@ -119,7 +127,7 @@ Android平台可选配置参数:
其他配置参数:
- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`。
-- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。
+- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
常用的cmake配置如下:
@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
..
```
-用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。
+用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE`为`MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE`为`Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
+
- 设置`CMAKE_BUILD_TYPE`为`Release`
- 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
diff --git a/doc/mobile/cross_compiling_for_android_en.md b/doc/mobile/cross_compiling_for_android_en.md
index 26858581fc1d77a9391520ac0dfd80fbd98f508c..0cf50181df4116beda3aa6faf836eda92edf6066 100644
--- a/doc/mobile/cross_compiling_for_android_en.md
+++ b/doc/mobile/cross_compiling_for_android_en.md
@@ -1,6 +1,9 @@
# Build PaddlePaddle for Android
-There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker.
+There are two approaches to build PaddlePaddle for Android:
+
+- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
+- [Cross-Compiling on Linux](#cross-compiling-on-linux)
## Cross-Compiling Using Docker
@@ -16,6 +19,12 @@ $ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android
```
+Users can directly use the published Docker image.
+
+```bash
+$ docker pull paddlepaddle/paddle:latest-dev-android
+```
+
### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
ANDROID_API |
- >= 21 |
+ >= 16 |
21 |
@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
-**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
-
### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know:
-- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`.
+- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
@@ -123,7 +130,7 @@ Some Android-specific arguments:
Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
-- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`.
+- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
Some frequent configurations for your reference:
@@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
+
- `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
diff --git a/doc/mobile/cross_compiling_for_ios_en.md b/doc/mobile/cross_compiling_for_ios_en.md
index aa390cd61f3fbd75e5a3b342f3559e76da35a918..19bfe86c511c7e43b462f94c8cabba420b3007f1 100644
--- a/doc/mobile/cross_compiling_for_ios_en.md
+++ b/doc/mobile/cross_compiling_for_ios_en.md
@@ -1,4 +1,4 @@
-# PaddlePaddle Compiling Guide for iOS
+# Build PaddlePaddle for iOS
This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
@@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying
- set `CMAKE_BUILD_TYPE` with `Release`
- set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
-## Compile and install
+## Build and install
After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
@@ -109,7 +109,7 @@ $ make install
Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
-`your/path/to/install` directory will have following directories after `compile` and `install`:
+`your/path/to/install` directory will have following directories after `make install`:
- `include`, contains all the C-API header files.
- `lib`, contains PaddlePaddle C-API static library.
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 7d2becbdd772747d77890321fce6721d8d17fb30..4a98ede278fad85ff2beef3c8e7dd158912f693a 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -24,6 +24,7 @@ else()
add_subdirectory(framework)
add_subdirectory(operators)
add_subdirectory(pybind)
+ add_subdirectory(inference)
endif()
if(WITH_SWIG_PY)
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 482b51e8a8430863c3e13df2298f6979d3959461..1f0e033c5b7ab713f1747d58f4fcdaad001f0715 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output(
out->args.push_back(layerOutput);
return kPD_NO_ERROR;
}
+
+paddle_error paddle_gradient_machine_release_layer_output(
+ paddle_gradient_machine machine) {
+ auto m = cast(machine);
+ if (m == nullptr || m->machine == nullptr) {
+ return kPD_NULLPTR;
+ }
+ m->machine->releaseOutput();
+ return kPD_NO_ERROR;
+}
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index 28eeb23e3bbdd4cc22a25c14170bf56c294f8cd7..7e37dea00b27b6cb955486b4210c1373decbcfa5 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
const char* layerName,
paddle_arguments args);
+/**
+ * @brief Release the middle layer's output memory of the gradient machine.
+ * @param [in] gradient machine that have run a inference
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
+
#ifdef __cplusplus
}
#endif
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index b4458eb9551724021636b628c5bf8c96f6e659aa..af4079875a50ffe6eb627492f834fb601bbee716 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -26,10 +26,15 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc)
-cc_library(scope SRCS scope.cc DEPS glog)
+cc_library(threadpool SRCS threadpool.cc)
+cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
+
+cc_library(scope SRCS scope.cc DEPS glog threadpool)
cc_test(scope_test SRCS scope_test.cc DEPS scope)
-cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto)
+cc_library(device_data_transform SRCS device_data_transform.cc DEPS tensor)
+
+cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows device_data_transform)
cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
cc_library(attribute SRCS attribute.cc DEPS framework_proto)
@@ -38,7 +43,7 @@ device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
-cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute)
+cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
@@ -70,9 +75,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
-cc_library(threadpool SRCS threadpool.cc)
-cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
-cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
+cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
+cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
+nv_test(device_data_transform_test SRCS device_data_transform_test.cu
+ DEPS operator op_registry init math_function)
diff --git a/paddle/framework/backward.cc b/paddle/framework/backward.cc
index eaf13ddcefcd8dc5a6b0438f765d8d325925aa30..85e693434af863bfc3bde29989dbbfc69678d3b7 100644
--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -427,7 +427,8 @@ std::vector> MakeBlockBackward(
VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector> op_grads;
- if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") {
+ if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
+ (*it)->Type() == "parallel_do") {
int step_block_idx = (*it)->GetBlockAttr("sub_block");
BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
grad_to_var, step_block_idx);
diff --git a/paddle/framework/block_desc.cc b/paddle/framework/block_desc.cc
index 0668b08ff7ab3c8ca4f1e989fc7af45a8ec5f63c..54498e175dacfa0a220e3d839f4feb02502b2c03 100644
--- a/paddle/framework/block_desc.cc
+++ b/paddle/framework/block_desc.cc
@@ -53,12 +53,12 @@ VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
return it->second.get();
}
-VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
+VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
VarDesc *res = FindVarRecursive(name_bytes);
if (res == nullptr) {
res = Var(name_bytes);
}
- return res;
+ return *res;
}
bool BlockDesc::HasVarRecursive(const std::string &name) const {
diff --git a/paddle/framework/block_desc.h b/paddle/framework/block_desc.h
index 6c8c81b332d99e52db41018e117aa837be6745bc..4b609e4bcb67bb8dda5924a639e7a8165eda4353 100644
--- a/paddle/framework/block_desc.h
+++ b/paddle/framework/block_desc.h
@@ -57,7 +57,7 @@ class BlockDesc {
VarDesc *FindVarRecursive(const std::string &name_bytes) const;
- VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes);
+ VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
bool HasVarRecursive(const std::string &var_name) const;
diff --git a/paddle/framework/data_transform.cc b/paddle/framework/data_transform.cc
index 9d6a8424426a68ae66cf93b803c35e33e30226f2..fed958db1584c4fda5394d59a2ef8936045a9ce9 100644
--- a/paddle/framework/data_transform.cc
+++ b/paddle/framework/data_transform.cc
@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
+#include
#include "paddle/framework/data_transform.h"
+#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h"
namespace paddle {
@@ -24,6 +27,37 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
return data_transform_map;
}
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+ const OpKernelType& kernel_type_for_var,
+ const Tensor& input_tensor) {
+ Tensor* out = nullptr;
+ if (!platform::is_same_place(kernel_type_for_var.place_,
+ expected_kernel_type.place_)) {
+ out = DeviceTransform(input_tensor, expected_kernel_type.place_);
+ }
+ PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
+ return out;
+}
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+ Variable& out_var) {
+ if (in_var.IsType()) {
+ auto& in_lod_tensor = in_var.Get();
+ auto* tran_lod_tensor = out_var.GetMutable();
+ tran_lod_tensor->set_lod(in_lod_tensor.lod());
+ tran_lod_tensor->set_layout(in_lod_tensor.layout());
+ tran_lod_tensor->ShareDataWith(tensor);
+ } else if (in_var.IsType()) {
+ auto& in_selected_rows = in_var.Get();
+ auto* trans_selected_rows = out_var.GetMutable();
+ trans_selected_rows->set_height(in_selected_rows.height());
+ trans_selected_rows->set_rows(in_selected_rows.rows());
+ trans_selected_rows->mutable_value()->ShareDataWith(tensor);
+ } else {
+ PADDLE_THROW("unknown var type");
+ }
+}
+
auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain);
@@ -36,6 +70,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNCHW, LibraryType::kPlain);
+// TODO(dzhwinter): Only for testing multiple op kernel.
+// Dummy transform function for library_type
+// should be removed.
+auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
+ DataLayout::kAnyLayout, LibraryType::kPlain);
+
+auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
+ DataLayout::kAnyLayout, LibraryType::kCUDNN);
+
+void DummyTrans(const platform::DeviceContext* ctx,
+ const KernelTypePair& kernel_pair, const Variable& in,
+ Variable* out) {
+ PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!.");
+ PADDLE_ENFORCE(
+ platform::places_are_same_class(kernel_pair.first.place_,
+ kernel_pair.second.place_),
+ "TransDataType Only Support DataType transform on same place!");
+ auto src = in.Get();
+ auto* dst = out->GetMutable();
+ *dst = src;
+}
+
void TransDataType(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
@@ -74,35 +130,36 @@ void TransDataType(const platform::DeviceContext* ctx,
}
}
-void TransDataLayout(const platform::DeviceContext* ctx,
+void TransDataLayout(const std::vector& axis,
+ const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
- PADDLE_ENFORCE(in.IsType(), "Only Support Tensor transform!.");
+ PADDLE_ENFORCE(in.IsType(), "Only support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
- "TransDataType Only Support DataType transform on same place!");
+ "TransDataLayout only support DataLayout transform on same place!");
+ PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
+ "TransDataLayout only support Datatype are same!");
auto src = in.Get();
auto* dst = out->GetMutable();
PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
auto src_dim = src.dims();
- dst->Resize(src_dim);
- auto place = kernel_pair.second.place_;
- CopyFrom(src, place, *ctx, dst);
- const std::vector axis = {0, 2, 3, 1};
-
std::vector dst_dim;
+
dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]];
}
dst->Resize(make_ddim(dst_dim));
+ auto place = kernel_pair.second.place_;
+ dst->mutable_data(place, src.type());
auto src_type = kernel_pair.first.data_type_;
- framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis));
+ framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
dst->set_layout(kernel_pair.second.data_layout_);
}
@@ -111,5 +168,24 @@ void TransDataLayout(const platform::DeviceContext* ctx,
} // namespace paddle
namespace f = paddle::framework;
+
+namespace {
+std::vector NHWC2NCHW = {0, 3, 1, 2};
+std::vector NCHW2NHWC = {0, 2, 3, 1};
+}
+
REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
-REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout);
+REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans);
+REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans);
+REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
+ std::bind(f::TransDataLayout, NHWC2NCHW,
+ std::placeholders::_1,
+ std::placeholders::_2,
+ std::placeholders::_3,
+ std::placeholders::_4));
+REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
+ std::bind(f::TransDataLayout, NCHW2NHWC,
+ std::placeholders::_1,
+ std::placeholders::_2,
+ std::placeholders::_3,
+ std::placeholders::_4));
diff --git a/paddle/framework/data_transform.h b/paddle/framework/data_transform.h
index 9abb3c99bf30fcf9deab59dc7ee9c02e7c7c775b..e4e5c30a96a3c985ae2ecd494b723c8afeceb12f 100644
--- a/paddle/framework/data_transform.h
+++ b/paddle/framework/data_transform.h
@@ -19,6 +19,7 @@ limitations under the License. */
#include
#include "paddle/framework/op_kernel_type.h"
+#include "paddle/framework/selected_rows.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/variable.h"
#include "paddle/operators/math/math_function.h"
@@ -49,6 +50,13 @@ struct KernelTypePairHash {
}
};
+Tensor* DataTransform(const OpKernelType& expected_kernel_type,
+ const OpKernelType& kernel_type_for_var,
+ const Tensor& input_tensor);
+
+void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
+ Variable& out_var);
+
template
struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const {
@@ -73,22 +81,23 @@ struct CastDataType {
auto numel = in_.numel();
auto* in_end = in_begin + numel;
auto* out_begin = out_->mutable_data(place);
+
if (platform::is_cpu_place(place)) {
platform::Transform trans;
auto* context = static_cast(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor());
} else {
- // TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type?
+ // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW("Unsupport CPU <-> GPU!");
}
}
};
struct CastDataLayout {
- CastDataLayout(const framework::Tensor& in, framework::Tensor* out,
- const platform::DeviceContext* ctx,
- const std::vector& axis)
+ CastDataLayout(const platform::DeviceContext* ctx,
+ const std::vector& axis, const framework::Tensor& in,
+ framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_;
framework::Tensor* out_;
@@ -98,6 +107,7 @@ struct CastDataLayout {
template
void operator()() {
auto place = ctx_->GetPlace();
+
if (platform::is_cpu_place(place)) {
operators::math::Transpose trans4;
auto* context = static_cast(ctx_);
diff --git a/paddle/framework/data_transform_test.cc b/paddle/framework/data_transform_test.cc
index 8665b6248faa2d218230449c45a10f022f3fbf4f..edd305fd17ae202926b83fbec10089719baa2e16 100644
--- a/paddle/framework/data_transform_test.cc
+++ b/paddle/framework/data_transform_test.cc
@@ -106,7 +106,7 @@ TEST(DataTransform, Register) {
ASSERT_EQ(test_value, 2);
}
-TEST(DataTransform, Layout) {
+TEST(DataTransform, DataLayout) {
using namespace paddle::framework;
using namespace paddle::platform;
@@ -127,7 +127,19 @@ TEST(DataTransform, Layout) {
}
Tensor dst = out.Get();
- EXPECT_TRUE(dst.layout() != src->layout());
+
+ EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
+ EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
+
+ {
+ auto kernel1 = GenFromBit({1, 0, 1, 0});
+ auto kernel2 = GenFromBit({1, 0, 0, 0});
+ auto pair0 = std::make_pair(kernel1, kernel2);
+ instance.Get(pair0)(ctx, pair0, out, &in);
+ }
+
+ EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
+ EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
}
TEST(DataTransform, DataType) {
diff --git a/paddle/framework/details/cow_ptr.h b/paddle/framework/details/cow_ptr.h
new file mode 100644
index 0000000000000000000000000000000000000000..7e308ffb5a49876aa2c1833b3b7e2a2c7eb137aa
--- /dev/null
+++ b/paddle/framework/details/cow_ptr.h
@@ -0,0 +1,98 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+#include
+#include
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+// Change it to thread safe flags if needed.
+class ThreadUnsafeOwnershipFlags {
+ public:
+ ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
+
+ ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
+ ThreadUnsafeOwnershipFlags& operator=(
+ const ThreadUnsafeOwnershipFlags& other) = delete;
+ ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
+
+ void SetOwnership(bool flag) { flag_ = flag; }
+
+ // Invoke the callback if it is not owned.
+ template
+ void AcquireOwnershipOnce(Callback acquire) {
+ if (!flag_) {
+ acquire();
+ flag_ = true;
+ }
+ }
+
+ private:
+ bool flag_;
+};
+
+// Copy-On-Write pointer.
+// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
+//
+// The template parameter OwnershipFlags should have:
+// * a constructor takes a bool. True if own.
+// * SetOwnership(bool flag).
+// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
+// owned.
+//
+// https://en.wikipedia.org/wiki/Copy-on-write
+template
+class COWPtr {
+ public:
+ // Ctor from raw pointer.
+ explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
+
+ // Move methods. Steal ownership from origin
+ COWPtr(COWPtr&& other)
+ : payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
+ COWPtr& operator=(COWPtr&& origin) = default;
+
+ // Copy methods. Not own payload
+ COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
+ COWPtr& operator=(const COWPtr& other) {
+ payload_ = other.payload_;
+ ownership_.SetOwnership(false);
+ return *this;
+ }
+
+ // Access read only data.
+ const T& Data() const { return *payload_; }
+
+ // Access mutable data. If the data is not owned, the data will be copied
+ // before.
+ T* MutableData() {
+ ownership_.AcquireOwnershipOnce(
+ [this] { payload_.reset(new T(*payload_)); });
+ return payload_.get();
+ }
+
+ private:
+ // Actual data pointer.
+ std::shared_ptr payload_;
+
+ // Ownership flag.
+ OwnershipFlags ownership_;
+};
+
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/details/cow_ptr_test.cc b/paddle/framework/details/cow_ptr_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..936954a2333e7e5d2a932abad641279db9ef7b9f
--- /dev/null
+++ b/paddle/framework/details/cow_ptr_test.cc
@@ -0,0 +1,35 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/framework/details/cow_ptr.h"
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+TEST(COWPtr, all) {
+ COWPtr ptr(new int{0});
+ ASSERT_EQ(ptr.Data(), 0);
+ COWPtr ptr2 = ptr;
+ ASSERT_EQ(ptr2.Data(), 0);
+ ASSERT_EQ(&ptr2.Data(), &ptr.Data());
+ *ptr2.MutableData() = 10;
+ ASSERT_EQ(ptr.Data(), 0);
+ ASSERT_EQ(ptr2.Data(), 10);
+}
+
+} // namespace details
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/device_data_transform.cc b/paddle/framework/device_data_transform.cc
new file mode 100644
index 0000000000000000000000000000000000000000..cd5104cc6f287315ed9d22aa2ec6414f7204d214
--- /dev/null
+++ b/paddle/framework/device_data_transform.cc
@@ -0,0 +1,46 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/framework/device_data_transform.h"
+
+namespace paddle {
+namespace framework {
+
+static const platform::DeviceContext* GetDeviceContext(
+ const platform::Place& src_place, const platform::Place& dst_place) {
+ platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+
+ if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
+ return pool.Get(src_place);
+ } else if (platform::is_cpu_place(src_place) &&
+ platform::is_gpu_place(dst_place)) {
+ return pool.Get(dst_place);
+ } else {
+ PADDLE_THROW(
+ "Currently, model parallelism is only supported between CPU and CUDA");
+ }
+}
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) {
+ VLOG(3) << "DeviceTransform in, src_place " << in.place()
+ << " dst_place: " << dst_place;
+ Tensor* out = new Tensor();
+ auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
+ dev_ctx->Wait();
+ Copy(in, dst_place, *dev_ctx, out);
+ dev_ctx->Wait();
+ return out;
+}
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/device_data_transform.h b/paddle/framework/device_data_transform.h
new file mode 100644
index 0000000000000000000000000000000000000000..bebf0d1b320183f46ab226dc6493ba09a365fc35
--- /dev/null
+++ b/paddle/framework/device_data_transform.h
@@ -0,0 +1,27 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/framework/tensor_util.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place);
+
+} // namespace framework
+} // namespace paddle
diff --git a/paddle/framework/device_data_transform_test.cu b/paddle/framework/device_data_transform_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..9fb26f09c7ed6aff3bfc98cf3f829e50adbf48bf
--- /dev/null
+++ b/paddle/framework/device_data_transform_test.cu
@@ -0,0 +1,168 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "gtest/gtest.h"
+
+#include "paddle/framework/init.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/op_info.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/operators/elementwise_op_function.h"
+#include "paddle/operators/math/math_function.h"
+#include "paddle/platform/device_context.h"
+
+namespace paddle {
+namespace framework {
+
+template
+struct AddFunctor {
+ inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
+};
+
+class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+ public:
+ OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
+ : OpProtoAndCheckerMaker(proto, op_checker) {
+ AddInput("input", "input1 of test op");
+ AddOutput("output", "output of test op");
+ AddAttr("use_gpu", "force to use gpu kernel").SetDefault(false);
+ AddComment("This is test op");
+ }
+};
+
+class TestOpWithKernel : public OperatorWithKernel {
+ public:
+ using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(framework::InferShapeContext* ctx) const override {}
+ OpKernelType GetExpectedKernelType(
+ const ExecutionContext& ctx) const override {
+ if (Attr("use_gpu")) {
+ VLOG(3) << "force use gpu kernel";
+ return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
+ } else {
+ VLOG(3) << "use default kernel";
+ return OpKernelType(proto::DataType::FP32,
+ ctx.Input("input")->place());
+ }
+ }
+};
+
+template
+class TestKernel : public OpKernel {
+ public:
+ void Compute(const ExecutionContext& ctx) const {
+ std::cout << ctx.op().DebugString() << std::endl;
+
+ const Tensor* input = ctx.Input("input");
+
+ std::cout << "input place:" << input->place() << std::endl;
+ auto* output = ctx.Output("output");
+ output->Resize(input->dims());
+ output->mutable_data(ctx.GetPlace());
+
+ operators::TransformFunctor, T, DeviceContext> functor(
+ input, input, output, ctx.template device_context(),
+ AddFunctor());
+ functor.Run();
+ }
+};
+
+} // namespace framework
+} // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(
+ test_op, paddle::framework::TestOpWithKernel,
+ paddle::framework::OpKernelTestProtoAndCheckerMaker);
+REGISTER_OP_CPU_KERNEL(
+ test_op,
+ paddle::framework::TestKernel);
+REGISTER_OP_CUDA_KERNEL(
+ test_op,
+ paddle::framework::TestKernel);
+
+static void BuildVar(const std::string& param_name,
+ std::initializer_list arguments,
+ paddle::framework::proto::OpDesc::Var* var) {
+ var->set_parameter(param_name);
+ for (auto& arg_name : arguments) {
+ *var->mutable_arguments()->Add() = arg_name;
+ }
+}
+
+TEST(Operator, CPUtoGPU) {
+ using namespace paddle::framework;
+ using namespace paddle::platform;
+
+ ASSERT_EQ(InitDevices({"CPU", "GPU:0"}), true);
+
+ paddle::framework::Scope scope;
+ paddle::platform::CPUPlace cpu_place;
+
+ // create an op to run on CPU
+ paddle::framework::proto::OpDesc cpu_op_desc;
+ cpu_op_desc.set_type("test_op");
+ BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
+ BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
+
+ auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
+ // prepare input
+ auto* in_t = scope.Var("IN1")->GetMutable();
+ auto* src_ptr = in_t->mutable_data({2, 3}, CPUPlace());
+ for (int i = 0; i < 2 * 3; ++i) {
+ src_ptr[i] = static_cast(i);
+ }
+
+ // get output
+ auto* output = scope.Var("OUT1");
+ cpu_op->Run(scope, cpu_place);
+
+ auto* output_ptr = output->Get().data();
+ for (int i = 0; i < 2 * 3; ++i) {
+ ASSERT_EQ(output_ptr[i], static_cast(i) * 2);
+ }
+
+ // create an op to run on GPU
+ paddle::framework::proto::OpDesc gpu_op_desc;
+ gpu_op_desc.set_type("test_op");
+ BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
+ BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
+
+ auto attr = gpu_op_desc.mutable_attrs()->Add();
+ attr->set_name("use_gpu");
+ attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
+ attr->set_b(true);
+
+ auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
+
+ paddle::platform::CUDAPlace cuda_place(0);
+ // get output
+ auto* output2 = scope.Var("OUT2");
+ gpu_op->Run(scope, cuda_place);
+
+ // auto* output2_ptr = output2->Get().data();
+ DeviceContextPool& pool = DeviceContextPool::Instance();
+ auto dev_ctx = pool.Get(cuda_place);
+
+ paddle::framework::Tensor output_tensor;
+ Copy(output2->Get(), paddle::platform::CPUPlace(), *dev_ctx,
+ &output_tensor);
+
+ dev_ctx->Wait();
+ float* output2_ptr = output_tensor.data();
+ for (int i = 0; i < 2 * 3; ++i) {
+ ASSERT_EQ(output2_ptr[i], static_cast(i) * 4);
+ }
+}
diff --git a/paddle/framework/executor.cc b/paddle/framework/executor.cc
index bf1f0471ccbfccf13cb6f74c8088da7acd68ec0b..c0418c9266e257bd7567861543e557f354451b17 100644
--- a/paddle/framework/executor.cc
+++ b/paddle/framework/executor.cc
@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h"
+#include "paddle/platform/place.h"
DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be "
@@ -49,10 +50,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable();
} else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable();
+ } else if (var_type == proto::VarDesc::PLACE_LIST) {
+ var->GetMutable();
} else {
PADDLE_THROW(
"Variable type %d is not in "
- "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]",
+ "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
+ " PLACE_LIST]",
var_type);
}
}
@@ -111,7 +115,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
- VLOG(3) << op->DebugString();
+ VLOG(3) << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) {
diff --git a/paddle/framework/framework.proto b/paddle/framework/framework.proto
index 4f2746e4b86ee5fe095897ff6ef9d3f6473e8a14..ea69b87e2ac7dc587333b623c310182bb39eb452 100644
--- a/paddle/framework/framework.proto
+++ b/paddle/framework/framework.proto
@@ -123,6 +123,7 @@ message VarDesc {
STEP_SCOPES = 5;
LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7;
+ PLACE_LIST = 8;
}
required string name = 1;
required VarType type = 2;
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index 682cff168d4d31e0565fc987604f97a671566fbd..e7087e063cbe8839716e3648d55cd25cc778f06f 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -15,6 +15,7 @@ limitations under the License. */
#include
#include "paddle/framework/init.h"
+#include "paddle/framework/operator.h"
#include "paddle/platform/device_context.h"
#include "paddle/platform/place.h"
#include "paddle/string/piece.h"
@@ -24,7 +25,6 @@ namespace framework {
std::once_flag gflags_init_flag;
-// TODO(qijun) move init gflags to init.cc
void InitGflags(std::vector &argv) {
std::call_once(gflags_init_flag, [&]() {
int argc = argv.size();
@@ -72,8 +72,14 @@ bool InitDevices(const std::vector &devices) {
LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
}
platform::DeviceContextPool::Init(places);
+ // framework::UseALL();
return true;
}
+void InitGLOG(const std::string &prog_name) {
+ google::InitGoogleLogging(prog_name.c_str());
+ google::InstallFailureSignalHandler();
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/init.h b/paddle/framework/init.h
index 33907f9eb00fb3469b53dcf8151557cc7a2d3791..9c84a03ded52632047841f95badbcf44bc9f48d1 100644
--- a/paddle/framework/init.h
+++ b/paddle/framework/init.h
@@ -22,6 +22,8 @@ namespace framework {
void InitGflags(std::vector &argv);
+void InitGLOG(const std::string &prog_name);
+
bool InitDevices(const std::vector &devices);
} // namespace framework
diff --git a/paddle/framework/library_type.h b/paddle/framework/library_type.h
index 7707799cae8c4edc304cd81725270a85f01fd28d..1e3084835439b0d55de72a669b93acbaef7ed6b9 100644
--- a/paddle/framework/library_type.h
+++ b/paddle/framework/library_type.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
+#include
namespace paddle {
namespace framework {
@@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
inline LibraryType StringToLibraryType(const char* ctype) {
std::string s(ctype);
+ for (size_t i = 0; i < s.size(); ++i) {
+ s[i] = toupper(s[i]);
+ }
if (s == std::string("PLAIN")) {
return LibraryType::kPlain;
} else if (s == std::string("MKLDNN")) {
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 7b6dc09bdb5535488c8c4dbc71c9cd6a7998bd0b..506fde440533e83f093f26484f925416b89c75a0 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -43,16 +43,20 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
return os;
}
-LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) {
- LoD new_lod;
- new_lod.reserve(level_end - level_begin);
- for (size_t i = level_begin; i < level_end; i++) {
- new_lod.emplace_back(in.at(i));
- }
- // transform the lowest level to absolute offset.
- LoD abs_offset_lod = ToAbsOffset(in);
- new_lod.back() = abs_offset_lod[level_end - 1];
- return new_lod;
+std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
+ PADDLE_ENFORCE(platform::is_cpu_place(t.place()));
+ PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
+
+ os << "dim: " << t.dims() << "\n";
+ os << "lod: " << t.lod() << "\n";
+
+ // only print first ten elements
+ int64_t size = t.numel() < 10 ? t.numel() : 10;
+ for (int64_t i = 0; i < size; ++i) {
+ os << t.data()[i] << " ";
+ }
+
+ return os;
}
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
@@ -115,43 +119,6 @@ bool operator==(const LoD &a, const LoD &b) {
return true;
}
-size_t LoDTensor::NumElements(size_t level, size_t idx) const {
- PADDLE_ENFORCE_LT(level, NumLevels());
- PADDLE_ENFORCE_LT(idx, NumElements(level));
- return lod_[level][idx + 1] - lod_[level][idx];
-}
-
-size_t LoDTensor::NumInstancesInElement(size_t level, size_t idx) const {
- PADDLE_ENFORCE_LT(level, NumLevels());
- PADDLE_ENFORCE_LT(idx, NumElements(level));
- auto abs_lod = ToAbsOffset(lod());
- size_t begin = abs_lod[level][idx];
- size_t end = abs_lod[level][idx + 1];
- return end - begin;
-}
-
-void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
- auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
- lod_ = new_lod;
-}
-
-void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
- size_t elem_end) {
- PADDLE_ENFORCE_LT(level, NumLevels());
- PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
- PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
-
- auto abs_lod = framework::ToAbsOffset(lod());
- auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
- lod_ = new_lod;
-
- // slice the underlying tensor
- size_t begin = abs_lod[level][elem_begin];
- size_t end = abs_lod[level][elem_end];
- PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
- ShareDataWith(Slice(begin, end));
-}
-
using LoDAndOffset = std::pair>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) {
@@ -177,6 +144,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod.");
if (lod->empty()) {
+ for (size_t i = 0; i < lod_length.size(); ++i) {
+ lod->emplace_back(1, 0); // size = 1, value = 0;
+ }
*lod = LoD(lod_length.size(), std::vector({0}));
}
for (size_t i = 0; i < lod->size(); ++i) {
@@ -214,9 +184,10 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
SerializeToStream(os, static_cast(tensor), dev_ctx);
}
-void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
+void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
+ const platform::DeviceContext &dev_ctx) {
{
- // the 1st field, unit32_t version for SelectedRows
+ // the 1st field, unit32_t version for LoDTensor
uint32_t version;
is.read(reinterpret_cast(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
@@ -237,7 +208,71 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
}
}
// the 3st filed, Tensor
- DeserializeFromStream(is, static_cast(tensor));
+ DeserializeFromStream(is, static_cast(tensor), dev_ctx);
+}
+
+std::vector LoDTensor::SplitLoDTensor(
+ const std::vector places) const {
+ check_memory_size();
+ // PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
+ // , "Disable parallel lod for now");
+ PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
+ PADDLE_ENFORCE(dims()[0] % places.size() == 0,
+ "Batch size should be divided by places size");
+
+ std::vector lods;
+ for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
+ size_t begin = place_idx * dims()[0] / places.size();
+ size_t end = (place_idx + 1) * dims()[0] / places.size();
+ auto src = Slice(static_cast(begin), static_cast(end));
+
+ LoDTensor dst;
+ dst.Resize(src.dims());
+ auto &dst_place = places[place_idx];
+ auto dst_ptr = dst.mutable_data(dst_place, src.type());
+
+ // TODO(tonyyang-svail):
+ // change the following to framework::Copy
+ auto src_place = src.place();
+ auto src_ptr = src.data();
+ auto size = src.numel() * SizeOfType(src.type());
+ if (platform::is_cpu_place(src_place) &&
+ platform::is_cpu_place(dst_place)) {
+ memory::Copy(boost::get(dst_place), dst_ptr,
+ boost::get(src_place), src_ptr, size);
+ } else {
+ PADDLE_THROW("Not Implemented");
+ }
+
+ lods.emplace_back(dst);
+ }
+
+ return lods;
+}
+
+void LoDTensor::MergeLoDTensor(
+ const std::vector &lod_tensors, platform::Place place) {
+ PADDLE_ENFORCE(platform::is_cpu_place(place));
+ PADDLE_ENFORCE(!lod_tensors.empty());
+
+ framework::DDim new_dim = lod_tensors[0]->dims();
+ std::type_index new_type = lod_tensors[0]->type();
+ for (auto *lod : lod_tensors) {
+ PADDLE_ENFORCE(new_dim == lod->dims());
+ PADDLE_ENFORCE(new_type == lod->type());
+ PADDLE_ENFORCE(platform::is_cpu_place(lod->place()));
+ }
+ new_dim[0] *= lod_tensors.size();
+ Resize(new_dim);
+
+ auto *dst_ptr = reinterpret_cast(mutable_data(place, new_type));
+ for (auto *src : lod_tensors) {
+ auto size = src->numel() * SizeOfType(src->type());
+ memory::Copy(boost::get(place), dst_ptr,
+ boost::get(src->place()),
+ src->data(), size);
+ dst_ptr += size;
+ }
}
} // namespace framework
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 147db3ab0877662d9e47ae7ee6df05638b5fcbd1..37753f5f4ddea4755ad6211007c367de00aad754 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -58,14 +58,7 @@ using Vector = thrust::host_vector<
using LoD = std::vector>;
std::ostream& operator<<(std::ostream& os, const LoD& lod);
-
-/*
- * Slice levels from a LoD.
- * NOTE the lowest level should always be the absolute offsets of the underlying
- * tensor instances. So if higher layers are sliced without the lowest level,
- * the lower level of the sliced LoD will be transformed to the absolute offset.
- */
-LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
+std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end);
@@ -115,34 +108,11 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1;
}
- /*
- * Number of lower-level elements.
- * For example, a 2-level lod-tensor
- *
- * 0-th level | |
- * 1-th level || |||
- *
- * NumElements(0, 0) get 2
- * NumElements(0, 1) get 3
- */
- size_t NumElements(size_t level, size_t idx) const;
+ std::vector SplitLoDTensor(
+ const std::vector places) const;
- /*
- * Get the number of instances in the underlying tensor in the `idx`-th
- * element.
- */
- size_t NumInstancesInElement(size_t level, size_t idx) const;
-
- /*
- * Shrink levels[level_begin:level_end]
- */
- void ShrinkLevels(size_t level_begin, size_t level_end);
-
- /*
- * Shrink elements of a level, [elem_begin: elem_end]
- * @note: low performance in slice lod_.
- */
- void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
+ void MergeLoDTensor(const std::vector& lod_tensors,
+ platform::Place place);
private:
LoD lod_;
@@ -177,8 +147,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
for (size_t ins = 0; ins < num_instances; ins++) {
for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
auto slice = tensor.Slice(elem, elem + 1);
- CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(),
- platform::CPUDeviceContext(), &slice);
+ Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
+ platform::CPUDeviceContext(), &slice);
}
}
return tensor;
@@ -208,7 +178,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
*/
void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, LoDTensor* tensor);
+void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
+ const platform::DeviceContext& dev_ctx);
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/lod_tensor_test.cc b/paddle/framework/lod_tensor_test.cc
index 0747c8db531d6ae443d76591b945cce0c9bbea2b..52b87f48e5340ce4e265e2e77577f58daae039d2 100644
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
@@ -54,92 +54,6 @@ class LoDTensorTester : public ::testing::Test {
LoDTensor lod_tensor_;
};
-TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
-
-TEST_F(LoDTensorTester, NumElements) {
- ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
- ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
- ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
-}
-
-TEST_F(LoDTensorTester, NumElements2) {
- ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
- ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
- ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
-}
-
-TEST_F(LoDTensorTester, ShrinkLevels) {
- // slice 1 level
- for (size_t level = 0; level < 3UL; ++level) {
- LoDTensor new_lod_tensor = lod_tensor_;
- new_lod_tensor.ShrinkLevels(level, level + 1);
- ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
- ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data());
- }
- // shrink 2 level
- for (size_t level = 0; level < 2UL; ++level) {
- LoDTensor new_lod_tensor = lod_tensor_;
- new_lod_tensor.ShrinkLevels(level, level + 2);
- // the lowest level's last element should be the tensor's batch_size.
- ASSERT_EQ(new_lod_tensor.lod().back().back(),
- lod_tensor_.lod().back().back());
- ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
- ASSERT_EQ(new_lod_tensor.data(), lod_tensor_.data());
- }
-}
-
-TEST_F(LoDTensorTester, ShrinkInLevel) {
- size_t level = 0;
- LoDTensor new_lod_tensor = lod_tensor_;
- new_lod_tensor.ShrinkInLevel(level, 0, 1);
- ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
- ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
- ASSERT_EQ(new_lod_tensor.NumElements(1), 2UL);
- ASSERT_EQ(new_lod_tensor.NumElements(2), 5UL);
- ASSERT_EQ(new_lod_tensor.dims()[0], 12);
- for (int i = 0; i < 12 * 128; i++) {
- ASSERT_EQ(new_lod_tensor.data()[i], i);
- }
-
- level = 1;
- new_lod_tensor = lod_tensor_;
- new_lod_tensor.ShrinkInLevel(level, 1, 2);
- ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
- ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
- ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
- ASSERT_EQ(new_lod_tensor.dims()[0], 7);
- for (int i = 5 * 128; i < 12 * 128; i++) {
- ASSERT_EQ(new_lod_tensor.data()[i - 5 * 128], i);
- }
-
- LoDTensor t1;
- t1.set_lod(lod_tensor_.lod());
- t1.ShareDataWith(lod_tensor_);
-
- LoDTensor t2;
- t2.set_lod(lod_tensor_.lod());
- t2.ShareDataWith(lod_tensor_);
-
- t1.ShrinkInLevel(0, 1, 2);
- t2.ShrinkInLevel(0, 0, 1);
- EXPECT_NE(t1.data(), t2.data());
- EXPECT_NE(t1.data(), lod_tensor_.data());
-}
-
-TEST_F(LoDTensorTester, SerializeAndDeserialize) {
- LoDTensor dst_tensor;
- platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
- std::ostringstream oss;
- SerializeToStream(oss, lod_tensor_, cpu_ctx);
- std::istringstream iss(oss.str());
- DeserializeFromStream(iss, &dst_tensor);
- float* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace());
- for (int i = 0; i < kLodTensorSize; ++i) {
- EXPECT_EQ(dst_ptr[i], i);
- }
- EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
-}
-
TEST(LodExpand, test) {
LoD lod{{0, 2}};
LoDTensor tensor;
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 3e58e6442edfe006c8aed238f67b9524783601ee..1c0372bb16c04e155a68a0411939e4887322107a 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -64,8 +64,9 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j,
out);
- out_var->SetLoDLevel(in_var->GetLodLevel());
+ out_var->SetLoDLevel(in_var->GetLoDLevel());
}
+
bool IsRuntime() const override;
protected:
@@ -383,7 +384,7 @@ void OpDesc::InferVarType(BlockDesc *block) const {
for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) {
block->FindRecursiveOrCreateVar(out_var_name)
- ->SetType(proto::VarDesc::LOD_TENSOR);
+ .SetType(proto::VarDesc::LOD_TENSOR);
}
}
}
diff --git a/paddle/framework/op_kernel_type.h b/paddle/framework/op_kernel_type.h
index b06002096fb109da806809f7b908d9768cf095ba..053897784c1c4350deadf39e2a009220d38f65f9 100644
--- a/paddle/framework/op_kernel_type.h
+++ b/paddle/framework/op_kernel_type.h
@@ -26,13 +26,12 @@ namespace framework {
struct OpKernelType {
struct Hash {
size_t operator()(const OpKernelType& key) const {
- int place = key.place_.which() + (1 << LEFT_SHIFT);
- int data_type =
- static_cast(key.data_type_) + (1 << (LEFT_SHIFT + 1));
- int data_layout =
- static_cast(key.data_layout_) + (1 << (LEFT_SHIFT + 2));
- int library_type =
- static_cast(key.library_type_) + (1 << (LEFT_SHIFT + 3));
+ int place = key.place_.which();
+ int data_type = static_cast(key.data_type_) << LEFT_SHIFT;
+ int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2);
+ int library_type = static_cast(key.library_type_)
+ << (LEFT_SHIFT * 3);
+
std::hash hasher;
return hasher(place + data_type + data_layout + library_type);
}
diff --git a/paddle/framework/op_registry.h b/paddle/framework/op_registry.h
index bdaa25918155caca4b64b0ed60aa3f6be03eb12f..d75c0233e8e0134ddf4edc50c07490a234b65cd0 100644
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -37,8 +37,8 @@ class Registrar {
public:
// In our design, various kinds of classes, e.g., operators and kernels,
// have their corresponding registry and registrar. The action of
- // registration is in the constructor of a global registrar variable, which,
- // however, are not used in the code that calls package framework, and would
+ // registration is in the constructor of a global registrar variable, which
+ // are not used in the code that calls package framework, and would
// be removed from the generated binary file by the linker. To avoid such
// removal, we add Touch to all registrar classes and make USE_OP macros to
// call this method. So, as long as the callee code calls USE_OP, the global
diff --git a/paddle/framework/op_registry_test.cc b/paddle/framework/op_registry_test.cc
index cef530c6e639f6e2188869fa57d114ec6b885aa8..66f07b6757fe1fe613e61ac66057be43ef5aced7 100644
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -12,13 +12,16 @@
See the License for the specific language governing permissions and
limitations under the License. */
-#include "paddle/framework/op_registry.h"
+#include
#include
+#include "paddle/framework/op_registry.h"
+
namespace pd = paddle::framework;
namespace paddle {
namespace framework {
+
class CosineOp : public OperatorBase {
public:
using OperatorBase::OperatorBase;
@@ -215,7 +218,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(InferShapeContext* ctx) const override {}
- framework::OpKernelType GetActualKernelType(
+ framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
}
@@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) {
op->Run(scope, cpu_place);
}
-#ifdef PADDLE_WITH_CUDA
TEST(OperatorRegistrar, CUDA) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0);
@@ -263,4 +265,127 @@ TEST(OperatorRegistrar, CUDA) {
op->Run(scope, cuda_place);
}
-#endif
+
+static int op_test_value = 0;
+
+using paddle::platform::DeviceContext;
+using paddle::platform::CPUDeviceContext;
+using paddle::platform::CUDADeviceContext;
+
+namespace paddle {
+namespace framework {
+
+class OpWithMultiKernelTest : public OperatorWithKernel {
+ public:
+ using OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+ void InferShape(InferShapeContext* ctx) const override {}
+
+ framework::OpKernelType GetExpectedKernelType(
+ const framework::ExecutionContext& ctx) const override {
+ return framework::OpKernelType(
+ proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
+ framework::LibraryType::kCUDNN);
+ }
+};
+
+template
+class OpMultiKernelTest : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template
+class OpMultiKernelTest
+ : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const {
+ ++op_test_value;
+ }
+};
+
+template
+class OpMultiKernelTest
+ : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const {
+ --op_test_value;
+ }
+};
+
+template
+class OpMultiKernelTest2 : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const;
+};
+
+template
+class OpMultiKernelTest2
+ : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const {
+ op_test_value += 10;
+ }
+};
+
+template
+class OpMultiKernelTest2
+ : public paddle::framework::OpKernel {
+ public:
+ void Compute(const paddle::framework::ExecutionContext& ctx) const {
+ op_test_value -= 10;
+ }
+};
+
+} // namespace framework
+} // namespace paddle
+
+REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
+ paddle::framework::OpWithMultiKernelTest,
+ paddle::framework::OpKernelTestMaker);
+REGISTER_OP_KERNEL(
+ op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
+ paddle::framework::OpMultiKernelTest);
+REGISTER_OP_KERNEL(
+ op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
+ paddle::framework::OpMultiKernelTest2);
+REGISTER_OP_KERNEL(
+ op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
+ paddle::framework::OpMultiKernelTest);
+REGISTER_OP_KERNEL(
+ op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
+ paddle::framework::OpMultiKernelTest2);
+
+TEST(OperatorRegistrar, OpWithMultiKernel) {
+ paddle::framework::proto::OpDesc op_desc;
+ paddle::platform::CUDAPlace cuda_place(0);
+ paddle::platform::CPUPlace cpu_place;
+ paddle::framework::Scope scope;
+
+ op_desc.set_type("op_with_multi_kernel");
+ auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
+
+ // TODO(qiao) add priority back
+ // use all available kernels
+ paddle::framework::UseALL();
+ op->Run(scope, cuda_place);
+ EXPECT_EQ(op_test_value, -10);
+
+ // remove cuda kernels
+ paddle::framework::UseCPU();
+ op->Run(scope, cpu_place);
+
+ EXPECT_EQ(op_test_value, -9);
+
+ // add cuda kernels
+ paddle::framework::UseCUDA();
+ op->Run(scope, cuda_place);
+
+ EXPECT_EQ(op_test_value, -10);
+
+ // use cudnn kernel
+ paddle::framework::UseCUDNN();
+ op->Run(scope, cuda_place);
+ EXPECT_EQ(op_test_value, -20);
+}
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index fc7091f1c89f8b3f998f6d1b68f032b76bad2197..35ebe48ba682f135b7f85edb3b2999db7c29e51a 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
+#include
#include
-#include
#include "paddle/framework/data_transform.h"
+#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/executor.h"
-#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/operator.h"
#include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h"
@@ -25,6 +25,66 @@ limitations under the License. */
namespace paddle {
namespace framework {
+std::vector> kKernelPriority;
+
+void UseCPU() {
+ kKernelPriority.clear();
+ /*Plain CPU*/
+ auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain);
+ kKernelPriority.insert(kKernelPriority.begin(), pair0);
+}
+
+void UseMKLDNN() {
+ UseCPU();
+#if PADDLE_WITH_MKLML
+ {
+ /*MKLDNN Kernel*/
+ auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN);
+ kKernelPriority.insert(kKernelPriority.begin(), pair0);
+ }
+#endif
+}
+
+void UseCUDA() {
+ UseMKLDNN();
+#if PADDLE_WITH_CUDA
+ /*Plain GPU*/
+ auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain);
+ kKernelPriority.insert(kKernelPriority.begin(), pair0);
+#endif
+}
+
+void UseCUDNN() {
+ UseCUDA();
+#if PADDLE_WITH_CUDA
+ if (platform::dynload::HasCUDNN()) {
+ /*CUDNN Kernel*/
+ auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN);
+ kKernelPriority.insert(kKernelPriority.begin(), pair0);
+ }
+#endif
+}
+
+void UseALL() {
+ UseCPU();
+ UseMKLDNN();
+ UseCUDA();
+ UseCUDNN();
+}
+
+static DDim GetDims(const Scope& scope, const std::string& name) {
+ Variable* var = scope.FindVar(name);
+ if (var == nullptr) {
+ return DDim({-1});
+ } else if (var->IsType()) {
+ return var->Get().dims();
+ } else if (var->IsType()) {
+ return var->Get().GetCompleteDims();
+ } else {
+ return DDim({-1});
+ }
+}
+
std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -57,7 +117,7 @@ const std::vector& OperatorBase::Outputs(
return it->second;
}
-std::string OperatorBase::DebugString() const {
+std::string OperatorBase::DebugStringEx(const Scope* scope) const {
std::stringstream ss;
ss << "Op(" << type_ << "), inputs:{";
for (auto it = inputs_.begin(); it != inputs_.end();) {
@@ -65,6 +125,9 @@ std::string OperatorBase::DebugString() const {
ss << input.first << "[";
for (size_t i = 0; i < input.second.size(); ++i) {
ss << input.second[i];
+ if (scope) {
+ ss << "(" << GetDims(*scope, input.second[i]) << ")";
+ }
if (i != input.second.size() - 1) {
ss << ", ";
}
@@ -81,6 +144,9 @@ std::string OperatorBase::DebugString() const {
ss << output.first << "[";
for (size_t i = 0; i < output.second.size(); ++i) {
ss << output.second[i];
+ if (scope) {
+ ss << "(" << GetDims(*scope, output.second[i]) << ")";
+ }
if (i != output.second.size() - 1) {
ss << ", ";
}
@@ -178,6 +244,10 @@ void OperatorBase::GenerateTemporaryNames() {
}
}
+static bool VarIsTensor(const Variable* var) {
+ return var->IsType() || var->IsType();
+}
+
static const Tensor* GetTensorFromVar(const Variable* var) {
const Tensor* t = nullptr;
if (var->IsType()) {
@@ -185,7 +255,8 @@ static const Tensor* GetTensorFromVar(const Variable* var) {
} else if (var->IsType()) {
t = &(var->Get().value());
} else {
- PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+ PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+ var->Type().name());
}
return t;
}
@@ -197,7 +268,8 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
} else if (var->IsType()) {
t = var->GetMutable()->mutable_value();
} else {
- PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+ PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
+ var->Type().name());
}
return t;
}
@@ -347,6 +419,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto in_tensor = in_var->Get();
auto* out_tensor = out_var->GetMutable();
out_tensor->set_lod(in_tensor.lod());
+
+ // TODO(dzhwinter) : reuse ShareLoD in most operators.
+ // Need to call ShareLayout explicitly in sequence related ops.
+ // Shall we have a better method to shared info between in/out Tensor?
+ out_tensor->set_layout(in_tensor.layout());
+ }
+
+ void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
+ size_t j = 0) const {
+ PADDLE_ENFORCE_LT(i, Inputs(in).size());
+ PADDLE_ENFORCE_LT(j, Outputs(out).size());
+ Variable* in_var = scope_.FindVar(Inputs(in)[i]);
+ Variable* out_var = scope_.FindVar(Outputs(out)[j]);
+ if (!in_var->IsType()) return;
+ PADDLE_ENFORCE(out_var->IsType(),
+ "The %d-th output of Output(%s) must be LoDTensor.", j, out);
+ auto in_tensor = in_var->Get();
+ auto* out_tensor = out_var->GetMutable();
+ out_tensor->set_layout(in_tensor.layout());
}
bool IsRuntime() const override { return true; }
@@ -359,7 +450,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType()) {
return var->Get().GetCompleteDims();
} else {
- PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+ PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+ name, var->Type().name());
}
}
@@ -370,7 +462,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType()) {
var->GetMutable()->set_height(dim[0]);
} else {
- PADDLE_THROW("Variable type must be LoDTensor/SelectedRows.");
+ PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
+ name, var->Type().name());
}
}
@@ -384,24 +477,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
const Scope& scope_;
};
-const platform::DeviceContext* GetDeviceContext(
- framework::KernelTypePair& kernel_pair) {
- auto& actual_kernel_key = kernel_pair.first;
- auto& expected_kernel_key = kernel_pair.second;
- platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-
- if (platform::is_gpu_place(actual_kernel_key.place_) &&
- platform::is_cpu_place(expected_kernel_key.place_)) {
- return pool.Get(actual_kernel_key.place_);
- } else if (platform::is_cpu_place(actual_kernel_key.place_) &&
- platform::is_gpu_place(expected_kernel_key.place_)) {
- return pool.Get(expected_kernel_key.place_);
- } else {
- PADDLE_THROW(
- "Currently, model parallelism is only supported between CPU and CUDA");
- }
-}
-
void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope);
@@ -417,71 +492,59 @@ void OperatorWithKernel::Run(const Scope& scope,
"There are no kernels which are registered in the %s operator.", type_);
}
- // check if op[type] have kernel for kernel_key
- OpKernelMap& kernels = kernels_iter->second;
-
ExecutionContext ctx(*this, scope, *dev_ctx);
- auto actual_kernel_key = GetActualKernelType(ctx);
- auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
- auto kernel_iter = kernels.find(expected_kernel_key);
-
- if (kernel_iter == kernels.end()) {
- PADDLE_THROW("The operator %s does not support %s", type_,
- expected_kernel_key);
- }
+ auto expected_kernel_key = this->GetExpectedKernelType(ctx);
- if (actual_kernel_key == expected_kernel_key) {
- PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
- "Currently, model parallelism is only supported between "
- "CPU and other devices. For example, multi-GPU model "
- "parallelism will failed.");
- } else {
- auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
- const DataTransformFn* trans_fun =
- DataTransformFnMap::Instance().GetNullable(kernel_pair);
- if (trans_fun) {
- auto input_vars = this->InputVars();
- // TODO(qijun) filter the input vars that do not need to be transformed
-
- // filter vars that has been transformed
- std::vector need_trans;
- for (auto var_name : input_vars) {
- auto var_name_trans =
- var_name + framework::KernelTypeToString(expected_kernel_key);
- if (!scope.FindVar(var_name_trans)) {
- const_cast(scope).Var(var_name_trans);
- need_trans.push_back(var_name);
- }
- }
+ OpKernelMap& kernels = kernels_iter->second;
- if (!need_trans.empty()) {
- auto trans_dev_ctx = GetDeviceContext(kernel_pair);
+ for (auto& candidate : kKernelPriority) {
+ auto candidate_key =
+ OpKernelType(expected_kernel_key.data_type_, std::get<0>(candidate),
+ expected_kernel_key.data_layout_, std::get<1>(candidate));
- // Wait for transform starting
- dev_ctx->Wait();
+ if ((candidate_key == expected_kernel_key) ||
+ (kernels.count(candidate_key))) {
+ expected_kernel_key = candidate_key;
+ break;
+ }
+ }
- for (auto var_name : need_trans) {
- (*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
- scope.FindVar(var_name + framework::KernelTypeToString(
- expected_kernel_key)));
+ VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
+
+ Scope& new_scope = scope.NewScope();
+
+ for (auto& var_name_item : this->Inputs()) {
+ for (auto& var_name : var_name_item.second) {
+ auto* var = scope.FindVar(var_name);
+ if (var && VarIsTensor(var)) {
+ auto* tensor_in = GetTensorFromVar(var);
+ if (tensor_in->IsInitialized()) {
+ auto kernel_type_for_var = this->GetKernelTypeForVar(
+ var_name_item.first, *tensor_in, expected_kernel_key);
+ if (kernel_type_for_var != expected_kernel_key) {
+ auto out_var_names = OutputVars(true);
+ if (std::find(out_var_names.begin(), out_var_names.end(),
+ var_name) != out_var_names.end()) {
+ PADDLE_THROW(
+ "var %s is both input and output, "
+ "does not support transform",
+ var_name);
+ }
+ VLOG(3) << "need to do transform for var " << var_name;
+ auto* trans_var = new_scope.Var(var_name);
+ auto* out = DataTransform(expected_kernel_key, kernel_type_for_var,
+ *tensor_in);
+ CopyVariableWithTensor(*var, *out, *trans_var);
+ }
}
- // Wait for data transform finishing
- trans_dev_ctx->Wait();
}
}
}
- kernel_iter->second->Compute(ctx);
-}
-
-OpKernelType OperatorWithKernel::GetActualKernelType(
- const ExecutionContext& ctx) const {
- return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
-}
+ auto kernel_iter = kernels.find(expected_kernel_key);
-OpKernelType OperatorWithKernel::GetExpectedKernelType(
- const OpKernelType& actual_kernel_type) const {
- return actual_kernel_type;
+ kernel_iter->second->Compute(ExecutionContext(
+ *this, new_scope, *pool.Get(expected_kernel_key.place_)));
}
proto::DataType OperatorWithKernel::IndicateDataType(
@@ -513,5 +576,16 @@ proto::DataType OperatorWithKernel::IndicateDataType(
return static_cast(data_type);
}
+OpKernelType OperatorWithKernel::GetExpectedKernelType(
+ const ExecutionContext& ctx) const {
+ return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
+}
+
+OpKernelType OperatorWithKernel::GetKernelTypeForVar(
+ const std::string& var_name, const Tensor& tensor,
+ const OpKernelType& expected_kernel_type) const {
+ return OpKernelType(expected_kernel_type.data_type_, tensor.place());
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index d0a9b643d565d6651fd7ec0b515f088362852ba3..d5feb598649c97a9517b7c2b1764fd54ff9f8693 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -17,6 +17,7 @@ limitations under the License. */
#include
#include
#include
+#include
#include
#include
@@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD";
/// Variables with this suffix are supposed to be filled up with zeros.
constexpr char kZeroVarSuffix[] = "@ZERO";
-// define some kernel hint
-const std::string kUseCPU = "use_cpu";
-const std::string kUseCUDNN = "use_cudnn";
-const std::string kUseMKLDNN = "use_mkldnn";
+// define some kernel priority
+extern std::vector> kKernelPriority;
+
+/**
+ * @brief Use cpu kernel only
+ */
+void UseCPU();
+
+/**
+ * @brief Perfer MKLDNN kernel than Plain CPU kernel
+ */
+void UseMKLDNN();
+
+/**
+ * @brief Perfer CUDA kernel than Plain CPU kernel
+ */
+void UseCUDA();
+
+/**
+ * @brief Perfer cudnn kernel than Plain CUDA kernel
+ */
+void UseCUDNN();
+
+/**
+ * @brief Use all available kernels
+ */
+void UseALL();
inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix;
@@ -84,7 +108,10 @@ class OperatorBase {
return boost::get(attrs_.at(name));
}
- virtual std::string DebugString() const;
+ /// if scope is not null, also show dimensions of arguments
+ virtual std::string DebugStringEx(const Scope* scope) const;
+
+ std::string DebugString() const { return DebugStringEx(nullptr); }
/// Net will call this function to Run an op.
virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
@@ -381,9 +408,10 @@ class OperatorWithKernel : public OperatorBase {
}
protected:
- virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const;
- virtual OpKernelType GetExpectedKernelType(
- const OpKernelType& actual_kernel_type) const;
+ virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
+ virtual OpKernelType GetKernelTypeForVar(
+ const std::string& var_name, const Tensor& tensor,
+ const OpKernelType& expected_kernel_type) const;
private:
// indicate kernel DataType by input data. Defaultly all input data must be
diff --git a/paddle/framework/operator_test.cc b/paddle/framework/operator_test.cc
index 4d38a7ada91af834aa1a19b49e36d606ebe786ba..d002f3f238862a53ad7286570e2d0bbd2334c584 100644
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -114,7 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
- OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override {
+ OpKernelType GetExpectedKernelType(
+ const ExecutionContext& ctx) const override {
return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
}
};
diff --git a/paddle/framework/scope.cc b/paddle/framework/scope.cc
index 0c01d605bcd95f5796fba1e5a3351a2640b2898a..2bd0ac8f5a9eb6439a4196dd9c61e13797c1a8e3 100644
--- a/paddle/framework/scope.cc
+++ b/paddle/framework/scope.cc
@@ -17,6 +17,7 @@ limitations under the License. */
#include // for unique_ptr
#include // for call_once
#include "glog/logging.h"
+#include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h"
namespace paddle {
@@ -87,7 +88,8 @@ void Scope::DeleteScope(Scope* scope) {
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it);
- delete scope;
+ // Make delete async.
+ Async([scope] { delete scope; });
}
void Scope::Rename(const std::string& origin_name,
@@ -107,6 +109,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
Rename(origin_name, var_name);
return var_name;
}
+
Variable* Scope::FindVarLocally(const std::string& name) const {
auto it = vars_.find(name);
if (it != vars_.end()) return it->second;
diff --git a/paddle/framework/scope.h b/paddle/framework/scope.h
index 10143326dfa201894c777b3e5e226d5ca5015eda..a1da81cc7977d2f31b99c41fb3db3ec03188f954 100644
--- a/paddle/framework/scope.h
+++ b/paddle/framework/scope.h
@@ -75,9 +75,9 @@ class Scope {
// Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const;
- private:
Variable* FindVarLocally(const std::string& name) const;
+ private:
// Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {}
diff --git a/paddle/framework/selected_rows.cc b/paddle/framework/selected_rows.cc
index 82adfa7123a3cf40d929021602c45fe7d2e34ffa..3b3e60177a495cc99f38ee8b82af41c4c76b8652 100644
--- a/paddle/framework/selected_rows.cc
+++ b/paddle/framework/selected_rows.cc
@@ -37,8 +37,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
SerializeToStream(os, selected_rows.value(), dev_ctx);
}
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
- auto tensor = *selected_rows->mutable_value();
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+ const platform::DeviceContext& dev_ctx) {
{
// the 1st field, unit32_t version for SelectedRows
uint32_t version;
@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
selected_rows->set_height(height);
}
// the 4st field, tensor which contains the data
- DeserializeFromStream(is, &tensor);
+ DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
}
} // namespace framework
diff --git a/paddle/framework/selected_rows.h b/paddle/framework/selected_rows.h
index 699e392688e9889f050592172f8bfc45f855d0b1..30d3dfc1e89f073a8180ceacf77619b36f7079a9 100644
--- a/paddle/framework/selected_rows.h
+++ b/paddle/framework/selected_rows.h
@@ -66,7 +66,8 @@ class SelectedRows {
*/
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx);
-void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows);
+void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
+ const platform::DeviceContext& dev_ctx);
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/selected_rows_test.cc b/paddle/framework/selected_rows_test.cc
index 75487c4010391aa9e519d73058184fa936dabb84..8ff3fb6a97199a2798ab29c56957a0f77fa26628 100644
--- a/paddle/framework/selected_rows_test.cc
+++ b/paddle/framework/selected_rows_test.cc
@@ -51,10 +51,12 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
SerializeToStream(oss, *selected_rows_, cpu_ctx);
std::istringstream iss(oss.str());
- DeserializeFromStream(iss, &dst_tensor);
+ DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
+ ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
+ ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
}
} // namespace framework
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index 341a6949beeb2dfa64b23d2079bd8f48750a94f8..4aaa29d794c95592832a1fe990e2dce274eba9d5 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -55,6 +55,10 @@ class Tensor {
template
inline const T* data() const;
+ inline bool IsInitialized() const;
+
+ inline void switch_place(platform::Place new_place);
+
/**
* @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation.
@@ -200,6 +204,15 @@ class Tensor {
size_t offset_;
};
+inline void Tensor::switch_place(platform::Place new_place) {
+ if (holder_->place() == new_place) {
+ return;
+ }
+
+ // TODO(tonyyang-svail): do memcpy here.
+ PADDLE_THROW("Not Implemented");
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 6c6f298edc187a87677089e54c4c9046821282df..1340c5e48520ccdd537e694abf452fd79129df99 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -84,6 +84,8 @@ inline const T* Tensor::data() const {
reinterpret_cast(holder_->ptr()) + offset_);
}
+inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
+
template
inline T* Tensor::data() {
check_memory_size();
diff --git a/paddle/framework/tensor_util.cc b/paddle/framework/tensor_util.cc
index 7efc649d0bcda67c663d148e83bcbb6789b0f371..a5b83eaa07ad25d39996f5644d6a7f3ed35ff7b2 100644
--- a/paddle/framework/tensor_util.cc
+++ b/paddle/framework/tensor_util.cc
@@ -69,7 +69,7 @@ struct AnyVisitor : public boost::static_visitor {
tmp.mutable_data(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait();
- CopyFrom(out, cpu, *gpuctx, &tmp);
+ Copy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait();
return GetResult(tmp, cpu);
}
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index 6a21f8db1e3966fd23eee0da2346b2d61f9321fb..7c56ccf17f94e29d06f529629c47f61b93d2bd22 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -29,11 +29,11 @@ namespace framework {
* @param[in] dst_place The dst place.
* @param[in] ctx The device context contains device resources.
*
- * @note CopyFrom supports CPU <-> GPU, GPU <-> GPU.
+ * @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/
-inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
- const platform::DeviceContext& ctx, Tensor* dst) {
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+ const platform::DeviceContext& ctx, Tensor* dst) {
src.check_memory_size();
dst->Resize(src.dims());
@@ -88,10 +88,10 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
}
/**
- * @brief CopyFrom support CPU <-> CPU
+ * @brief Copy supports CPU <-> CPU
*/
-inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
- Tensor* dst) {
+inline void Copy(const Tensor& src, const platform::Place& dst_place,
+ Tensor* dst) {
src.check_memory_size();
dst->Resize(src.dims());
dst->set_layout(src.layout());
@@ -270,7 +270,23 @@ inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
}
}
-inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
+struct DeserializedDataFunctor {
+ DeserializedDataFunctor(void** buf, Tensor* tensor,
+ const platform::Place& place)
+ : buf_(buf), tensor_(tensor), place_(place) {}
+
+ template
+ void operator()() {
+ *buf_ = tensor_->mutable_data(place_);
+ }
+
+ void** buf_;
+ Tensor* tensor_;
+ platform::Place place_;
+};
+
+inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
+ const platform::DeviceContext& dev_ctx) {
uint32_t version;
is.read(reinterpret_cast(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
@@ -289,27 +305,28 @@ inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
dims.reserve(static_cast(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims));
-
void* buf;
- platform::Place cpu = platform::CPUPlace();
- // TODO(Yancey1989): use VisiterDataType instead of DataType switch
- switch (desc.data_type()) {
- case proto::FP32:
- buf = tensor->mutable_data(cpu);
- break;
- case proto::FP64:
- buf = tensor->mutable_data(cpu);
- break;
- case proto::INT32:
- buf = tensor->mutable_data(cpu);
- break;
- case proto::INT64:
- buf = tensor->mutable_data(cpu);
- break;
- default:
- PADDLE_THROW("DataType %d not supported", desc.data_type());
+ auto ctx = platform::CPUDeviceContext();
+ if (platform::is_gpu_place(dev_ctx.GetPlace())) {
+#ifdef PADDLE_WITH_CUDA
+ Tensor cpu_tensor;
+ cpu_tensor.Resize(framework::make_ddim(dims));
+ framework::VisitDataType(
+ desc.data_type(),
+ DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
+ is.read(static_cast(buf), cpu_tensor.memory_size());
+ auto cpu_place = new platform::CPUPlace();
+ framework::Copy(cpu_tensor, *cpu_place, dev_ctx, tensor);
+ delete cpu_place;
+#else
+ PADDLE_THROW("Unexpected branch");
+#endif
+ } else {
+ framework::VisitDataType(
+ desc.data_type(),
+ DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
+ is.read(static_cast(buf), tensor->memory_size());
}
- is.read(static_cast(buf), tensor->memory_size());
}
}
diff --git a/paddle/framework/tensor_util_test.cc b/paddle/framework/tensor_util_test.cc
index 0dc5166fcabf77b48b8681ab1f050e2bc88f44ab..3636125f2052200238ff82d4f708b62224322cdf 100644
--- a/paddle/framework/tensor_util_test.cc
+++ b/paddle/framework/tensor_util_test.cc
@@ -19,7 +19,7 @@
namespace paddle {
namespace framework {
-TEST(CopyFrom, Tensor) {
+TEST(Copy, Tensor) {
Tensor src_tensor;
Tensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
@@ -32,7 +32,7 @@ TEST(CopyFrom, Tensor) {
src_tensor.set_layout(DataLayout::kAnyLayout);
auto cpu_place = new platform::CPUPlace();
- CopyFrom(src_tensor, *cpu_place, &dst_tensor);
+ Copy(src_tensor, *cpu_place, &dst_tensor);
const int* dst_ptr = dst_tensor.data();
ASSERT_NE(src_ptr, dst_ptr);
@@ -43,7 +43,7 @@ TEST(CopyFrom, Tensor) {
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2);
- CopyFrom(slice_tensor, *cpu_place, &dst_tensor);
+ Copy(slice_tensor, *cpu_place, &dst_tensor);
const int* slice_ptr = slice_tensor.data();
dst_ptr = dst_tensor.data();
ASSERT_NE(dst_ptr, slice_ptr);
@@ -67,11 +67,11 @@ TEST(CopyFrom, Tensor) {
// CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place);
- CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+ Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
auto cpu_place = new platform::CPUPlace();
- CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+ Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
@@ -84,10 +84,10 @@ TEST(CopyFrom, Tensor) {
Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor
- CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+ Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor
- CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+ Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Slice Tensors
gpu_ctx.Wait();
@@ -155,7 +155,7 @@ TEST(CopyFromVector, Tensor) {
CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison
- CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+ Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
@@ -175,7 +175,7 @@ TEST(CopyFromVector, Tensor) {
CopyFromVector(src_vec, cpu_ctx, &cpu_tensor);
gpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector(src_vec, gpu_ctx, &gpu_tensor);
- CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
+ Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors
gpu_ctx.Wait();
@@ -270,11 +270,12 @@ TEST(Tensor, SerializeAndDeserialize) {
SerializeToStream(oss, src_tensor, cpu_ctx);
std::istringstream iss(oss.str());
- DeserializeFromStream(iss, &dst_tensor);
+ DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace());
for (int i = 0; i < 5; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
+ ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
delete place;
}
#ifdef PADDLE_WITH_CUDA
@@ -286,19 +287,18 @@ TEST(Tensor, SerializeAndDeserialize) {
auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place);
- CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
+ Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
std::ostringstream oss;
SerializeToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str());
- DeserializeFromStream(iss, &dst_tensor);
+ DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
int* dst_ptr = dst_tensor.mutable_data(platform::CPUPlace());
for (int i = 0; i < 6; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]);
}
-
delete gpu_place;
}
#endif
diff --git a/paddle/framework/threadpool.h b/paddle/framework/threadpool.h
index bcd8190755083ec30687675602a1c95a9c15c69e..3ac345851c38557f82698786dd3bc8e1202a4256 100644
--- a/paddle/framework/threadpool.h
+++ b/paddle/framework/threadpool.h
@@ -29,7 +29,6 @@ namespace framework {
class ThreadPool {
public:
typedef std::packaged_task Task;
- typedef std::function Fun;
/**
* @brief Get a instance of threadpool, the thread number will
@@ -67,7 +66,8 @@ class ThreadPool {
* @return std::future, we could wait for the task finished by
* f.wait().
*/
- std::future Run(const Fun& fn) {
+ template
+ std::future Run(Callback fn) {
std::unique_lock lock(mutex_);
Task task(std::bind(fn));
std::future f = task.get_future();
@@ -159,5 +159,13 @@ class ThreadPool {
std::condition_variable completed_;
};
+// Run a function asynchronously.
+// NOTE: The function must return void. If the function need to return a value,
+// you can use lambda to capture a value pointer.
+template
+std::future Async(Callback callback) {
+ return ThreadPool::GetInstance()->Run(callback);
+}
+
} // namespace framework
} // namespace paddle
diff --git a/paddle/framework/var_desc.cc b/paddle/framework/var_desc.cc
index 7d002b9ea0b597730685ee03b021c4982f787f49..aeab18d7214f8d9dd79bc3d2e0322490445b3b49 100644
--- a/paddle/framework/var_desc.cc
+++ b/paddle/framework/var_desc.cc
@@ -52,7 +52,7 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
}
}
-int32_t VarDesc::GetLodLevel() const {
+int32_t VarDesc::GetLoDLevel() const {
switch (desc_.type()) {
case proto::VarDesc::LOD_TENSOR:
return desc_.lod_tensor().lod_level();
diff --git a/paddle/framework/var_desc.h b/paddle/framework/var_desc.h
index 4fd2abe7fb215c3ac454de3e30754685111eb570..fc482c467404a6b9dfed64c43871d91d3d10c766 100644
--- a/paddle/framework/var_desc.h
+++ b/paddle/framework/var_desc.h
@@ -76,7 +76,7 @@ class VarDesc {
void SetLoDLevel(int32_t lod_level);
- int32_t GetLodLevel() const;
+ int32_t GetLoDLevel() const;
proto::VarDesc::VarType GetType() const;
diff --git a/paddle/framework/var_type.h b/paddle/framework/var_type.h
index 0e6ea8dc69fe9b7fdaa1163b8d63295624cd3abc..5b7a08a08732a6ccbc206f6a4f0aa4788ce4a219 100644
--- a/paddle/framework/var_type.h
+++ b/paddle/framework/var_type.h
@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/variable.h"
namespace paddle {
namespace framework {
@@ -35,7 +37,7 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
}
template
-inline void VisitVarType(const Variable& var, Visitor visitor) {
+inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) {
case proto::VarDesc_VarType_LOD_TENSOR:
visitor(var.Get());
diff --git a/paddle/framework/variable.h b/paddle/framework/variable.h
index e5a94759f9230ab4ce9d2cc24849a2debb8a5e2f..36b76fb196cfd4c7b3697dcf0cda9a23ff53deb3 100644
--- a/paddle/framework/variable.h
+++ b/paddle/framework/variable.h
@@ -32,6 +32,8 @@ class Variable {
return *static_cast(holder_->Ptr());
}
+ bool IsInitialized() const { return holder_ != nullptr; }
+
template
T* GetMutable() {
if (!IsType()) {
diff --git a/paddle/gserver/gradientmachines/GradientMachine.h b/paddle/gserver/gradientmachines/GradientMachine.h
index ebfe0573cfdbfb2ef54a29b038e8b85356cc6c27..4ab54a5022a3d30215c7557bca2e69c4d011fc5f 100644
--- a/paddle/gserver/gradientmachines/GradientMachine.h
+++ b/paddle/gserver/gradientmachines/GradientMachine.h
@@ -233,6 +233,13 @@ public:
(void)numProcessed;
}
+ /**
+ * @brief Release the middle layer's output memory.
+ *
+ * @note This function is used for memory optimization in inference.
+ */
+ virtual void releaseOutput() {}
+
protected:
virtual void onLoadParameter() {}
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.cpp b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
index 68bf37d59db65ddc8096e2db3391be25c37b57e6..1f2aa61b6f86eebf7b002f1e48aa56d1d14d4820 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.cpp
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.cpp
@@ -187,6 +187,31 @@ void NeuralNetwork::init(const ModelConfig& config,
CHECK(it != layerMap_.end());
outputLayers_.push_back(it->second);
}
+
+ for (const auto& layer : layers_) {
+ const auto& name = layer->getName();
+ bool isMiddleLayer = true;
+
+ // if data layer
+ for (const auto& dataLayer : dataLayers_) {
+ if (name == dataLayer->getName()) {
+ isMiddleLayer = false;
+ break;
+ }
+ }
+
+ // if output layer
+ for (const auto& dataLayer : outputLayers_) {
+ if (name == dataLayer->getName()) {
+ isMiddleLayer = false;
+ break;
+ }
+ }
+
+ if (isMiddleLayer) {
+ middleLayers_.push_back(layer);
+ }
+ }
}
void NeuralNetwork::connect(LayerPtr agentLayer,
@@ -327,6 +352,13 @@ void NeuralNetwork::onPassEnd() {
}
}
+void NeuralNetwork::releaseOutput() {
+ for (auto& layer : middleLayers_) {
+ Argument& arg = layer->getOutput();
+ arg.value.reset();
+ }
+}
+
#ifndef PADDLE_MOBILE_INFERENCE
class CombinedEvaluator : public Evaluator {
diff --git a/paddle/gserver/gradientmachines/NeuralNetwork.h b/paddle/gserver/gradientmachines/NeuralNetwork.h
index 6888380290074318fe7f94d168b2931e776dda47..968e198cf6608c47089d40fc98c6691f9e5bda5c 100644
--- a/paddle/gserver/gradientmachines/NeuralNetwork.h
+++ b/paddle/gserver/gradientmachines/NeuralNetwork.h
@@ -137,6 +137,13 @@ public:
/// some finish work, like convert the weight format of MKLDNNLayers
void finish();
+ /**
+ * @brief Release the middle layer's output memory.
+ *
+ * @note This function is used for memory optimization in inference.
+ */
+ void releaseOutput();
+
protected:
/**
* The constructor of NeuralNetwork.
@@ -158,6 +165,7 @@ protected:
std::vector dataLayers_;
std::vector outputLayers_;
+ std::vector middleLayers_;
static std::map dllInitMap;
diff --git a/paddle/gserver/tests/test_LayerGrad.cpp b/paddle/gserver/tests/test_LayerGrad.cpp
index a2f07937b8834e3f3fa7a6bf2ae10f29a8d84f29..ba83667ebc9a89c37f77a7f71e6df90b54723cc0 100644
--- a/paddle/gserver/tests/test_LayerGrad.cpp
+++ b/paddle/gserver/tests/test_LayerGrad.cpp
@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed);
config.testState = !reversed;
- testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu);
+ testLayerGrad(
+ config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
}
}
}
@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed);
config.testState = !reversed;
- testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu);
+ testLayerGrad(
+ config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
}
}
for (auto useGpu : {true}) {
diff --git a/paddle/inference/CMakeLists.txt b/paddle/inference/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8437b2b21942ead544dab8636db1b355b7cf7bd5
--- /dev/null
+++ b/paddle/inference/CMakeLists.txt
@@ -0,0 +1,47 @@
+set(FLUID_CORE_MODULES
+ backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB})
+
+cc_library(paddle_fluid_api
+ SRCS inference.cc
+ DEPS ${FLUID_CORE_MODULES})
+
+# Merge all modules into a simgle static library
+cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
+
+# ptools
+# just for testing, we may need to change the storing format for inference_model
+# and move the dependent of pickle.
+# download from http://www.picklingtools.com/
+# build in the C++ sub-directory, using command
+# make -f Makefile.Linux libptools.so
+set(PTOOLS_LIB)
+set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
+find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
+find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
+if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
+ add_definitions(-DPADDLE_USE_PTOOLS)
+ set(PTOOLS_LIB ptools)
+ message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
+ add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
+ set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
+ include_directories(${PTOOLS_ROOT}/C++)
+ include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
+ add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
+endif()
+
+add_executable(example example.cc)
+if(APPLE)
+ set(OPTIONAL_LINK_FLAGS)
+ if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
+ set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
+ endif()
+ target_link_libraries(example
+ -Wl,-force_load paddle_fluid
+ ${OPTIONAL_LINK_FLAGS}
+ ${PTOOLS_LIB})
+else()
+ target_link_libraries(example
+ -Wl,--start-group -Wl,--whole-archive paddle_fluid
+ -Wl,--no-whole-archive -Wl,--end-group
+ ${PTOOLS_LIB})
+endif()
diff --git a/paddle/inference/example.cc b/paddle/inference/example.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9711b20e6fb4099a2cc497029468ebd1fd0b3456
--- /dev/null
+++ b/paddle/inference/example.cc
@@ -0,0 +1,79 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include
+#include
+#include "gflags/gflags.h"
+#include "paddle/inference/inference.h"
+
+DEFINE_string(dirname, "", "Directory of the inference model.");
+DEFINE_string(feed_var_names, "", "Names of feeding variables");
+DEFINE_string(fetch_var_names, "", "Names of fetching variables");
+
+int main(int argc, char** argv) {
+ google::ParseCommandLineFlags(&argc, &argv, true);
+ if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
+ FLAGS_fetch_var_names.empty()) {
+ // Example:
+ // ./example --dirname=recognize_digits_mlp.inference.model
+ // --feed_var_names="x"
+ // --fetch_var_names="fc_2.tmp_2"
+ std::cout << "Usage: ./example --dirname=path/to/your/model "
+ "--feed_var_names=x --fetch_var_names=y"
+ << std::endl;
+ exit(1);
+ }
+
+ std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
+ std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
+ std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
+
+ std::string dirname = FLAGS_dirname;
+ std::vector feed_var_names = {FLAGS_feed_var_names};
+ std::vector fetch_var_names = {FLAGS_fetch_var_names};
+
+ paddle::InferenceEngine* engine = new paddle::InferenceEngine();
+ engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
+
+ paddle::framework::LoDTensor input;
+ srand(time(0));
+ float* input_ptr =
+ input.mutable_data({1, 784}, paddle::platform::CPUPlace());
+ for (int i = 0; i < 784; ++i) {
+ input_ptr[i] = rand() / (static_cast(RAND_MAX));
+ }
+
+ std::vector feeds;
+ feeds.push_back(input);
+ std::vector fetchs;
+ engine->Execute(feeds, fetchs);
+
+ for (size_t i = 0; i < fetchs.size(); ++i) {
+ auto dims_i = fetchs[i].dims();
+ std::cout << "dims_i:";
+ for (int j = 0; j < dims_i.size(); ++j) {
+ std::cout << " " << dims_i[j];
+ }
+ std::cout << std::endl;
+ std::cout << "result:";
+ float* output_ptr = fetchs[i].data();
+ for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
+ std::cout << " " << output_ptr[j];
+ }
+ std::cout << std::endl;
+ }
+
+ delete engine;
+ return 0;
+}
diff --git a/paddle/inference/inference.cc b/paddle/inference/inference.cc
new file mode 100644
index 0000000000000000000000000000000000000000..49e39358e81bbee64a618be88ee0fca6aa438b93
--- /dev/null
+++ b/paddle/inference/inference.cc
@@ -0,0 +1,195 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "inference.h"
+#include
+#include "paddle/framework/executor.h"
+#include "paddle/framework/feed_fetch_method.h"
+#include "paddle/framework/init.h"
+#include "paddle/framework/scope.h"
+
+#ifdef PADDLE_USE_PTOOLS
+#include "chooseser.h"
+#endif
+
+namespace paddle {
+
+void InferenceEngine::LoadInferenceModel(
+ const std::string& dirname,
+ const std::vector& feed_var_names,
+ const std::vector& fetch_var_names) {
+#ifdef PADDLE_USE_PTOOLS
+ std::string model_filename = dirname + "/__model__";
+ LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
+ Val v;
+ LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
+ std::string program_desc_str = v["program_desc_str"];
+ LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+// PicklingTools cannot parse the vector of strings correctly.
+#else
+ std::string model_filename = dirname + "/__model__.dat";
+ LOG(INFO) << "loading model from " << model_filename;
+ std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
+ std::string program_desc_str;
+ inputfs.seekg(0, std::ios::end);
+ program_desc_str.resize(inputfs.tellg());
+ inputfs.seekg(0, std::ios::beg);
+ LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
+ inputfs.read(&program_desc_str[0], program_desc_str.size());
+ inputfs.close();
+#endif
+ program_ = new framework::ProgramDesc(program_desc_str);
+ GenerateLoadProgram(dirname);
+
+ if (feed_var_names.empty() || fetch_var_names.empty()) {
+ LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
+ }
+ feed_var_names_ = feed_var_names;
+ fetch_var_names_ = fetch_var_names;
+ PrependFeedOp();
+ AppendFetchOp();
+}
+
+bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
+ if (var->Persistable()) {
+ // There are many unreachable variables in the program
+ for (size_t i = 0; i < program_->Size(); ++i) {
+ const framework::BlockDesc& block = program_->Block(i);
+ for (auto* op : block.AllOps()) {
+ for (auto input_argument_name : op->InputArgumentNames()) {
+ if (input_argument_name == var->Name()) {
+ return true;
+ }
+ }
+ }
+ }
+ }
+ return false;
+}
+
+void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
+ framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+ load_program_ = new framework::ProgramDesc();
+ framework::BlockDesc* load_block = load_program_->MutableBlock(0);
+ for (auto* var : global_block->AllVars()) {
+ if (IsParameter(var)) {
+ LOG(INFO) << "parameter's name: " << var->Name();
+
+ framework::VarDesc* new_var = load_block->Var(var->Name());
+ new_var->SetShape(var->Shape());
+ new_var->SetDataType(var->GetDataType());
+ new_var->SetType(var->GetType());
+ new_var->SetLoDLevel(var->GetLoDLevel());
+ new_var->SetPersistable(true);
+
+ // append_op
+ framework::OpDesc* op = load_block->AppendOp();
+ op->SetType("load");
+ op->SetOutput("Out", {new_var->Name()});
+ op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
+ op->CheckAttrs();
+ }
+ }
+}
+
+void InferenceEngine::PrependFeedOp() {
+ if (!program_) {
+ LOG(FATAL) << "Please initialize the program_ first.";
+ }
+
+ framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+ // create_var
+ framework::VarDesc* feed_var = global_block->Var("feed");
+ feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
+ feed_var->SetPersistable(true);
+
+ // prepend feed_op
+ for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+ std::string var_name = feed_var_names_[i];
+ LOG(INFO) << "feed var's name: " << var_name;
+
+ // prepend_op
+ framework::OpDesc* op = global_block->PrependOp();
+ op->SetType("feed");
+ op->SetInput("X", {"feed"});
+ op->SetOutput("Out", {var_name});
+ op->SetAttr("col", {static_cast(i)});
+ op->CheckAttrs();
+ }
+}
+
+void InferenceEngine::AppendFetchOp() {
+ if (!program_) {
+ LOG(FATAL) << "Please initialize the program_ first.";
+ }
+
+ framework::BlockDesc* global_block = program_->MutableBlock(0);
+
+ // create_var
+ framework::VarDesc* fetch_var = global_block->Var("fetch");
+ fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
+ fetch_var->SetPersistable(true);
+
+ // append fetch_op
+ for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+ std::string var_name = fetch_var_names_[i];
+ LOG(INFO) << "fetch var's name: " << var_name;
+
+ // append_op
+ framework::OpDesc* op = global_block->AppendOp();
+ op->SetType("fetch");
+ op->SetInput("X", {var_name});
+ op->SetOutput("Out", {"fetch"});
+ op->SetAttr("col", {static_cast(i)});
+ op->CheckAttrs();
+ }
+}
+
+void InferenceEngine::Execute(const std::vector& feeds,
+ std::vector& fetchs) {
+ if (!program_ || !load_program_) {
+ LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
+ }
+
+ if (feeds.size() < feed_var_names_.size()) {
+ LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
+ }
+
+ auto* place = new platform::CPUPlace();
+ framework::InitDevices({"CPU"});
+ framework::Executor* executor = new framework::Executor(*place);
+ framework::Scope* scope = new framework::Scope();
+
+ executor->Run(*load_program_, scope, 0, true, true);
+
+ // set_feed_variable
+ for (size_t i = 0; i < feed_var_names_.size(); ++i) {
+ framework::SetFeedVariable(scope, feeds[i], "feed", i);
+ }
+
+ executor->Run(*program_, scope, 0, true, true);
+
+ // get_fetch_variable
+ fetchs.resize(fetch_var_names_.size());
+ for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
+ fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
+ }
+
+ delete place;
+ delete scope;
+ delete executor;
+}
+} // namespace paddle
diff --git a/paddle/inference/inference.h b/paddle/inference/inference.h
new file mode 100644
index 0000000000000000000000000000000000000000..a3f3ef4b440036a0b27353cc092eed1bbf96eeb3
--- /dev/null
+++ b/paddle/inference/inference.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/block_desc.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/program_desc.h"
+
+namespace paddle {
+
+class InferenceEngine {
+public:
+ InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
+ ~InferenceEngine() {
+ delete program_;
+ delete load_program_;
+ }
+
+ void LoadInferenceModel(const std::string& dirname,
+ const std::vector& feed_var_names,
+ const std::vector& fetch_var_names);
+ void Execute(const std::vector& feeds,
+ std::vector& fetchs);
+
+private:
+ bool IsParameter(const framework::VarDesc* var);
+ void GenerateLoadProgram(const std::string& dirname);
+ void PrependFeedOp();
+ void AppendFetchOp();
+
+private:
+ framework::ProgramDesc* program_;
+ framework::ProgramDesc* load_program_;
+ std::vector feed_var_names_;
+ std::vector fetch_var_names_;
+};
+
+} // namespace paddle
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index c4bb6baee7ebf2941cee5915ca2723c298689261..1a73a94567e45b81a0b148965a834f03c7407ffe 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -114,5 +114,21 @@ void Free(platform::CUDAPlace place, void* p) {
#endif
+size_t Usage::operator()(const platform::CPUPlace& cpu) const {
+ return Used(cpu);
+}
+
+size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+ return Used(gpu);
+#else
+ PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
+#endif
+}
+
+size_t memory_usage(const platform::Place& p) {
+ return boost::apply_visitor(Usage(), p);
+}
+
} // namespace memory
} // namespace paddle
diff --git a/paddle/memory/memory.h b/paddle/memory/memory.h
index 11bbb881874ec50e1132547336fc6fb6b42bcc4f..7012b6d331d0c4631a3d120fbaf3db7c97298ac7 100644
--- a/paddle/memory/memory.h
+++ b/paddle/memory/memory.h
@@ -54,6 +54,13 @@ void Free(Place place, void* ptr);
template
size_t Used(Place place);
+struct Usage : public boost::static_visitor {
+ size_t operator()(const platform::CPUPlace& cpu) const;
+ size_t operator()(const platform::CUDAPlace& gpu) const;
+};
+
+size_t memory_usage(const platform::Place& p);
+
/**
* \brief Free memory block in one place.
*
diff --git a/paddle/memory/memory_test.cc b/paddle/memory/memory_test.cc
index f476bf71264da59a5c546968f4689145e1d8801b..b3f699f9b7eff54c06ff69023db082380c83467a 100644
--- a/paddle/memory/memory_test.cc
+++ b/paddle/memory/memory_test.cc
@@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) {
EXPECT_NE(p, nullptr);
+ paddle::platform::Place place = cpu;
+ EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
+
paddle::memory::Free(cpu, p);
}
@@ -99,6 +102,9 @@ TEST(BuddyAllocator, GPUAllocation) {
EXPECT_NE(p, nullptr);
+ paddle::platform::Place place = gpu;
+ EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
+
paddle::memory::Free(gpu, p);
}
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 0b22b85c382f419b660b4eb67daa21e079d75e0e..055162fbd6deee8f49830425821413f4fef1169f 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,106 +61,28 @@ function(op_library TARGET)
${op_common_deps})
endif()
- # net_op doesn't need pybind
- if ("${TARGET}" STREQUAL "net_op")
- set(pybind_flag 1)
- endif()
-
- if ("${TARGET}" STREQUAL "compare_op")
- set(pybind_flag 1)
- file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
- endif()
-
- # conv_op contains several operators
- if ("${TARGET}" STREQUAL "conv_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
- endif()
-
- # conv_cudnn_op contains several operators
- if ("${TARGET}" STREQUAL "conv_cudnn_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
- endif()
-
- # pool_op contains several operators
- if ("${TARGET}" STREQUAL "pool_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
- endif()
-
- # pool_cudnn_op contains several operators
- if ("${TARGET}" STREQUAL "pool_cudnn_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
- endif()
-
- if ("${TARGET}" STREQUAL "logical_op")
- set(pybind_flag 1)
- file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
- endif()
-
- # pool_with_index_op contains several operators
- if ("${TARGET}" STREQUAL "pool_with_index_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
- endif()
-
- # conv_transpose_op contains several operators
- if ("${TARGET}" STREQUAL "conv_transpose_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
- endif()
-
- # conv_transpose_cudnn_op contains two operators
- if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
- endif()
-
- # save_restore_op contains several operators
- if ("${TARGET}" STREQUAL "save_restore_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
- endif()
-
- # activation_op contains several operators
- if ("${TARGET}" STREQUAL "activation_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
- endif()
-
- # nccl_op contains several operators
- if ("${TARGET}" STREQUAL "nccl_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
- endif()
-
- # reduce_op contains several operators
- if ("${TARGET}" STREQUAL "reduce_op")
- set(pybind_flag 1)
- # It's enough to just adding one operator to pybind
- file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
- endif()
+ # Define operators that don't need pybind here.
+ foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
+ if ("${TARGET}" STREQUAL "${manual_pybind_op}")
+ set(pybind_flag 1)
+ endif()
+ endforeach()
- if ("${TARGET}" STREQUAL "tensor_array_read_write_op")
- set(pybind_flag 1)
- file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n")
+ # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
+ # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
+ # And for detail pybind information, please see generated paddle/pybind/pybind.h.
+ file(READ ${TARGET}.cc TARGET_CONTENT)
+ string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
+ string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
+ if (one_register STREQUAL "")
+ string(REPLACE "_op" "" TARGET "${TARGET}")
+ else ()
+ string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
+ string(REPLACE "," "" TARGET "${TARGET}")
endif()
# pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
- file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
@@ -171,7 +93,6 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
-
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
@@ -188,6 +109,7 @@ add_subdirectory(nccl)
if(WITH_GPU)
op_library(nccl_op DEPS nccl_common)
+ file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
else()
set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif()
@@ -229,8 +151,10 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor)
+op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function)
op_library(cos_sim_op DEPS cos_sim_functor)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
+op_library(parallel_do_op DEPS executor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
@@ -239,6 +163,8 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
+file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
+
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
diff --git a/paddle/operators/accuracy_op.cc b/paddle/operators/accuracy_op.cc
index d7baa6e90538484b400f32587a052d394a8d10d5..8e8a3c7dd3036317fac29b709d7a29e18f017503 100644
--- a/paddle/operators/accuracy_op.cc
+++ b/paddle/operators/accuracy_op.cc
@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
}
protected:
- framework::OpKernelType GetActualKernelType(
+ framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input("Out")->type()),
diff --git a/paddle/operators/activation_op.h b/paddle/operators/activation_op.h
index 0885f7c570b9b52dc51597347295734fd689da8d..88c3d1c597a853abdee7753a5110be4a1726e905 100644
--- a/paddle/operators/activation_op.h
+++ b/paddle/operators/activation_op.h
@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once
#include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h"
+#include "paddle/operators/detail/safe_ref.h"
namespace paddle {
namespace operators {
@@ -26,12 +27,16 @@ class ActivationKernel
using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override {
- auto* X = context.Input("X");
- auto* Out = context.Output("Out");
- Out->mutable_data(context.GetPlace());
-
- auto x = framework::EigenVector::Flatten(*X);
- auto out = framework::EigenVector::Flatten(*Out);
+ auto& X = detail::Ref(context.Input("X"),
+ "Cannot get input tensor X, variable name = %s",
+ context.op().Input("X"));
+
+ auto& Out = detail::Ref(context.Output