提交 9f2f76aa 编写于 作者: Y Yancey1989

Merge branch 'develop' of github.com:PaddlePaddle/Paddle into hsigmoid_op

...@@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}) ...@@ -20,8 +20,10 @@ set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
include(system) include(system)
project(paddle CXX C Go) project(paddle CXX C Go)
message(STATUS "CXX compiler: " ${CMAKE_CXX_COMPILER} ", version: " ${CMAKE_CXX_COMPILER_VERSION}) message(STATUS "CXX compiler: ${CMAKE_CXX_COMPILER}, version: "
message(STATUS "C compiler: " ${CMAKE_C_COMPILER} ", version: " ${CMAKE_C_COMPILER_VERSION}) "${CMAKE_CXX_COMPILER_ID} ${CMAKE_CXX_COMPILER_VERSION}")
message(STATUS "C compiler: ${CMAKE_C_COMPILER}, version: "
"${CMAKE_C_COMPILER_ID} ${CMAKE_C_COMPILER_VERSION}")
find_package(Sphinx) find_package(Sphinx)
if(NOT CMAKE_CROSSCOMPILING) if(NOT CMAKE_CROSSCOMPILING)
......
...@@ -7,11 +7,11 @@ Machine: ...@@ -7,11 +7,11 @@ Machine:
System: CentOS release 6.3 (Final), Docker 1.12.1. System: CentOS release 6.3 (Final), Docker 1.12.1.
PaddlePaddle: (TODO: will rerun after 0.11.0) PaddlePaddle:
- paddlepaddle/paddle:latest (for MKLML and MKL-DNN) - paddlepaddle/paddle:0.11.0 (for MKLML and MKL-DNN)
- MKL-DNN tag v0.11 - MKL-DNN tag v0.11
- MKLML 2018.0.1.20171007 - MKLML 2018.0.1.20171007
- paddlepaddle/paddle:latest-openblas (for OpenBLAS) - paddlepaddle/paddle:0.11.0-openblas (for OpenBLAS)
- OpenBLAS v0.2.20 - OpenBLAS v0.2.20
On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively. On each machine, we will test and compare the performance of training on single node using MKL-DNN / MKLML / OpenBLAS respectively.
...@@ -56,15 +56,15 @@ Input image size - 3 * 224 * 224, Time: images/second ...@@ -56,15 +56,15 @@ Input image size - 3 * 224 * 224, Time: images/second
<img src="figs/googlenet-cpu-train.png" width="500"> <img src="figs/googlenet-cpu-train.png" width="500">
- Alexnet - AlexNet
| BatchSize | 64 | 128 | 256 | | BatchSize | 64 | 128 | 256 |
|--------------|--------| ------ | -------| |--------------|--------| ------ | -------|
| OpenBLAS | 2.13 | 2.45 | 2.68 | | OpenBLAS | 45.62 | 72.79 | 107.22 |
| MKLML | 66.37 | 105.60 | 144.04 | | MKLML | 66.37 | 105.60 | 144.04 |
| MKL-DNN | 399.00 | 498.94 | 626.53 | | MKL-DNN | 399.00 | 498.94 | 626.53 |
chart TBD <img src="figs/alexnet-cpu-train.png" width="500">
#### Inference #### Inference
Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
...@@ -72,36 +72,41 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz ...@@ -72,36 +72,41 @@ Test on batch size 1, 2, 4, 8, 16 on Intel(R) Xeon(R) Gold 6148 CPU @ 2.40GHz
| BatchSize | 1 | 2 | 4 | 8 | 16 | | BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|-------|-------|-------|-------| |-----------|-------|-------|-------|-------|-------|
| OpenBLAS | 1.07 | 1.08 | 1.06 | 0.88 | 0.65 | | OpenBLAS | 1.10 | 1.96 | 3.62 | 3.63 | 2.25 |
| MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 | | MKLML | 5.58 | 9.80 | 15.15 | 21.21 | 28.67 |
| MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 | | MKL-DNN | 75.07 | 88.64 | 82.58 | 92.29 | 96.75 |
<img src="figs/vgg-cpu-infer.png" width="500">
- ResNet-50 - ResNet-50
| BatchSize | 1 | 2 | 4 | 8 | 16 | | BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|-------|--------|--------|--------|--------| |-----------|-------|--------|--------|--------|--------|
| OpenBLAS | 3.35 | 3.19 | 3.09 | 2.55 | 1.96 | | OpenBLAS | 3.31 | 6.72 | 11.59 | 13.17 | 9.27 |
| MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 | | MKLML | 6.33 | 12.02 | 22.88 | 40.53 | 63.09 |
| MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 | | MKL-DNN | 107.83| 148.84 | 177.78 | 189.35 | 217.69 |
<img src="figs/resnet-cpu-infer.png" width="500">
- GoogLeNet - GoogLeNet
| BatchSize | 1 | 2 | 4 | 8 | 16 | | BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------| |-----------|--------|--------|--------|--------|--------|
| OpenBLAS | 12.04 | 11.31 | 10.00 | 9.07 | 4.34 | | OpenBLAS | 12.06 | 23.56 | 34.48 | 36.45 | 23.12 |
| MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 | | MKLML | 22.74 | 41.56 | 81.22 | 133.47 | 210.53 |
| MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 | | MKL-DNN | 175.10 | 272.92 | 450.70 | 512.00 | 600.94 |
- Alexnet <img src="figs/googlenet-cpu-infer.png" width="500">
- AlexNet
| BatchSize | 1 | 2 | 4 | 8 | 16 | | BatchSize | 1 | 2 | 4 | 8 | 16 |
|-----------|--------|--------|--------|--------|--------| |-----------|--------|--------|--------|--------|--------|
| OpenBLAS | | | | | | | OpenBLAS | 3.53 | 6.23 | 15.04 | 26.06 | 31.62 |
| MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 | | MKLML | 21.32 | 36.55 | 73.06 | 131.15 | 192.77 |
| MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 | | MKL-DNN | 442.91 | 656.41 | 719.10 | 847.68 | 850.51 |
chart TBD <img src="figs/alexnet-cpu-infer.png" width="500">
### Laptop ### Laptop
TBD TBD
benchmark/figs/resnet-cpu-train.png

19.8 KB | W: | H:

benchmark/figs/resnet-cpu-train.png

17.6 KB | W: | H:

benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
benchmark/figs/resnet-cpu-train.png
  • 2-up
  • Swipe
  • Onion skin
benchmark/figs/vgg-cpu-train.png

17.9 KB | W: | H:

benchmark/figs/vgg-cpu-train.png

16.7 KB | W: | H:

benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
benchmark/figs/vgg-cpu-train.png
  • 2-up
  • Swipe
  • Onion skin
...@@ -8,6 +8,7 @@ function clock_to_seconds() { ...@@ -8,6 +8,7 @@ function clock_to_seconds() {
} }
function infer() { function infer() {
export OPENBLAS_MAIN_FREE=1
topology=$1 topology=$1
layer_num=$2 layer_num=$2
bs=$3 bs=$3
......
...@@ -19,7 +19,7 @@ ExternalProject_Add( ...@@ -19,7 +19,7 @@ ExternalProject_Add(
if (${CMAKE_VERSION} VERSION_LESS "3.3.0") if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_eigen3 = \"${dummyfile}\";") file(WRITE ${dummyfile} "const char *dummy_eigen3 = \"${dummyfile}\";")
add_library(eigen3 STATIC ${dummyfile}) add_library(eigen3 STATIC ${dummyfile})
else() else()
add_library(eigen3 INTERFACE) add_library(eigen3 INTERFACE)
......
...@@ -63,9 +63,30 @@ ExternalProject_Add( ...@@ -63,9 +63,30 @@ ExternalProject_Add(
-DMKLROOT:PATH=${MKLML_ROOT} -DMKLROOT:PATH=${MKLML_ROOT}
) )
ADD_LIBRARY(mkldnn SHARED IMPORTED GLOBAL) ADD_LIBRARY(shared_mkldnn SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN) add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies mkldnn) LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/mkldnn_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";")
ADD_LIBRARY(mkldnn STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(mkldnn ${MKLDNN_LIB} ${MKLML_LIB} ${MKLML_IOMP_LIB})
ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
# copy the real so.0 lib to install dir
# it can be directly contained in wheel or capi
SET(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
ADD_CUSTOM_COMMAND(OUTPUT ${MKLDNN_SHARED_LIB}
COMMAND cp ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
DEPENDS mkldnn)
ADD_CUSTOM_TARGET(mkldnn_shared_lib ALL DEPENDS ${MKLDNN_SHARED_LIB})
IF(WITH_C_API)
INSTALL(FILES ${MKLDNN_SHARED_LIB} DESTINATION lib)
ENDIF()
...@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) ...@@ -66,3 +66,7 @@ ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml) LIST(APPEND external_project_dependencies mklml)
IF(WITH_C_API)
INSTALL(FILES ${MKLML_LIB} ${MKLML_IOMP_LIB} DESTINATION lib)
ENDIF()
...@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -30,23 +30,21 @@ IF(NOT ${CBLAS_FOUND})
CACHE FILEPATH "openblas library." FORCE) CACHE FILEPATH "openblas library." FORCE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -Wno-unused-but-set-variable -Wno-unused-variable")
SET(OPENBLAS_COMMIT "v0.2.20")
IF(CMAKE_CROSSCOMPILING) IF(CMAKE_CROSSCOMPILING)
SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER}) SET(OPTIONAL_ARGS HOSTCC=${HOST_C_COMPILER})
GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY) GET_FILENAME_COMPONENT(CROSS_SUFFIX ${CMAKE_C_COMPILER} DIRECTORY)
SET(CROSS_SUFFIX ${CROSS_SUFFIX}/) SET(CROSS_SUFFIX ${CROSS_SUFFIX}/)
IF(ANDROID) IF(ANDROID)
# arm_soft_fp_abi branch of OpenBLAS to support softfp
# https://github.com/xianyi/OpenBLAS/tree/arm_soft_fp_abi
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$") IF(ANDROID_ABI MATCHES "^armeabi(-v7a)?$")
# use softfp
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 ARM_SOFTFP_ABI=1 USE_THREAD=0)
ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a") ELSEIF(ANDROID_ABI STREQUAL "arm64-v8a")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0)
ENDIF() ENDIF()
ELSEIF(IOS) ELSEIF(IOS)
IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64") IF(CMAKE_OSX_ARCHITECTURES MATCHES "arm64")
SET(OPENBLAS_COMMIT "b5c96fcfcdc82945502a2303116a64d89985daf5")
SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${OPENBLAS_CC} ${CMAKE_C_FLAGS} -isysroot ${CMAKE_OSX_SYSROOT}")
SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64") SET(OPENBLAS_CC "${OPENBLAS_CC} -arch arm64")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX}) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV8 BINARY=64 USE_THREAD=0 CROSS_SUFFIX=${CROSS_SUFFIX})
...@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -56,14 +54,12 @@ IF(NOT ${CBLAS_FOUND})
ENDIF() ENDIF()
ELSEIF(RPI) ELSEIF(RPI)
# use hardfp # use hardfp
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} TARGET=ARMV7 USE_THREAD=0)
ENDIF() ENDIF()
ELSE() ELSE()
IF(APPLE) IF(APPLE)
SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}") SET(OPENBLAS_CC "${CMAKE_C_COMPILER} -isysroot ${CMAKE_OSX_SYSROOT}")
ENDIF() ENDIF()
SET(OPENBLAS_COMMIT "v0.2.20")
SET(OPTIONAL_ARGS "") SET(OPTIONAL_ARGS "")
IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$") IF(CMAKE_SYSTEM_PROCESSOR MATCHES "^x86(_64)?$")
SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64) SET(OPTIONAL_ARGS DYNAMIC_ARCH=1 NUM_THREADS=64)
...@@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR}) ...@@ -113,7 +109,7 @@ INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
# FIXME(gangliao): generate cblas target to track all high performance # FIXME(gangliao): generate cblas target to track all high performance
# linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas) # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c) SET(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/cblas_dummy.c)
FILE(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") FILE(WRITE ${dummyfile} "const char *dummy_cblas = \"${dummyfile}\";")
ADD_LIBRARY(cblas STATIC ${dummyfile}) ADD_LIBRARY(cblas STATIC ${dummyfile})
TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES}) TARGET_LINK_LIBRARIES(cblas ${CBLAS_LIBRARIES})
......
...@@ -63,7 +63,7 @@ ExternalProject_Add( ...@@ -63,7 +63,7 @@ ExternalProject_Add(
MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}")
INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR})
ADD_LIBRARY(warpctc STATIC IMPORTED GLOBAL) ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc) ADD_DEPENDENCIES(warpctc extern_warpctc)
......
...@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME) ...@@ -120,7 +120,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs}) DEPENDS ${libs})
# Generate dummy staic lib # Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";") file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS}) add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps}) target_link_libraries(${TARGET_NAME} ${libs_deps})
...@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME) ...@@ -160,7 +160,7 @@ function(merge_static_libs TARGET_NAME)
DEPENDS ${libs} ${target_OBJS}) DEPENDS ${libs} ${target_OBJS})
# Generate dummy staic lib # Generate dummy staic lib
file(WRITE ${target_SRCS} "const char *dummy = \"${target_SRCS}\";") file(WRITE ${target_SRCS} "const char *dummy_${TARGET_NAME} = \"${target_SRCS}\";")
add_library(${TARGET_NAME} STATIC ${target_SRCS}) add_library(${TARGET_NAME} STATIC ${target_SRCS})
target_link_libraries(${TARGET_NAME} ${libs_deps}) target_link_libraries(${TARGET_NAME} ${libs_deps})
...@@ -324,7 +324,7 @@ function(go_library TARGET_NAME) ...@@ -324,7 +324,7 @@ function(go_library TARGET_NAME)
) )
# Add dummy code to support `make target_name` under Terminal Command # Add dummy code to support `make target_name` under Terminal Command
file(WRITE ${dummyfile} "const char * dummy = \"${dummyfile}\";") file(WRITE ${dummyfile} "const char *dummy_${TARGET_NAME} = \"${dummyfile}\";")
if (go_library_SHARED OR go_library_shared) if (go_library_SHARED OR go_library_shared)
add_library(${TARGET_NAME} SHARED ${dummyfile}) add_library(${TARGET_NAME} SHARED ${dummyfile})
else() else()
......
...@@ -307,6 +307,12 @@ sequence_expand ...@@ -307,6 +307,12 @@ sequence_expand
:noindex: :noindex:
gru_unit
--------
.. autofunction:: paddle.v2.fluid.layers.gru_unit
:noindex:
lstm_unit lstm_unit
--------- ---------
.. autofunction:: paddle.v2.fluid.layers.lstm_unit .. autofunction:: paddle.v2.fluid.layers.lstm_unit
......
# Design Doc: Concurrent Programming with Fluid
With PaddlePaddle Fluid, users describe a program other than a model. The program is a [`ProgramDesc`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto) protobuf message. TensorFlow/MxNet/Caffe2 applications generate protobuf messages too, but their protobuf messages represent the model, a graph of operators, but not the program that trains/uses the model.
Many know that when we program TensorFlow, we can specify the device on which each operator runs. This allows us to create a concurrent/parallel AI application. An interesting questions is **how does a `ProgramDesc` represents a concurrent program?**
The answer relies on the fact that a `ProgramDesc` is similar to an abstract syntax tree (AST) that describes a program. So users just program a concurrent program that they do with any concurrent programming language, e.g., [Go](https://golang.org).
## An Analogy
The following table compares concepts in Fluid and Go
| Go | Fluid |
|----|-------|
|user-defined functions | [layers](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid) |
| control-flow and built-in functions | [intrinsics/operators](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators) |
| goroutines, channels | [class ThreadPool](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework/thread_pool.h) |
| runtime | [class Executor](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) |
## An Example Concurrent Program
To review all above concepts in an example, let us take a simple program and writes its distributed version.
Suppose that we want to parallelize a naive Fluid program (written in Go and calling Fluid's Go binding) that multiplies two tensors.
```go
import "fluid"
func paddlepaddle() {
X = fluid.read(...)
W = fluid.Tensor(...)
Y = fluid.mult(X, W)
}
```
Please be aware that the Fluid's Go binding provides the default `main` function, which calls the `paddlepaddle` function, which, in this case, is defined in above program and creates the following `ProgramDesc` message.
```protobuf
message ProgramDesc {
block[0] = Block {
vars = [X, W, Y],
ops = [
read(output = X)
assign(input = ..., output = W)
mult(input = {X, W}, output = Y)
],
}
}
```
Then, the default `main` function calls `fluid.run()`, which creates an instance of the [`class Executor`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.h) and calls `Executor.Run(block[0])`, where `block[0]` is the first and only block defined in above `ProgramDesc` message.
The default `main` function is defined as follows:
```go
func main() {
paddlepaddle()
fluid.run()
}
```
## The Concurrent Version
By parallelizing the above program, we could support very big tensor X by splitting into small pieces {x_1, x_2, ...} and sent each piece to worker process/node for parallel multiplication.
In this case, we can write a transpiler that takes a `ProgramDesc` message that represents the above example program and outputs two `ProgramDesc` messages, one for running on the master process/node, and the other one for worker processes/nodes.
### The Master Program
The master program could look like the following:
```protobuf
message ProgramDesc {
block[0] = Block {
vars = [X, L, Y],
ops = [
read(output = X)
kube_get_workers_addrs(output = L)
Y = tensor_array(len(L))
parallel_for(input = X, output = Y,
attrs = {L, block_id(1)}) # referring to block 1
]
}
block[1] = Block {
parent = 0,
vars = [x, y, index],
ops = [
slice(input = [X, index], output = x) # index is initialized by parallel_for
send(input = x, attrs = L[index])
recv(outputs = y, attrs = L[index])
assign(input = y, output = Y[index])
]
}
}
```
The equivalent Fluid program (calling the Go binding) is:
```go
func main() { //// block 0
X = fluid.read(...)
L = fluid.k8s.get_worker_addrs()
Y = fluid.tensor_array(len(L))
fluid.parallel_for(X, L,
func(index int) { //// block 1
x = X[index]
fluid.send(L[index], x)
y = fluid.recv(L[index])
Y[index] = y
})
}
```
An explanation of the above program:
- `fluid.k8s` is a package that provides access to Kubernetes API.
- `fluid.k8s.get_worker_addrs` returns the list of IP and ports of all pods of the current job except for the current one (the master pod).
- `fluid.tensor_array` creates a [tensor array](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/lod_tensor_array.h). `fluid.parallel_for` creates a `ParallelFor` intrinsic, which, when executed,
1. creates `len(L)` scopes, each for the concurrent running of the sub-block (block 1 in this case), and initializes a variable named "index" in the scope to an integer value in the range `[0, len(L)-1]`, and
2. creates `len(L)` threads by calling into the `ThreadPool` singleton, each thread
1. creates an Executor instance, and
2. calls `Executor.Run(block)`, where `block` is block 1 as explained above.
1. Please be aware that block 1 is a sub-block of block 0, so ops in block 1 could refer to variables defined in block 0.
### The Worker Program
The worker program looks like
```go
func main() {
W = Tensor(...)
x = fluid.listen_and_do(
fluid.k8s.self_addr(),
func(input Tensor) {
output = fluid.mult(input, W)
})
}
```
where
- `fluid.listen_and_do` creates a `ListenAndDo` intrinsic, which, when executed,
1. listens on the current pod's IP address, as returned by `fliud.k8s.self_addr()`,
2. once a connection is established,
1. creates a scope of two parameters, "input" and "output",
2. reads a [Fluid variable](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h) and saves it into "input",
3. creates an Executor instance and calls `Executor.Run(block)`, where the block is generated by running the lambda specified as the second parameter of `fluid.listen_and_do`.
## Summarization
From the above example, we see that:
1. Fluid enables the imperative programming paradigm by:
1. letting users describe a program, but not a model (a sequence of layers, or a graph of operators), and
2. call the `fluid.run` function that runs the program implicitly.
1. The program is described as a `ProgramDesc` protobuf message.
2. Function `Executor.Run` takes a block, instead of a `ProgramDesc`, as its parameter.
3. `fluid.run` calls `Executor.Run` to run the first block in the `ProgramDesc` message.
4. `Executor.Run`'s implementation is extremely simple -- it doesn't plan the execution nor create threads; instead, it runs on the current thread and execute intrinsics/operators' `Run` method sequentially as they appear in the `Block.ops` array.
5. Intrinsics/operators' `Run` method might create threads. For example, the `ListenAndDo` operator creates a thread to handle each incoming request.
6. Threads are not necessarily OS thread; instead, they could be [green threads](https://en.wikipedia.org/wiki/Green_threads) managed by ThreadPool. Multiple green threads might run on the same OS thread. An example green threads is Go's [goroutines](https://tour.golang.org/concurrency/1).
...@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the ...@@ -52,8 +52,9 @@ The IR for PaddlePaddle after refactoring is called a `Block`, it specifies the
The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly. The user can not directly specify the parameter update rule for the parameter server in the Python module, since the parameter server does not use the same computation definition as the trainer. Instead, the update rule is baked inside the parameter server. The user can not specify the update rule explicitly.
This could be fixed by making the parameter server run the same computation definition as the trainer (the user's Python module). For a detailed explanation, refer to this document - This could be fixed by making the parameter server also run an IR, which can be different to the trainer side
[Design Doc: Operation Graph Based Parameter Server](./parameter_server.md) For a detailed explanation, refer to this document -
[Design Doc: Parameter Server](./parameter_server.md)
## Distributed Training Architecture ## Distributed Training Architecture
...@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l ...@@ -61,68 +62,111 @@ The revamped distributed training architecture can address the above discussed l
<img src="src/distributed_architecture.png"/> <img src="src/distributed_architecture.png"/>
The major components in the architecture are: *PaddlePaddle Python*, *PaddlePaddle converter* and *PaddlePaddle runtime*. The major components are: *Python API*, *Distribute Transpiler* and *Remote Executor*.
### PaddlePaddle Python ### Python API
PaddlePaddle Python is the Python library that user's Python code invokes, to read the data. build the neural network topology, start training, etc. Python API is the Python library that user's Python code invokes, to read the data, build the neural network topology, and start training, etc.
```Python ```Python
paddle.init() images = fluid.layers.data(name='pixel', shape=[1, 28, 28], dtype='float32')
input = paddle.op.recordIO("/home/data/mnist.recordio") # file stored on the cluster label = fluid.layers.data(name='label', shape=[1], dtype='int64')
img, label = input[0], input[1] ...
hidden = paddle.layer.fc(input=img, size=200, act=paddle.activation.Tanh()) predict = fluid.layers.fc(input=conv_pool_2, size=10, act="softmax")
prediction = paddle.layer.fc(input=img, size=10, act=paddle.activation.Softmax()) cost = fluid.layers.cross_entropy(input=predict, label=label)
cost = paddle.layer.classification_cost(input=prediction, label=label) avg_cost = fluid.layers.mean(x=cost)
optimizer = paddle.optimizer.SGD(cost, learning_rate=0.01) optimizer = fluid.optimizer.Adam(learning_rate=0.01)
session = paddle.session.NewRemote(num_trainer=3, num_ps=2, GPU_per_trainer=1) optimizer.minimize(avg_cost)
for i in range(1000):
_, cost_val = session.eval(targets=[cost, optimizer]) train_reader = paddle.batch(
print cost_val paddle.reader.shuffle(
paddle.dataset.mnist.train(), buf_size=500),
batch_size=BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
for pass_id in range(10):
for data in train_reader():
loss, acc = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost])
``` ```
The above code is what a typical Python trainer code is, the neural network topology is built using the helper functions such as `paddle.layer.fc`. Training is done by calling `session.eval` iteratively. The code above is a typical local training program, the "Training Program" is built using helper functions such as
`fluid.layer.fc`. The training is done by calling `Executor.run`
#### session.eval iteratively.
As shown in the graph, `session.eval` sends the IR and the evaluation inputs or targets to the PaddlePaddle cluster for evaluation. For more details, the implementation of IR is [Program](../program.md), and `ProgramDesc` is the protobuf type.
The targets can be any variable in the computation graph. When the target is say, the `optimizer` variable, the neural network will be optimized once. When the target is the `cost` variable, `session.eval` returns the cost value. Based on what the target is, an appropriate action is taken.
[Executor](../executor.md) simply runs the `ProgramDesc`. For local training you generally use
The Python `session` is a wrapper of the C++ `Session` class. For more information about `Session`, refer to this document - [Design Doc: Session](./session.md). `Executor` to run the program locally. For any kind of distributed training, you can use
`RemoteExecutor` to specify desired distributed training method with some optional arguments.
### PaddlePaddle Converter
### Distributed Transpiler
The PaddlePaddle converter automatically converts the IR in the request (IR and evaluation inputs/targets) from PaddlePaddle Python to partitioned IRs and dispatches the new IRs and evaluation inputs/targets to different PaddlePaddle runtimes. Below are the steps that are followed :
The Distributed Transpiler automatically converts the IR (in protobuf format) to partitioned IRs. Then
1. Add a `feed` OP that feeds the eval inputs, and a `fetch` OP that fetches the eval targets to the IR. the Remote Executor dispatches the new IRs to Remote Executors across the cluster.
Below are the steps that are followed :
2. Extract a new computation (sub)graph with the `feed` and `fetch` OPs as the boundary. The runtime does not need to run the OP that is not dependent on the `fetch` OP.
1. User only need to change `Executor` to `RemoteExecutor` to change local program to distributed program.
3. Optimize the computation graph. 1. `RemoteExecutor` calls `Distributed Transpiler` to "transpile" user's program to several IRs representing a
distributed training program:
4. Place the OPs in the graph onto different devices on different PaddlePaddle runtime according to a placement algorithm and the device constraints specified by the user. 1. Parse configurations from `RemoteExecutor`.
1. Determine the type of distributed program, can be DataParallelism, ModelParallelism or Streaming.
5. Partition the graph according to runtime boundaries and add `send` / `recv` OP pair on the runtime boundaries. 1. Partition the `ProgramDesc` according to type and add `send` / `recv` OP pair on the boundaries. Take
DataParallelism type for example, it removes the optimization operators and add a `send` OP to the
"trainer" role, then add the optimization operators to the parameter server role within the `recv` OP.
1. Dispatch the partitioned graph to different `RemoteExecutor` in the cluster.
1. `RemoteExecutor` on each node run the received `ProgramDesc` utill the end.
### RemoteExecutor
As shown in the graph, `RemoteExecutor.run` sends the IR to the cluster for Execution.
You can also use parameter `fetch_list` to interactively fetch variable back to local for
log printing.
The Python `RemoteExecutor` is derived from `Executor` class.
```python
exe = RemoteExecutor(
feed=feeder.feed(data),
fetch_list=[avg_cost],
job_desc=JobDesc(
jobname,
num_trainer,
num_pserver,
cpu_per_trainer,
gpu_per_trainer,
mem_per_trainer,
cpu_per_pserver,
mem_per_pserver
))
for data in train_reader():
loss, acc = exe.run(trainer_prog,
feed=feeder.feed(data),
fetch_list=[avg_cost])
```
6. Dispatch the partitioned graph to different PaddlePaddle runtimes. `JobDesc` object describe the distributed job resource specification to run on
Cluster environment.
7. PaddlePaddle runtimes with the `fetch` OP reports evaluation results back to the converter, the converter reports the evaluation results back to the PaddlePaddle Python. <img src="src/remote_executor.png"/>
The output IRs will be cached to optimize the conversion latency. `RemoteExecutor.run` sends the `ProgramDesc` and
[TrainingJob](https://github.com/PaddlePaddle/cloud/blob/develop/doc/autoscale/README.md#training-job-resource)
to a server in the cluster which executes `RemoteExecutor.listen`. This server is responsible
to start the final Kubernetes Jobs to run the different role of `ProgramDesc`.
#### Placement Algorithm ### Placement Algorithm
Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible. Our first implementation will only support "trainer-parameter server" placement: the parameters, initializers, and optimizers are all placed on the PaddlePaddle runtimes with the parameter server role. Everything else will be placed on the PaddlePaddle runtimes with the trainer role. This has the same functionality as the "trainer-parameter server" architecture of PaddlePaddle v0.10.0, but is more generic and flexible.
In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm. In the future, a more general placement algorithm should be implemented, which makes placements according to the input IR, and a model of device computation time and device communication time. Model parallelism requires the generic placement algorithm.
### PaddlePaddle Runtime
The PaddlePaddle runtime owns multiple devices (e.g., CPUs, GPUs) and runs the IR. The runtime does not need to do OP placement since it is already done by the converter.
### Local Training Architecture ### Local Training Architecture
The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime: The local training architecture will be the same as the distributed training architecture, the difference is that everything runs locally, and there is just one PaddlePaddle runtime:
...@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc ...@@ -132,9 +176,18 @@ The local training architecture will be the same as the distributed training arc
### Training Data ### Training Data
In PaddlePaddle v0.10.0, training data is typically read with a [data reader](../reader/README.md) from Python. This approach is no longer efficient when training in a distributed fashion since the Python process no longer runs on the same node with the trainer processes. The Python reader will need to read from the distributed filesystem (assuming it has the required access) and send to the trainers, doubling the network traffic. In PaddlePaddle v0.10.0, training data is typically read
with [data reader](../reader/README.md) from Python. This approach is
When doing distributed training, the user can still use Python data reader: the training data are sent with `session.eval`. However this should be used for debugging purpose only. The users are encouraged to use the read data OPs. no longer efficient when training distributedly since the Python
process no longer runs on the same node with the trainer processes,
the Python reader will need to read from the distributed filesystem
(assuming it has the access) and send to the trainers, doubling the
network traffic.
When doing distributed training, the user can still use Python data
reader: the training data are sent with `Executor.run`. However, should
be used for debugging purpose only. The users are encouraged to use
the read data OPs.
## References: ## References:
......
# Design Doc: Operation Graph Based Parameter Server # Design Doc: Parameter Server
## Abstract ## Abstract
...@@ -10,7 +10,7 @@ different purposes. ...@@ -10,7 +10,7 @@ different purposes.
## Background ## Background
The previous implementations of the parameter server does not run a The previous implementations of the parameter server does not run a
subgraph. parameter initialization, optimizer computation, network fluid sub-program. Parameter initialization, optimizer computation, network
communication and checkpointing are implemented twice on both the communication and checkpointing are implemented twice on both the
trainer and the parameter server. trainer and the parameter server.
...@@ -23,10 +23,10 @@ server becomes a natural extension. ...@@ -23,10 +23,10 @@ server becomes a natural extension.
## Design ## Design
### Graph Converter ### Distributed Transpiler
The *graph converter* converts the user-defined operation (OP) graph The *Distributed Transpiler* converts the user-defined fluid program
into subgraphs to be scheduled on different nodes with the following into sub-programs to be scheduled on different nodes with the following
steps: steps:
1. OP placement: the OPs will be placed on different nodes according 1. OP placement: the OPs will be placed on different nodes according
...@@ -34,7 +34,6 @@ steps: ...@@ -34,7 +34,6 @@ steps:
time. Currently we will use a simple heuristic that puts parameter time. Currently we will use a simple heuristic that puts parameter
varable on parameter server workers and everything else on trainer varable on parameter server workers and everything else on trainer
workers. workers.
1. Add communication OPs to enable the communication between nodes. 1. Add communication OPs to enable the communication between nodes.
We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*. We will need these OPs: *Send*, *Recv*, *Enqueue*, *Dequeue*.
...@@ -48,8 +47,8 @@ After converting: ...@@ -48,8 +47,8 @@ After converting:
<img src="src/dist-graph.png" width="700"/> <img src="src/dist-graph.png" width="700"/>
1. The parameter variable W and it's optimizer subgraph are placed on the parameter server. 1. The parameter variable W and it's optimizer program are placed on the parameter server.
1. Operators are added to the subgraphs. 1. Operators are added to the program.
- *Send* sends data to the connected *Recv* operator. The - *Send* sends data to the connected *Recv* operator. The
scheduler on the receive node will only schedule *Recv* operator scheduler on the receive node will only schedule *Recv* operator
to run when the *Send* operator has ran (the *Send* OP will mark to run when the *Send* operator has ran (the *Send* OP will mark
...@@ -64,39 +63,30 @@ After converting: ...@@ -64,39 +63,30 @@ After converting:
### Benefits ### Benefits
- Model parallelism become easier to implement: it's an extension to - Model parallelism become easier to implement: it's an extension to
the trainer - parameter server approach. we already have the the trainer - parameter server approach. We can have several "Transpilers"
communication OPs, but need to extend the graph converter's to achieve different goals.
placement functionality.
- User-defined optimizer is easier to add - user can now express it as - User-defined optimizer is easier to add - user can now express it as
a subgraph. a sub-program.
- No more duplication logic inside the trainer and the parameter - No more duplication logic inside the trainer and the parameter
server mentioned in the background section. server mentioned in the background section.
### Challenges ### Challenges
- It might be hard for the graph converter to cut a general graph
(without any hint for which subgraph is the optimizer). We may need
to label which subgraph inside the OP graph is the optimizer.
- It's important to balance the parameter shards of on multiple - It's important to balance the parameter shards of on multiple
parameter server. If a single parameter is very big (some parameter server. If a single parameter is very big (some
word-embedding, fully connected, softmax layer), we need to word-embedding, fully connected, softmax layer), we need to
automatically partition the single parameter onto different automatically partition the single parameter onto different
parameter servers when possible (only element-wise optimizer depends parameter servers when possible (only element-wise optimizer depends
on the parameter variable). on the parameter variable).
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently. See
[here](https://github.com/PaddlePaddle/Paddle/pull/6394) for more
details about concurrent program in fluid.
### Discussion ### Discussion
- In the "Aync SGD" figure, the "W" variable on the parameter server
could be read and wrote concurrently, what is our locking strategy?
E.g., each variable have a lock cpp method to be invoked by every
OP, or, have a lock OP.
- Can the Enqueue OP be implemented under our current tensor design - Can the Enqueue OP be implemented under our current tensor design
(puts the input tensor into the queue tensor)? (puts the input tensor into the queue tensor)?
- *Dequeue* OP will have variable numbers of output (depends on the - *Dequeue* OP will have variable numbers of output (depends on the
`min_count` attribute), does our current design support it? (similar `min_count` attribute), does our current design support it? (similar
question for the *Add* OP) question for the *Add* OP)
......
# Error Clip
## Overview
Error clip is widely used in model training to prevent gradient exploding. It takes some specific rules to adjust variables' gradients and prevent them from being too large. With it, values of a gradient will be checked before they are taken by the next `grad_op` and be shrunk if necessary.
## Usage
Users are allowed to assign different error clip methods or attributes to different `Variable`s. Users can specify it as a parameter of `Variable`'s constructor:
```python
var = framework.Variable(..., error_clip=myErrorClip, ...)
```
The default value of `error_clip` is `None`, which means no error clip is employed. When it's not `None`, it should take an object of `BaseErrorClipAttr`'s derived class. So far, `BaseErrorClipAttr` has only one derived class: `ErrorClipByValue`, whose constructor is:
```python
ErrorClipByValue(max, min=None)
```
`max` and `min` represent the maximal and minimal clip threshold respectively. In backward pass, all values of `var`'s gradient greater than `max` or less than `min` will be clipped to `max` and `min` respectively. When the `min` is None, the minimal threshold will be assigned with `-max` automatically.
So we can enable the error clip with threshold `[-5.0, 5.0]` for variable `var` by:
```python
var = framework.Variable(..., error_clip=ErrorClipByValue(max=5.0), ...)
```
## Implementation
The `BaseErrorClipAttr` and its derived class `ErrorClipByValue` are defined in *clip.py*.
```python
class BaseErrorClipAttr(object):
def append_clip_op(self, block, grad_name):
raise NotImplementedError()
class ErrorClipByValue(BaseErrorClipAttr):
def __init__(self, max, min=None):
max = float(max)
if min is None:
min = -max
else:
min = float(min)
self.max = max
self.min = min
def append_clip_op(self, block, grad_name):
block.append_op(
type="clip",
inputs={"X": grad_name},
outputs={"Out": grad_name},
attrs={"min": self.min,
"max": self.max})
```
The `BaseErrorClipAttr` have one main member functions: `append_clip_op(self, block, grad_name)`.
This function is used to create a `clip_op` and append it to the end of given `block`. For different error clip algorithm require different `clip_op`, the function is defined as virtual in the base class. All derived classes must implement their own versions of this function.
These `clip_op`s should be inserted after `grad_op`s whose output gradients need to be clipped. It is equivalent to appending some `clip_op`s to the end of the target block every time a new `grad_op` is added.
```python
for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc)
callback(block=target_block, context=grad_to_var)
```
Here we employ a callback function to complete this kind of jobs. In `_append_backward_ops_` function, each time after a `grad_op` is added to the `target_block`, a callback function is invoked. The logic of `clip_op` appending can be implemented inside the callback function.
The callback function for `clip_op` appending is defined in *clip.py*:
```python
def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if error_clip is not None:
error_clip.append_clip_op(block, grad_n)
```
This function takes a `block` and a `context`(which is actually a grad\_to\_var map) as inputs. It checks each output of the last `OpDesc` in the `block`. Notice that the last `OpDesc` of the `block` must be a `grad_op` and its outputs must be some forward variables' gradients. If an output gradient's corresponding forward variable has an attribute of `error_clip`, `error_clip_callback` will call the `error_clip`'s `append_clip_op` function to append the required `clip_op` into the `block`.
# Memory Optimization
## Problem
In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
- availability of Big Data
- supercomputing power to process this Big Data over very large neural networks
- modern algorithms
Following graph shows the details:
![](images/deep_learning.png)
Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference.
## Solution
### Basic Strategy
There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
#### In-place Operation
In a relu activation operator:
$y = \max(x, 0)$
If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
#### Memory Sharing
Not all operators support in-place operations. Memory sharing is a more general strategy.
Following is an example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
### Live Variable Analysis
It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation.
In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis.
We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
- construct a control flow graph
- solve the dataflow equations
#### Control Flow Graph
To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
Following is the flow graph for a simple loop.
![](images/control_flow_graph.png)
#### Dataflow Analysis
liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
- Flow Graph Terminology
A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
- Uses and Defs
An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
- Liveness
A variable is *live* on an edge if there is a directed path from that edge to a *use* of the variable that does not go through any *def*. A variable is *live-in* at a node if it is live on any of the in-edges of that node; it is *live-out* at a node if it is live on any of the out-edges of the node.
The calcution of liveness can be solved by iteration until a fixed pointer is reached. Following is the recursive formula:
![](images/dataflow_equations.png)
### Memory optimization transpiler
At last, we take basic strategy and liveness analysis techniques learning from compilers to implement our memory optimization transpiler.
#### add in-place attribute
In-place is a built-in attribute of an operator. Since we treat in-place and other operators differently, we have to add an in-place attribute for every operator.
#### contruct control flow graph
Following is the ProgramDesc protobuf of [machine translation](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/tests/book/test_machine_translation.py) example.
- Block0:
```
lookup_table
mul
...
while(sub-block idx 1)
...
array_to_lod_tensor
cross_entropy
...
while_grad(sub-block idx 2)
read_from_array
array_to_lod_tensor
...
```
- Block1
```
read_from_array
read_from_array
...
write_to_array
increment
write_to_array
less_than
```
- Block2
```
read_from_array
increment
...
write_to_array
write_to_array
```
We can transfer all the operators and variables in ProgramDesc to build a control flow graph.
```python
class ControlFlowGraph(object):
def __init__(self, Program):
self._sucessors = defaultdict(set)
self._presucessors = defaultdict(set)
self._uses = defaultdict(set)
self._defs = defaultdict(set)
self._live_in = defaultdict(set)
self._live_out = defaultdict(set)
self._program = Program
def build(self):
pass
def dataflow_analysis(self):
pass
def memory_optimization(self):
pass
def get_program(self):
return self._program
```
#### make dataflow analysis
We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing.
For example:
```
a = op1(b, c);
d = op2(a)
e = op3(d, f)
```
The dataflow analysis result is:
```
live_in(op1) = {b, c, f}
live_out(op1) = {a, f}
live_in(op2) = {a, f}
live_out(op2) = {d, f}
live_in(op3) = {d, f}
live_out(op3) = {}
```
After op1, we can process variable b and variable c; After op2, we can process variable a. After op3, we can process variable d and variable f.
#### memory sharing policy
A memory pool will be mantained in the stage of memory optimization. Each operator node will be scanned to determine memory optimization is done or not. If an operator satifies the requirement, following policy will be taken to handle input/output variables.
```
if op.support_inplace():
i --> pool
pool --> o
else:
pool --> o
i --> pool
```
## Reference
- [Lecture Notes From Artificial Intelligence Is The New Electricity By Andrew Ng](https://manavsehgal.com/lecture-notes-from-artificial-intelligence-is-the-new-electricity-by-andrew-ng-4712dcbf26e5)
- Modern compiler implementation in ML, by Andrew W. Appel
- [Optimizing Memory Consumption in Deep learning](https://mxnet.incubator.apache.org/architecture/note_memory.html)
# Design Doc: Session
## Abstract
The *session* object encapsulates the environment in which the
computation graph is executed.
We will have the *local* session and *remote* session, they offer the
same [interface](#interface). The local session encapsulates the local
runtime environment and the remote session encapsulates the cluster
runtime environment.
The local runtime environment contains:
1. computation devices (i.e., CPU, GPU) handles, and
1. the [scope](../scope.md) which holds all variables.
The remote runtime environment contains:
1. computation devices (i.e., CPU and GPU on node 0, 1) in a cluster,
and
1. the distributed [scope](../scope.md) in a cluster which holds all
variables.
The user can create a remote session on Paddle Cloud and evaluate the
computation graph with it. In this way, the user can control the
remote computation resource in a cluster from his local computer.
## Background
The current design has an implicit global session in which
`paddle.eval()` is executed. The pain point is:
Since the user is not able to explicitly switch between runtime
environments, the user cannot run a topology in two independent
environments.
For example, in reinforcement learning, the user may want to have a
stale model for inference and a fresh model for training, and only
replace the stale model with the fresh model periodically.
Furthermore, we have no concept that encapsulates a remote environment
that executes a computation graph.
We need the session object to address above issues.
## Session
A session is an object that owns the runtime environment. All
computations are executed through `session.eval()`.
### Interface
```python
eval(
targets,
feed_dict=None,
)
```
Evaluates the target Operations or Variables in `targets`.
- *targets*: the evaluation targets. Can be a single Operation or
Variable, or a list with the Operations or Variables as
elements. The value returned by `eval()` has the same shape as the
`target` argument.
The PaddlePaddle program is represented by
the [ProgramDesc](../design/program.md), `eval()` will infer the
ProgramDesc from the given targets and run the PaddlePaddle
program. Please
see
[this graph](./distributed_architecture.md#local-training-architecture) for
the detailed illustration for the local session
and
[this graph](./distributed_architecture.md#distributed-training-architecture) for
the detailed illustration for the remote session.
- *feed_dict*: a dictionary that contains the tensors which override
the edges of the computation graph.
feed_dict not only can provide the input data, it can override any
OP's input as well:
```python
a = pd.constant(2.0, name="a")
b = pd.variable(name="b")
c = pd.mul(a,b)
sess.eval(targets=c, feed_dict={"b":3.0}) # returns 6.0
```
```python
close()
```
Closes the session and releases the scope that the session owns.
### Create a Local Session
```python
session(
devices=None
)
```
Creates a new session. One session owns one global scope, so creating
multiple sessions will create different scopes.
- *devices*: a single `string` or a list of `string` of device names,
the corresponding devices will be the computation devices for
`eval()`. If not specified, all available devices (e.g., all GPUs)
will be used. The user doesn't need to specify the CPU device since
it will be always used. Multiple sessions can use the same device.
#### Example
```Python
a = paddle.constant(1.0)
b = paddle.constant(2.0)
c = a + b
sess = paddle.session(devices=["gpu:0", "gpu:1", "fpga:0"])
sess.eval(c)
sess.close()
```
### Create a Remote Session
```python
create_cloud_job(
name,
num_trainer,
mem_per_trainer,
gpu_per_trainer,
cpu_per_trainer,
num_ps,
mem_per_ps,
cpu_per_ps,
)
```
Creates a Paddle Cloud job. Fails if the job name exists.
```python
get_cloud_job(
name
)
```
Gets a Paddle Cloud job.
```python
remote_session(
job
)
```
- *job*: the Paddle Cloud job.
#### Example
```Python
reader = paddle.reader.recordio("/pfs/home/peter/mnist-train-*") # data stored on Paddle Cloud
image = reader.column(0)
label = reader.column(1)
fc1 = paddle.op.fc(image, size=256, act="sigmoid")
fc2 = paddle.op.fc(fc1, size=10, act="softmax")
cost = paddle.op.cross_entropy(fc2, label)
opt = paddle.optimizer.sgd(cost)
job = paddle.create_cloud_job("test", 3, "1G", 1, 1, 2, "1G", 1)
sess = paddle.remote_ession(job)
for i in range(1000):
sess.eval(opt)
sess.close()
```
...@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -7,11 +7,9 @@ PaddlePaddle每次发新的版本,遵循以下流程:
1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0` 1.`develop`分支派生出新的分支,分支名为`release/版本号`。例如,`release/0.10.0`
1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。 1. 将新分支的版本打上tag,tag为`版本号rc.Patch号`。第一个tag为`0.10.0rc1`,第二个为`0.10.0rc2`,依次类推。
1. 对这个版本的提交,做如下几个操作: 1. 对这个版本的提交,做如下几个操作:
* 使用Regression Test List作为检查列表,测试本次release的正确性。
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,到第二步
* 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True` * 修改`python/setup.py.in`中的版本信息,并将`istaged`字段设为`True`
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
* 编译这个版本的Ubuntu Deb包。如果失败,修复Ubuntu Deb包编译问题,Patch号加一,返回第二步。
* 使用Regression Test List作为检查列表,测试Docker镜像/ubuntu安装包的功能正确性
* 如果失败,记录下所有失败的例子,在这个`release/版本号`分支中,修复所有bug后,Patch号加一,返回第二步
* 编译这个版本的python wheel包,并发布到pypi。 * 编译这个版本的python wheel包,并发布到pypi。
* 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64` * 由于pypi.python.org目前遵循[严格的命名规范PEP 513](https://www.python.org/dev/peps/pep-0513),在使用twine上传之前,需要重命名wheel包中platform相关的后缀,比如将`linux_x86_64`修改成`manylinux1_x86_64`
* pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel` * pypi上的package名称为paddlepaddle和paddlepaddle_gpu,如果要上传GPU版本的包,需要修改build/python/setup.py中,name: "paddlepaddle_gpu"并重新打包wheel包:`python setup.py bdist_wheel`
...@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -21,8 +19,8 @@ PaddlePaddle每次发新的版本,遵循以下流程:
pip install twine pip install twine
twine upload dist/[package to upload] twine upload dist/[package to upload]
``` ```
* 编译这个版本的Docker发行镜像,发布到dockerhub。如果失败,修复Docker编译镜像问题,Patch号加一,返回第二步
1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。 1. 第三步完成后,将`release/版本号`分支合入master分支,并删除`release/版本号`分支。将master分支的合入commit打上tag,tag为`版本号`。同时再将`master`分支合入`develop`分支。最后删除`release/版本号`分支。
1. 编译master分支的Docker发行镜像,发布到dockerhub。编译ubuntu的deb包,发布到github release页面
1. 协同完成Release Note的书写 1. 协同完成Release Note的书写
...@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本,遵循以下流程: ...@@ -31,6 +29,30 @@ PaddlePaddle每次发新的版本,遵循以下流程:
* `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。 * `release/版本号`分支一旦建立,一般不允许再从`develop`分支合入`release/版本号`。这样保证`release/版本号`分支功能的封闭,方便测试人员测试PaddlePaddle的行为。
*`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。 *`release/版本号`分支存在的时候,如果有bugfix的行为,需要将bugfix的分支同时merge到`master`, `develop``release/版本号`这三个分支。
## 发布wheel包到pypi
使用[PaddlePaddle CI](https://paddleci.ngrok.io/project.html?projectId=Manylinux1&tab=projectOverview)
完成自动化二进制编译,参考下图,选择需要发布的版本(通常包含一个CPU版本和一个GPU版本),点击"run"右侧的"..."按钮,可以
弹出下面的选择框,在第二个tab (Changes)里选择需要发布的分支,这里选择0.11.0,然后点击"Run Build"按钮。等待编译完成后
可以在此页面的"Artifacts"下拉框中找到生成的3个二进制文件,分别对应CAPI,`cp27m``cp27mu`的版本。然后按照上述的方法
使用`twine`工具上传即可。
<img src="ci_build_whl.png">
* 注:CI环境使用 https://github.com/PaddlePaddle/buildtools 这里的DockerImage作为编译环境以支持更多的Linux
发型版,如果需要手动编译,也可以使用这些镜像。这些镜像也可以从 https://hub.docker.com/r/paddlepaddle/paddle_manylinux_devel/tags/ 下载得到。
* pypi不支持覆盖上传,所以一个版本号的wheel包发布之后,不可以更改。下一个wheel包需要更新版本号才可以上传。
## 发布Docker镜像
上述PaddlePaddle CI编译wheel完成后会自动将Docker镜像push到DockerHub,所以,发布Docker镜像只需要对自动push的镜像打上
版本号对应的tag即可:
1. 进入 https://hub.docker.com/r/paddlepaddle/paddle/tags/ 查看latest tag的更新时间是否在上述编译wheel包完成后是否最新。
1. 执行 `docker pull paddlepaddle/paddle:[latest tag]`,latest tag可以是latest或latest-gpu等。
1. 执行 `docker tag paddlepaddle/paddle:[latest tag] paddlepaddle/paddle:[version]`
1. 执行 `docker push paddlepaddle/paddle:[version]`
## PaddlePaddle 分支规范 ## PaddlePaddle 分支规范
PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。 PaddlePaddle开发过程使用[git-flow](http://nvie.com/posts/a-successful-git-branching-model/)分支规范,并适应github的特性做了一些区别。
......
...@@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev ...@@ -48,8 +48,8 @@ Fluid uses class [DeviceContext](https://github.com/PaddlePaddle/Paddle/blob/dev
``` ```
/-> CPUDeviceContext --> MKLDeviceContext /-> CPUDeviceContext
DeviceContext ----> CUDADeviceContext --> CUDNNDeviceContext DeviceContext ----> CUDADeviceContext
\-> FPGADeviceContext \-> FPGADeviceContext
``` ```
...@@ -79,16 +79,6 @@ private: ...@@ -79,16 +79,6 @@ private:
}; };
``` ```
- CUDNNDeviceContext
```
class CUDNNDeviceContext : public CUDADeviceContext {
private:
cudnnHandle_t cudnn_handle_;
};
```
### Memory and Tensor ### Memory and Tensor
......
...@@ -51,7 +51,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num ...@@ -51,7 +51,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
- port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信 - port:**必选,默认7164**,pserver监听的起始端口,根据ports_num决定总端口个数,从起始端口监听多个端口用于通信
- ports_num:**必选,默认1**,监听的端口个数 - ports_num:**必选,默认1**,监听的端口个数
- ports_num_for_sparse:**必选,默认1**,用于稀疏类型参数通信的端口个数 - ports_num_for_sparse:**必选,默认0**,用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
### 启动计算节点 ### 启动计算节点
...@@ -60,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num ...@@ -60,7 +60,7 @@ $ stdbuf -oL /usr/bin/nohup paddle pserver --port=7164 --ports_num=1 --ports_num
$ python train.py $ python train.py
``` ```
trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过环境变量(https://zh.wikipedia.org/wiki/环境变量 )或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。 trainer需要和pserver保持网络联通以完成训练。trainer启动需要传入端口、pserver地址等参数使trainer可以正确连接到pserver。这些参数可以通过[环境变量](https://zh.wikipedia.org/wiki/环境变量)或编写程序时`paddle.init()`中传入参数。如果同时使用`paddle.init()`参数和环境变量,将会优先使用`paddle.init()`中传入的参数。
使用环境变量: 使用环境变量:
...@@ -95,7 +95,7 @@ paddle.init( ...@@ -95,7 +95,7 @@ paddle.init(
- trainer_count:**必选,默认1**,当前训练任务trainer总个数 - trainer_count:**必选,默认1**,当前训练任务trainer总个数
- port:**必选,默认7164**,连接到pserver的端口 - port:**必选,默认7164**,连接到pserver的端口
- ports_num:**必选,默认1**,连接到pserver的端口个数 - ports_num:**必选,默认1**,连接到pserver的端口个数
- ports_num_for_sparse:**必选,默认1**,和pserver之间用于稀疏类型参数通信的端口个数 - ports_num_for_sparse:**必选,默认0**,和pserver之间用于稀疏类型参数通信的端口个数
- num_gradient_servers:**必选,默认1**,当前训练任务pserver总数 - num_gradient_servers:**必选,默认1**,当前训练任务pserver总数
- trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数 - trainer_id:**必选,默认0**,每个trainer的唯一ID,从0开始的整数
- pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开 - pservers:**必选,默认127.0.0.1**,当前训练任务启动的pserver的IP列表,多个IP使用“,”隔开
......
...@@ -52,7 +52,7 @@ Parameter Description ...@@ -52,7 +52,7 @@ Parameter Description
- port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput. - port: **required, default 7164**, port which parameter server will listen on. If ports_num greater than 1, parameter server will listen on multiple ports for more network throughput.
- ports_num: **required, default 1**, total number of ports will listen on. - ports_num: **required, default 1**, total number of ports will listen on.
- ports_num_for_sparse: **required, default 1**, number of ports which serves sparse parameter update. - ports_num_for_sparse: **required, default 0**, number of ports which serves sparse parameter update.
- num_gradient_servers: **required, default 1**, total number of gradient servers. - num_gradient_servers: **required, default 1**, total number of gradient servers.
### Starting trainer ### Starting trainer
...@@ -98,7 +98,7 @@ Parameter Description ...@@ -98,7 +98,7 @@ Parameter Description
- trainer_count: **required, default 1**, total count of trainers in the training job. - trainer_count: **required, default 1**, total count of trainers in the training job.
- port: **required, default 7164**, port to connect to parameter server. - port: **required, default 7164**, port to connect to parameter server.
- ports_num: **required, default 1**, number of ports for communication. - ports_num: **required, default 1**, number of ports for communication.
- ports_num_for_sparse: **required, default 1**, number of ports for sparse type caculation. - ports_num_for_sparse: **required, default 0**, number of ports for sparse type caculation.
- num_gradient_servers: **required, default 1**, total number of gradient server. - num_gradient_servers: **required, default 1**, total number of gradient server.
- trainer_id: **required, default 0**, ID for every trainer, start from 0. - trainer_id: **required, default 0**, ID for every trainer, start from 0.
- pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",". - pservers: **required, default 127.0.0.1**, list of IPs of parameter servers, separated by ",".
......
# Android平台编译指南 # Android平台编译指南
用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库: 用户可通过如下两种方式,交叉编译Android平台上适用的PaddlePaddle库:
- 基于Docker容器的编译方式
- 基于Linux交叉编译环境的编译方式 - [基于Docker容器的编译方式](#基于docker容器的编译方式)
- [基于Linux交叉编译环境的编译方式](#基于linux交叉编译环境的编译方式)
## 基于Docker容器的编译方式 ## 基于Docker容器的编译方式
Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。 Docker能在所有主要操作系统(包括Linux,Mac OS X和Windows)上运行,因此,使用基于Docker容器的编译方式,用户可在自己熟悉的开发平台上编译Android平台上适用的PaddlePaddle库。
...@@ -16,6 +17,12 @@ $ cd Paddle ...@@ -16,6 +17,12 @@ $ cd Paddle
$ docker build -t username/paddle-android:dev . -f Dockerfile.android $ docker build -t username/paddle-android:dev . -f Dockerfile.android
``` ```
用户也可以使用PaddlePaddle提供的官方开发镜像:
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### 编译PaddlePaddle C-API库 ### 编译PaddlePaddle C-API库
构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。 构建好开发镜像后,即可使用开发镜像来编译Android版PaddlePaddle C-API库。
Android的Docker开发镜像向用户提供两个可配置的参数: Android的Docker开发镜像向用户提供两个可配置的参数:
...@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数: ...@@ -41,23 +48,25 @@ Android的Docker开发镜像向用户提供两个可配置的参数:
</tr> </tr>
<tr class="row-odd"> <tr class="row-odd">
<td>ANDROID_API</td> <td>ANDROID_API</td>
<td>>= 21</td> <td>>= 16</td>
<td>21</td> <td>21</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>
- 编译`armeabi-v7a``Android API 21`的PaddlePaddle库 - 编译`armeabi-v7a``Android API 21`的PaddlePaddle库
```bash ```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=armeabi-v7a" -e "ANDROID_API=21" username/paddle-android:dev
``` ```
- 编译`arm64-v8a``Android API 21`的PaddlePaddle库 - 编译`arm64-v8a``Android API 21`的PaddlePaddle库
```bash ```bash
$ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev $ docker run -it --rm -v $PWD:/paddle -e "ANDROID_ABI=arm64-v8a" -e "ANDROID_API=21" username/paddle-android:dev
``` ```
执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文**配置交叉编译参数**章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。 执行上述`docker run`命令时,容器默认执行[paddle/scripts/docker/build_android.sh](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/scripts/docker/build_android.sh)脚本。该脚本中记录了交叉编译Android版PaddlePaddle库常用的CMake配置,并且会根据`ANDROID_ABI``ANDROID_API`自动构建独立工具链、进行编译和安装。由于arm64架构要求Android API不小于21。因此当`ANDROID_ABI=arm64-v8a``ANDROID_API<21`时,Docker容器中将默认使用`Android API 21`的编译工具链。用户可以参考下文[配置交叉编译参数](#配置交叉编译参数)章节,根据个人的需求修改定制Docker容器所执行的脚本。编译安装结束之后,PaddlePaddle的C-API库将被安装到`$PWD/install_android`目录,所依赖的第三方库同时也被安装到`$PWD/install_android/third_party`目录。
## 基于Linux交叉编译环境的编译方式 ## 基于Linux交叉编译环境的编译方式
本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。 本文档将以Linux x86-64平台为例,介绍交叉编译Android平台上适用的PaddlePaddle库的方法和步骤。
...@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain ...@@ -83,6 +92,7 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8` 此命令将在`your/path/to/arm_standalone_toolchain`目录生成一套独立编译工具链,面向架构为32位ARM架构,支持的最小的Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8`
- 构建`arm64-v8a``Android API 21`的独立工具链: - 构建`arm64-v8a``Android API 21`的独立工具链:
```bash ```bash
your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain.sh \
--arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain --arch=arm64 --platform=android-21 --install-dir=your/path/to/arm64_standalone_toolchain
...@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain ...@@ -90,14 +100,12 @@ your/path/to/android-ndk-r14b-linux-x86_64/build/tools/make-standalone-toolchain
此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8` 此命令将在`your/path/to/arm64_standalone_toolchain`目录生成一套独立编译工具链,面向架构为64位ARM64架构,支持的最小Android API级别为21,支持编译器`arm-linux-androideabi-gcc (GCC) 4.9``clang 3.8`
注意:**PaddlePaddle要求使用的编译工具链所支持的Android API级别不小于21**
### 配置交叉编译参数 ### 配置交叉编译参数
CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling) CMake系统对交叉编译提供了支持[cmake-toolchains](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling)。为了简化cmake配置,PaddlePaddle为交叉编译提供了工具链配置文档[cmake/cross_compiling/android.cmake](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake),以提供一些默认的编译器和编译参数相关配置。注意,从CMake 3.7版本开始,CMake官方对Android平台的交叉编译提供了通用的支持。PaddlePaddle若检测到用户使用的CMake版本不低于3.7时,将会将用户传进来的配置参数传递CMake系统,交由CMake系统本身来处理。有关参数配置的详细说明见[cmake-toolchains](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling)
交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数: 交叉编译Android版本的PaddlePaddle库时,有一些必须配置的参数:
- `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译宿主机版protoc可执行文件、目标机版protobuf库、以及Android所需`arm_soft_fp_abi`分支的目标机版OpenBLAS库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF``WITH_AVX=OFF``WITH_PYTHON=OFF``WITH_RDMA=OFF`)。 - `CMAKE_SYSTEM_NAME`,CMake编译的目标平台,必须设置为`Android`。在设置`CMAKE_SYSTEM_NAME=Android`后,PaddlePaddle的CMake系统才认为是在交叉编译Android系统的版本,并自动编译PaddlePaddle所需的所有第三方库。此外,还会强制设置一些PaddlePaddle参数的值(`WITH_GPU=OFF``WITH_AVX=OFF``WITH_PYTHON=OFF``WITH_RDMA=OFF``WITH_MKL=OFF``WITH_GOLANG=OFF`)。
- `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。 - `WITH_C_API`,必须设置为`ON`。在Android平台上只支持使用C-API来预测。
- `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。 - `WITH_SWIG_PY`,必须设置为`OFF`。在Android平台上不支持通过swig调用来训练或者预测。
...@@ -119,7 +127,7 @@ Android平台可选配置参数: ...@@ -119,7 +127,7 @@ Android平台可选配置参数:
其他配置参数: 其他配置参数:
- `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF` - `USE_EIGEN_FOR_BLAS`,是否使用Eigen库进行矩阵计算。可设置`ON/OFF`,默认值为`OFF`
- `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC`的值;若环境变量`CC`没有设置,则设置成`cc`编译器。 - `HOST_C/CXX_COMPILER`,宿主机的C/C++编译器。在编译宿主机版protoc可执行文件和目标机版OpenBLAS库时需要用到。默认设置成环境变量`CC/CXX`的值;若环境变量`CC/CXX`没有设置,则设置成`cc/c++`编译器。
常用的cmake配置如下: 常用的cmake配置如下:
...@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \ ...@@ -147,9 +155,10 @@ cmake -DCMAKE_SYSTEM_NAME=Android \
.. ..
``` ```
用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS_MINSIZEREL/RELEASE`来影响PaddlePaddle的编译过程。 用户还可根据自己的需求设置其他编译参数。比如希望最小化生成的库的大小,可以设置`CMAKE_BUILD_TYPE``MinSizeRel`;若希望最快的执行速度,则可设置`CMAKE_BUILD_TYPE``Release`。亦可以通过手动设置`CMAKE_C/CXX_FLAGS`来影响PaddlePaddle的编译过程。
**性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议: **性能TIPS**,为了达到最快的计算速度,在CMake参数配置上,有以下建议:
- 设置`CMAKE_BUILD_TYPE``Release` - 设置`CMAKE_BUILD_TYPE``Release`
- 使用`clang`编译工具链 - 使用`clang`编译工具链
- `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算 - `armeabi-v7a`时,设置`USE_EIGEN_BLAS=ON`,使用Eigen进行矩阵计算;`arm64-v8a`时,设置`USE_EIGEN_FOR_BLAS=OFF`,使用OpenBLAS进行矩阵计算
......
# Build PaddlePaddle for Android # Build PaddlePaddle for Android
There are two approaches to build PaddlePaddle for Android: using Docker and on Linux without Docker. There are two approaches to build PaddlePaddle for Android:
- [Cross-Compiling Using Docker](#cross-compiling-using-docker)
- [Cross-Compiling on Linux](#cross-compiling-on-linux)
## Cross-Compiling Using Docker ## Cross-Compiling Using Docker
...@@ -16,6 +19,12 @@ $ cd Paddle ...@@ -16,6 +19,12 @@ $ cd Paddle
$ docker build -t paddle:dev-android . -f Dockerfile.android $ docker build -t paddle:dev-android . -f Dockerfile.android
``` ```
Users can directly use the published Docker image.
```bash
$ docker pull paddlepaddle/paddle:latest-dev-android
```
### Build the Inference Library ### Build the Inference Library
We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below: We can run the Docker image we just created to build the inference library of PaddlePaddle for Android using the command below:
...@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`: ...@@ -47,7 +56,7 @@ The Docker image accepts two arguments `ANDROID_ABI` and `ANDROID_API`:
</tr> </tr>
<tr class="row-odd"> <tr class="row-odd">
<td>ANDROID_API</td> <td>ANDROID_API</td>
<td>>= 21</td> <td>>= 16</td>
<td>21</td> <td>21</td>
</tr> </tr>
</tbody> </tbody>
...@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht ...@@ -93,15 +102,13 @@ Android NDK includes everything we need to build the [*standalone toolchain*](ht
The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`. The generated standalone toolchain will be in `your/path/to/arm64_standalone_toolchain`.
**Please be aware that the minimum level of Android API required by PaddlePaddle is 21.**
### Cross-Compiling Arguments ### Cross-Compiling Arguments
CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling). CMake supports [choosing the toolchain](https://cmake.org/cmake/help/v3.0/manual/cmake-toolchains.7.html#cross-compiling). PaddlePaddle provides [`android.cmake`](https://github.com/PaddlePaddle/Paddle/blob/develop/cmake/cross_compiling/android.cmake), which configures the Android cross-compiling toolchain for CMake. `android.cmake` is not required for CMake >= 3.7, which support Android cross-compiling. PaddlePaddle detects the CMake version, for those newer than 3.7, it uses [the official version](https://cmake.org/cmake/help/v3.7/manual/cmake-toolchains.7.html#cross-compiling).
Some other CMake arguments you need to know: Some other CMake arguments you need to know:
- `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, and `WITH_RDMA=OFF`. - `CMAKE_SYSTEM_NAME` must be `Android`. This tells PaddlePaddle's CMake system to cross-compile third-party dependencies. This also changes some other CMake arguments like `WITH_GPU=OFF`, `WITH_AVX=OFF`, `WITH_PYTHON=OFF`, `WITH_RDMA=OFF`, `WITH_MKL=OFF` and `WITH_GOLANG=OFF`.
- `WITH_C_API` must be `ON`, to build the C-based inference library for Android. - `WITH_C_API` must be `ON`, to build the C-based inference library for Android.
- `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API. - `WITH_SWIG_PY` must be `OFF` because the Android platform doesn't support SWIG-based API.
...@@ -123,7 +130,7 @@ Some Android-specific arguments: ...@@ -123,7 +130,7 @@ Some Android-specific arguments:
Other useful arguments: Other useful arguments:
- `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`. - `USE_EIGEN_FOR_BLAS`: indicates if using Eigen. Could be `ON` or `OFF`, defaults to `OFF`.
- `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC`, or `cc`. - `HOST_C/CXX_COMPILER`: specifies the host compiler, which is used to build the host-specific protoc and target-specific OpenBLAS. It defaults to the value of the environment variable `CC/C++`, or `cc/c++`.
Some frequent configurations for your reference: Some frequent configurations for your reference:
...@@ -158,6 +165,7 @@ There are some other arguments you might want to configure. ...@@ -158,6 +165,7 @@ There are some other arguments you might want to configure.
- `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance. - `CMAKE_BUILD_TYPE-Release` optimizes the runtime performance.
Our own tip for performance optimization to use clang and Eigen or OpenBLAS: Our own tip for performance optimization to use clang and Eigen or OpenBLAS:
- `CMAKE_BUILD_TYPE=Release` - `CMAKE_BUILD_TYPE=Release`
- `ANDROID_TOOLCHAIN=clang` - `ANDROID_TOOLCHAIN=clang`
- `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`. - `USE_EIGEN_BLAS=ON` for `armeabi-v7a`, or `USE_EIGEN_FOR_BLAS=OFF` for `arm64-v8a`.
......
# PaddlePaddle Compiling Guide for iOS # Build PaddlePaddle for iOS
This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS. This tutorial will walk you through cross compiling the PaddlePaddle library for iOS from the source in MacOS.
...@@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying ...@@ -98,7 +98,7 @@ You can set other compiling parameters for your own need. I.E. if you are trying
- set `CMAKE_BUILD_TYPE` with `Release` - set `CMAKE_BUILD_TYPE` with `Release`
- set `IOS_USE_VECLIB_FOR_BLAS` with `ON` - set `IOS_USE_VECLIB_FOR_BLAS` with `ON`
## Compile and install ## Build and install
After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library. After CMake, run following commands, PaddlePaddle will download the compile 3rd party dependencies, compile and install PaddlePaddle inference library.
...@@ -109,7 +109,7 @@ $ make install ...@@ -109,7 +109,7 @@ $ make install
Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration. Please Note: if you compiled PaddlePaddle in the source directory for other platforms, do remove `third_party` and `build` directory within the source with `rm -rf` to ensure that all the 3rd party libraries dependencies and PaddlePaddle is newly compiled with current CMake configuration.
`your/path/to/install` directory will have following directories after `compile` and `install`: `your/path/to/install` directory will have following directories after `make install`:
- `include`, contains all the C-API header files. - `include`, contains all the C-API header files.
- `lib`, contains PaddlePaddle C-API static library. - `lib`, contains PaddlePaddle C-API static library.
......
...@@ -24,6 +24,7 @@ else() ...@@ -24,6 +24,7 @@ else()
add_subdirectory(framework) add_subdirectory(framework)
add_subdirectory(operators) add_subdirectory(operators)
add_subdirectory(pybind) add_subdirectory(pybind)
add_subdirectory(inference)
endif() endif()
if(WITH_SWIG_PY) if(WITH_SWIG_PY)
......
...@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output( ...@@ -168,3 +168,13 @@ paddle_error paddle_gradient_machine_get_layer_output(
out->args.push_back(layerOutput); out->args.push_back(layerOutput);
return kPD_NO_ERROR; return kPD_NO_ERROR;
} }
paddle_error paddle_gradient_machine_release_layer_output(
paddle_gradient_machine machine) {
auto m = cast(machine);
if (m == nullptr || m->machine == nullptr) {
return kPD_NULLPTR;
}
m->machine->releaseOutput();
return kPD_NO_ERROR;
}
...@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine, ...@@ -113,6 +113,14 @@ paddle_gradient_machine_get_layer_output(paddle_gradient_machine machine,
const char* layerName, const char* layerName,
paddle_arguments args); paddle_arguments args);
/**
* @brief Release the middle layer's output memory of the gradient machine.
* @param [in] gradient machine that have run a inference
* @return paddle_error
*/
PD_API paddle_error
paddle_gradient_machine_release_layer_output(paddle_gradient_machine machine);
#ifdef __cplusplus #ifdef __cplusplus
} }
#endif #endif
......
...@@ -26,10 +26,15 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) ...@@ -26,10 +26,15 @@ nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor)
cc_test(variable_test SRCS variable_test.cc) cc_test(variable_test SRCS variable_test.cc)
cc_library(scope SRCS scope.cc DEPS glog) cc_library(threadpool SRCS threadpool.cc)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(scope SRCS scope.cc DEPS glog threadpool)
cc_test(scope_test SRCS scope_test.cc DEPS scope) cc_test(scope_test SRCS scope_test.cc DEPS scope)
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto) cc_library(device_data_transform SRCS device_data_transform.cc DEPS tensor)
cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor framework_proto selected_rows device_data_transform)
cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context) cc_test(data_transform_test SRCS data_transform_test.cc DEPS data_transform device_context)
cc_library(attribute SRCS attribute.cc DEPS framework_proto) cc_library(attribute SRCS attribute.cc DEPS framework_proto)
...@@ -38,7 +43,7 @@ device_context) ...@@ -38,7 +43,7 @@ device_context)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
shape_inference data_transform) shape_inference data_transform)
cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
...@@ -70,9 +75,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry ...@@ -70,9 +75,10 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry
cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_library(selected_rows SRCS selected_rows.cc DEPS tensor)
cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows)
cc_library(threadpool SRCS threadpool.cc) cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece operator)
cc_test(threadpool_test SRCS threadpool_test.cc DEPS threadpool)
cc_library(init SRCS init.cc DEPS gflags device_context place stringpiece)
cc_test(init_test SRCS init_test.cc DEPS init) cc_test(init_test SRCS init_test.cc DEPS init)
cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto)
cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
nv_test(device_data_transform_test SRCS device_data_transform_test.cu
DEPS operator op_registry init math_function)
...@@ -427,7 +427,8 @@ std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward( ...@@ -427,7 +427,8 @@ std::vector<std::unique_ptr<OpDesc>> MakeBlockBackward(
VLOG(5) << "Making backward " << (*it)->Type() << " op"; VLOG(5) << "Making backward " << (*it)->Type() << " op";
std::vector<std::unique_ptr<OpDesc>> op_grads; std::vector<std::unique_ptr<OpDesc>> op_grads;
if ((*it)->Type() == "recurrent" || (*it)->Type() == "while") { if ((*it)->Type() == "recurrent" || (*it)->Type() == "while" ||
(*it)->Type() == "parallel_do") {
int step_block_idx = (*it)->GetBlockAttr("sub_block"); int step_block_idx = (*it)->GetBlockAttr("sub_block");
BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars, BlockDesc* backward_block = CreateStepBlock(program_desc, no_grad_vars,
grad_to_var, step_block_idx); grad_to_var, step_block_idx);
......
...@@ -53,12 +53,12 @@ VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const { ...@@ -53,12 +53,12 @@ VarDesc *BlockDesc::FindVarRecursive(const std::string &name) const {
return it->second.get(); return it->second.get();
} }
VarDesc *BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) { VarDesc &BlockDesc::FindRecursiveOrCreateVar(const std::string &name_bytes) {
VarDesc *res = FindVarRecursive(name_bytes); VarDesc *res = FindVarRecursive(name_bytes);
if (res == nullptr) { if (res == nullptr) {
res = Var(name_bytes); res = Var(name_bytes);
} }
return res; return *res;
} }
bool BlockDesc::HasVarRecursive(const std::string &name) const { bool BlockDesc::HasVarRecursive(const std::string &name) const {
......
...@@ -57,7 +57,7 @@ class BlockDesc { ...@@ -57,7 +57,7 @@ class BlockDesc {
VarDesc *FindVarRecursive(const std::string &name_bytes) const; VarDesc *FindVarRecursive(const std::string &name_bytes) const;
VarDesc *FindRecursiveOrCreateVar(const std::string &name_bytes); VarDesc &FindRecursiveOrCreateVar(const std::string &name_bytes);
bool HasVarRecursive(const std::string &var_name) const; bool HasVarRecursive(const std::string &var_name) const;
......
...@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,9 +11,12 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <functional>
#include "paddle/framework/data_transform.h" #include "paddle/framework/data_transform.h"
#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
namespace paddle { namespace paddle {
...@@ -24,6 +27,37 @@ DataTransformFnMap& DataTransformFnMap::Instance() { ...@@ -24,6 +27,37 @@ DataTransformFnMap& DataTransformFnMap::Instance() {
return data_transform_map; return data_transform_map;
} }
Tensor* DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor) {
Tensor* out = nullptr;
if (!platform::is_same_place(kernel_type_for_var.place_,
expected_kernel_type.place_)) {
out = DeviceTransform(input_tensor, expected_kernel_type.place_);
}
PADDLE_ENFORCE_NOT_NULL(out, "out should not be null");
return out;
}
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable& out_var) {
if (in_var.IsType<LoDTensor>()) {
auto& in_lod_tensor = in_var.Get<LoDTensor>();
auto* tran_lod_tensor = out_var.GetMutable<LoDTensor>();
tran_lod_tensor->set_lod(in_lod_tensor.lod());
tran_lod_tensor->set_layout(in_lod_tensor.layout());
tran_lod_tensor->ShareDataWith(tensor);
} else if (in_var.IsType<SelectedRows>()) {
auto& in_selected_rows = in_var.Get<SelectedRows>();
auto* trans_selected_rows = out_var.GetMutable<SelectedRows>();
trans_selected_rows->set_height(in_selected_rows.height());
trans_selected_rows->set_rows(in_selected_rows.rows());
trans_selected_rows->mutable_value()->ShareDataWith(tensor);
} else {
PADDLE_THROW("unknown var type");
}
}
auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(), auto KernelFP32 = OpKernelType(proto::DataType::FP32, platform::CPUPlace(),
DataLayout::kNHWC, LibraryType::kPlain); DataLayout::kNHWC, LibraryType::kPlain);
...@@ -36,6 +70,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), ...@@ -36,6 +70,28 @@ auto KernelNHWC = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(), auto KernelNCHW = OpKernelType(proto::DataType::FP64, platform::CPUPlace(),
DataLayout::kNCHW, LibraryType::kPlain); DataLayout::kNCHW, LibraryType::kPlain);
// TODO(dzhwinter): Only for testing multiple op kernel.
// Dummy transform function for library_type
// should be removed.
auto KernelPlain = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kPlain);
auto KernelCUDNN = OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0),
DataLayout::kAnyLayout, LibraryType::kCUDNN);
void DummyTrans(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!.");
PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!");
auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>();
*dst = src;
}
void TransDataType(const platform::DeviceContext* ctx, void TransDataType(const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in, const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) { Variable* out) {
...@@ -74,35 +130,36 @@ void TransDataType(const platform::DeviceContext* ctx, ...@@ -74,35 +130,36 @@ void TransDataType(const platform::DeviceContext* ctx,
} }
} }
void TransDataLayout(const platform::DeviceContext* ctx, void TransDataLayout(const std::vector<int>& axis,
const platform::DeviceContext* ctx,
const KernelTypePair& kernel_pair, const Variable& in, const KernelTypePair& kernel_pair, const Variable& in,
Variable* out) { Variable* out) {
PADDLE_ENFORCE(in.IsType<Tensor>(), "Only Support Tensor transform!."); PADDLE_ENFORCE(in.IsType<Tensor>(), "Only support Tensor transform!.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
platform::places_are_same_class(kernel_pair.first.place_, platform::places_are_same_class(kernel_pair.first.place_,
kernel_pair.second.place_), kernel_pair.second.place_),
"TransDataType Only Support DataType transform on same place!"); "TransDataLayout only support DataLayout transform on same place!");
PADDLE_ENFORCE(kernel_pair.first.data_type_ == kernel_pair.second.data_type_,
"TransDataLayout only support Datatype are same!");
auto src = in.Get<Tensor>(); auto src = in.Get<Tensor>();
auto* dst = out->GetMutable<Tensor>(); auto* dst = out->GetMutable<Tensor>();
PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!"); PADDLE_ENFORCE(arity(src.dims()) == 4, "Input Arity Only Suppport 4!");
auto src_dim = src.dims(); auto src_dim = src.dims();
dst->Resize(src_dim);
auto place = kernel_pair.second.place_;
CopyFrom(src, place, *ctx, dst);
const std::vector<int> axis = {0, 2, 3, 1};
std::vector<int64_t> dst_dim; std::vector<int64_t> dst_dim;
dst_dim.resize(axis.size()); dst_dim.resize(axis.size());
for (size_t i = 0; i < axis.size(); i++) { for (size_t i = 0; i < axis.size(); i++) {
dst_dim[i] = src_dim[axis[i]]; dst_dim[i] = src_dim[axis[i]];
} }
dst->Resize(make_ddim(dst_dim)); dst->Resize(make_ddim(dst_dim));
auto place = kernel_pair.second.place_;
dst->mutable_data(place, src.type());
auto src_type = kernel_pair.first.data_type_; auto src_type = kernel_pair.first.data_type_;
framework::VisitDataType(src_type, CastDataLayout(src, dst, ctx, axis)); framework::VisitDataType(src_type, CastDataLayout(ctx, axis, src, dst));
dst->set_layout(kernel_pair.second.data_layout_); dst->set_layout(kernel_pair.second.data_layout_);
} }
...@@ -111,5 +168,24 @@ void TransDataLayout(const platform::DeviceContext* ctx, ...@@ -111,5 +168,24 @@ void TransDataLayout(const platform::DeviceContext* ctx,
} // namespace paddle } // namespace paddle
namespace f = paddle::framework; namespace f = paddle::framework;
namespace {
std::vector<int> NHWC2NCHW = {0, 3, 1, 2};
std::vector<int> NCHW2NHWC = {0, 2, 3, 1};
}
REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType); REGISTER_DATA_TRANSFORM_FN(f::KernelFP32, f::KernelFP64, f::TransDataType);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW, f::TransDataLayout); REGISTER_DATA_TRANSFORM_FN(f::KernelPlain, f::KernelCUDNN, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelCUDNN, f::KernelPlain, f::DummyTrans);
REGISTER_DATA_TRANSFORM_FN(f::KernelNHWC, f::KernelNCHW,
std::bind(f::TransDataLayout, NHWC2NCHW,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
REGISTER_DATA_TRANSFORM_FN(f::KernelNCHW, f::KernelNHWC,
std::bind(f::TransDataLayout, NCHW2NHWC,
std::placeholders::_1,
std::placeholders::_2,
std::placeholders::_3,
std::placeholders::_4));
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/framework/op_kernel_type.h" #include "paddle/framework/op_kernel_type.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/tensor.h" #include "paddle/framework/tensor.h"
#include "paddle/framework/variable.h" #include "paddle/framework/variable.h"
#include "paddle/operators/math/math_function.h" #include "paddle/operators/math/math_function.h"
...@@ -49,6 +50,13 @@ struct KernelTypePairHash { ...@@ -49,6 +50,13 @@ struct KernelTypePairHash {
} }
}; };
Tensor* DataTransform(const OpKernelType& expected_kernel_type,
const OpKernelType& kernel_type_for_var,
const Tensor& input_tensor);
void CopyVariableWithTensor(const Variable& in_var, const Tensor& tensor,
Variable& out_var);
template <typename InType, typename OutType> template <typename InType, typename OutType>
struct CastDataTypeFunctor { struct CastDataTypeFunctor {
HOSTDEVICE inline OutType operator()(InType in) const { HOSTDEVICE inline OutType operator()(InType in) const {
...@@ -73,22 +81,23 @@ struct CastDataType { ...@@ -73,22 +81,23 @@ struct CastDataType {
auto numel = in_.numel(); auto numel = in_.numel();
auto* in_end = in_begin + numel; auto* in_end = in_begin + numel;
auto* out_begin = out_->mutable_data<OutType>(place); auto* out_begin = out_->mutable_data<OutType>(place);
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
platform::Transform<platform::CPUDeviceContext> trans; platform::Transform<platform::CPUDeviceContext> trans;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_); auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin, trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>()); CastDataTypeFunctor<InType, OutType>());
} else { } else {
// TODO(dzhwinter): enhance CopyFrom CPU<->GPU with different data type? // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW("Unsupport CPU <-> GPU!"); PADDLE_THROW("Unsupport CPU <-> GPU!");
} }
} }
}; };
struct CastDataLayout { struct CastDataLayout {
CastDataLayout(const framework::Tensor& in, framework::Tensor* out, CastDataLayout(const platform::DeviceContext* ctx,
const platform::DeviceContext* ctx, const std::vector<int>& axis, const framework::Tensor& in,
const std::vector<int>& axis) framework::Tensor* out)
: in_(in), out_(out), ctx_(ctx), axis_(axis) {} : in_(in), out_(out), ctx_(ctx), axis_(axis) {}
const framework::Tensor in_; const framework::Tensor in_;
framework::Tensor* out_; framework::Tensor* out_;
...@@ -98,6 +107,7 @@ struct CastDataLayout { ...@@ -98,6 +107,7 @@ struct CastDataLayout {
template <typename T> template <typename T>
void operator()() { void operator()() {
auto place = ctx_->GetPlace(); auto place = ctx_->GetPlace();
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4; operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_); auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
......
...@@ -106,7 +106,7 @@ TEST(DataTransform, Register) { ...@@ -106,7 +106,7 @@ TEST(DataTransform, Register) {
ASSERT_EQ(test_value, 2); ASSERT_EQ(test_value, 2);
} }
TEST(DataTransform, Layout) { TEST(DataTransform, DataLayout) {
using namespace paddle::framework; using namespace paddle::framework;
using namespace paddle::platform; using namespace paddle::platform;
...@@ -127,7 +127,19 @@ TEST(DataTransform, Layout) { ...@@ -127,7 +127,19 @@ TEST(DataTransform, Layout) {
} }
Tensor dst = out.Get<Tensor>(); Tensor dst = out.Get<Tensor>();
EXPECT_TRUE(dst.layout() != src->layout());
EXPECT_TRUE(dst.layout() == DataLayout::kNCHW);
EXPECT_TRUE(dst.dims() == make_ddim({2, 2, 3, 1}));
{
auto kernel1 = GenFromBit({1, 0, 1, 0});
auto kernel2 = GenFromBit({1, 0, 0, 0});
auto pair0 = std::make_pair(kernel1, kernel2);
instance.Get(pair0)(ctx, pair0, out, &in);
}
EXPECT_TRUE(src->layout() == DataLayout::kNHWC);
EXPECT_TRUE(src->dims() == make_ddim({2, 3, 1, 2}));
} }
TEST(DataTransform, DataType) { TEST(DataTransform, DataType) {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <thread>
namespace paddle {
namespace framework {
namespace details {
// Change it to thread safe flags if needed.
class ThreadUnsafeOwnershipFlags {
public:
ThreadUnsafeOwnershipFlags(bool flag) : flag_(flag) {}
ThreadUnsafeOwnershipFlags(const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags& operator=(
const ThreadUnsafeOwnershipFlags& other) = delete;
ThreadUnsafeOwnershipFlags(ThreadUnsafeOwnershipFlags&& other) = default;
void SetOwnership(bool flag) { flag_ = flag; }
// Invoke the callback if it is not owned.
template <typename Callback>
void AcquireOwnershipOnce(Callback acquire) {
if (!flag_) {
acquire();
flag_ = true;
}
}
private:
bool flag_;
};
// Copy-On-Write pointer.
// It will hold a T* pointer, and only copy once when `MutableData` is invoked.
//
// The template parameter OwnershipFlags should have:
// * a constructor takes a bool. True if own.
// * SetOwnership(bool flag).
// * AcquireOwnershipOnce(Callback). It will invoke the callback if it is not
// owned.
//
// https://en.wikipedia.org/wiki/Copy-on-write
template <typename T, typename OwnershipFlags = ThreadUnsafeOwnershipFlags>
class COWPtr {
public:
// Ctor from raw pointer.
explicit COWPtr(T* ptr) : payload_(ptr), ownership_{true} {}
// Move methods. Steal ownership from origin
COWPtr(COWPtr&& other)
: payload_(other.payload_), ownership_{std::move(other.ownership_)} {}
COWPtr& operator=(COWPtr&& origin) = default;
// Copy methods. Not own payload
COWPtr(const COWPtr& other) : payload_(other.payload_), ownership_{false} {}
COWPtr& operator=(const COWPtr& other) {
payload_ = other.payload_;
ownership_.SetOwnership(false);
return *this;
}
// Access read only data.
const T& Data() const { return *payload_; }
// Access mutable data. If the data is not owned, the data will be copied
// before.
T* MutableData() {
ownership_.AcquireOwnershipOnce(
[this] { payload_.reset(new T(*payload_)); });
return payload_.get();
}
private:
// Actual data pointer.
std::shared_ptr<T> payload_;
// Ownership flag.
OwnershipFlags ownership_;
};
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/details/cow_ptr.h"
#include "gtest/gtest.h"
namespace paddle {
namespace framework {
namespace details {
TEST(COWPtr, all) {
COWPtr<int> ptr(new int{0});
ASSERT_EQ(ptr.Data(), 0);
COWPtr<int> ptr2 = ptr;
ASSERT_EQ(ptr2.Data(), 0);
ASSERT_EQ(&ptr2.Data(), &ptr.Data());
*ptr2.MutableData() = 10;
ASSERT_EQ(ptr.Data(), 0);
ASSERT_EQ(ptr2.Data(), 10);
}
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/framework/device_data_transform.h"
namespace paddle {
namespace framework {
static const platform::DeviceContext* GetDeviceContext(
const platform::Place& src_place, const platform::Place& dst_place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(src_place) && platform::is_cpu_place(dst_place)) {
return pool.Get(src_place);
} else if (platform::is_cpu_place(src_place) &&
platform::is_gpu_place(dst_place)) {
return pool.Get(dst_place);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place) {
VLOG(3) << "DeviceTransform in, src_place " << in.place()
<< " dst_place: " << dst_place;
Tensor* out = new Tensor();
auto* dev_ctx = GetDeviceContext(in.place(), dst_place);
dev_ctx->Wait();
Copy(in, dst_place, *dev_ctx, out);
dev_ctx->Wait();
return out;
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/tensor.h"
#include "paddle/framework/tensor_util.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
Tensor* DeviceTransform(const Tensor& in, const platform::Place& dst_place);
} // namespace framework
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "gtest/gtest.h"
#include "paddle/framework/init.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/op_info.h"
#include "paddle/framework/op_registry.h"
#include "paddle/operators/elementwise_op_function.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace framework {
template <typename T>
struct AddFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return a + b; }
};
class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
public:
OpKernelTestProtoAndCheckerMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("input", "input1 of test op");
AddOutput("output", "output of test op");
AddAttr<bool>("use_gpu", "force to use gpu kernel").SetDefault(false);
AddComment("This is test op");
}
};
class TestOpWithKernel : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override {
if (Attr<bool>("use_gpu")) {
VLOG(3) << "force use gpu kernel";
return OpKernelType(proto::DataType::FP32, platform::CUDAPlace(0));
} else {
VLOG(3) << "use default kernel";
return OpKernelType(proto::DataType::FP32,
ctx.Input<Tensor>("input")->place());
}
}
};
template <typename DeviceContext, typename T>
class TestKernel : public OpKernel<float> {
public:
void Compute(const ExecutionContext& ctx) const {
std::cout << ctx.op().DebugString() << std::endl;
const Tensor* input = ctx.Input<Tensor>("input");
std::cout << "input place:" << input->place() << std::endl;
auto* output = ctx.Output<framework::LoDTensor>("output");
output->Resize(input->dims());
output->mutable_data<T>(ctx.GetPlace());
operators::TransformFunctor<AddFunctor<T>, T, DeviceContext> functor(
input, input, output, ctx.template device_context<DeviceContext>(),
AddFunctor<T>());
functor.Run();
}
};
} // namespace framework
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(
test_op, paddle::framework::TestOpWithKernel,
paddle::framework::OpKernelTestProtoAndCheckerMaker);
REGISTER_OP_CPU_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
test_op,
paddle::framework::TestKernel<paddle::platform::CUDADeviceContext, float>);
static void BuildVar(const std::string& param_name,
std::initializer_list<const char*> arguments,
paddle::framework::proto::OpDesc::Var* var) {
var->set_parameter(param_name);
for (auto& arg_name : arguments) {
*var->mutable_arguments()->Add() = arg_name;
}
}
TEST(Operator, CPUtoGPU) {
using namespace paddle::framework;
using namespace paddle::platform;
ASSERT_EQ(InitDevices({"CPU", "GPU:0"}), true);
paddle::framework::Scope scope;
paddle::platform::CPUPlace cpu_place;
// create an op to run on CPU
paddle::framework::proto::OpDesc cpu_op_desc;
cpu_op_desc.set_type("test_op");
BuildVar("input", {"IN1"}, cpu_op_desc.add_inputs());
BuildVar("output", {"OUT1"}, cpu_op_desc.add_outputs());
auto cpu_op = paddle::framework::OpRegistry::CreateOp(cpu_op_desc);
// prepare input
auto* in_t = scope.Var("IN1")->GetMutable<LoDTensor>();
auto* src_ptr = in_t->mutable_data<float>({2, 3}, CPUPlace());
for (int i = 0; i < 2 * 3; ++i) {
src_ptr[i] = static_cast<float>(i);
}
// get output
auto* output = scope.Var("OUT1");
cpu_op->Run(scope, cpu_place);
auto* output_ptr = output->Get<LoDTensor>().data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output_ptr[i], static_cast<float>(i) * 2);
}
// create an op to run on GPU
paddle::framework::proto::OpDesc gpu_op_desc;
gpu_op_desc.set_type("test_op");
BuildVar("input", {"OUT1"}, gpu_op_desc.add_inputs());
BuildVar("output", {"OUT2"}, gpu_op_desc.add_outputs());
auto attr = gpu_op_desc.mutable_attrs()->Add();
attr->set_name("use_gpu");
attr->set_type(paddle::framework::proto::AttrType::BOOLEAN);
attr->set_b(true);
auto gpu_op = paddle::framework::OpRegistry::CreateOp(gpu_op_desc);
paddle::platform::CUDAPlace cuda_place(0);
// get output
auto* output2 = scope.Var("OUT2");
gpu_op->Run(scope, cuda_place);
// auto* output2_ptr = output2->Get<LoDTensor>().data<float>();
DeviceContextPool& pool = DeviceContextPool::Instance();
auto dev_ctx = pool.Get(cuda_place);
paddle::framework::Tensor output_tensor;
Copy(output2->Get<LoDTensor>(), paddle::platform::CPUPlace(), *dev_ctx,
&output_tensor);
dev_ctx->Wait();
float* output2_ptr = output_tensor.data<float>();
for (int i = 0; i < 2 * 3; ++i) {
ASSERT_EQ(output2_ptr[i], static_cast<float>(i) * 4);
}
}
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/platform/place.h"
DEFINE_bool(check_nan_inf, false, DEFINE_bool(check_nan_inf, false,
"Checking whether operator produce NAN/INF or not. It will be " "Checking whether operator produce NAN/INF or not. It will be "
...@@ -49,10 +50,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) { ...@@ -49,10 +50,13 @@ static void CreateTensor(Variable* var, proto::VarDesc::VarType var_type) {
var->GetMutable<LoDRankTable>(); var->GetMutable<LoDRankTable>();
} else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) { } else if (var_type == proto::VarDesc::LOD_TENSOR_ARRAY) {
var->GetMutable<LoDTensorArray>(); var->GetMutable<LoDTensorArray>();
} else if (var_type == proto::VarDesc::PLACE_LIST) {
var->GetMutable<platform::PlaceList>();
} else { } else {
PADDLE_THROW( PADDLE_THROW(
"Variable type %d is not in " "Variable type %d is not in "
"[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE]", "[LoDTensor, SelectedRows, FEED_MINIBATCH, FETCH_LIST, LOD_RANK_TABLE,"
" PLACE_LIST]",
var_type); var_type);
} }
} }
...@@ -111,7 +115,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -111,7 +115,7 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << op->DebugString(); VLOG(3) << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
if (FLAGS_check_nan_inf) { if (FLAGS_check_nan_inf) {
for (auto& vname : op->OutputVars(true)) { for (auto& vname : op->OutputVars(true)) {
......
...@@ -123,6 +123,7 @@ message VarDesc { ...@@ -123,6 +123,7 @@ message VarDesc {
STEP_SCOPES = 5; STEP_SCOPES = 5;
LOD_RANK_TABLE = 6; LOD_RANK_TABLE = 6;
LOD_TENSOR_ARRAY = 7; LOD_TENSOR_ARRAY = 7;
PLACE_LIST = 8;
} }
required string name = 1; required string name = 1;
required VarType type = 2; required VarType type = 2;
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/framework/init.h" #include "paddle/framework/init.h"
#include "paddle/framework/operator.h"
#include "paddle/platform/device_context.h" #include "paddle/platform/device_context.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "paddle/string/piece.h" #include "paddle/string/piece.h"
...@@ -24,7 +25,6 @@ namespace framework { ...@@ -24,7 +25,6 @@ namespace framework {
std::once_flag gflags_init_flag; std::once_flag gflags_init_flag;
// TODO(qijun) move init gflags to init.cc
void InitGflags(std::vector<std::string> &argv) { void InitGflags(std::vector<std::string> &argv) {
std::call_once(gflags_init_flag, [&]() { std::call_once(gflags_init_flag, [&]() {
int argc = argv.size(); int argc = argv.size();
...@@ -72,8 +72,14 @@ bool InitDevices(const std::vector<std::string> &devices) { ...@@ -72,8 +72,14 @@ bool InitDevices(const std::vector<std::string> &devices) {
LOG(WARNING) << "Not specified CPU device, create CPU by Default."; LOG(WARNING) << "Not specified CPU device, create CPU by Default.";
} }
platform::DeviceContextPool::Init(places); platform::DeviceContextPool::Init(places);
// framework::UseALL();
return true; return true;
} }
void InitGLOG(const std::string &prog_name) {
google::InitGoogleLogging(prog_name.c_str());
google::InstallFailureSignalHandler();
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -22,6 +22,8 @@ namespace framework { ...@@ -22,6 +22,8 @@ namespace framework {
void InitGflags(std::vector<std::string> &argv); void InitGflags(std::vector<std::string> &argv);
void InitGLOG(const std::string &prog_name);
bool InitDevices(const std::vector<std::string> &devices); bool InitDevices(const std::vector<std::string> &devices);
} // namespace framework } // namespace framework
......
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#pragma once #pragma once
#include <cctype>
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) { ...@@ -41,6 +42,9 @@ inline std::string LibraryTypeToString(const LibraryType& library_type) {
inline LibraryType StringToLibraryType(const char* ctype) { inline LibraryType StringToLibraryType(const char* ctype) {
std::string s(ctype); std::string s(ctype);
for (size_t i = 0; i < s.size(); ++i) {
s[i] = toupper(s[i]);
}
if (s == std::string("PLAIN")) { if (s == std::string("PLAIN")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else if (s == std::string("MKLDNN")) { } else if (s == std::string("MKLDNN")) {
......
...@@ -43,16 +43,20 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) { ...@@ -43,16 +43,20 @@ std::ostream &operator<<(std::ostream &os, const LoD &lod) {
return os; return os;
} }
LoD SliceLevels(const LoD &in, size_t level_begin, size_t level_end) { std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
LoD new_lod; PADDLE_ENFORCE(platform::is_cpu_place(t.place()));
new_lod.reserve(level_end - level_begin); PADDLE_ENFORCE(t.type().hash_code() == typeid(float).hash_code());
for (size_t i = level_begin; i < level_end; i++) {
new_lod.emplace_back(in.at(i)); os << "dim: " << t.dims() << "\n";
} os << "lod: " << t.lod() << "\n";
// transform the lowest level to absolute offset.
LoD abs_offset_lod = ToAbsOffset(in); // only print first ten elements
new_lod.back() = abs_offset_lod[level_end - 1]; int64_t size = t.numel() < 10 ? t.numel() : 10;
return new_lod; for (int64_t i = 0; i < size; ++i) {
os << t.data<float>()[i] << " ";
}
return os;
} }
LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
...@@ -115,43 +119,6 @@ bool operator==(const LoD &a, const LoD &b) { ...@@ -115,43 +119,6 @@ bool operator==(const LoD &a, const LoD &b) {
return true; return true;
} }
size_t LoDTensor::NumElements(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level));
return lod_[level][idx + 1] - lod_[level][idx];
}
size_t LoDTensor::NumInstancesInElement(size_t level, size_t idx) const {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(idx, NumElements(level));
auto abs_lod = ToAbsOffset(lod());
size_t begin = abs_lod[level][idx];
size_t end = abs_lod[level][idx + 1];
return end - begin;
}
void LoDTensor::ShrinkLevels(size_t level_begin, size_t level_end) {
auto new_lod = framework::SliceLevels(lod_, level_begin, level_end);
lod_ = new_lod;
}
void LoDTensor::ShrinkInLevel(size_t level, size_t elem_begin,
size_t elem_end) {
PADDLE_ENFORCE_LT(level, NumLevels());
PADDLE_ENFORCE_LT(elem_begin, NumElements(level));
PADDLE_ENFORCE_LT(elem_end, NumElements(level) + 1);
auto abs_lod = framework::ToAbsOffset(lod());
auto new_lod = framework::SliceInLevel(lod_, level, elem_begin, elem_end);
lod_ = new_lod;
// slice the underlying tensor
size_t begin = abs_lod[level][elem_begin];
size_t end = abs_lod[level][elem_end];
PADDLE_ENFORCE_LT(begin, end, "Cannot shrink, the result tensor is empty.");
ShareDataWith(Slice(begin, end));
}
using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>; using LoDAndOffset = std::pair<LoD, std::pair<size_t, size_t>>;
LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx, LoDAndOffset GetSubLoDAndAbsoluteOffset(const LoD &lod, size_t start_idx,
size_t end_idx, size_t start_level) { size_t end_idx, size_t start_level) {
...@@ -177,6 +144,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) { ...@@ -177,6 +144,9 @@ void AppendLoD(LoD *lod, const LoD &lod_length) {
lod->empty() || lod->size() == lod_length.size(), lod->empty() || lod->size() == lod_length.size(),
"The lod_length should has the same size with the appended lod."); "The lod_length should has the same size with the appended lod.");
if (lod->empty()) { if (lod->empty()) {
for (size_t i = 0; i < lod_length.size(); ++i) {
lod->emplace_back(1, 0); // size = 1, value = 0;
}
*lod = LoD(lod_length.size(), std::vector<size_t>({0})); *lod = LoD(lod_length.size(), std::vector<size_t>({0}));
} }
for (size_t i = 0; i < lod->size(); ++i) { for (size_t i = 0; i < lod->size(); ++i) {
...@@ -214,9 +184,10 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor, ...@@ -214,9 +184,10 @@ void SerializeToStream(std::ostream &os, const LoDTensor &tensor,
SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx); SerializeToStream(os, static_cast<Tensor>(tensor), dev_ctx);
} }
void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { void DeserializeFromStream(std::istream &is, LoDTensor *tensor,
const platform::DeviceContext &dev_ctx) {
{ {
// the 1st field, unit32_t version for SelectedRows // the 1st field, unit32_t version for LoDTensor
uint32_t version; uint32_t version;
is.read(reinterpret_cast<char *>(&version), sizeof(version)); is.read(reinterpret_cast<char *>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
...@@ -237,7 +208,71 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) { ...@@ -237,7 +208,71 @@ void DeserializeFromStream(std::istream &is, LoDTensor *tensor) {
} }
} }
// the 3st filed, Tensor // the 3st filed, Tensor
DeserializeFromStream(is, static_cast<Tensor *>(tensor)); DeserializeFromStream(is, static_cast<Tensor *>(tensor), dev_ctx);
}
std::vector<LoDTensor> LoDTensor::SplitLoDTensor(
const std::vector<platform::Place> places) const {
check_memory_size();
// PADDLE_ENFORCE(lod().empty() || (lod().size() == 1 && lod()[0].empty())
// , "Disable parallel lod for now");
PADDLE_ENFORCE(lod().empty(), "Disable parallel lod for now");
PADDLE_ENFORCE(dims()[0] % places.size() == 0,
"Batch size should be divided by places size");
std::vector<LoDTensor> lods;
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
size_t begin = place_idx * dims()[0] / places.size();
size_t end = (place_idx + 1) * dims()[0] / places.size();
auto src = Slice(static_cast<int>(begin), static_cast<int>(end));
LoDTensor dst;
dst.Resize(src.dims());
auto &dst_place = places[place_idx];
auto dst_ptr = dst.mutable_data(dst_place, src.type());
// TODO(tonyyang-svail):
// change the following to framework::Copy
auto src_place = src.place();
auto src_ptr = src.data<void>();
auto size = src.numel() * SizeOfType(src.type());
if (platform::is_cpu_place(src_place) &&
platform::is_cpu_place(dst_place)) {
memory::Copy(boost::get<platform::CPUPlace>(dst_place), dst_ptr,
boost::get<platform::CPUPlace>(src_place), src_ptr, size);
} else {
PADDLE_THROW("Not Implemented");
}
lods.emplace_back(dst);
}
return lods;
}
void LoDTensor::MergeLoDTensor(
const std::vector<const LoDTensor *> &lod_tensors, platform::Place place) {
PADDLE_ENFORCE(platform::is_cpu_place(place));
PADDLE_ENFORCE(!lod_tensors.empty());
framework::DDim new_dim = lod_tensors[0]->dims();
std::type_index new_type = lod_tensors[0]->type();
for (auto *lod : lod_tensors) {
PADDLE_ENFORCE(new_dim == lod->dims());
PADDLE_ENFORCE(new_type == lod->type());
PADDLE_ENFORCE(platform::is_cpu_place(lod->place()));
}
new_dim[0] *= lod_tensors.size();
Resize(new_dim);
auto *dst_ptr = reinterpret_cast<uint8_t *>(mutable_data(place, new_type));
for (auto *src : lod_tensors) {
auto size = src->numel() * SizeOfType(src->type());
memory::Copy(boost::get<platform::CPUPlace>(place), dst_ptr,
boost::get<platform::CPUPlace>(src->place()),
src->data<void>(), size);
dst_ptr += size;
}
} }
} // namespace framework } // namespace framework
......
...@@ -58,14 +58,7 @@ using Vector = thrust::host_vector< ...@@ -58,14 +58,7 @@ using Vector = thrust::host_vector<
using LoD = std::vector<Vector<size_t>>; using LoD = std::vector<Vector<size_t>>;
std::ostream& operator<<(std::ostream& os, const LoD& lod); std::ostream& operator<<(std::ostream& os, const LoD& lod);
std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
/*
* Slice levels from a LoD.
* NOTE the lowest level should always be the absolute offsets of the underlying
* tensor instances. So if higher layers are sliced without the lowest level,
* the lower level of the sliced LoD will be transformed to the absolute offset.
*/
LoD SliceLevels(const LoD& in, size_t level_begin, size_t level_end);
LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin, LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
size_t elem_end); size_t elem_end);
...@@ -115,34 +108,11 @@ class LoDTensor : public Tensor { ...@@ -115,34 +108,11 @@ class LoDTensor : public Tensor {
return (lod_)[level].size() - 1; return (lod_)[level].size() - 1;
} }
/* std::vector<LoDTensor> SplitLoDTensor(
* Number of lower-level elements. const std::vector<platform::Place> places) const;
* For example, a 2-level lod-tensor
*
* 0-th level | |
* 1-th level || |||
*
* NumElements(0, 0) get 2
* NumElements(0, 1) get 3
*/
size_t NumElements(size_t level, size_t idx) const;
/* void MergeLoDTensor(const std::vector<const LoDTensor*>& lod_tensors,
* Get the number of instances in the underlying tensor in the `idx`-th platform::Place place);
* element.
*/
size_t NumInstancesInElement(size_t level, size_t idx) const;
/*
* Shrink levels[level_begin:level_end]
*/
void ShrinkLevels(size_t level_begin, size_t level_end);
/*
* Shrink elements of a level, [elem_begin: elem_end]
* @note: low performance in slice lod_.
*/
void ShrinkInLevel(size_t level, size_t elem_begin, size_t elem_end);
private: private:
LoD lod_; LoD lod_;
...@@ -177,8 +147,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level, ...@@ -177,8 +147,8 @@ LoDTensor LodExpand(const LoDTensor& source, const LoD& lod, size_t level,
for (size_t ins = 0; ins < num_instances; ins++) { for (size_t ins = 0; ins < num_instances; ins++) {
for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) { for (size_t elem = lod_level[ins]; elem < lod_level[ins + 1]; elem++) {
auto slice = tensor.Slice(elem, elem + 1); auto slice = tensor.Slice(elem, elem + 1);
CopyFrom(source.Slice(ins, ins + 1), platform::CPUPlace(), Copy(source.Slice(ins, ins + 1), platform::CPUPlace(),
platform::CPUDeviceContext(), &slice); platform::CPUDeviceContext(), &slice);
} }
} }
return tensor; return tensor;
...@@ -208,7 +178,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length); ...@@ -208,7 +178,8 @@ void AppendLoD(LoD* lod, const LoD& lod_length);
*/ */
void SerializeToStream(std::ostream& os, const LoDTensor& tensor, void SerializeToStream(std::ostream& os, const LoDTensor& tensor,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, LoDTensor* tensor); void DeserializeFromStream(std::istream& is, LoDTensor* tensor,
const platform::DeviceContext& dev_ctx);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -54,92 +54,6 @@ class LoDTensorTester : public ::testing::Test { ...@@ -54,92 +54,6 @@ class LoDTensorTester : public ::testing::Test {
LoDTensor lod_tensor_; LoDTensor lod_tensor_;
}; };
TEST_F(LoDTensorTester, NumLevels) { ASSERT_EQ(lod_tensor_.NumLevels(), 3UL); }
TEST_F(LoDTensorTester, NumElements) {
ASSERT_EQ(lod_tensor_.NumElements(0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(1), 3UL);
ASSERT_EQ(lod_tensor_.NumElements(2), 8UL);
}
TEST_F(LoDTensorTester, NumElements2) {
ASSERT_EQ(lod_tensor_.NumElements(0, 0), 2UL);
ASSERT_EQ(lod_tensor_.NumElements(0, 1), 1UL);
ASSERT_EQ(lod_tensor_.NumElements(1, 1), 3UL);
}
TEST_F(LoDTensorTester, ShrinkLevels) {
// slice 1 level
for (size_t level = 0; level < 3UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
// shrink 2 level
for (size_t level = 0; level < 2UL; ++level) {
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkLevels(level, level + 2);
// the lowest level's last element should be the tensor's batch_size.
ASSERT_EQ(new_lod_tensor.lod().back().back(),
lod_tensor_.lod().back().back());
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.data<float>(), lod_tensor_.data<float>());
}
}
TEST_F(LoDTensorTester, ShrinkInLevel) {
size_t level = 0;
LoDTensor new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 0, 1);
ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(2), 5UL);
ASSERT_EQ(new_lod_tensor.dims()[0], 12);
for (int i = 0; i < 12 * 128; i++) {
ASSERT_EQ(new_lod_tensor.data<float>()[i], i);
}
level = 1;
new_lod_tensor = lod_tensor_;
new_lod_tensor.ShrinkInLevel(level, 1, 2);
ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
ASSERT_EQ(new_lod_tensor.NumElements(0), 1UL);
ASSERT_EQ(new_lod_tensor.NumElements(1), 3UL);
ASSERT_EQ(new_lod_tensor.dims()[0], 7);
for (int i = 5 * 128; i < 12 * 128; i++) {
ASSERT_EQ(new_lod_tensor.data<float>()[i - 5 * 128], i);
}
LoDTensor t1;
t1.set_lod(lod_tensor_.lod());
t1.ShareDataWith(lod_tensor_);
LoDTensor t2;
t2.set_lod(lod_tensor_.lod());
t2.ShareDataWith(lod_tensor_);
t1.ShrinkInLevel(0, 1, 2);
t2.ShrinkInLevel(0, 0, 1);
EXPECT_NE(t1.data<float>(), t2.data<float>());
EXPECT_NE(t1.data<float>(), lod_tensor_.data<float>());
}
TEST_F(LoDTensorTester, SerializeAndDeserialize) {
LoDTensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
std::ostringstream oss;
SerializeToStream(oss, lod_tensor_, cpu_ctx);
std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor);
float* dst_ptr = dst_tensor.mutable_data<float>(platform::CPUPlace());
for (int i = 0; i < kLodTensorSize; ++i) {
EXPECT_EQ(dst_ptr[i], i);
}
EXPECT_EQ(dst_tensor.lod(), lod_tensor_.lod());
}
TEST(LodExpand, test) { TEST(LodExpand, test) {
LoD lod{{0, 2}}; LoD lod{{0, 2}};
LoDTensor tensor; LoDTensor tensor;
......
...@@ -64,8 +64,9 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -64,8 +64,9 @@ class CompileTimeInferShapeContext : public InferShapeContext {
PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR, PADDLE_ENFORCE_EQ(in_var->GetType(), proto::VarDesc::LOD_TENSOR,
"The %d-th output of Output(%s) must be LoDTensor.", j, "The %d-th output of Output(%s) must be LoDTensor.", j,
out); out);
out_var->SetLoDLevel(in_var->GetLodLevel()); out_var->SetLoDLevel(in_var->GetLoDLevel());
} }
bool IsRuntime() const override; bool IsRuntime() const override;
protected: protected:
...@@ -383,7 +384,7 @@ void OpDesc::InferVarType(BlockDesc *block) const { ...@@ -383,7 +384,7 @@ void OpDesc::InferVarType(BlockDesc *block) const {
for (auto &out_pair : this->outputs_) { for (auto &out_pair : this->outputs_) {
for (auto &out_var_name : out_pair.second) { for (auto &out_var_name : out_pair.second) {
block->FindRecursiveOrCreateVar(out_var_name) block->FindRecursiveOrCreateVar(out_var_name)
->SetType(proto::VarDesc::LOD_TENSOR); .SetType(proto::VarDesc::LOD_TENSOR);
} }
} }
} }
......
...@@ -26,13 +26,12 @@ namespace framework { ...@@ -26,13 +26,12 @@ namespace framework {
struct OpKernelType { struct OpKernelType {
struct Hash { struct Hash {
size_t operator()(const OpKernelType& key) const { size_t operator()(const OpKernelType& key) const {
int place = key.place_.which() + (1 << LEFT_SHIFT); int place = key.place_.which();
int data_type = int data_type = static_cast<int>(key.data_type_) << LEFT_SHIFT;
static_cast<int>(key.data_type_) + (1 << (LEFT_SHIFT + 1)); int data_layout = static_cast<int>(key.data_layout_) << (LEFT_SHIFT * 2);
int data_layout = int library_type = static_cast<int>(key.library_type_)
static_cast<int>(key.data_layout_) + (1 << (LEFT_SHIFT + 2)); << (LEFT_SHIFT * 3);
int library_type =
static_cast<int>(key.library_type_) + (1 << (LEFT_SHIFT + 3));
std::hash<int> hasher; std::hash<int> hasher;
return hasher(place + data_type + data_layout + library_type); return hasher(place + data_type + data_layout + library_type);
} }
......
...@@ -37,8 +37,8 @@ class Registrar { ...@@ -37,8 +37,8 @@ class Registrar {
public: public:
// In our design, various kinds of classes, e.g., operators and kernels, // In our design, various kinds of classes, e.g., operators and kernels,
// have their corresponding registry and registrar. The action of // have their corresponding registry and registrar. The action of
// registration is in the constructor of a global registrar variable, which, // registration is in the constructor of a global registrar variable, which
// however, are not used in the code that calls package framework, and would // are not used in the code that calls package framework, and would
// be removed from the generated binary file by the linker. To avoid such // be removed from the generated binary file by the linker. To avoid such
// removal, we add Touch to all registrar classes and make USE_OP macros to // removal, we add Touch to all registrar classes and make USE_OP macros to
// call this method. So, as long as the callee code calls USE_OP, the global // call this method. So, as long as the callee code calls USE_OP, the global
......
...@@ -12,13 +12,16 @@ ...@@ -12,13 +12,16 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/framework/op_registry.h" #include <glog/logging.h>
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include "paddle/framework/op_registry.h"
namespace pd = paddle::framework; namespace pd = paddle::framework;
namespace paddle { namespace paddle {
namespace framework { namespace framework {
class CosineOp : public OperatorBase { class CosineOp : public OperatorBase {
public: public:
using OperatorBase::OperatorBase; using OperatorBase::OperatorBase;
...@@ -215,7 +218,7 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -215,7 +218,7 @@ class OpWithKernelTest : public OperatorWithKernel {
protected: protected:
void InferShape(InferShapeContext* ctx) const override {} void InferShape(InferShapeContext* ctx) const override {}
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(proto::DataType::FP32, ctx.device_context()); return framework::OpKernelType(proto::DataType::FP32, ctx.device_context());
} }
...@@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) { ...@@ -252,7 +255,6 @@ TEST(OperatorRegistrar, CPU) {
op->Run(scope, cpu_place); op->Run(scope, cpu_place);
} }
#ifdef PADDLE_WITH_CUDA
TEST(OperatorRegistrar, CUDA) { TEST(OperatorRegistrar, CUDA) {
paddle::framework::proto::OpDesc op_desc; paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0); paddle::platform::CUDAPlace cuda_place(0);
...@@ -263,4 +265,127 @@ TEST(OperatorRegistrar, CUDA) { ...@@ -263,4 +265,127 @@ TEST(OperatorRegistrar, CUDA) {
op->Run(scope, cuda_place); op->Run(scope, cuda_place);
} }
#endif
static int op_test_value = 0;
using paddle::platform::DeviceContext;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CUDADeviceContext;
namespace paddle {
namespace framework {
class OpWithMultiKernelTest : public OperatorWithKernel {
public:
using OperatorWithKernel::OperatorWithKernel;
protected:
void InferShape(InferShapeContext* ctx) const override {}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
proto::DataType::FP32, platform::CUDAPlace(0), DataLayout::kAnyLayout,
framework::LibraryType::kCUDNN);
}
};
template <typename DeviceContext, typename T>
class OpMultiKernelTest : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const;
};
template <typename T>
class OpMultiKernelTest<CPUDeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
++op_test_value;
}
};
template <typename T>
class OpMultiKernelTest<CUDADeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
--op_test_value;
}
};
template <typename DeviceContext, typename T>
class OpMultiKernelTest2 : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const;
};
template <typename T>
class OpMultiKernelTest2<CPUDeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
op_test_value += 10;
}
};
template <typename T>
class OpMultiKernelTest2<CUDADeviceContext, T>
: public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const {
op_test_value -= 10;
}
};
} // namespace framework
} // namespace paddle
REGISTER_OP_WITHOUT_GRADIENT(op_with_multi_kernel,
paddle::framework::OpWithMultiKernelTest,
paddle::framework::OpKernelTestMaker);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CPU, paddle::platform::CPUPlace,
paddle::framework::OpMultiKernelTest<CPUDeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, MKLDNN, paddle::platform::CPUPlace,
paddle::framework::OpMultiKernelTest2<CPUDeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CUDA, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest<CUDADeviceContext, float>);
REGISTER_OP_KERNEL(
op_with_multi_kernel, CUDNN, paddle::platform::CUDAPlace,
paddle::framework::OpMultiKernelTest2<CUDADeviceContext, float>);
TEST(OperatorRegistrar, OpWithMultiKernel) {
paddle::framework::proto::OpDesc op_desc;
paddle::platform::CUDAPlace cuda_place(0);
paddle::platform::CPUPlace cpu_place;
paddle::framework::Scope scope;
op_desc.set_type("op_with_multi_kernel");
auto op = paddle::framework::OpRegistry::CreateOp(op_desc);
// TODO(qiao) add priority back
// use all available kernels
paddle::framework::UseALL();
op->Run(scope, cuda_place);
EXPECT_EQ(op_test_value, -10);
// remove cuda kernels
paddle::framework::UseCPU();
op->Run(scope, cpu_place);
EXPECT_EQ(op_test_value, -9);
// add cuda kernels
paddle::framework::UseCUDA();
op->Run(scope, cuda_place);
EXPECT_EQ(op_test_value, -10);
// use cudnn kernel
paddle::framework::UseCUDNN();
op->Run(scope, cuda_place);
EXPECT_EQ(op_test_value, -20);
}
...@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS, ...@@ -11,13 +11,13 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <glog/logging.h>
#include <algorithm> #include <algorithm>
#include <atomic>
#include "paddle/framework/data_transform.h" #include "paddle/framework/data_transform.h"
#include "paddle/framework/device_data_transform.h"
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/shape_inference.h" #include "paddle/framework/shape_inference.h"
#include "paddle/framework/var_type.h" #include "paddle/framework/var_type.h"
...@@ -25,6 +25,66 @@ limitations under the License. */ ...@@ -25,6 +25,66 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
void UseCPU() {
kKernelPriority.clear();
/*Plain CPU*/
auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kPlain);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}
void UseMKLDNN() {
UseCPU();
#if PADDLE_WITH_MKLML
{
/*MKLDNN Kernel*/
auto pair0 = std::make_tuple(platform::CPUPlace(), LibraryType::kMKLDNN);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}
#endif
}
void UseCUDA() {
UseMKLDNN();
#if PADDLE_WITH_CUDA
/*Plain GPU*/
auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
#endif
}
void UseCUDNN() {
UseCUDA();
#if PADDLE_WITH_CUDA
if (platform::dynload::HasCUDNN()) {
/*CUDNN Kernel*/
auto pair0 = std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN);
kKernelPriority.insert(kKernelPriority.begin(), pair0);
}
#endif
}
void UseALL() {
UseCPU();
UseMKLDNN();
UseCUDA();
UseCUDNN();
}
static DDim GetDims(const Scope& scope, const std::string& name) {
Variable* var = scope.FindVar(name);
if (var == nullptr) {
return DDim({-1});
} else if (var->IsType<LoDTensor>()) {
return var->Get<LoDTensor>().dims();
} else if (var->IsType<SelectedRows>()) {
return var->Get<SelectedRows>().GetCompleteDims();
} else {
return DDim({-1});
}
}
std::string OperatorBase::Input(const std::string& name) const { std::string OperatorBase::Input(const std::string& name) const {
auto& ins = Inputs(name); auto& ins = Inputs(name);
PADDLE_ENFORCE_LE(ins.size(), 1UL, PADDLE_ENFORCE_LE(ins.size(), 1UL,
...@@ -57,7 +117,7 @@ const std::vector<std::string>& OperatorBase::Outputs( ...@@ -57,7 +117,7 @@ const std::vector<std::string>& OperatorBase::Outputs(
return it->second; return it->second;
} }
std::string OperatorBase::DebugString() const { std::string OperatorBase::DebugStringEx(const Scope* scope) const {
std::stringstream ss; std::stringstream ss;
ss << "Op(" << type_ << "), inputs:{"; ss << "Op(" << type_ << "), inputs:{";
for (auto it = inputs_.begin(); it != inputs_.end();) { for (auto it = inputs_.begin(); it != inputs_.end();) {
...@@ -65,6 +125,9 @@ std::string OperatorBase::DebugString() const { ...@@ -65,6 +125,9 @@ std::string OperatorBase::DebugString() const {
ss << input.first << "["; ss << input.first << "[";
for (size_t i = 0; i < input.second.size(); ++i) { for (size_t i = 0; i < input.second.size(); ++i) {
ss << input.second[i]; ss << input.second[i];
if (scope) {
ss << "(" << GetDims(*scope, input.second[i]) << ")";
}
if (i != input.second.size() - 1) { if (i != input.second.size() - 1) {
ss << ", "; ss << ", ";
} }
...@@ -81,6 +144,9 @@ std::string OperatorBase::DebugString() const { ...@@ -81,6 +144,9 @@ std::string OperatorBase::DebugString() const {
ss << output.first << "["; ss << output.first << "[";
for (size_t i = 0; i < output.second.size(); ++i) { for (size_t i = 0; i < output.second.size(); ++i) {
ss << output.second[i]; ss << output.second[i];
if (scope) {
ss << "(" << GetDims(*scope, output.second[i]) << ")";
}
if (i != output.second.size() - 1) { if (i != output.second.size() - 1) {
ss << ", "; ss << ", ";
} }
...@@ -178,6 +244,10 @@ void OperatorBase::GenerateTemporaryNames() { ...@@ -178,6 +244,10 @@ void OperatorBase::GenerateTemporaryNames() {
} }
} }
static bool VarIsTensor(const Variable* var) {
return var->IsType<LoDTensor>() || var->IsType<SelectedRows>();
}
static const Tensor* GetTensorFromVar(const Variable* var) { static const Tensor* GetTensorFromVar(const Variable* var) {
const Tensor* t = nullptr; const Tensor* t = nullptr;
if (var->IsType<LoDTensor>()) { if (var->IsType<LoDTensor>()) {
...@@ -185,7 +255,8 @@ static const Tensor* GetTensorFromVar(const Variable* var) { ...@@ -185,7 +255,8 @@ static const Tensor* GetTensorFromVar(const Variable* var) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
t = &(var->Get<SelectedRows>().value()); t = &(var->Get<SelectedRows>().value());
} else { } else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name());
} }
return t; return t;
} }
...@@ -197,7 +268,8 @@ static Tensor* GetMutableTensorFromVar(Variable* var) { ...@@ -197,7 +268,8 @@ static Tensor* GetMutableTensorFromVar(Variable* var) {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
t = var->GetMutable<SelectedRows>()->mutable_value(); t = var->GetMutable<SelectedRows>()->mutable_value();
} else { } else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); PADDLE_THROW("Variable type_id %s, expect LoDTensor/SelectedRows.",
var->Type().name());
} }
return t; return t;
} }
...@@ -347,6 +419,25 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -347,6 +419,25 @@ class RuntimeInferShapeContext : public InferShapeContext {
auto in_tensor = in_var->Get<LoDTensor>(); auto in_tensor = in_var->Get<LoDTensor>();
auto* out_tensor = out_var->GetMutable<LoDTensor>(); auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_lod(in_tensor.lod()); out_tensor->set_lod(in_tensor.lod());
// TODO(dzhwinter) : reuse ShareLoD in most operators.
// Need to call ShareLayout explicitly in sequence related ops.
// Shall we have a better method to shared info between in/out Tensor?
out_tensor->set_layout(in_tensor.layout());
}
void ShareLayout(const std::string& in, const std::string& out, size_t i = 0,
size_t j = 0) const {
PADDLE_ENFORCE_LT(i, Inputs(in).size());
PADDLE_ENFORCE_LT(j, Outputs(out).size());
Variable* in_var = scope_.FindVar(Inputs(in)[i]);
Variable* out_var = scope_.FindVar(Outputs(out)[j]);
if (!in_var->IsType<LoDTensor>()) return;
PADDLE_ENFORCE(out_var->IsType<LoDTensor>(),
"The %d-th output of Output(%s) must be LoDTensor.", j, out);
auto in_tensor = in_var->Get<LoDTensor>();
auto* out_tensor = out_var->GetMutable<LoDTensor>();
out_tensor->set_layout(in_tensor.layout());
} }
bool IsRuntime() const override { return true; } bool IsRuntime() const override { return true; }
...@@ -359,7 +450,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -359,7 +450,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
return var->Get<SelectedRows>().GetCompleteDims(); return var->Get<SelectedRows>().GetCompleteDims();
} else { } else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
name, var->Type().name());
} }
} }
...@@ -370,7 +462,8 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -370,7 +462,8 @@ class RuntimeInferShapeContext : public InferShapeContext {
} else if (var->IsType<SelectedRows>()) { } else if (var->IsType<SelectedRows>()) {
var->GetMutable<SelectedRows>()->set_height(dim[0]); var->GetMutable<SelectedRows>()->set_height(dim[0]);
} else { } else {
PADDLE_THROW("Variable type must be LoDTensor/SelectedRows."); PADDLE_THROW("Variable %s type_id %s, expect LoDTensor/SelectedRows.",
name, var->Type().name());
} }
} }
...@@ -384,24 +477,6 @@ class RuntimeInferShapeContext : public InferShapeContext { ...@@ -384,24 +477,6 @@ class RuntimeInferShapeContext : public InferShapeContext {
const Scope& scope_; const Scope& scope_;
}; };
const platform::DeviceContext* GetDeviceContext(
framework::KernelTypePair& kernel_pair) {
auto& actual_kernel_key = kernel_pair.first;
auto& expected_kernel_key = kernel_pair.second;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
if (platform::is_gpu_place(actual_kernel_key.place_) &&
platform::is_cpu_place(expected_kernel_key.place_)) {
return pool.Get(actual_kernel_key.place_);
} else if (platform::is_cpu_place(actual_kernel_key.place_) &&
platform::is_gpu_place(expected_kernel_key.place_)) {
return pool.Get(expected_kernel_key.place_);
} else {
PADDLE_THROW(
"Currently, model parallelism is only supported between CPU and CUDA");
}
}
void OperatorWithKernel::Run(const Scope& scope, void OperatorWithKernel::Run(const Scope& scope,
const platform::Place& place) const { const platform::Place& place) const {
RuntimeInferShapeContext infer_shape_ctx(*this, scope); RuntimeInferShapeContext infer_shape_ctx(*this, scope);
...@@ -417,71 +492,59 @@ void OperatorWithKernel::Run(const Scope& scope, ...@@ -417,71 +492,59 @@ void OperatorWithKernel::Run(const Scope& scope,
"There are no kernels which are registered in the %s operator.", type_); "There are no kernels which are registered in the %s operator.", type_);
} }
// check if op[type] have kernel for kernel_key
OpKernelMap& kernels = kernels_iter->second;
ExecutionContext ctx(*this, scope, *dev_ctx); ExecutionContext ctx(*this, scope, *dev_ctx);
auto actual_kernel_key = GetActualKernelType(ctx); auto expected_kernel_key = this->GetExpectedKernelType(ctx);
auto expected_kernel_key = GetExpectedKernelType(actual_kernel_key);
auto kernel_iter = kernels.find(expected_kernel_key);
if (kernel_iter == kernels.end()) {
PADDLE_THROW("The operator %s does not support %s", type_,
expected_kernel_key);
}
if (actual_kernel_key == expected_kernel_key) { OpKernelMap& kernels = kernels_iter->second;
PADDLE_ENFORCE_EQ(actual_kernel_key.place_, expected_kernel_key.place_,
"Currently, model parallelism is only supported between "
"CPU and other devices. For example, multi-GPU model "
"parallelism will failed.");
} else {
auto kernel_pair = std::make_pair(actual_kernel_key, expected_kernel_key);
const DataTransformFn* trans_fun =
DataTransformFnMap::Instance().GetNullable(kernel_pair);
if (trans_fun) {
auto input_vars = this->InputVars();
// TODO(qijun) filter the input vars that do not need to be transformed
// filter vars that has been transformed
std::vector<std::string> need_trans;
for (auto var_name : input_vars) {
auto var_name_trans =
var_name + framework::KernelTypeToString(expected_kernel_key);
if (!scope.FindVar(var_name_trans)) {
const_cast<Scope&>(scope).Var(var_name_trans);
need_trans.push_back(var_name);
}
}
if (!need_trans.empty()) { for (auto& candidate : kKernelPriority) {
auto trans_dev_ctx = GetDeviceContext(kernel_pair); auto candidate_key =
OpKernelType(expected_kernel_key.data_type_, std::get<0>(candidate),
expected_kernel_key.data_layout_, std::get<1>(candidate));
// Wait for transform starting if ((candidate_key == expected_kernel_key) ||
dev_ctx->Wait(); (kernels.count(candidate_key))) {
expected_kernel_key = candidate_key;
break;
}
}
for (auto var_name : need_trans) { VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
(*trans_fun)(trans_dev_ctx, kernel_pair, *(scope.FindVar(var_name)),
scope.FindVar(var_name + framework::KernelTypeToString( Scope& new_scope = scope.NewScope();
expected_kernel_key)));
for (auto& var_name_item : this->Inputs()) {
for (auto& var_name : var_name_item.second) {
auto* var = scope.FindVar(var_name);
if (var && VarIsTensor(var)) {
auto* tensor_in = GetTensorFromVar(var);
if (tensor_in->IsInitialized()) {
auto kernel_type_for_var = this->GetKernelTypeForVar(
var_name_item.first, *tensor_in, expected_kernel_key);
if (kernel_type_for_var != expected_kernel_key) {
auto out_var_names = OutputVars(true);
if (std::find(out_var_names.begin(), out_var_names.end(),
var_name) != out_var_names.end()) {
PADDLE_THROW(
"var %s is both input and output, "
"does not support transform",
var_name);
}
VLOG(3) << "need to do transform for var " << var_name;
auto* trans_var = new_scope.Var(var_name);
auto* out = DataTransform(expected_kernel_key, kernel_type_for_var,
*tensor_in);
CopyVariableWithTensor(*var, *out, *trans_var);
}
} }
// Wait for data transform finishing
trans_dev_ctx->Wait();
} }
} }
} }
kernel_iter->second->Compute(ctx); auto kernel_iter = kernels.find(expected_kernel_key);
}
OpKernelType OperatorWithKernel::GetActualKernelType(
const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}
OpKernelType OperatorWithKernel::GetExpectedKernelType( kernel_iter->second->Compute(ExecutionContext(
const OpKernelType& actual_kernel_type) const { *this, new_scope, *pool.Get(expected_kernel_key.place_)));
return actual_kernel_type;
} }
proto::DataType OperatorWithKernel::IndicateDataType( proto::DataType OperatorWithKernel::IndicateDataType(
...@@ -513,5 +576,16 @@ proto::DataType OperatorWithKernel::IndicateDataType( ...@@ -513,5 +576,16 @@ proto::DataType OperatorWithKernel::IndicateDataType(
return static_cast<proto::DataType>(data_type); return static_cast<proto::DataType>(data_type);
} }
OpKernelType OperatorWithKernel::GetExpectedKernelType(
const ExecutionContext& ctx) const {
return OpKernelType(IndicateDataType(ctx), ctx.GetPlace());
}
OpKernelType OperatorWithKernel::GetKernelTypeForVar(
const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const {
return OpKernelType(expected_kernel_type.data_type_, tensor.place());
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include <atomic> #include <atomic>
#include <string> #include <string>
#include <tuple>
#include <unordered_map> #include <unordered_map>
#include <vector> #include <vector>
...@@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD"; ...@@ -52,10 +53,33 @@ constexpr char kGradVarSuffix[] = "@GRAD";
/// Variables with this suffix are supposed to be filled up with zeros. /// Variables with this suffix are supposed to be filled up with zeros.
constexpr char kZeroVarSuffix[] = "@ZERO"; constexpr char kZeroVarSuffix[] = "@ZERO";
// define some kernel hint // define some kernel priority
const std::string kUseCPU = "use_cpu"; extern std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority;
const std::string kUseCUDNN = "use_cudnn";
const std::string kUseMKLDNN = "use_mkldnn"; /**
* @brief Use cpu kernel only
*/
void UseCPU();
/**
* @brief Perfer MKLDNN kernel than Plain CPU kernel
*/
void UseMKLDNN();
/**
* @brief Perfer CUDA kernel than Plain CPU kernel
*/
void UseCUDA();
/**
* @brief Perfer cudnn kernel than Plain CUDA kernel
*/
void UseCUDNN();
/**
* @brief Use all available kernels
*/
void UseALL();
inline std::string GradVarName(const std::string& var_name) { inline std::string GradVarName(const std::string& var_name) {
return var_name + kGradVarSuffix; return var_name + kGradVarSuffix;
...@@ -84,7 +108,10 @@ class OperatorBase { ...@@ -84,7 +108,10 @@ class OperatorBase {
return boost::get<T>(attrs_.at(name)); return boost::get<T>(attrs_.at(name));
} }
virtual std::string DebugString() const; /// if scope is not null, also show dimensions of arguments
virtual std::string DebugStringEx(const Scope* scope) const;
std::string DebugString() const { return DebugStringEx(nullptr); }
/// Net will call this function to Run an op. /// Net will call this function to Run an op.
virtual void Run(const Scope& scope, const platform::Place& place) const = 0; virtual void Run(const Scope& scope, const platform::Place& place) const = 0;
...@@ -381,9 +408,10 @@ class OperatorWithKernel : public OperatorBase { ...@@ -381,9 +408,10 @@ class OperatorWithKernel : public OperatorBase {
} }
protected: protected:
virtual OpKernelType GetActualKernelType(const ExecutionContext& ctx) const; virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const;
virtual OpKernelType GetExpectedKernelType( virtual OpKernelType GetKernelTypeForVar(
const OpKernelType& actual_kernel_type) const; const std::string& var_name, const Tensor& tensor,
const OpKernelType& expected_kernel_type) const;
private: private:
// indicate kernel DataType by input data. Defaultly all input data must be // indicate kernel DataType by input data. Defaultly all input data must be
......
...@@ -114,7 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel { ...@@ -114,7 +114,8 @@ class OpWithKernelTest : public OperatorWithKernel {
protected: protected:
void InferShape(framework::InferShapeContext* ctx) const override {} void InferShape(framework::InferShapeContext* ctx) const override {}
OpKernelType GetActualKernelType(const ExecutionContext& ctx) const override { OpKernelType GetExpectedKernelType(
const ExecutionContext& ctx) const override {
return OpKernelType(proto::DataType::FP32, ctx.GetPlace()); return OpKernelType(proto::DataType::FP32, ctx.GetPlace());
} }
}; };
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <memory> // for unique_ptr #include <memory> // for unique_ptr
#include <mutex> // for call_once #include <mutex> // for call_once
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/framework/threadpool.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
namespace paddle { namespace paddle {
...@@ -87,7 +88,8 @@ void Scope::DeleteScope(Scope* scope) { ...@@ -87,7 +88,8 @@ void Scope::DeleteScope(Scope* scope) {
auto it = std::find(this->kids_.begin(), this->kids_.end(), scope); auto it = std::find(this->kids_.begin(), this->kids_.end(), scope);
PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope); PADDLE_ENFORCE(it != this->kids_.end(), "Cannot find %p as kid scope", scope);
this->kids_.erase(it); this->kids_.erase(it);
delete scope; // Make delete async.
Async([scope] { delete scope; });
} }
void Scope::Rename(const std::string& origin_name, void Scope::Rename(const std::string& origin_name,
...@@ -107,6 +109,7 @@ std::string Scope::Rename(const std::string& origin_name) const { ...@@ -107,6 +109,7 @@ std::string Scope::Rename(const std::string& origin_name) const {
Rename(origin_name, var_name); Rename(origin_name, var_name);
return var_name; return var_name;
} }
Variable* Scope::FindVarLocally(const std::string& name) const { Variable* Scope::FindVarLocally(const std::string& name) const {
auto it = vars_.find(name); auto it = vars_.find(name);
if (it != vars_.end()) return it->second; if (it != vars_.end()) return it->second;
......
...@@ -75,9 +75,9 @@ class Scope { ...@@ -75,9 +75,9 @@ class Scope {
// Rename variable to a new name and return the new name // Rename variable to a new name and return the new name
std::string Rename(const std::string& origin_name) const; std::string Rename(const std::string& origin_name) const;
private:
Variable* FindVarLocally(const std::string& name) const; Variable* FindVarLocally(const std::string& name) const;
private:
// Call Scope::NewScope for a sub-scope. // Call Scope::NewScope for a sub-scope.
explicit Scope(Scope const* parent) : parent_(parent) {} explicit Scope(Scope const* parent) : parent_(parent) {}
......
...@@ -37,8 +37,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, ...@@ -37,8 +37,8 @@ void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
SerializeToStream(os, selected_rows.value(), dev_ctx); SerializeToStream(os, selected_rows.value(), dev_ctx);
} }
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) { void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
auto tensor = *selected_rows->mutable_value(); const platform::DeviceContext& dev_ctx) {
{ {
// the 1st field, unit32_t version for SelectedRows // the 1st field, unit32_t version for SelectedRows
uint32_t version; uint32_t version;
...@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) { ...@@ -62,7 +62,7 @@ void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows) {
selected_rows->set_height(height); selected_rows->set_height(height);
} }
// the 4st field, tensor which contains the data // the 4st field, tensor which contains the data
DeserializeFromStream(is, &tensor); DeserializeFromStream(is, selected_rows->mutable_value(), dev_ctx);
} }
} // namespace framework } // namespace framework
......
...@@ -66,7 +66,8 @@ class SelectedRows { ...@@ -66,7 +66,8 @@ class SelectedRows {
*/ */
void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows, void SerializeToStream(std::ostream& os, const SelectedRows& selected_rows,
const platform::DeviceContext& dev_ctx); const platform::DeviceContext& dev_ctx);
void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows); void DeserializeFromStream(std::istream& is, SelectedRows* selected_rows,
const platform::DeviceContext& dev_ctx);
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -51,10 +51,12 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) { ...@@ -51,10 +51,12 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
SerializeToStream(oss, *selected_rows_, cpu_ctx); SerializeToStream(oss, *selected_rows_, cpu_ctx);
std::istringstream iss(oss.str()); std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor); DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows()); ASSERT_EQ(selected_rows_->rows(), dst_tensor.rows());
ASSERT_EQ(selected_rows_->height(), dst_tensor.height()); ASSERT_EQ(selected_rows_->height(), dst_tensor.height());
ASSERT_EQ(selected_rows_->value().dims(), dst_tensor.value().dims());
ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
} }
} // namespace framework } // namespace framework
......
...@@ -55,6 +55,10 @@ class Tensor { ...@@ -55,6 +55,10 @@ class Tensor {
template <typename T> template <typename T>
inline const T* data() const; inline const T* data() const;
inline bool IsInitialized() const;
inline void switch_place(platform::Place new_place);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
* @note If not exist, then allocation. * @note If not exist, then allocation.
...@@ -200,6 +204,15 @@ class Tensor { ...@@ -200,6 +204,15 @@ class Tensor {
size_t offset_; size_t offset_;
}; };
inline void Tensor::switch_place(platform::Place new_place) {
if (holder_->place() == new_place) {
return;
}
// TODO(tonyyang-svail): do memcpy here.
PADDLE_THROW("Not Implemented");
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
......
...@@ -84,6 +84,8 @@ inline const T* Tensor::data() const { ...@@ -84,6 +84,8 @@ inline const T* Tensor::data() const {
reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_); reinterpret_cast<uintptr_t>(holder_->ptr()) + offset_);
} }
inline bool Tensor::IsInitialized() const { return holder_ != nullptr; }
template <typename T> template <typename T>
inline T* Tensor::data() { inline T* Tensor::data() {
check_memory_size(); check_memory_size();
......
...@@ -69,7 +69,7 @@ struct AnyVisitor : public boost::static_visitor<bool> { ...@@ -69,7 +69,7 @@ struct AnyVisitor : public boost::static_visitor<bool> {
tmp.mutable_data<bool>(cpu); tmp.mutable_data<bool>(cpu);
auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu); auto gpuctx = platform::DeviceContextPool::Instance().Get(gpu);
gpuctx->Wait(); gpuctx->Wait();
CopyFrom(out, cpu, *gpuctx, &tmp); Copy(out, cpu, *gpuctx, &tmp);
gpuctx->Wait(); gpuctx->Wait();
return GetResult(tmp, cpu); return GetResult(tmp, cpu);
} }
......
...@@ -29,11 +29,11 @@ namespace framework { ...@@ -29,11 +29,11 @@ namespace framework {
* @param[in] dst_place The dst place. * @param[in] dst_place The dst place.
* @param[in] ctx The device context contains device resources. * @param[in] ctx The device context contains device resources.
* *
* @note CopyFrom supports CPU <-> GPU, GPU <-> GPU. * @note Copy supports CPU <-> GPU, GPU <-> GPU.
*/ */
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, inline void Copy(const Tensor& src, const platform::Place& dst_place,
const platform::DeviceContext& ctx, Tensor* dst) { const platform::DeviceContext& ctx, Tensor* dst) {
src.check_memory_size(); src.check_memory_size();
dst->Resize(src.dims()); dst->Resize(src.dims());
...@@ -88,10 +88,10 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, ...@@ -88,10 +88,10 @@ inline void CopyFrom(const Tensor& src, const platform::Place& dst_place,
} }
/** /**
* @brief CopyFrom support CPU <-> CPU * @brief Copy supports CPU <-> CPU
*/ */
inline void CopyFrom(const Tensor& src, const platform::Place& dst_place, inline void Copy(const Tensor& src, const platform::Place& dst_place,
Tensor* dst) { Tensor* dst) {
src.check_memory_size(); src.check_memory_size();
dst->Resize(src.dims()); dst->Resize(src.dims());
dst->set_layout(src.layout()); dst->set_layout(src.layout());
...@@ -270,7 +270,23 @@ inline void SerializeToStream(std::ostream& os, const Tensor& tensor, ...@@ -270,7 +270,23 @@ inline void SerializeToStream(std::ostream& os, const Tensor& tensor,
} }
} }
inline void DeserializeFromStream(std::istream& is, Tensor* tensor) { struct DeserializedDataFunctor {
DeserializedDataFunctor(void** buf, Tensor* tensor,
const platform::Place& place)
: buf_(buf), tensor_(tensor), place_(place) {}
template <typename T>
void operator()() {
*buf_ = tensor_->mutable_data<T>(place_);
}
void** buf_;
Tensor* tensor_;
platform::Place place_;
};
inline void DeserializeFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx) {
uint32_t version; uint32_t version;
is.read(reinterpret_cast<char*>(&version), sizeof(version)); is.read(reinterpret_cast<char*>(&version), sizeof(version));
PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported"); PADDLE_ENFORCE_EQ(version, 0U, "Only version 0 is supported");
...@@ -289,27 +305,28 @@ inline void DeserializeFromStream(std::istream& is, Tensor* tensor) { ...@@ -289,27 +305,28 @@ inline void DeserializeFromStream(std::istream& is, Tensor* tensor) {
dims.reserve(static_cast<size_t>(desc.dims().size())); dims.reserve(static_cast<size_t>(desc.dims().size()));
std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims)); std::copy(desc.dims().begin(), desc.dims().end(), std::back_inserter(dims));
tensor->Resize(framework::make_ddim(dims)); tensor->Resize(framework::make_ddim(dims));
void* buf; void* buf;
platform::Place cpu = platform::CPUPlace(); auto ctx = platform::CPUDeviceContext();
// TODO(Yancey1989): use VisiterDataType instead of DataType switch if (platform::is_gpu_place(dev_ctx.GetPlace())) {
switch (desc.data_type()) { #ifdef PADDLE_WITH_CUDA
case proto::FP32: Tensor cpu_tensor;
buf = tensor->mutable_data<float>(cpu); cpu_tensor.Resize(framework::make_ddim(dims));
break; framework::VisitDataType(
case proto::FP64: desc.data_type(),
buf = tensor->mutable_data<double>(cpu); DeserializedDataFunctor(&buf, &cpu_tensor, ctx.GetPlace()));
break; is.read(static_cast<char*>(buf), cpu_tensor.memory_size());
case proto::INT32: auto cpu_place = new platform::CPUPlace();
buf = tensor->mutable_data<int>(cpu); framework::Copy(cpu_tensor, *cpu_place, dev_ctx, tensor);
break; delete cpu_place;
case proto::INT64: #else
buf = tensor->mutable_data<int64_t>(cpu); PADDLE_THROW("Unexpected branch");
break; #endif
default: } else {
PADDLE_THROW("DataType %d not supported", desc.data_type()); framework::VisitDataType(
desc.data_type(),
DeserializedDataFunctor(&buf, tensor, ctx.GetPlace()));
is.read(static_cast<char*>(buf), tensor->memory_size());
} }
is.read(static_cast<char*>(buf), tensor->memory_size());
} }
} }
......
...@@ -19,7 +19,7 @@ ...@@ -19,7 +19,7 @@
namespace paddle { namespace paddle {
namespace framework { namespace framework {
TEST(CopyFrom, Tensor) { TEST(Copy, Tensor) {
Tensor src_tensor; Tensor src_tensor;
Tensor dst_tensor; Tensor dst_tensor;
platform::CPUDeviceContext cpu_ctx((platform::CPUPlace())); platform::CPUDeviceContext cpu_ctx((platform::CPUPlace()));
...@@ -32,7 +32,7 @@ TEST(CopyFrom, Tensor) { ...@@ -32,7 +32,7 @@ TEST(CopyFrom, Tensor) {
src_tensor.set_layout(DataLayout::kAnyLayout); src_tensor.set_layout(DataLayout::kAnyLayout);
auto cpu_place = new platform::CPUPlace(); auto cpu_place = new platform::CPUPlace();
CopyFrom(src_tensor, *cpu_place, &dst_tensor); Copy(src_tensor, *cpu_place, &dst_tensor);
const int* dst_ptr = dst_tensor.data<int>(); const int* dst_ptr = dst_tensor.data<int>();
ASSERT_NE(src_ptr, dst_ptr); ASSERT_NE(src_ptr, dst_ptr);
...@@ -43,7 +43,7 @@ TEST(CopyFrom, Tensor) { ...@@ -43,7 +43,7 @@ TEST(CopyFrom, Tensor) {
EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout()); EXPECT_TRUE(dst_tensor.layout() == src_tensor.layout());
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
CopyFrom(slice_tensor, *cpu_place, &dst_tensor); Copy(slice_tensor, *cpu_place, &dst_tensor);
const int* slice_ptr = slice_tensor.data<int>(); const int* slice_ptr = slice_tensor.data<int>();
dst_ptr = dst_tensor.data<int>(); dst_ptr = dst_tensor.data<int>();
ASSERT_NE(dst_ptr, slice_ptr); ASSERT_NE(dst_ptr, slice_ptr);
...@@ -67,11 +67,11 @@ TEST(CopyFrom, Tensor) { ...@@ -67,11 +67,11 @@ TEST(CopyFrom, Tensor) {
// CPU Tensor to GPU Tensor // CPU Tensor to GPU Tensor
auto gpu_place = new platform::CUDAPlace(0); auto gpu_place = new platform::CUDAPlace(0);
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
auto cpu_place = new platform::CPUPlace(); auto cpu_place = new platform::CPUPlace();
CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
...@@ -84,10 +84,10 @@ TEST(CopyFrom, Tensor) { ...@@ -84,10 +84,10 @@ TEST(CopyFrom, Tensor) {
Tensor slice_tensor = src_tensor.Slice(1, 2); Tensor slice_tensor = src_tensor.Slice(1, 2);
// CPU Slice Tensor to GPU Tensor // CPU Slice Tensor to GPU Tensor
CopyFrom(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor); Copy(slice_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
// GPU Tensor to CPU Tensor // GPU Tensor to CPU Tensor
CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Slice Tensors // Sync before Compare Slice Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
...@@ -155,7 +155,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -155,7 +155,7 @@ TEST(CopyFromVector, Tensor) {
CUDADeviceContext gpu_ctx(*gpu_place); CUDADeviceContext gpu_ctx(*gpu_place);
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
// Copy from GPU to CPU tensor for comparison // Copy from GPU to CPU tensor for comparison
CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
...@@ -175,7 +175,7 @@ TEST(CopyFromVector, Tensor) { ...@@ -175,7 +175,7 @@ TEST(CopyFromVector, Tensor) {
CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor); CopyFromVector<int>(src_vec, cpu_ctx, &cpu_tensor);
gpu_tensor.Resize(make_ddim({2, 2})); gpu_tensor.Resize(make_ddim({2, 2}));
CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor); CopyFromVector<int>(src_vec, gpu_ctx, &gpu_tensor);
CopyFrom(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor); Copy(gpu_tensor, *cpu_place, gpu_ctx, &dst_tensor);
// Sync before Compare Tensors // Sync before Compare Tensors
gpu_ctx.Wait(); gpu_ctx.Wait();
...@@ -270,11 +270,12 @@ TEST(Tensor, SerializeAndDeserialize) { ...@@ -270,11 +270,12 @@ TEST(Tensor, SerializeAndDeserialize) {
SerializeToStream(oss, src_tensor, cpu_ctx); SerializeToStream(oss, src_tensor, cpu_ctx);
std::istringstream iss(oss.str()); std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor); DeserializeFromStream(iss, &dst_tensor, cpu_ctx);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace()); int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 5; ++i) { for (int i = 0; i < 5; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]); ASSERT_EQ(dst_ptr[i], array[i]);
} }
ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
delete place; delete place;
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -286,19 +287,18 @@ TEST(Tensor, SerializeAndDeserialize) { ...@@ -286,19 +287,18 @@ TEST(Tensor, SerializeAndDeserialize) {
auto gpu_place = new platform::CUDAPlace(); auto gpu_place = new platform::CUDAPlace();
platform::CUDADeviceContext gpu_ctx(*gpu_place); platform::CUDADeviceContext gpu_ctx(*gpu_place);
CopyFrom(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor); Copy(src_tensor, *gpu_place, gpu_ctx, &gpu_tensor);
std::ostringstream oss; std::ostringstream oss;
SerializeToStream(oss, gpu_tensor, gpu_ctx); SerializeToStream(oss, gpu_tensor, gpu_ctx);
std::istringstream iss(oss.str()); std::istringstream iss(oss.str());
DeserializeFromStream(iss, &dst_tensor); DeserializeFromStream(iss, &dst_tensor, gpu_ctx);
int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace()); int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
ASSERT_EQ(dst_ptr[i], array[i]); ASSERT_EQ(dst_ptr[i], array[i]);
} }
delete gpu_place; delete gpu_place;
} }
#endif #endif
......
...@@ -29,7 +29,6 @@ namespace framework { ...@@ -29,7 +29,6 @@ namespace framework {
class ThreadPool { class ThreadPool {
public: public:
typedef std::packaged_task<void()> Task; typedef std::packaged_task<void()> Task;
typedef std::function<void()> Fun;
/** /**
* @brief Get a instance of threadpool, the thread number will * @brief Get a instance of threadpool, the thread number will
...@@ -67,7 +66,8 @@ class ThreadPool { ...@@ -67,7 +66,8 @@ class ThreadPool {
* @return std::future<void>, we could wait for the task finished by * @return std::future<void>, we could wait for the task finished by
* f.wait(). * f.wait().
*/ */
std::future<void> Run(const Fun& fn) { template <typename Callback>
std::future<void> Run(Callback fn) {
std::unique_lock<std::mutex> lock(mutex_); std::unique_lock<std::mutex> lock(mutex_);
Task task(std::bind(fn)); Task task(std::bind(fn));
std::future<void> f = task.get_future(); std::future<void> f = task.get_future();
...@@ -159,5 +159,13 @@ class ThreadPool { ...@@ -159,5 +159,13 @@ class ThreadPool {
std::condition_variable completed_; std::condition_variable completed_;
}; };
// Run a function asynchronously.
// NOTE: The function must return void. If the function need to return a value,
// you can use lambda to capture a value pointer.
template <typename Callback>
std::future<void> Async(Callback callback) {
return ThreadPool::GetInstance()->Run(callback);
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -52,7 +52,7 @@ void VarDesc::SetLoDLevel(int32_t lod_level) { ...@@ -52,7 +52,7 @@ void VarDesc::SetLoDLevel(int32_t lod_level) {
} }
} }
int32_t VarDesc::GetLodLevel() const { int32_t VarDesc::GetLoDLevel() const {
switch (desc_.type()) { switch (desc_.type()) {
case proto::VarDesc::LOD_TENSOR: case proto::VarDesc::LOD_TENSOR:
return desc_.lod_tensor().lod_level(); return desc_.lod_tensor().lod_level();
......
...@@ -76,7 +76,7 @@ class VarDesc { ...@@ -76,7 +76,7 @@ class VarDesc {
void SetLoDLevel(int32_t lod_level); void SetLoDLevel(int32_t lod_level);
int32_t GetLodLevel() const; int32_t GetLoDLevel() const;
proto::VarDesc::VarType GetType() const; proto::VarDesc::VarType GetType() const;
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/framework/lod_rank_table.h" #include "paddle/framework/lod_rank_table.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/lod_tensor_array.h" #include "paddle/framework/lod_tensor_array.h"
#include "paddle/framework/selected_rows.h"
#include "paddle/framework/variable.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -35,7 +37,7 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) { ...@@ -35,7 +37,7 @@ inline proto::VarDesc::VarType ToVarType(std::type_index type) {
} }
template <typename Visitor> template <typename Visitor>
inline void VisitVarType(const Variable& var, Visitor visitor) { inline void VisitVarType(const framework::Variable& var, Visitor visitor) {
switch (ToVarType(var.Type())) { switch (ToVarType(var.Type())) {
case proto::VarDesc_VarType_LOD_TENSOR: case proto::VarDesc_VarType_LOD_TENSOR:
visitor(var.Get<framework::LoDTensor>()); visitor(var.Get<framework::LoDTensor>());
......
...@@ -32,6 +32,8 @@ class Variable { ...@@ -32,6 +32,8 @@ class Variable {
return *static_cast<const T*>(holder_->Ptr()); return *static_cast<const T*>(holder_->Ptr());
} }
bool IsInitialized() const { return holder_ != nullptr; }
template <typename T> template <typename T>
T* GetMutable() { T* GetMutable() {
if (!IsType<T>()) { if (!IsType<T>()) {
......
...@@ -233,6 +233,13 @@ public: ...@@ -233,6 +233,13 @@ public:
(void)numProcessed; (void)numProcessed;
} }
/**
* @brief Release the middle layer's output memory.
*
* @note This function is used for memory optimization in inference.
*/
virtual void releaseOutput() {}
protected: protected:
virtual void onLoadParameter() {} virtual void onLoadParameter() {}
......
...@@ -187,6 +187,31 @@ void NeuralNetwork::init(const ModelConfig& config, ...@@ -187,6 +187,31 @@ void NeuralNetwork::init(const ModelConfig& config,
CHECK(it != layerMap_.end()); CHECK(it != layerMap_.end());
outputLayers_.push_back(it->second); outputLayers_.push_back(it->second);
} }
for (const auto& layer : layers_) {
const auto& name = layer->getName();
bool isMiddleLayer = true;
// if data layer
for (const auto& dataLayer : dataLayers_) {
if (name == dataLayer->getName()) {
isMiddleLayer = false;
break;
}
}
// if output layer
for (const auto& dataLayer : outputLayers_) {
if (name == dataLayer->getName()) {
isMiddleLayer = false;
break;
}
}
if (isMiddleLayer) {
middleLayers_.push_back(layer);
}
}
} }
void NeuralNetwork::connect(LayerPtr agentLayer, void NeuralNetwork::connect(LayerPtr agentLayer,
...@@ -327,6 +352,13 @@ void NeuralNetwork::onPassEnd() { ...@@ -327,6 +352,13 @@ void NeuralNetwork::onPassEnd() {
} }
} }
void NeuralNetwork::releaseOutput() {
for (auto& layer : middleLayers_) {
Argument& arg = layer->getOutput();
arg.value.reset();
}
}
#ifndef PADDLE_MOBILE_INFERENCE #ifndef PADDLE_MOBILE_INFERENCE
class CombinedEvaluator : public Evaluator { class CombinedEvaluator : public Evaluator {
......
...@@ -137,6 +137,13 @@ public: ...@@ -137,6 +137,13 @@ public:
/// some finish work, like convert the weight format of MKLDNNLayers /// some finish work, like convert the weight format of MKLDNNLayers
void finish(); void finish();
/**
* @brief Release the middle layer's output memory.
*
* @note This function is used for memory optimization in inference.
*/
void releaseOutput();
protected: protected:
/** /**
* The constructor of NeuralNetwork. * The constructor of NeuralNetwork.
...@@ -158,6 +165,7 @@ protected: ...@@ -158,6 +165,7 @@ protected:
std::vector<DataLayerPtr> dataLayers_; std::vector<DataLayerPtr> dataLayers_;
std::vector<LayerPtr> outputLayers_; std::vector<LayerPtr> outputLayers_;
std::vector<LayerPtr> middleLayers_;
static std::map<std::string, bool> dllInitMap; static std::map<std::string, bool> dllInitMap;
......
...@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) { ...@@ -1472,7 +1472,8 @@ TEST(Layer, RecurrentLayer) {
for (auto reversed : {false, true}) { for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed); config.layerConfig.set_reversed(reversed);
config.testState = !reversed; config.testState = !reversed;
testLayerGrad(config, "recurrent", 50, /* trans= */ false, useGpu); testLayerGrad(
config, "recurrent", 50, /* trans= */ false, useGpu, false, 1.0);
} }
} }
} }
...@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) { ...@@ -1494,7 +1495,8 @@ TEST(Layer, LstmLayer) {
for (auto reversed : {false, true}) { for (auto reversed : {false, true}) {
config.layerConfig.set_reversed(reversed); config.layerConfig.set_reversed(reversed);
config.testState = !reversed; config.testState = !reversed;
testLayerGrad(config, "lstmemory", 100, /* trans= */ false, useGpu); testLayerGrad(
config, "lstmemory", 100, /* trans= */ false, useGpu, false, 0.02);
} }
} }
for (auto useGpu : {true}) { for (auto useGpu : {true}) {
......
set(FLUID_CORE_MODULES
backward proto_desc paddle_memory executor prune init ${GLOB_OP_LIB})
cc_library(paddle_fluid_api
SRCS inference.cc
DEPS ${FLUID_CORE_MODULES})
# Merge all modules into a simgle static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES})
# ptools
# just for testing, we may need to change the storing format for inference_model
# and move the dependent of pickle.
# download from http://www.picklingtools.com/
# build in the C++ sub-directory, using command
# make -f Makefile.Linux libptools.so
set(PTOOLS_LIB)
set(PTOOLS_ROOT $ENV{PTOOLS_ROOT} CACHE PATH "Folder contains PicklingTools")
find_path(PTOOLS_INC_DIR chooseser.h PATHS ${PTOOLS_ROOT}/C++)
find_library(PTOOLS_SHARED_LIB NAMES ptools PATHS ${PTOOLS_ROOT}/C++)
if(PTOOLS_INC_DIR AND PTOOLS_SHARED_LIB)
add_definitions(-DPADDLE_USE_PTOOLS)
set(PTOOLS_LIB ptools)
message(STATUS "Found PicklingTools: ${PTOOLS_SHARED_LIB}")
add_library(${PTOOLS_LIB} SHARED IMPORTED GLOBAL)
set_property(TARGET ${PTOOLS_LIB} PROPERTY IMPORTED_LOCATION ${PTOOLS_SHARED_LIB})
include_directories(${PTOOLS_ROOT}/C++)
include_directories(${PTOOLS_ROOT}/C++/opencontainers_1_8_5/include)
add_definitions(-DOC_NEW_STYLE_INCLUDES) # used in ptools
endif()
add_executable(example example.cc)
if(APPLE)
set(OPTIONAL_LINK_FLAGS)
if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang")
set(OPTIONAL_LINK_FLAGS "-undefined dynamic_lookup")
endif()
target_link_libraries(example
-Wl,-force_load paddle_fluid
${OPTIONAL_LINK_FLAGS}
${PTOOLS_LIB})
else()
target_link_libraries(example
-Wl,--start-group -Wl,--whole-archive paddle_fluid
-Wl,--no-whole-archive -Wl,--end-group
${PTOOLS_LIB})
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <time.h>
#include <iostream>
#include "gflags/gflags.h"
#include "paddle/inference/inference.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_string(feed_var_names, "", "Names of feeding variables");
DEFINE_string(fetch_var_names, "", "Names of fetching variables");
int main(int argc, char** argv) {
google::ParseCommandLineFlags(&argc, &argv, true);
if (FLAGS_dirname.empty() || FLAGS_feed_var_names.empty() ||
FLAGS_fetch_var_names.empty()) {
// Example:
// ./example --dirname=recognize_digits_mlp.inference.model
// --feed_var_names="x"
// --fetch_var_names="fc_2.tmp_2"
std::cout << "Usage: ./example --dirname=path/to/your/model "
"--feed_var_names=x --fetch_var_names=y"
<< std::endl;
exit(1);
}
std::cout << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::cout << "FLAGS_feed_var_names: " << FLAGS_feed_var_names << std::endl;
std::cout << "FLAGS_fetch_var_names: " << FLAGS_fetch_var_names << std::endl;
std::string dirname = FLAGS_dirname;
std::vector<std::string> feed_var_names = {FLAGS_feed_var_names};
std::vector<std::string> fetch_var_names = {FLAGS_fetch_var_names};
paddle::InferenceEngine* engine = new paddle::InferenceEngine();
engine->LoadInferenceModel(dirname, feed_var_names, fetch_var_names);
paddle::framework::LoDTensor input;
srand(time(0));
float* input_ptr =
input.mutable_data<float>({1, 784}, paddle::platform::CPUPlace());
for (int i = 0; i < 784; ++i) {
input_ptr[i] = rand() / (static_cast<float>(RAND_MAX));
}
std::vector<paddle::framework::LoDTensor> feeds;
feeds.push_back(input);
std::vector<paddle::framework::LoDTensor> fetchs;
engine->Execute(feeds, fetchs);
for (size_t i = 0; i < fetchs.size(); ++i) {
auto dims_i = fetchs[i].dims();
std::cout << "dims_i:";
for (int j = 0; j < dims_i.size(); ++j) {
std::cout << " " << dims_i[j];
}
std::cout << std::endl;
std::cout << "result:";
float* output_ptr = fetchs[i].data<float>();
for (int j = 0; j < paddle::framework::product(dims_i); ++j) {
std::cout << " " << output_ptr[j];
}
std::cout << std::endl;
}
delete engine;
return 0;
}
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "inference.h"
#include <fstream>
#include "paddle/framework/executor.h"
#include "paddle/framework/feed_fetch_method.h"
#include "paddle/framework/init.h"
#include "paddle/framework/scope.h"
#ifdef PADDLE_USE_PTOOLS
#include "chooseser.h"
#endif
namespace paddle {
void InferenceEngine::LoadInferenceModel(
const std::string& dirname,
const std::vector<std::string>& feed_var_names,
const std::vector<std::string>& fetch_var_names) {
#ifdef PADDLE_USE_PTOOLS
std::string model_filename = dirname + "/__model__";
LOG(INFO) << "Using PicklingTools, loading model from " << model_filename;
Val v;
LoadValFromFile(model_filename.c_str(), v, SERIALIZE_P0);
std::string program_desc_str = v["program_desc_str"];
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
// PicklingTools cannot parse the vector of strings correctly.
#else
std::string model_filename = dirname + "/__model__.dat";
LOG(INFO) << "loading model from " << model_filename;
std::ifstream inputfs(model_filename, std::ios::in | std::ios::binary);
std::string program_desc_str;
inputfs.seekg(0, std::ios::end);
program_desc_str.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
LOG(INFO) << "program_desc_str's size: " << program_desc_str.size();
inputfs.read(&program_desc_str[0], program_desc_str.size());
inputfs.close();
#endif
program_ = new framework::ProgramDesc(program_desc_str);
GenerateLoadProgram(dirname);
if (feed_var_names.empty() || fetch_var_names.empty()) {
LOG(FATAL) << "Please specify the feed_var_names and fetch_var_names.";
}
feed_var_names_ = feed_var_names;
fetch_var_names_ = fetch_var_names;
PrependFeedOp();
AppendFetchOp();
}
bool InferenceEngine::IsParameter(const framework::VarDesc* var) {
if (var->Persistable()) {
// There are many unreachable variables in the program
for (size_t i = 0; i < program_->Size(); ++i) {
const framework::BlockDesc& block = program_->Block(i);
for (auto* op : block.AllOps()) {
for (auto input_argument_name : op->InputArgumentNames()) {
if (input_argument_name == var->Name()) {
return true;
}
}
}
}
}
return false;
}
void InferenceEngine::GenerateLoadProgram(const std::string& dirname) {
framework::BlockDesc* global_block = program_->MutableBlock(0);
load_program_ = new framework::ProgramDesc();
framework::BlockDesc* load_block = load_program_->MutableBlock(0);
for (auto* var : global_block->AllVars()) {
if (IsParameter(var)) {
LOG(INFO) << "parameter's name: " << var->Name();
framework::VarDesc* new_var = load_block->Var(var->Name());
new_var->SetShape(var->Shape());
new_var->SetDataType(var->GetDataType());
new_var->SetType(var->GetType());
new_var->SetLoDLevel(var->GetLoDLevel());
new_var->SetPersistable(true);
// append_op
framework::OpDesc* op = load_block->AppendOp();
op->SetType("load");
op->SetOutput("Out", {new_var->Name()});
op->SetAttr("file_path", {dirname + "/" + new_var->Name()});
op->CheckAttrs();
}
}
}
void InferenceEngine::PrependFeedOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* feed_var = global_block->Var("feed");
feed_var->SetType(framework::proto::VarDesc::FEED_MINIBATCH);
feed_var->SetPersistable(true);
// prepend feed_op
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
std::string var_name = feed_var_names_[i];
LOG(INFO) << "feed var's name: " << var_name;
// prepend_op
framework::OpDesc* op = global_block->PrependOp();
op->SetType("feed");
op->SetInput("X", {"feed"});
op->SetOutput("Out", {var_name});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::AppendFetchOp() {
if (!program_) {
LOG(FATAL) << "Please initialize the program_ first.";
}
framework::BlockDesc* global_block = program_->MutableBlock(0);
// create_var
framework::VarDesc* fetch_var = global_block->Var("fetch");
fetch_var->SetType(framework::proto::VarDesc::FETCH_LIST);
fetch_var->SetPersistable(true);
// append fetch_op
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
std::string var_name = fetch_var_names_[i];
LOG(INFO) << "fetch var's name: " << var_name;
// append_op
framework::OpDesc* op = global_block->AppendOp();
op->SetType("fetch");
op->SetInput("X", {var_name});
op->SetOutput("Out", {"fetch"});
op->SetAttr("col", {static_cast<int>(i)});
op->CheckAttrs();
}
}
void InferenceEngine::Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs) {
if (!program_ || !load_program_) {
LOG(FATAL) << "Please initialize the program_ and load_program_ first.";
}
if (feeds.size() < feed_var_names_.size()) {
LOG(FATAL) << "Please feed " << feed_var_names_.size() << " input Tensors.";
}
auto* place = new platform::CPUPlace();
framework::InitDevices({"CPU"});
framework::Executor* executor = new framework::Executor(*place);
framework::Scope* scope = new framework::Scope();
executor->Run(*load_program_, scope, 0, true, true);
// set_feed_variable
for (size_t i = 0; i < feed_var_names_.size(); ++i) {
framework::SetFeedVariable(scope, feeds[i], "feed", i);
}
executor->Run(*program_, scope, 0, true, true);
// get_fetch_variable
fetchs.resize(fetch_var_names_.size());
for (size_t i = 0; i < fetch_var_names_.size(); ++i) {
fetchs[i] = framework::GetFetchVariable(*scope, "fetch", i);
}
delete place;
delete scope;
delete executor;
}
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/block_desc.h"
#include "paddle/framework/lod_tensor.h"
#include "paddle/framework/program_desc.h"
namespace paddle {
class InferenceEngine {
public:
InferenceEngine() : program_(nullptr), load_program_(nullptr) {}
~InferenceEngine() {
delete program_;
delete load_program_;
}
void LoadInferenceModel(const std::string& dirname,
const std::vector<std::string>& feed_var_names,
const std::vector<std::string>& fetch_var_names);
void Execute(const std::vector<framework::LoDTensor>& feeds,
std::vector<framework::LoDTensor>& fetchs);
private:
bool IsParameter(const framework::VarDesc* var);
void GenerateLoadProgram(const std::string& dirname);
void PrependFeedOp();
void AppendFetchOp();
private:
framework::ProgramDesc* program_;
framework::ProgramDesc* load_program_;
std::vector<std::string> feed_var_names_;
std::vector<std::string> fetch_var_names_;
};
} // namespace paddle
...@@ -114,5 +114,21 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) { ...@@ -114,5 +114,21 @@ void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) {
#endif #endif
size_t Usage::operator()(const platform::CPUPlace& cpu) const {
return Used(cpu);
}
size_t Usage::operator()(const platform::CUDAPlace& gpu) const {
#ifdef PADDLE_WITH_CUDA
return Used(gpu);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
size_t memory_usage(const platform::Place& p) {
return boost::apply_visitor(Usage(), p);
}
} // namespace memory } // namespace memory
} // namespace paddle } // namespace paddle
...@@ -54,6 +54,13 @@ void Free(Place place, void* ptr); ...@@ -54,6 +54,13 @@ void Free(Place place, void* ptr);
template <typename Place> template <typename Place>
size_t Used(Place place); size_t Used(Place place);
struct Usage : public boost::static_visitor<size_t> {
size_t operator()(const platform::CPUPlace& cpu) const;
size_t operator()(const platform::CUDAPlace& gpu) const;
};
size_t memory_usage(const platform::Place& p);
/** /**
* \brief Free memory block in one place. * \brief Free memory block in one place.
* *
......
...@@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) { ...@@ -44,6 +44,9 @@ TEST(BuddyAllocator, CPUAllocation) {
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
paddle::platform::Place place = cpu;
EXPECT_EQ(paddle::memory::Used(cpu), paddle::memory::memory_usage(place));
paddle::memory::Free(cpu, p); paddle::memory::Free(cpu, p);
} }
...@@ -99,6 +102,9 @@ TEST(BuddyAllocator, GPUAllocation) { ...@@ -99,6 +102,9 @@ TEST(BuddyAllocator, GPUAllocation) {
EXPECT_NE(p, nullptr); EXPECT_NE(p, nullptr);
paddle::platform::Place place = gpu;
EXPECT_EQ(paddle::memory::Used(gpu), paddle::memory::memory_usage(place));
paddle::memory::Free(gpu, p); paddle::memory::Free(gpu, p);
} }
......
...@@ -61,106 +61,28 @@ function(op_library TARGET) ...@@ -61,106 +61,28 @@ function(op_library TARGET)
${op_common_deps}) ${op_common_deps})
endif() endif()
# net_op doesn't need pybind # Define operators that don't need pybind here.
if ("${TARGET}" STREQUAL "net_op") foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
set(pybind_flag 1) if ("${TARGET}" STREQUAL "${manual_pybind_op}")
endif() set(pybind_flag 1)
endif()
if ("${TARGET}" STREQUAL "compare_op") endforeach()
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal);\n")
endif()
# conv_op contains several operators
if ("${TARGET}" STREQUAL "conv_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d);\n")
endif()
# conv_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "conv_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_cudnn);\n")
endif()
# pool_op contains several operators
if ("${TARGET}" STREQUAL "pool_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d);\n")
endif()
# pool_cudnn_op contains several operators
if ("${TARGET}" STREQUAL "pool_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(pool2d_cudnn);\n")
endif()
if ("${TARGET}" STREQUAL "logical_op")
set(pybind_flag 1)
file(APPEND ${pybind_file} "USE_OP(logical_and);\n")
endif()
# pool_with_index_op contains several operators
if ("${TARGET}" STREQUAL "pool_with_index_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(max_pool2d_with_index);\n")
endif()
# conv_transpose_op contains several operators
if ("${TARGET}" STREQUAL "conv_transpose_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose);\n")
endif()
# conv_transpose_cudnn_op contains two operators
if ("${TARGET}" STREQUAL "conv_transpose_cudnn_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(conv2d_transpose_cudnn);\n")
endif()
# save_restore_op contains several operators
if ("${TARGET}" STREQUAL "save_restore_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(save);\n")
endif()
# activation_op contains several operators
if ("${TARGET}" STREQUAL "activation_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(sigmoid);\n")
endif()
# nccl_op contains several operators
if ("${TARGET}" STREQUAL "nccl_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
endif()
# reduce_op contains several operators
if ("${TARGET}" STREQUAL "reduce_op")
set(pybind_flag 1)
# It's enough to just adding one operator to pybind
file(APPEND ${pybind_file} "USE_OP(reduce_sum);\n")
endif()
if ("${TARGET}" STREQUAL "tensor_array_read_write_op") # The registration of USE_OP, please refer to paddle/framework/op_registry.h.
set(pybind_flag 1) # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
file(APPEND ${pybind_file} "USE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(write_to_array);\n") # And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP\\(.*REGISTER_OP\\(" multi_register "${TARGET_CONTENT}")
string(REGEX MATCH "REGISTER_OP\\([a-z0-9_]*," one_register "${multi_register}")
if (one_register STREQUAL "")
string(REPLACE "_op" "" TARGET "${TARGET}")
else ()
string(REPLACE "REGISTER_OP(" "" TARGET "${one_register}")
string(REPLACE "," "" TARGET "${TARGET}")
endif() endif()
# pybind USE_NO_KERNEL_OP # pybind USE_NO_KERNEL_OP
# HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel # HACK: if REGISTER_OP_CPU_KERNEL presents the operator must have kernel
file(READ ${TARGET}.cc TARGET_CONTENT)
string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}") string(REGEX MATCH "REGISTER_OP_CPU_KERNEL" regex_result "${TARGET_CONTENT}")
string(REPLACE "_op" "" TARGET "${TARGET}") string(REPLACE "_op" "" TARGET "${TARGET}")
if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "") if (${pybind_flag} EQUAL 0 AND regex_result STREQUAL "")
...@@ -171,7 +93,6 @@ function(op_library TARGET) ...@@ -171,7 +93,6 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
...@@ -188,6 +109,7 @@ add_subdirectory(nccl) ...@@ -188,6 +109,7 @@ add_subdirectory(nccl)
if(WITH_GPU) if(WITH_GPU)
op_library(nccl_op DEPS nccl_common) op_library(nccl_op DEPS nccl_common)
file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(ncclAllReduce);\n")
else() else()
set(DEPS_OPS ${DEPS_OPS} nccl_op) set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif() endif()
...@@ -229,8 +151,10 @@ op_library(lstm_op DEPS sequence2batch lstm_compute) ...@@ -229,8 +151,10 @@ op_library(lstm_op DEPS sequence2batch lstm_compute)
op_library(conv_transpose_op DEPS vol2col) op_library(conv_transpose_op DEPS vol2col)
op_library(gru_op DEPS sequence2batch gru_compute) op_library(gru_op DEPS sequence2batch gru_compute)
op_library(recurrent_op DEPS executor) op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding math_function)
op_library(cos_sim_op DEPS cos_sim_functor) op_library(cos_sim_op DEPS cos_sim_functor)
op_library(hierarchical_sigmoid_op DEPS matrix_bit_code) op_library(hierarchical_sigmoid_op DEPS matrix_bit_code)
op_library(parallel_do_op DEPS executor)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
...@@ -239,6 +163,8 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) ...@@ -239,6 +163,8 @@ list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
op_library(${src}) op_library(${src})
endforeach() endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
...@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel { ...@@ -53,7 +53,7 @@ class AccuracyOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()), framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/eigen.h" #include "paddle/framework/eigen.h"
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -26,12 +27,16 @@ class ActivationKernel ...@@ -26,12 +27,16 @@ class ActivationKernel
using T = typename Functor::ELEMENT_TYPE; using T = typename Functor::ELEMENT_TYPE;
void Compute(const framework::ExecutionContext& context) const override { void Compute(const framework::ExecutionContext& context) const override {
auto* X = context.Input<framework::Tensor>("X"); auto& X = detail::Ref(context.Input<framework::Tensor>("X"),
auto* Out = context.Output<framework::Tensor>("Out"); "Cannot get input tensor X, variable name = %s",
Out->mutable_data<T>(context.GetPlace()); context.op().Input("X"));
auto x = framework::EigenVector<T>::Flatten(*X); auto& Out = detail::Ref(context.Output<framework::Tensor>("Out"),
auto out = framework::EigenVector<T>::Flatten(*Out); "Cannot get output tensor Out, variable name = %s",
context.op().Output("Out"));
Out.mutable_data<T>(context.GetPlace());
auto x = framework::EigenVector<T>::Flatten(X);
auto out = framework::EigenVector<T>::Flatten(Out);
auto* place = auto* place =
context.template device_context<DeviceContext>().eigen_device(); context.template device_context<DeviceContext>().eigen_device();
Functor functor; Functor functor;
......
...@@ -47,8 +47,7 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -47,8 +47,7 @@ class AdagradOpKernel : public framework::OpKernel<T> {
*ctx.Input<framework::Tensor>("Grad")); *ctx.Input<framework::Tensor>("Grad"));
auto moment = framework::EigenVector<T>::Flatten( auto moment = framework::EigenVector<T>::Flatten(
*ctx.Input<framework::Tensor>("Moment")); *ctx.Input<framework::Tensor>("Moment"));
auto lr = framework::EigenVector<T>::Flatten( auto* learning_rate = ctx.Input<framework::Tensor>("LearningRate");
*ctx.Input<framework::Tensor>("LearningRate"));
auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor); auto param_out = framework::EigenVector<T>::Flatten(*param_out_tensor);
auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor); auto moment_out = framework::EigenVector<T>::Flatten(*moment_out_tensor);
...@@ -56,8 +55,16 @@ class AdagradOpKernel : public framework::OpKernel<T> { ...@@ -56,8 +55,16 @@ class AdagradOpKernel : public framework::OpKernel<T> {
moment_out.device(*place) = moment + grad * grad; moment_out.device(*place) = moment + grad * grad;
Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel()); Eigen::DSizes<int, 1> m_dsize(moment_out_tensor->numel());
param_out.device(*place) = if (platform::is_cpu_place(ctx.GetPlace())) {
param - lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon); auto* lr = learning_rate->data<T>();
param_out.device(*place) =
param - lr[0] * grad / (moment_out.sqrt() + epsilon);
} else {
auto lr = framework::EigenVector<T>::Flatten(*learning_rate);
param_out.device(*place) =
param -
lr.broadcast(m_dsize) * grad / (moment_out.sqrt() + epsilon);
}
} else if (grad_var->IsType<framework::SelectedRows>()) { } else if (grad_var->IsType<framework::SelectedRows>()) {
auto* param_tensor = ctx.Input<framework::Tensor>("Param"); auto* param_tensor = ctx.Input<framework::Tensor>("Param");
PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor); PADDLE_ENFORCE_EQ(param_tensor, param_out_tensor);
......
...@@ -42,7 +42,7 @@ class ArrayOp : public framework::OperatorBase { ...@@ -42,7 +42,7 @@ class ArrayOp : public framework::OperatorBase {
if (platform::is_gpu_place(i_tensor.place())) { if (platform::is_gpu_place(i_tensor.place())) {
// FIXME: Avoid copy from GPU to CPU // FIXME: Avoid copy from GPU to CPU
framework::Tensor t; framework::Tensor t;
framework::CopyFrom(i_tensor, platform::CPUPlace(), dev_ctx, &t); framework::Copy(i_tensor, platform::CPUPlace(), dev_ctx, &t);
dev_ctx.Wait(); dev_ctx.Wait();
offset = static_cast<size_t>(*t.data<int64_t>()); offset = static_cast<size_t>(*t.data<int64_t>());
} else { } else {
......
...@@ -110,8 +110,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { ...@@ -110,8 +110,8 @@ class ArrayToLoDTensorOp : public framework::OperatorBase {
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x[x_idx].Slice(start_offset, end_offset), place, framework::Copy(x[x_idx].Slice(start_offset, end_offset), place,
dev_ctx, &slice); dev_ctx, &slice);
out_offset += len; out_offset += len;
} }
} }
......
...@@ -45,7 +45,7 @@ class AssignFunctor { ...@@ -45,7 +45,7 @@ class AssignFunctor {
out_rows.set_height(rows.height()); out_rows.set_height(rows.height());
auto &t = rows.value(); auto &t = rows.value();
auto *m = out_rows.mutable_value(); auto *m = out_rows.mutable_value();
framework::CopyFrom(t, t.place(), dev_ctx_, m); framework::Copy(t, t.place(), dev_ctx_, m);
} }
template <typename T> template <typename T>
...@@ -57,7 +57,7 @@ class AssignFunctor { ...@@ -57,7 +57,7 @@ class AssignFunctor {
void copy_tensor(const framework::LoDTensor &lod_tensor, void copy_tensor(const framework::LoDTensor &lod_tensor,
framework::LoDTensor *out) const { framework::LoDTensor *out) const {
auto &out_tensor = *out; auto &out_tensor = *out;
CopyFrom(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor); Copy(lod_tensor, lod_tensor.place(), dev_ctx_, &out_tensor);
out_tensor.set_lod(lod_tensor.lod()); out_tensor.set_lod(lod_tensor.lod());
} }
......
...@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel { ...@@ -39,7 +39,7 @@ class AucOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Out")->type()), framework::ToDataType(ctx.Input<Tensor>("Out")->type()),
......
...@@ -64,7 +64,7 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -64,7 +64,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5, PADDLE_ENFORCE(x_dims.size() >= 2 && x_dims.size() <= 5,
"Input X must have 2 to 5 dimensions."); "Input X must have 2 to 5 dimensions.");
const int C = const int64_t C =
(data_layout == DataLayout::kNCHW ? x_dims[1] (data_layout == DataLayout::kNCHW ? x_dims[1]
: x_dims[x_dims.size() - 1]); : x_dims[x_dims.size() - 1]);
...@@ -78,6 +78,7 @@ class BatchNormOp : public framework::OperatorWithKernel { ...@@ -78,6 +78,7 @@ class BatchNormOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("VarianceOut", {C}); ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C}); ctx->SetOutputDim("SavedMean", {C});
ctx->SetOutputDim("SavedVariance", {C}); ctx->SetOutputDim("SavedVariance", {C});
ctx->ShareLoD("X", "Y");
} }
}; };
...@@ -305,7 +306,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel { ...@@ -305,7 +306,7 @@ class BatchNormGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
const auto *var = ctx.InputVar(framework::GradVarName("Y")); const auto *var = ctx.InputVar(framework::GradVarName("Y"));
if (var == nullptr) { if (var == nullptr) {
......
...@@ -55,10 +55,10 @@ class ChunkEvalOp : public framework::OperatorWithKernel { ...@@ -55,10 +55,10 @@ class ChunkEvalOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(framework::proto::DataType::FP32, return framework::OpKernelType(framework::proto::DataType::FP32,
ctx.device_context()); platform::CPUPlace());
} }
}; };
......
...@@ -145,6 +145,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -145,6 +145,7 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
context.Attr<std::vector<int>>("excluded_chunk_types").end()); context.Attr<std::vector<int>>("excluded_chunk_types").end());
auto* inference = context.Input<LoDTensor>("Inference"); auto* inference = context.Input<LoDTensor>("Inference");
auto place = inference->place();
auto* label = context.Input<LoDTensor>("Label"); auto* label = context.Input<LoDTensor>("Label");
auto* precision = context.Output<Tensor>("Precision"); auto* precision = context.Output<Tensor>("Precision");
auto* recall = context.Output<Tensor>("Recall"); auto* recall = context.Output<Tensor>("Recall");
...@@ -155,15 +156,15 @@ class ChunkEvalKernel : public framework::OpKernel<T> { ...@@ -155,15 +156,15 @@ class ChunkEvalKernel : public framework::OpKernel<T> {
const int64_t* inference_data = inference->data<int64_t>(); const int64_t* inference_data = inference->data<int64_t>();
const int64_t* label_data = label->data<int64_t>(); const int64_t* label_data = label->data<int64_t>();
T* precision_data = precision->mutable_data<T>(context.GetPlace()); T* precision_data = precision->mutable_data<T>(place);
T* racall_data = recall->mutable_data<T>(context.GetPlace()); T* racall_data = recall->mutable_data<T>(place);
T* f1_data = f1->mutable_data<T>(context.GetPlace()); T* f1_data = f1->mutable_data<T>(place);
int64_t* num_infer_chunks_data = int64_t* num_infer_chunks_data =
num_infer_chunks->mutable_data<int64_t>(context.GetPlace()); num_infer_chunks->mutable_data<int64_t>(place);
int64_t* num_label_chunks_data = int64_t* num_label_chunks_data =
num_label_chunks->mutable_data<int64_t>(context.GetPlace()); num_label_chunks->mutable_data<int64_t>(place);
int64_t* num_correct_chunks_data = int64_t* num_correct_chunks_data =
num_correct_chunks->mutable_data<int64_t>(context.GetPlace()); num_correct_chunks->mutable_data<int64_t>(place);
*num_infer_chunks_data = 0; *num_infer_chunks_data = 0;
*num_label_chunks_data = 0; *num_label_chunks_data = 0;
*num_correct_chunks_data = 0; *num_correct_chunks_data = 0;
......
...@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel { ...@@ -66,9 +66,9 @@ class CompareOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CompareOp kernel's device type is decided by input tensor place // CompareOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt; return kt;
......
...@@ -315,10 +315,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> { ...@@ -315,10 +315,7 @@ class CudnnConvGradOpKernel : public framework::OpKernel<T> {
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_OP_KERNEL(conv2d, CUDNN, paddle::platform::CUDAPlace, // TODO(dzhwinter) : below register should be removed
paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>);
REGISTER_OP_CUDA_KERNEL(conv2d_cudnn, REGISTER_OP_CUDA_KERNEL(conv2d_cudnn,
paddle::operators::CudnnConvOpKernel<float>, paddle::operators::CudnnConvOpKernel<float>,
paddle::operators::CudnnConvOpKernel<double>); paddle::operators::CudnnConvOpKernel<double>);
......
...@@ -44,14 +44,12 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -44,14 +44,12 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
paddings.size(), strides.size(), paddings.size(), strides.size(),
"Conv paddings dimension and Conv strides dimension should be the same."); "Conv paddings dimension and Conv strides dimension should be the same.");
int input_channels = in_dims[1]; PADDLE_ENFORCE_EQ(in_dims[1], filter_dims[1] * groups,
PADDLE_ENFORCE_EQ(input_channels, filter_dims[1] * groups,
"The number of input channels should be equal to filter " "The number of input channels should be equal to filter "
"channels * groups."); "channels * groups.");
int output_channels = filter_dims[0];
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
output_channels % groups, 0, filter_dims[0] % groups, 0,
"The number of output channels should be divided by groups."); "The number of output channels should be divided by groups.");
std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]}); std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
...@@ -66,6 +64,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -66,6 +64,7 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
dilations[i], paddings[i], strides[i])); dilations[i], paddings[i], strides[i]));
} }
ctx->SetOutputDim("Output", framework::make_ddim(output_shape)); ctx->SetOutputDim("Output", framework::make_ddim(output_shape));
ctx->ShareLoD("Input", "Output");
} }
Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
...@@ -231,7 +230,6 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -231,7 +230,6 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad, REGISTER_OP(conv2d, ops::ConvOp, ops::Conv2DOpMaker, conv2d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
namespace ops = paddle::operators;
REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad, REGISTER_OP(conv3d, ops::ConvOp, ops::Conv3DOpMaker, conv3d_grad,
ops::ConvOpGrad); ops::ConvOpGrad);
......
...@@ -120,17 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel { ...@@ -120,17 +120,11 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()), framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
ctx.device_context()); platform::CPUPlace());
}
framework::OpKernelType GetExpectedKernelType(
const framework::OpKernelType& actual_kernel_type) const override {
return framework::OpKernelType(actual_kernel_type.data_type_,
platform::CPUPlace());
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -28,9 +28,6 @@ template <typename DeviceContext, typename T> ...@@ -28,9 +28,6 @@ template <typename DeviceContext, typename T>
class CRFDecodingOpKernel : public framework::OpKernel<T> { class CRFDecodingOpKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
"The crf_decoding operator can only run on CPU.");
auto* emission_weights = ctx.Input<LoDTensor>("Emission"); auto* emission_weights = ctx.Input<LoDTensor>("Emission");
auto* transition_weights = ctx.Input<Tensor>("Transition"); auto* transition_weights = ctx.Input<Tensor>("Transition");
auto* label = ctx.Input<LoDTensor>("Label"); auto* label = ctx.Input<LoDTensor>("Label");
......
...@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel { ...@@ -51,7 +51,7 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of cross_entropy // Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
...@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { ...@@ -101,7 +101,7 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of cross_entropy // Explicitly set that the data type of computation kernel of cross_entropy
// is determined by its input "X". // is determined by its input "X".
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -21,14 +21,9 @@ namespace detail { ...@@ -21,14 +21,9 @@ namespace detail {
Status SendRecvServerImpl::SendVariable(ServerContext *context, Status SendRecvServerImpl::SendVariable(ServerContext *context,
const VariableMessage *in_var, const VariableMessage *in_var,
VoidMessage *out_var) { VoidMessage *out_var) {
// TODO(typhoonzero): support different variable types. MessageWithName msg_with_name =
std::istringstream iss(in_var->serialized()); std::make_pair(in_var->varname(), std::move(*in_var));
framework::LoDTensor t; var_recv_queue_.Push(std::move(msg_with_name));
framework::DeserializeFromStream(iss, &t);
TensorWithName tensor_with_name =
std::make_pair(in_var->varname(), std::move(t));
var_recv_queue_.Push(std::move(tensor_with_name));
return Status::OK; return Status::OK;
} }
...@@ -37,14 +32,8 @@ Status SendRecvServerImpl::GetVariable(ServerContext *context, ...@@ -37,14 +32,8 @@ Status SendRecvServerImpl::GetVariable(ServerContext *context,
VariableMessage *out_var) { VariableMessage *out_var) {
std::string get_var_name = in_var->varname(); std::string get_var_name = in_var->varname();
auto *var = scope_->FindVar(get_var_name); auto *var = scope_->FindVar(get_var_name);
auto tensor = var->Get<framework::LoDTensor>();
std::ostringstream oss;
framework::SerializeToStream(oss, tensor, platform::CPUDeviceContext());
std::string *varname = out_var->mutable_varname(); SerializeToMessage(get_var_name, var, platform::CPUDeviceContext(), out_var);
*varname = get_var_name;
std::string *serialized = out_var->mutable_serialized();
*serialized = oss.str();
return Status::OK; return Status::OK;
} }
......
...@@ -27,14 +27,8 @@ bool RPCClient::SendVariable(const framework::Scope& scope, ...@@ -27,14 +27,8 @@ bool RPCClient::SendVariable(const framework::Scope& scope,
auto ctx = platform::CPUDeviceContext(); auto ctx = platform::CPUDeviceContext();
auto* var = scope.FindVar(inname); auto* var = scope.FindVar(inname);
PADDLE_ENFORCE(var); PADDLE_ENFORCE(var);
// TODO(typhoonzero): support SelectedRows SerializeToMessage(inname, var, ctx, &msg);
PADDLE_ENFORCE(var->IsType<framework::LoDTensor>(),
"Only support LoDTensor, %s has wrong type", inname);
const framework::LoDTensor& tensor = var->Get<framework::LoDTensor>();
std::ostringstream oss;
framework::SerializeToStream(oss, tensor, ctx);
msg.set_varname(inname);
msg.set_serialized(oss.str());
Status status = stub_->SendVariable(&context, msg, &out_msg); Status status = stub_->SendVariable(&context, msg, &out_msg);
if (!status.ok()) { if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message(); LOG(ERROR) << "gRPC error: " << status.error_message();
...@@ -50,19 +44,15 @@ bool RPCClient::GetVariable(const framework::Scope& scope, ...@@ -50,19 +44,15 @@ bool RPCClient::GetVariable(const framework::Scope& scope,
call_msg.set_varname(outname); call_msg.set_varname(outname);
auto ctx = platform::CPUDeviceContext(); auto ctx = platform::CPUDeviceContext();
Status status = stub_->GetVariable(&context, call_msg, &ret_msg); Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
auto* outvar = scope.FindVar(outname);
if (!status.ok()) { if (!status.ok()) {
LOG(ERROR) << "gRPC error: " << status.error_message(); LOG(ERROR) << "gRPC error: " << status.error_message();
return false; return false;
} }
std::istringstream iss(ret_msg.serialized()); std::istringstream iss(ret_msg.serialized());
DeserializeFromMessage(ret_msg, ctx, outvar);
framework::LoDTensor ret_tensor;
framework::DeserializeFromStream(iss, &ret_tensor);
auto* outvar = scope.FindVar(outname);
framework::LoDTensor* out_tensor = outvar->GetMutable<framework::LoDTensor>();
// FIXME(typhoonzero): do not copy.
framework::CopyFrom(ret_tensor, ctx.GetPlace(), ctx, out_tensor);
return true; return true;
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. Licensed under
the Apache License, Version 2.0 (the "License"); you may not use this file
Licensed under the Apache License, Version 2.0 (the "License"); except in compliance with the License.
you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
...@@ -13,7 +12,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +12,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
syntax = "proto3"; syntax = "proto3";
package sendrecv; package sendrecv;
service SendRecvService { service SendRecvService {
...@@ -29,12 +27,18 @@ service SendRecvService { ...@@ -29,12 +27,18 @@ service SendRecvService {
// VariableMessage is serialized paddle variable message. // VariableMessage is serialized paddle variable message.
// It can be: // It can be:
// Tensor
// LoDTensor // LoDTensor
// SelectedRows // SelectedRows
enum VarType {
LOD_TENSOR = 0;
SELECTED_ROWS = 1;
}
message VariableMessage { message VariableMessage {
string varname = 1; string varname = 1;
bytes serialized = 2; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2;
bytes serialized = 3;
} }
message VoidMessage {} message VoidMessage {}
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/framework/data_type.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
#include "paddle/framework/scope.h" #include "paddle/framework/scope.h"
#include "paddle/framework/selected_rows.h" #include "paddle/framework/selected_rows.h"
#include "paddle/framework/var_type.h"
#include "paddle/operators/detail/simple_block_queue.h" #include "paddle/operators/detail/simple_block_queue.h"
#include "paddle/operators/detail/send_recv.grpc.pb.h" #include "paddle/operators/detail/send_recv.grpc.pb.h"
...@@ -44,7 +44,7 @@ namespace paddle { ...@@ -44,7 +44,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace detail { namespace detail {
typedef std::pair<std::string, framework::LoDTensor> TensorWithName; typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
class SendRecvServerImpl final : public SendRecvService::Service { class SendRecvServerImpl final : public SendRecvService::Service {
public: public:
...@@ -60,13 +60,13 @@ class SendRecvServerImpl final : public SendRecvService::Service { ...@@ -60,13 +60,13 @@ class SendRecvServerImpl final : public SendRecvService::Service {
void Done(); void Done();
void SetScope(framework::Scope *scope) { scope_ = scope; }; void SetScope(framework::Scope *scope) { scope_ = scope; };
const TensorWithName Get() { return this->var_recv_queue_.Pop(); } const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
void Push(const TensorWithName &msg) { this->var_recv_queue_.Push(msg); } void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
private: private:
// received variable from RPC, operators fetch variable from this queue. // received variable from RPC, operators fetch variable from this queue.
SimpleBlockQueue<TensorWithName> var_recv_queue_; SimpleBlockQueue<MessageWithName> var_recv_queue_;
framework::Scope *scope_; framework::Scope *scope_;
// condition of the sub program // condition of the sub program
std::mutex mutex_; std::mutex mutex_;
...@@ -89,6 +89,53 @@ class RPCClient { ...@@ -89,6 +89,53 @@ class RPCClient {
std::unique_ptr<SendRecvService::Stub> stub_; std::unique_ptr<SendRecvService::Stub> stub_;
}; };
inline void SerializeToMessage(const std::string &name,
const framework::Variable *var,
const platform::DeviceContext &ctx,
VariableMessage *msg) {
msg->set_varname(name);
std::ostringstream oss;
switch (framework::ToVarType(var->Type())) {
case framework::proto::VarDesc_VarType_LOD_TENSOR:
msg->set_type(sendrecv::VarType::LOD_TENSOR);
framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
break;
case framework::proto::VarDesc_VarType_SELECTED_ROWS:
msg->set_type(sendrecv::VarType::SELECTED_ROWS);
framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
ctx);
break;
default: {
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
break;
}
}
msg->set_serialized(oss.str());
}
inline void DeserializeFromMessage(const VariableMessage &msg,
const platform::DeviceContext &ctx,
framework::Variable *var) {
using namespace paddle::framework::proto;
std::istringstream iss(msg.serialized());
switch (msg.type()) {
case sendrecv::VarType::LOD_TENSOR:
DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
break;
case sendrecv::VarType::SELECTED_ROWS: {
DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
ctx);
break;
}
default: {
PADDLE_THROW("Deserialize does not support type: %s",
typeid(var->Type()).name());
break;
}
}
}
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -98,16 +98,16 @@ class DetectionOutputKernel : public framework::OpKernel<T> { ...@@ -98,16 +98,16 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
T* conf_data = conf_tensor.data<T>(); T* conf_data = conf_tensor.data<T>();
if (platform::is_gpu_place(context.GetPlace())) { if (platform::is_gpu_place(context.GetPlace())) {
loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace()); loc_cpu.mutable_data<T>(loc_tensor.dims(), platform::CPUPlace());
framework::CopyFrom(loc_tensor, platform::CPUPlace(), framework::Copy(loc_tensor, platform::CPUPlace(),
context.device_context(), &loc_cpu); context.device_context(), &loc_cpu);
loc_data = loc_cpu.data<T>(); loc_data = loc_cpu.data<T>();
conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace()); conf_cpu.mutable_data<T>(conf_tensor.dims(), platform::CPUPlace());
framework::CopyFrom(conf_tensor, platform::CPUPlace(), framework::Copy(conf_tensor, platform::CPUPlace(),
context.device_context(), &conf_cpu); context.device_context(), &conf_cpu);
conf_data = conf_cpu.data<T>(); conf_data = conf_cpu.data<T>();
priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace()); priorbox_cpu.mutable_data<T>(in_priorbox->dims(), platform::CPUPlace());
framework::CopyFrom(*in_priorbox, platform::CPUPlace(), framework::Copy(*in_priorbox, platform::CPUPlace(),
context.device_context(), &priorbox_cpu); context.device_context(), &priorbox_cpu);
priorbox_data = priorbox_cpu.data<T>(); priorbox_data = priorbox_cpu.data<T>();
} }
// get decode bboxes // get decode bboxes
...@@ -158,8 +158,8 @@ class DetectionOutputKernel : public framework::OpKernel<T> { ...@@ -158,8 +158,8 @@ class DetectionOutputKernel : public framework::OpKernel<T> {
batch_size, all_indices, all_decoded_bboxes, batch_size, all_indices, all_decoded_bboxes,
out_data); out_data);
if (platform::is_gpu_place(context.GetPlace())) { if (platform::is_gpu_place(context.GetPlace())) {
framework::CopyFrom(out_cpu, platform::CUDAPlace(), framework::Copy(out_cpu, platform::CUDAPlace(), context.device_context(),
context.device_context(), out); out);
} }
} }
}; };
......
...@@ -126,8 +126,7 @@ class ExpandGradKernel : public framework::OpKernel<T> { ...@@ -126,8 +126,7 @@ class ExpandGradKernel : public framework::OpKernel<T> {
auto* in0 = context.Input<Tensor>(framework::GradVarName("Out")); auto* in0 = context.Input<Tensor>(framework::GradVarName("Out"));
auto* out0 = context.Output<Tensor>(framework::GradVarName("X")); auto* out0 = context.Output<Tensor>(framework::GradVarName("X"));
out0->mutable_data<T>(context.GetPlace()); out0->mutable_data<T>(context.GetPlace());
framework::CopyFrom(*in0, context.GetPlace(), context.device_context(), framework::Copy(*in0, context.GetPlace(), context.device_context(), out0);
out0);
} else { } else {
switch (dims) { switch (dims) {
REP_EXPAND_GRAD_TEMPLATE(72) REP_EXPAND_GRAD_TEMPLATE(72)
......
...@@ -52,7 +52,7 @@ class FeedOp : public framework::OperatorBase { ...@@ -52,7 +52,7 @@ class FeedOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(feed_item, place, dev_ctx, out_item); framework::Copy(feed_item, place, dev_ctx, out_item);
out_item->set_lod(feed_item.lod()); out_item->set_lod(feed_item.lod());
} }
}; };
......
...@@ -53,9 +53,9 @@ class FetchOp : public framework::OperatorBase { ...@@ -53,9 +53,9 @@ class FetchOp : public framework::OperatorBase {
// FIXME(yuyang18): Should we assume the fetch operator always generate // FIXME(yuyang18): Should we assume the fetch operator always generate
// CPU outputs? // CPU outputs?
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(src_item.place());
CopyFrom(src_item, platform::CPUPlace(), dev_ctx, &dst_item); Copy(src_item, platform::CPUPlace(), dev_ctx, &dst_item);
dev_ctx.Wait(); dev_ctx.Wait();
dst_item.set_lod(src_item.lod()); dst_item.set_lod(src_item.lod());
......
...@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,7 @@ class FillConstantBatchSizeLikeOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
...@@ -72,7 +72,7 @@ class FillOp : public framework::OperatorBase { ...@@ -72,7 +72,7 @@ class FillOp : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(tensor, place, dev_ctx, &out); framework::Copy(tensor, place, dev_ctx, &out);
} }
} }
}; };
......
...@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel { ...@@ -40,7 +40,7 @@ class GatherOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
...@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel { ...@@ -57,7 +57,7 @@ class GatherGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -60,7 +60,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel { ...@@ -60,7 +60,7 @@ class GaussianRandomOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thread>
#include "paddle/framework/op_registry.h"
#include "paddle/operators/detail/safe_ref.h"
#include "paddle/platform/place.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/platform/gpu_info.h"
#endif
namespace paddle {
namespace operators {
static size_t CUDADevCount() {
#ifdef PADDLE_WITH_CUDA
return platform::GetCUDADeviceCount();
#else
return 0UL;
#endif
}
class GetPlacesOp : public framework::OperatorBase {
public:
GetPlacesOp(const std::string &type, const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
std::string device_type = Attr<std::string>("device_type");
auto device_count = static_cast<size_t>(Attr<int>("device_count"));
if (device_count == 0) {
if (device_type == "CUDA") {
device_count = CUDADevCount();
} else if (device_type == "CPU") {
device_count = std::thread::hardware_concurrency();
}
}
PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
device_type);
auto out_var_name = Output("Out");
auto &places =
*(detail::Ref(scope.FindVar(out_var_name),
"Output variable %s cannot be found", out_var_name)
.GetMutable<platform::PlaceList>());
places.reserve(device_count);
if (device_type == "CUDA") {
PADDLE_ENFORCE_LE(device_count, CUDADevCount(),
"Only %d CUDA devices found, cannot set to %d",
CUDADevCount(), device_count);
for (size_t i = 0; i < device_count; ++i) {
places.emplace_back(platform::CUDAPlace(i));
}
} else if (device_type == "CPU") {
for (size_t i = 0; i < device_count; ++i) {
places.emplace_back(platform::CPUPlace());
}
}
}
};
class GetPlacesOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
GetPlacesOpProtoMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("Out", "vector of Place");
AddAttr<int>("device_count", "device count").SetDefault(1);
AddAttr<std::string>("device_type",
R"(device type must be in ["CPU", "CUDA"])")
.InEnum({"CPU", "CUDA"});
AddComment(R"DOC(
Returns a list of places based on flags. The list will be used for parallel
execution.
)DOC");
}
};
class GetPlacesInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &o_name : op_desc.Output("Out")) {
block->FindRecursiveOrCreateVar(o_name).SetType(
framework::proto::VarDesc::PLACE_LIST);
}
}
};
class GetPlacesInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *context) const override {
// Do nothing
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(get_places, ops::GetPlacesOp, ops::GetPlacesOpProtoMaker,
ops::GetPlacesInferVarType, ops::GetPlacesInferShape);
...@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { ...@@ -183,7 +183,7 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of computation kernel of linear_chain_crf // Explicitly set that the data type of computation kernel of linear_chain_crf
// is determined by its input "Emission". // is determined by its input "Emission".
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()), framework::ToDataType(ctx.Input<LoDTensor>("Emission")->type()),
...@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { ...@@ -242,7 +242,7 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
protected: protected:
// Explicitly set that the data type of output of the linear_chain_crf_grad // Explicitly set that the data type of output of the linear_chain_crf_grad
// operator is determined by its input: gradients of LogLikelihood. // operator is determined by its input: gradients of LogLikelihood.
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
......
...@@ -196,7 +196,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -196,7 +196,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
auto copyLoDTensor = [](const platform::DeviceContext& ctx, auto copyLoDTensor = [](const platform::DeviceContext& ctx,
const LoDTensor& src, LoDTensor* dst) { const LoDTensor& src, LoDTensor* dst) {
dst->mutable_data<T>(src.dims(), platform::CPUPlace()); dst->mutable_data<T>(src.dims(), platform::CPUPlace());
framework::CopyFrom(src, platform::CPUPlace(), ctx, dst); framework::Copy(src, platform::CPUPlace(), ctx, dst);
}; };
copyLoDTensor(ctx, emission_weights_src, emission_weights_dst); copyLoDTensor(ctx, emission_weights_src, emission_weights_dst);
...@@ -204,8 +204,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -204,8 +204,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
transition_weights_dst->mutable_data<T>(transition_weights_src.dims(), transition_weights_dst->mutable_data<T>(transition_weights_src.dims(),
platform::CPUPlace()); platform::CPUPlace());
framework::CopyFrom(transition_weights_src, platform::CPUPlace(), ctx, framework::Copy(transition_weights_src, platform::CPUPlace(), ctx,
transition_weights_dst); transition_weights_dst);
} }
void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx, void CopyOutputsToGpuMemory(const platform::DeviceContext& ctx,
...@@ -220,7 +220,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> { ...@@ -220,7 +220,7 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) { Tensor* dst) {
dst->mutable_data<T>(platform::CUDAPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(src, platform::CUDAPlace(), ctx, dst); framework::Copy(src, platform::CUDAPlace(), ctx, dst);
}; };
copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst);
...@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -410,12 +410,12 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
// Copy the inputs from GPU memory to CPU memory when this operators runs on // Copy the inputs from GPU memory to CPU memory when this operators runs on
// GPU device. // GPU device.
label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace()); label_dst->mutable_data<T>(label_src.dims(), platform::CPUPlace());
framework::CopyFrom(label_src, platform::CPUPlace(), ctx, label_dst); framework::Copy(label_src, platform::CPUPlace(), ctx, label_dst);
auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src, auto copyTensor = [](const platform::DeviceContext& ctx, const Tensor& src,
Tensor* dst) { Tensor* dst) {
dst->mutable_data<T>(src.dims(), platform::CPUPlace()); dst->mutable_data<T>(src.dims(), platform::CPUPlace());
framework::CopyFrom(src, platform::CPUPlace(), ctx, dst); framework::Copy(src, platform::CPUPlace(), ctx, dst);
}; };
copyTensor(ctx, emission_exps_src, emission_exps_dst); copyTensor(ctx, emission_exps_src, emission_exps_dst);
copyTensor(ctx, transition_exps_src, transition_exps_dst); copyTensor(ctx, transition_exps_src, transition_exps_dst);
...@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> { ...@@ -434,7 +434,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
Tensor* dst) { Tensor* dst) {
if (src && dst) { if (src && dst) {
dst->mutable_data<T>(platform::CUDAPlace()); dst->mutable_data<T>(platform::CUDAPlace());
framework::CopyFrom(*src, platform::CUDAPlace(), ctx, dst); framework::Copy(*src, platform::CUDAPlace(), ctx, dst);
} }
}; };
copyTensor(ctx, emission_grad_src, emission_grad_dst); copyTensor(ctx, emission_grad_src, emission_grad_dst);
......
...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase { ...@@ -38,10 +38,10 @@ class LoadOp : public framework::OperatorBase {
out_var_name); out_var_name);
auto *tensor = out_var->GetMutable<framework::LoDTensor>(); auto *tensor = out_var->GetMutable<framework::LoDTensor>();
DeserializeFromStream(fin, tensor);
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
DeserializeFromStream(fin, tensor, dev_ctx);
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
// copy CPU to GPU // copy CPU to GPU
...@@ -53,7 +53,7 @@ class LoadOp : public framework::OperatorBase { ...@@ -53,7 +53,7 @@ class LoadOp : public framework::OperatorBase {
out_var->Clear(); out_var->Clear();
tensor = out_var->GetMutable<framework::LoDTensor>(); tensor = out_var->GetMutable<framework::LoDTensor>();
tensor->set_lod(cpu_tensor.lod()); tensor->set_lod(cpu_tensor.lod());
CopyFrom(cpu_tensor, place, dev_ctx, tensor); Copy(cpu_tensor, place, dev_ctx, tensor);
} }
} }
}; };
......
...@@ -66,7 +66,7 @@ class LoDRankTableInferVarType : public framework::VarTypeInference { ...@@ -66,7 +66,7 @@ class LoDRankTableInferVarType : public framework::VarTypeInference {
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
for (auto &o : op_desc.Output("Out")) { for (auto &o : op_desc.Output("Out")) {
block->FindRecursiveOrCreateVar(o)->SetType( block->FindRecursiveOrCreateVar(o).SetType(
framework::proto::VarDesc::LOD_RANK_TABLE); framework::proto::VarDesc::LOD_RANK_TABLE);
} }
} }
......
...@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel { ...@@ -38,7 +38,7 @@ class LoDResetOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
...@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel { ...@@ -97,7 +97,7 @@ class LoDResetGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
......
...@@ -33,8 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> { ...@@ -33,8 +33,8 @@ class LoDResetKernel : public framework::OpKernel<T> {
auto* lod = lod_t->data<int>(); auto* lod = lod_t->data<int>();
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
framework::Tensor lod_cpu; framework::Tensor lod_cpu;
framework::CopyFrom(*lod_t, platform::CPUPlace(), ctx.device_context(), framework::Copy(*lod_t, platform::CPUPlace(), ctx.device_context(),
&lod_cpu); &lod_cpu);
lod = lod_cpu.data<int>(); lod = lod_cpu.data<int>();
} }
level0 = std::vector<int>(lod, lod + lod_t->numel()); level0 = std::vector<int>(lod, lod + lod_t->numel());
......
...@@ -92,9 +92,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase { ...@@ -92,9 +92,9 @@ class LoDTensorToArrayOp : public framework::OperatorBase {
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin), framework::Copy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)), static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice); x.place(), dev_ctx, &slice);
offset += len; offset += len;
} }
} }
......
...@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel { ...@@ -99,9 +99,9 @@ class LogicalOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetActualKernelType(ctx); framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// LogicalOp kernel's device type is decided by input tensor place // LogicalOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place(); kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt; return kt;
......
...@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel { ...@@ -41,7 +41,7 @@ class LookupTableOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("W")->type()), framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
...@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel { ...@@ -98,7 +98,7 @@ class LookupTableOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<LoDTensor>("W")->type()), framework::ToDataType(ctx.Input<LoDTensor>("W")->type()),
......
...@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel { ...@@ -92,7 +92,7 @@ class LSTMOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
...@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel { ...@@ -260,7 +260,7 @@ class LSTMGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("Input")->type()),
......
...@@ -12,6 +12,7 @@ if(WITH_GPU) ...@@ -12,6 +12,7 @@ if(WITH_GPU)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor) nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor)
nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
...@@ -27,6 +28,7 @@ else() ...@@ -27,6 +28,7 @@ else()
cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function) cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor) cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor)
cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(matrix_bit_code SRCS matrix_bit_code.cc DEPS device_context) cc_library(matrix_bit_code SRCS matrix_bit_code.cc DEPS device_context)
cc_library(maxouting SRCS maxouting.cc DEPS device_context) cc_library(maxouting SRCS maxouting.cc DEPS device_context)
...@@ -39,3 +41,4 @@ cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) ...@@ -39,3 +41,4 @@ cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
...@@ -149,7 +149,7 @@ class ContextProjectFunctor { ...@@ -149,7 +149,7 @@ class ContextProjectFunctor {
Tensor out_t_sub = out_t.Slice(k * context_length, Tensor out_t_sub = out_t.Slice(k * context_length,
k * context_length + padding_size); k * context_length + padding_size);
Tensor w_sub = padding_data.Slice(k, k + padding_size); Tensor w_sub = padding_data.Slice(k, k + padding_size);
framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub); framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
} }
} }
if (down_pad > 0) { // add down pad if (down_pad > 0) { // add down pad
...@@ -179,7 +179,7 @@ class ContextProjectFunctor { ...@@ -179,7 +179,7 @@ class ContextProjectFunctor {
(down_pad_begin_row + t) * context_length); (down_pad_begin_row + t) * context_length);
Tensor w_sub = padding_data.Slice( Tensor w_sub = padding_data.Slice(
up_pad + padding_idx, up_pad + padding_idx + padding_size); up_pad + padding_idx, up_pad + padding_idx + padding_size);
framework::CopyFrom(w_sub, context.GetPlace(), context, &out_t_sub); framework::Copy(w_sub, context.GetPlace(), context, &out_t_sub);
} }
} }
out_t.Resize({sequence_height, context_length * sequence_width}); out_t.Resize({sequence_height, context_length * sequence_width});
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#include "paddle/operators/math/im2col.h" #include "paddle/operators/math/im2col.h"
#include <gtest/gtest.h> #include <gtest/gtest.h>
#include <iostream>
template <typename DeviceContext, typename Place> template <typename DeviceContext, typename Place>
void testIm2col() { void testIm2col() {
...@@ -63,7 +62,7 @@ void testIm2col() { ...@@ -63,7 +62,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
CopyFrom(input_tmp, *place, *context, &input); Copy(input_tmp, *place, *context, &input);
} }
output_cfo.mutable_data<float>( output_cfo.mutable_data<float>(
{1, filter_size, filter_size, output_height, output_width}, *place); {1, filter_size, filter_size, output_height, output_width}, *place);
...@@ -88,7 +87,7 @@ void testIm2col() { ...@@ -88,7 +87,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output_cfo.data<float>(); out_cfo_ptr = output_cfo.data<float>();
} else { } else {
CopyFrom(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp); Copy(output_cfo, paddle::platform::CPUPlace(), *context, &output_tmp);
out_cfo_ptr = output_tmp.data<float>(); out_cfo_ptr = output_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
...@@ -99,9 +98,10 @@ void testIm2col() { ...@@ -99,9 +98,10 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_ocf_ptr = output_ocf.data<float>(); out_ocf_ptr = output_ocf.data<float>();
} else { } else {
CopyFrom(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp); Copy(output_ocf, paddle::platform::CPUPlace(), *context, &output_tmp);
out_ocf_ptr = output_tmp.data<float>(); out_ocf_ptr = output_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]); EXPECT_EQ(out_ocf_ptr[i], out_ocf_data[i]);
} }
...@@ -119,7 +119,7 @@ void testIm2col() { ...@@ -119,7 +119,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
CopyFrom(input_tmp, *place, *context, &input); Copy(input_tmp, *place, *context, &input);
} }
col2im(*context, output_cfo, dilation, stride, padding, &input); col2im(*context, output_cfo, dilation, stride, padding, &input);
...@@ -128,7 +128,7 @@ void testIm2col() { ...@@ -128,7 +128,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
...@@ -140,7 +140,7 @@ void testIm2col() { ...@@ -140,7 +140,7 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
CopyFrom(input_tmp, *place, *context, &input); Copy(input_tmp, *place, *context, &input);
} }
col2im_ocf(*context, output_ocf, dilation, stride, padding, &input); col2im_ocf(*context, output_ocf, dilation, stride, padding, &input);
...@@ -148,12 +148,15 @@ void testIm2col() { ...@@ -148,12 +148,15 @@ void testIm2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
for (int i = 0; i < 6; ++i) { for (int i = 0; i < 6; ++i) {
EXPECT_EQ(in_ptr[i], col2im_data[i]); EXPECT_EQ(in_ptr[i], col2im_data[i]);
} }
delete place;
delete context;
} }
TEST(math, im2col) { TEST(math, im2col) {
......
...@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) { ...@@ -16,15 +16,15 @@ TEST(math_function, notrans_mul_trans) {
auto* gpu_place = new paddle::platform::CUDAPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, *gpu_place); out_gpu.mutable_data<float>({2, 2}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) { ...@@ -50,15 +50,15 @@ TEST(math_function, trans_mul_notrans) {
auto* gpu_place = new paddle::platform::CUDAPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input2_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, *gpu_place); out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::CopyFrom(out_gpu, *cpu_place, context, &out); paddle::framework::Copy(out_gpu, *cpu_place, context, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -99,9 +99,9 @@ TEST(math_function, gemm_notrans_cublas) {
auto* gpu_place = new paddle::platform::CUDAPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu); paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu); paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(*gpu_place);
...@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -109,7 +109,7 @@ TEST(math_function, gemm_notrans_cublas) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -154,9 +154,9 @@ TEST(math_function, gemm_trans_cublas) {
auto* gpu_place = new paddle::platform::CUDAPlace(0); auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(input1, *gpu_place, context, &input1_gpu); paddle::framework::Copy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::CopyFrom(input2, *gpu_place, context, &input2_gpu); paddle::framework::Copy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::CopyFrom(input3, *gpu_place, context, &input3_gpu); paddle::framework::Copy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(*gpu_place);
...@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -164,7 +164,7 @@ TEST(math_function, gemm_trans_cublas) {
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::CopyFrom(input3_gpu, *cpu_place, context, &input3); paddle::framework::Copy(input3_gpu, *cpu_place, context, &input3);
context.Wait(); context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
...@@ -205,15 +205,15 @@ void GemvTest(int m, int n, bool trans) { ...@@ -205,15 +205,15 @@ void GemvTest(int m, int n, bool trans) {
} }
paddle::platform::CUDADeviceContext context(*gpu_place); paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::CopyFrom(mat_a, *gpu_place, context, &g_mat_a); paddle::framework::Copy(mat_a, *gpu_place, context, &g_mat_a);
paddle::framework::CopyFrom(vec_b, *gpu_place, context, &g_vec_b); paddle::framework::Copy(vec_b, *gpu_place, context, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>( paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
paddle::framework::CopyFrom(g_vec_c, paddle::platform::CPUPlace(), context, paddle::framework::Copy(g_vec_c, paddle::platform::CPUPlace(), context,
&vec_c); &vec_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -67,7 +67,7 @@ TEST(selected_rows_functor, gpu_add) {
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; Tensor out_cpu;
CopyFrom(*out_value, cpu_place, ctx, &out_cpu); Copy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) { ...@@ -94,7 +94,7 @@ TEST(selected_rows_functor, gpu_add) {
add_tensor_functor(ctx, *output, *tensor1, tensor2.get()); add_tensor_functor(ctx, *output, *tensor1, tensor2.get());
Tensor tensor2_cpu; Tensor tensor2_cpu;
CopyFrom(*tensor2, cpu_place, ctx, &tensor2_cpu); Copy(*tensor2, cpu_place, ctx, &tensor2_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor2_cpu_data = tensor2_cpu.data<float>(); auto* tensor2_cpu_data = tensor2_cpu.data<float>();
...@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -167,7 +167,7 @@ TEST(selected_rows_functor, gpu_add_to) {
EXPECT_EQ(out_rows[6], 9); EXPECT_EQ(out_rows[6], 9);
Tensor out_cpu; Tensor out_cpu;
CopyFrom(*out_value, cpu_place, ctx, &out_cpu); Copy(*out_value, cpu_place, ctx, &out_cpu);
ctx.Wait(); ctx.Wait();
auto* out_cpu_data = out_cpu.data<float>(); auto* out_cpu_data = out_cpu.data<float>();
...@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) { ...@@ -191,7 +191,7 @@ TEST(selected_rows_functor, gpu_add_to) {
add_to_tensor_functor(ctx, *output, tensor1.get()); add_to_tensor_functor(ctx, *output, tensor1.get());
Tensor tensor1_cpu; Tensor tensor1_cpu;
CopyFrom(*tensor1, cpu_place, ctx, &tensor1_cpu); Copy(*tensor1, cpu_place, ctx, &tensor1_cpu);
ctx.Wait(); ctx.Wait();
auto* tensor1_cpu_data = tensor1_cpu.data<float>(); auto* tensor1_cpu_data = tensor1_cpu.data<float>();
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
const size_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const size_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const size_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>();
for (size_t i = 0; i < max_sequence_length; ++i) {
for (size_t j = 0; j < num_sequences; ++j) {
size_t start_pos = abs_offset_lod[level][j];
size_t sequence_length = abs_offset_lod[level][j + 1] - start_pos;
if (i < sequence_length) {
// i > 0 => sequence_length > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (size_t k = 0; k < sequence_width; ++k) {
padding_data[(i * num_sequences + j) * sequence_width + k] =
seq_data[(start_pos + i) * sequence_width + k] * scale;
}
} else {
memset(padding_data + (i * num_sequences + j) * sequence_width, 0,
sequence_width * sizeof(T));
}
}
}
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The LoD of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
const size_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const size_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const size_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>();
for (size_t i = 0; i < num_sequences; ++i) {
size_t start_pos = abs_offset_lod[level][i];
size_t sequence_length = abs_offset_lod[level][i + 1] - start_pos;
for (size_t j = 0; j < sequence_length; ++j) {
// sequence_width > j > 0
T scale =
norm_by_times ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
for (size_t k = 0; k < sequence_width; ++k) {
seq_data[(start_pos + j) * sequence_width + k] =
padding_data[(j * num_sequences + i) * sequence_width + k] *
scale;
}
}
}
}
};
template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence_padding.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T, bool NormByTimes, bool Padding>
__global__ void SequencePaddingKernel(T* padding, T* sequence,
const size_t* sequence_start_positions,
const size_t sequence_width,
const size_t max_sequence_length,
const size_t num_sequences) {
size_t padding_idx = blockIdx.y;
size_t start_pos = sequence_start_positions[padding_idx];
size_t sequence_length =
sequence_start_positions[padding_idx + 1] - start_pos;
size_t sequence_idx = blockIdx.x * blockDim.y + threadIdx.y;
size_t padding_base_idx =
(sequence_idx * num_sequences + padding_idx) * sequence_width;
size_t sequence_base_idx = (start_pos + sequence_idx) * sequence_width;
if (sequence_idx < sequence_length) {
T scale = NormByTimes ? (1.0f / static_cast<T>(sequence_length)) : 1.0f;
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = scale * sequence[sequence_base_idx + i];
}
} else {
/* padding -> sequence */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
sequence[sequence_base_idx + i] = scale * padding[padding_base_idx + i];
}
}
} else if (sequence_idx < max_sequence_length) {
if (Padding) {
/* sequence -> padding */
for (size_t i = threadIdx.x; i < sequence_width; i += blockDim.x) {
padding[padding_base_idx + i] = 0;
}
}
}
}
template <typename T>
class PaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::LoDTensor& seq, framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequence_length, num_sequences, sequence_width].");
size_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be the "
"maximum length of all sequences in LoDTensor seq.");
const size_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be the "
"number of sequences in LoDTensor seq.");
const size_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
Copy(seq, context.GetPlace(), context, &padding);
padding.Resize(padding_dims);
return;
}
const size_t kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
dim3 grid(grid_dim_x, grid_dim_y);
const T* seq_data = seq.data<T>();
T* padding_data = padding.data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 1><<<grid, threads, 0, context.stream()>>>(
padding_data, const_cast<T*>(seq_data), abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
}
}
};
template <typename T>
class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
framework::LoDTensor& seq, const framework::Tensor& padding,
bool norm_by_times) {
auto lod = seq.lod();
PADDLE_ENFORCE_GT(lod.size(), 0UL,
"The lod of LoDTensor seq should not be null.");
const size_t level = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
auto seq_dims = seq.dims();
PADDLE_ENFORCE_EQ(seq_dims[0], abs_offset_lod[level].back(),
"The first dimension of LoDTensor seq should be "
"equal to the sum of all sequences's length.");
auto padding_dims = padding.dims();
PADDLE_ENFORCE_EQ(padding_dims.size(), 3UL,
"The input padding should be a 3-D Tensor of shape "
"[max_sequnece_length, num_sequences, sequence_width].");
size_t max_sequence_length = MaximumSequenceLength(lod, level);
PADDLE_ENFORCE_EQ(padding_dims[0], max_sequence_length,
"The first dimension of Tensor padding should be "
"the maximum length of all sequences in LoDTensor seq.");
const size_t num_sequences = abs_offset_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(padding_dims[1], num_sequences,
"The second dimension of Tensor padding should be "
"the number of sequences in LoDTensor seq.");
const size_t sequence_width = seq.numel() / seq_dims[0];
PADDLE_ENFORCE_EQ(padding_dims[2], sequence_width,
"The third dimension of Tensor padding should be the "
"width of sequence in LoDTensor seq.");
if (!norm_by_times && num_sequences == 1UL) {
Copy(padding, context.GetPlace(), context, &seq);
seq.Resize(seq_dims);
return;
}
const size_t kBlockSize = 512;
/* At least use 32 threads to copy sequence_width elements,
* and at least 8 elements for each thread.
*/
size_t block_dim_x =
std::min(((((sequence_width + 7) >> 3) + 31) >> 5) << 5, kBlockSize);
size_t block_dim_y = kBlockSize / block_dim_x;
dim3 threads(block_dim_x, block_dim_y);
size_t grid_dim_x = (max_sequence_length + block_dim_y - 1) / block_dim_y;
size_t grid_dim_y = num_sequences;
dim3 grid(grid_dim_x, grid_dim_y);
const T* padding_data = padding.data<T>();
T* seq_data = seq.data<T>();
if (norm_by_times) {
SequencePaddingKernel<T, 1, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
} else {
SequencePaddingKernel<T, 0, 0><<<grid, threads, 0, context.stream()>>>(
const_cast<T*>(padding_data), seq_data, abs_offset_lod[level].data(),
sequence_width, max_sequence_length, num_sequences);
}
}
};
template class PaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
template class UnpaddingLoDTensorFunctor<platform::CUDADeviceContext, float>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/lod_tensor.h"
#include "paddle/platform/device_context.h"
namespace paddle {
namespace operators {
namespace math {
inline static size_t MaximumSequenceLength(const framework::LoD& lod,
const size_t level) {
const size_t num_sequences = lod[level].size() - 1;
size_t max_sequence_length = 0;
framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
for (size_t i = 0; i < num_sequences; ++i) {
max_sequence_length =
std::max(max_sequence_length,
abs_offset_lod[level][i + 1] - abs_offset_lod[level][i]);
}
return max_sequence_length;
}
/*
* \brief Padding/Unpadding LoDTensor to/from normal Tensor of the shape
* [max_sequence_length, num_sequences, sequence_width].
*
* Padding sequence:
* padding[i] = seq[lod[level][i]]
* Unpadding sequence:
* seq[lod[level][i]] = padding[i]
*
* All sequences will be padded to the same length and stored in a transposed
* shape.
* Example:
* seq (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
* padding (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
*
* \param context device context of this functor.
* \param seq LoDTensor which is stored in sequence format, the shape
* is [total_sequence_length, sequence_width] where
* total_sequence_length is the sum of all sequences'
* length.
* \param padding Tensor which is padded to the same length, the shape is
* [max_sequence_length, num_sequences, sequence_width].
* \param norm_by_times whether dividing sequence's length.
*
* \note transposition is also done in this functor.
*/
template <typename DeviceContext, typename T>
class PaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, const framework::LoDTensor& seq,
framework::Tensor& padding, bool norm_by_times);
};
template <typename DeviceContext, typename T>
class UnpaddingLoDTensorFunctor {
public:
void operator()(const DeviceContext& context, framework::LoDTensor& seq,
const framework::Tensor& padding, bool norm_by_times);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/math/sequence_padding.h"
#include <gtest/gtest.h>
template <typename DeviceContext, typename Place, typename T>
void TestSequencePadding(const paddle::framework::LoD& lod,
const size_t sequence_width) {
paddle::framework::LoDTensor cpu_seq;
paddle::framework::LoDTensor cpu_seq_back;
paddle::framework::LoDTensor seq;
paddle::framework::LoDTensor seq_back;
paddle::framework::Tensor padding;
const size_t level = lod.size() - 1;
auto seq_dims =
paddle::framework::make_ddim({static_cast<int64_t>(lod[level].back()),
static_cast<int64_t>(sequence_width)});
cpu_seq.set_lod(lod);
cpu_seq.mutable_data<T>(seq_dims, paddle::platform::CPUPlace());
for (size_t i = 0; i < cpu_seq.numel(); ++i) {
cpu_seq.data<T>()[i] = static_cast<T>(i);
}
auto* place = new Place();
DeviceContext* context = new DeviceContext(*place);
if (paddle::platform::is_cpu_place(*place)) {
seq = cpu_seq;
} else {
Copy(cpu_seq, *place, *context, &seq);
seq.set_lod(lod);
}
const size_t max_sequence_length =
paddle::operators::math::MaximumSequenceLength(lod, level);
const size_t num_sequences = lod[level].size() - 1;
auto padding_dims =
paddle::framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
padding.mutable_data<T>(padding_dims, *place);
paddle::operators::math::PaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq, padding, false);
seq_back.set_lod(lod);
seq_back.mutable_data<T>(seq_dims, *place);
paddle::operators::math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
*context, seq_back, padding, false);
if (paddle::platform::is_cpu_place(*place)) {
cpu_seq_back = seq_back;
} else {
Copy(seq_back, paddle::platform::CPUPlace(), *context, &cpu_seq_back);
cpu_seq_back.set_lod(lod);
}
EXPECT_EQ(cpu_seq.numel(), cpu_seq_back.numel());
EXPECT_EQ(cpu_seq.dims(), cpu_seq_back.dims());
for (size_t i = 0; i < cpu_seq.numel(); ++i) {
EXPECT_EQ(cpu_seq.data<T>()[i], cpu_seq_back.data<T>()[i]);
}
delete place;
delete context;
};
TEST(Seq2BatchPadding, CPU) {
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePadding<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod1, 16);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePadding<paddle::platform::CPUDeviceContext,
paddle::platform::CPUPlace, float>(lod2, 128);
}
#ifdef PADDLE_WITH_CUDA
TEST(SequencePadding, CUDA) {
paddle::framework::LoD lod1;
lod1.push_back(std::vector<size_t>{0, 10});
TestSequencePadding<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod1, 16);
paddle::framework::LoD lod2;
lod2.push_back(std::vector<size_t>{0, 2, 7, 10});
TestSequencePadding<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace, float>(lod2, 128);
}
#endif
...@@ -71,7 +71,7 @@ void testVol2col() { ...@@ -71,7 +71,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
CopyFrom(input_tmp, *place, *context, &input); Copy(input_tmp, *place, *context, &input);
} }
output.mutable_data<float>({1, filter_size, filter_size, filter_size, output.mutable_data<float>({1, filter_size, filter_size, filter_size,
output_depth, output_height, output_width}, output_depth, output_height, output_width},
...@@ -85,7 +85,7 @@ void testVol2col() { ...@@ -85,7 +85,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
out_cfo_ptr = output.data<float>(); out_cfo_ptr = output.data<float>();
} else { } else {
CopyFrom(output, paddle::platform::CPUPlace(), *context, &output_tmp); Copy(output, paddle::platform::CPUPlace(), *context, &output_tmp);
out_cfo_ptr = output_tmp.data<float>(); out_cfo_ptr = output_tmp.data<float>();
} }
...@@ -99,7 +99,7 @@ void testVol2col() { ...@@ -99,7 +99,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
input = input_tmp; input = input_tmp;
} else { } else {
CopyFrom(input_tmp, *place, *context, &input); Copy(input_tmp, *place, *context, &input);
} }
paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol; paddle::operators::math::Col2VolFunctor<DeviceContext, float> col2vol;
...@@ -109,7 +109,7 @@ void testVol2col() { ...@@ -109,7 +109,7 @@ void testVol2col() {
if (paddle::platform::is_cpu_place(*place)) { if (paddle::platform::is_cpu_place(*place)) {
in_ptr = input.data<float>(); in_ptr = input.data<float>();
} else { } else {
CopyFrom(input, paddle::platform::CPUPlace(), *context, &input_tmp); Copy(input, paddle::platform::CPUPlace(), *context, &input_tmp);
in_ptr = input_tmp.data<float>(); in_ptr = input_tmp.data<float>();
} }
......
...@@ -49,7 +49,7 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -49,7 +49,7 @@ class MergeLoDTensorOp : public framework::OperatorBase {
cpu_mask->ShareDataWith(mask); cpu_mask->ShareDataWith(mask);
} else if (platform::is_gpu_place(mask.place())) { } else if (platform::is_gpu_place(mask.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
#else #else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif #endif
...@@ -104,8 +104,8 @@ class MergeLoDTensorOp : public framework::OperatorBase { ...@@ -104,8 +104,8 @@ class MergeLoDTensorOp : public framework::OperatorBase {
continue; continue;
} }
auto slice = out->Slice(out_offset, out_offset + len); auto slice = out->Slice(out_offset, out_offset + len);
framework::CopyFrom(input->Slice(start_offset, end_offset), place, framework::Copy(input->Slice(start_offset, end_offset), place, dev_ctx,
dev_ctx, &slice); &slice);
out_offset += len; out_offset += len;
(*in_idx) += 1; (*in_idx) += 1;
} }
......
...@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel { ...@@ -51,7 +51,7 @@ class MultiplexOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()), framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
...@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel { ...@@ -102,7 +102,7 @@ class MultiplexGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()), framework::ToDataType(ctx.MultiInput<Tensor>("X")[0]->type()),
......
...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> { ...@@ -33,7 +33,7 @@ class MultiplexGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace()); platform::CUDAPlace place = boost::get<platform::CUDAPlace>(ctx.GetPlace());
...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> { ...@@ -69,7 +69,7 @@ class MultiplexGradGPUKernel : public framework::OpKernel<T> {
auto cols = ins[0]->numel() / rows; auto cols = ins[0]->numel() / rows;
// copy index to cpu // copy index to cpu
Tensor index_t_cpu; Tensor index_t_cpu;
CopyFrom(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu); Copy(*ids, platform::CPUPlace(), ctx.device_context(), &index_t_cpu);
auto* index = index_t_cpu.data<int32_t>(); auto* index = index_t_cpu.data<int32_t>();
auto stream = ctx.cuda_device_context().stream(); auto stream = ctx.cuda_device_context().stream();
......
...@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel { ...@@ -63,7 +63,7 @@ class NCEOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
...@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel { ...@@ -166,7 +166,7 @@ class NCEOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), framework::ToDataType(ctx.Input<Tensor>("Input")->type()),
......
...@@ -56,11 +56,11 @@ void NetOp::CompleteAddOp(bool calc) { ...@@ -56,11 +56,11 @@ void NetOp::CompleteAddOp(bool calc) {
std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs)); std::copy(output_set.begin(), output_set.end(), std::back_inserter(outputs));
} }
std::string NetOp::DebugString() const { std::string NetOp::DebugStringEx(const framework::Scope* scope) const {
std::ostringstream os; std::ostringstream os;
os << OperatorBase::DebugString() << std::endl; os << OperatorBase::DebugStringEx(scope) << std::endl;
for (auto& op : ops_) { for (auto& op : ops_) {
std::istringstream is(op->DebugString()); std::istringstream is(op->DebugStringEx(scope));
for (std::string line; std::getline(is, line);) { for (std::string line; std::getline(is, line);) {
os << " " << line << std::endl; os << " " << line << std::endl;
} }
......
...@@ -106,7 +106,8 @@ class NetOp : public framework::OperatorBase { ...@@ -106,7 +106,8 @@ class NetOp : public framework::OperatorBase {
void CompleteAddOp(bool calculate = true); void CompleteAddOp(bool calculate = true);
std::string DebugString() const override; std::string DebugStringEx(
const framework::Scope* scope = nullptr) const override;
bool IsNetOp() const override; bool IsNetOp() const override;
std::vector<std::string> OutputVars(bool has_intermediate) const override; std::vector<std::string> OutputVars(bool has_intermediate) const override;
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/framework/executor.h"
#include "paddle/framework/op_registry.h"
#include "paddle/framework/threadpool.h"
namespace paddle {
namespace operators {
static constexpr char kInputs[] = "inputs";
static constexpr char kParameters[] = "parameters";
static constexpr char kPlaces[] = "places";
static constexpr char kOutputs[] = "outputs";
static constexpr char kParallelScopes[] = "parallel_scopes";
static constexpr char kParallelBlock[] = "sub_block";
// using ParallelScopeVar = std::vector<framework::Scope *>;
using LoDTensor = framework::LoDTensor;
using OperatorBase = framework::OperatorBase;
void SplitTensorAndMoveTensorToScopes(
const framework::Scope &scope,
const std::vector<framework::Scope *> &sub_scopes,
const std::vector<platform::Place> &places,
const std::vector<std::string> &names) {
for (auto &argu : names) {
auto *var = scope.FindVar(argu);
const auto &tensor = var->Get<LoDTensor>();
auto lod_tensors = tensor.SplitLoDTensor(places);
for (auto &lod : lod_tensors) {
VLOG(3) << lod.dims();
}
for (size_t i = 0; i < sub_scopes.size(); ++i) {
*sub_scopes[i]->Var(argu)->GetMutable<LoDTensor>() = lod_tensors[i];
}
}
}
class ParallelDoOp : public framework::OperatorBase {
public:
ParallelDoOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
// get device context from pool
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place);
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
// TODO(tonyyang-svail): get places from input
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
places.emplace_back(platform::CPUPlace());
auto &sub_scopes = *scope.FindVar(Output(kParallelScopes))
->GetMutable<std::vector<framework::Scope *>>();
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
sub_scopes.push_back(&scope.NewScope());
}
SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
Inputs(kInputs));
std::vector<std::future<void>> workers;
workers.reserve(places.size());
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
VLOG(3) << "Run " << place_idx;
auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx];
// copy parameter
// some version of boost lacks != for boost::variant
if (!(dev_ctx.GetPlace() == place)) {
PADDLE_THROW("Not Implemented");
}
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
// merge output
for (auto &o_name : Outputs(kOutputs)) {
std::vector<const framework::LoDTensor *> lod_tensors;
lod_tensors.reserve(sub_scopes.size());
for (auto *sub_scope : sub_scopes) {
lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get<LoDTensor>());
}
auto *lod_tensor_to_be_merged =
scope.FindVar(o_name)->GetMutable<LoDTensor>();
lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace());
}
}
};
class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker {
public:
ParallelDoOpProtoMaker(OpProto *proto, framework::OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kInputs, "").AsDuplicable();
AddInput(kParameters, "").AsDuplicable();
AddInput(kPlaces, "");
AddOutput(kOutputs, "").AsDuplicable();
AddOutput(kParallelScopes, "");
AddAttr<framework::BlockDesc *>(kParallelBlock, "");
AddComment(R"DOC(
ParallelDo Operator.
)DOC");
}
};
class ParallelDoGradOp : public OperatorBase {
public:
ParallelDoGradOp(const std::string &type,
const framework::VariableNameMap &inputs,
const framework::VariableNameMap &outputs,
const framework::AttributeMap &attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void Run(const framework::Scope &scope,
const platform::Place &place) const override {
// // get device context from pool
// platform::DeviceContextPool &pool =
// platform::DeviceContextPool::Instance();
// auto &dev_ctx = *pool.Get(place);
auto *block = Attr<framework::BlockDesc *>(kParallelBlock);
auto *program = block->Program();
auto &sub_scopes = scope.FindVar(Input(kParallelScopes))
->Get<std::vector<framework::Scope *>>();
// TODO(tonyyang-svail): get places from input
std::vector<platform::Place> places;
places.emplace_back(platform::CPUPlace());
places.emplace_back(platform::CPUPlace());
// feed output@grad
SplitTensorAndMoveTensorToScopes(scope, sub_scopes, places,
Inputs(framework::GradVarName(kOutputs)));
for (auto &s : Inputs(framework::GradVarName(kOutputs))) {
VLOG(3) << s;
VLOG(3) << scope.FindVar(s)->Get<LoDTensor>();
for (auto *sub_scope : sub_scopes) {
VLOG(3) << sub_scope->FindVar(s)->Get<LoDTensor>();
}
}
// exe run
std::vector<std::future<void>> workers;
for (size_t place_idx = 0; place_idx < places.size(); ++place_idx) {
VLOG(3) << "Run " << place_idx;
auto &place = places[place_idx];
auto *cur_scope = sub_scopes[place_idx];
// execute
workers.emplace_back(framework::Async([program, cur_scope, place, block] {
framework::Executor executor(place);
executor.Run(*program, cur_scope, block->ID(),
false /*create_local_scope*/);
}));
}
for (auto &worker : workers) {
worker.wait();
}
// merge grad
for (auto &s : Outputs(framework::GradVarName(kParameters))) {
VLOG(3) << s;
auto &t = sub_scopes[0]->FindVar(s)->Get<LoDTensor>();
VLOG(3) << t;
std::string s_buf = s + "@BUF";
auto *t_buf = sub_scopes[0]->Var(s_buf)->GetMutable<LoDTensor>();
for (size_t place_idx = 1; place_idx < places.size(); ++place_idx) {
auto &tt = sub_scopes[place_idx]->FindVar(s)->Get<LoDTensor>();
VLOG(3) << place_idx;
VLOG(3) << tt;
framework::Copy(tt, places[0], t_buf);
auto sum_op = framework::OpRegistry::CreateOp(
"sum", {{"X", {s, s_buf}}}, {{"Out", {s}}},
framework::AttributeMap{});
sum_op->Run(*sub_scopes[0], place);
}
VLOG(3) << t;
framework::Copy(t, place, scope.FindVar(s)->GetMutable<LoDTensor>());
}
}
};
class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker {
public:
using framework::SingleGradOpDescMaker::SingleGradOpDescMaker;
protected:
virtual std::unique_ptr<framework::OpDesc> Apply() const {
auto *grad = new framework::OpDesc();
grad->SetType("parallel_do_grad");
for (auto &input_param : this->InputNames()) {
VLOG(3) << input_param;
grad->SetInput(input_param, this->Input(input_param));
grad->SetOutput(framework::GradVarName(input_param),
this->InputGrad(input_param, false));
}
for (auto &output_param : this->OutputNames()) {
if (output_param == kParallelScopes) {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(framework::GradVarName(output_param),
this->Output(output_param));
} else {
grad->SetInput(output_param, this->Output(output_param));
grad->SetInput(framework::GradVarName(output_param),
this->OutputGrad(output_param));
}
}
grad->SetAttrMap(this->Attrs());
grad->SetBlockAttr(kParallelBlock, *grad_block_[0]);
return std::unique_ptr<framework::OpDesc>(grad);
}
};
class ParallelDoGradOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext *ctx) const override {
std::vector<std::string> input{kParameters, kInputs};
std::vector<std::string> output{kOutputs};
for (auto &s : input) {
PADDLE_ENFORCE(ctx->HasInputs(s));
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(s)),
"Cannot find the gradient variable %s",
framework::GradVarName(s));
}
for (auto &s : output) {
PADDLE_ENFORCE(ctx->HasInputs(s));
}
for (auto &s : input) {
ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s));
}
if (ctx->HasInputs(kParameters)) {
PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)));
ctx->SetOutputsDim(framework::GradVarName(kParameters),
ctx->GetInputsDim(kParameters));
}
}
};
} // namespace operators
} // namespace paddle
REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp,
paddle::operators::ParallelDoOpProtoMaker,
paddle::operators::ParallelDoGradOpDescMaker);
REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp,
paddle::operators::ParallelDoGradOpShapeInference);
...@@ -58,6 +58,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { ...@@ -58,6 +58,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); OutputSizePool(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
} }
ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
ctx->ShareLoD("X", "Out");
} }
void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
......
...@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { ...@@ -69,7 +69,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel { ...@@ -90,7 +90,7 @@ class MaxPoolWithIndexOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
...@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel { ...@@ -85,7 +85,7 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Score")->type()), framework::ToDataType(ctx.Input<Tensor>("Score")->type()),
......
...@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel { ...@@ -80,7 +80,7 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()), framework::ToDataType(ctx.Input<Tensor>("MaxProbs")->type()),
......
...@@ -290,7 +290,7 @@ class RecurrentOp : public RecurrentBase { ...@@ -290,7 +290,7 @@ class RecurrentOp : public RecurrentBase {
auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1); auto dst_out = dst_tensor->Slice(seq_offset, seq_offset + 1);
// Explicit copy output since the local RNN scope can be destroyed // Explicit copy output since the local RNN scope can be destroyed
// early. // early.
framework::CopyFrom(src_tensor, place, dev_ctx, &dst_out); framework::Copy(src_tensor, place, dev_ctx, &dst_out);
}); });
scopes.Next(); scopes.Next();
...@@ -376,7 +376,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -376,7 +376,7 @@ class RecurrentGradOp : public RecurrentBase {
auto *cur_grad_var = cur_scope.Var(cur_grad); auto *cur_grad_var = cur_scope.Var(cur_grad);
auto cur_grad_tensor = auto cur_grad_tensor =
cur_grad_var->GetMutable<framework::LoDTensor>(); cur_grad_var->GetMutable<framework::LoDTensor>();
framework::CopyFrom(ex_tensor, place, dev_ctx, cur_grad_tensor); framework::Copy(ex_tensor, place, dev_ctx, cur_grad_tensor);
} }
} }
...@@ -450,7 +450,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -450,7 +450,7 @@ class RecurrentGradOp : public RecurrentBase {
} }
auto dst = outside->Slice(seq_offset, seq_offset + 1); auto dst = outside->Slice(seq_offset, seq_offset + 1);
framework::CopyFrom(inside, place, dev_ctx, &dst); framework::Copy(inside, place, dev_ctx, &dst);
}); });
VLOG(5) << "Link outside gradient finished "; VLOG(5) << "Link outside gradient finished ";
...@@ -463,7 +463,7 @@ class RecurrentGradOp : public RecurrentBase { ...@@ -463,7 +463,7 @@ class RecurrentGradOp : public RecurrentBase {
framework::LoDTensor *outside) { framework::LoDTensor *outside) {
outside->Resize(inside.dims()); outside->Resize(inside.dims());
outside->mutable_data(place, inside.type()); outside->mutable_data(place, inside.type());
framework::CopyFrom(inside, place, dev_ctx, outside); framework::Copy(inside, place, dev_ctx, outside);
}); });
VLOG(5) << "Link initialize state gradient finished "; VLOG(5) << "Link initialize state gradient finished ";
} }
......
...@@ -19,7 +19,6 @@ limitations under the License. */ ...@@ -19,7 +19,6 @@ limitations under the License. */
#include <unistd.h> #include <unistd.h>
#include "paddle/framework/data_type.h"
#include "paddle/framework/executor.h" #include "paddle/framework/executor.h"
#include "paddle/framework/framework.pb.h" #include "paddle/framework/framework.pb.h"
#include "paddle/framework/lod_tensor.h" #include "paddle/framework/lod_tensor.h"
...@@ -33,6 +32,20 @@ limitations under the License. */ ...@@ -33,6 +32,20 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
static void CreateTensorFromMessageType(framework::Variable *var,
sendrecv::VarType var_type) {
if (var_type == sendrecv::VarType::LOD_TENSOR) {
var->GetMutable<framework::LoDTensor>();
} else if (var_type == sendrecv::VarType::SELECTED_ROWS) {
var->GetMutable<framework::SelectedRows>();
} else {
PADDLE_THROW(
"VraibleMessage type %d is not in "
"[LoDTensor, SelectedRows]",
var_type);
}
}
void RunServer(Server **rpc_server, void RunServer(Server **rpc_server,
std::shared_ptr<detail::SendRecvServerImpl> service, std::shared_ptr<detail::SendRecvServerImpl> service,
const std::string &server_address) { const std::string &server_address) {
...@@ -60,7 +73,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -60,7 +73,7 @@ class RecvOp : public framework::OperatorBase {
} }
void Stop() override { void Stop() override {
detail::TensorWithName term_msg; detail::MessageWithName term_msg;
term_msg.first = LISTEN_TERMINATE_MESSAGE; term_msg.first = LISTEN_TERMINATE_MESSAGE;
rpc_service_->Push(term_msg); rpc_service_->Push(term_msg);
rpc_server_->Shutdown(); rpc_server_->Shutdown();
...@@ -94,7 +107,7 @@ class RecvOp : public framework::OperatorBase { ...@@ -94,7 +107,7 @@ class RecvOp : public framework::OperatorBase {
// the gradient arrives, just add suffix 0~n then average the gradient. // the gradient arrives, just add suffix 0~n then average the gradient.
for (size_t i = 0; i < param_count * trainer_count; ++i) { for (size_t i = 0; i < param_count * trainer_count; ++i) {
// blocking get one var from client. // blocking get one var from client.
const detail::TensorWithName &v = rpc_service_->Get(); const detail::MessageWithName &v = rpc_service_->Get();
auto grad_var_name = v.first; auto grad_var_name = v.first;
if (grad_var_name == LISTEN_TERMINATE_MESSAGE) { if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
exit_flag = true; exit_flag = true;
...@@ -111,9 +124,11 @@ class RecvOp : public framework::OperatorBase { ...@@ -111,9 +124,11 @@ class RecvOp : public framework::OperatorBase {
<< " updating param: " << param_var_name; << " updating param: " << param_var_name;
auto *merged_grad = recv_scope.FindVar(grad_var_name); auto *merged_grad = recv_scope.FindVar(grad_var_name);
if (merged_grad == nullptr) { if (merged_grad == nullptr) {
// create output of merged var. auto *ptr = recv_scope.Var(grad_var_name);
auto merged_var = recv_scope.Var(grad_var_name); CreateTensorFromMessageType(ptr, v.second.type());
merged_var->GetMutable<framework::LoDTensor>(); VLOG(3) << "Create Variable " << grad_var_name
<< " on recv scope, which pointer is " << ptr << " type is "
<< v.second.type();
} }
if (trainer_count > 1) { if (trainer_count > 1) {
...@@ -121,11 +136,10 @@ class RecvOp : public framework::OperatorBase { ...@@ -121,11 +136,10 @@ class RecvOp : public framework::OperatorBase {
} }
auto *var = recv_scope.Var(grad_var_name); auto *var = recv_scope.Var(grad_var_name);
auto *tensor = var->GetMutable<framework::LoDTensor>(); platform::DeviceContextPool &pool =
// FIXME(typhoonzero): do not copy platform::DeviceContextPool::Instance();
platform::DeviceContextPool &pool = platform::DeviceContextPool::Get(); auto &dev_ctx = *pool.Get(dev_place);
auto &dev_ctx = *pool.Borrow(dev_place); detail::DeserializeFromMessage(v.second, dev_ctx, var);
framework::CopyFrom(v.second, dev_place, dev_ctx, tensor);
} }
if (exit_flag) { if (exit_flag) {
break; break;
......
...@@ -77,6 +77,7 @@ class ReduceGradOp : public framework::OperatorWithKernel { ...@@ -77,6 +77,7 @@ class ReduceGradOp : public framework::OperatorWithKernel {
auto x_grad_name = framework::GradVarName("X"); auto x_grad_name = framework::GradVarName("X");
if (ctx->HasOutput(x_grad_name)) { if (ctx->HasOutput(x_grad_name)) {
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, x_dims);
ctx->ShareLoD("X", /*->*/ x_grad_name);
} }
} }
}; };
......
...@@ -88,20 +88,33 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { ...@@ -88,20 +88,33 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable( std::vector<AbsoluteRankTableItem> GetAbsoluteOffsetAndLengthByLoDRankTable(
const framework::LoDTensor &x) const { const framework::LoDTensor &x) const {
std::vector<AbsoluteRankTableItem> absolute_table; std::vector<AbsoluteRankTableItem> absolute_table;
size_t level = 0;
size_t size = x.lod()[level].size();
for (size_t i = 0; i < size - 1; ++i) { if (x.lod().empty()) {
auto lod_offset = // For Tensor without lod, such as the output of sequence_pool_op
framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level); size_t size = x.dims()[0];
absolute_table.reserve(size);
for (size_t i = 0; i < size; ++i) {
absolute_table.emplace_back();
absolute_table.back().length = 1;
absolute_table.back().offset = i;
}
} else {
size_t level = 0;
size_t size = x.lod()[level].size();
for (size_t i = 0; i < size - 1; ++i) {
auto lod_offset =
framework::GetSubLoDAndAbsoluteOffset(x.lod(), i, i + 1, level);
auto &offset = lod_offset.second; auto &offset = lod_offset.second;
absolute_table.emplace_back(); absolute_table.emplace_back();
absolute_table.back().length = offset.second - offset.first; absolute_table.back().length = offset.second - offset.first;
absolute_table.back().offset = offset.first; absolute_table.back().offset = offset.first;
absolute_table.back().lod = lod_offset.first; absolute_table.back().lod = lod_offset.first;
}
} }
return absolute_table; return absolute_table;
} }
...@@ -133,7 +146,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase { ...@@ -133,7 +146,7 @@ class ReorderLoDTensorByRankTableBase : public framework::OperatorBase {
platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_sliced, out_sliced.place(), dev_ctx, &out_sliced); framework::Copy(x_sliced, out_sliced.place(), dev_ctx, &out_sliced);
out_offset += len; out_offset += len;
return out_offset; return out_offset;
} }
......
...@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -28,7 +28,7 @@ class ReshapeKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
auto out_dims = out->dims(); auto out_dims = out->dims();
out->mutable_data<T>(ctx.GetPlace()); out->mutable_data<T>(ctx.GetPlace());
framework::CopyFrom(*in, ctx.GetPlace(), ctx.device_context(), out); framework::Copy(*in, ctx.GetPlace(), ctx.device_context(), out);
out->Resize(out_dims); out->Resize(out_dims);
} }
}; };
...@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> { ...@@ -42,7 +42,7 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
d_x->mutable_data<T>(ctx.GetPlace()); d_x->mutable_data<T>(ctx.GetPlace());
auto in_dims = d_x->dims(); auto in_dims = d_x->dims();
framework::CopyFrom(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); framework::Copy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
d_x->Resize(in_dims); d_x->Resize(in_dims);
} }
}; };
......
...@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel { ...@@ -68,7 +68,7 @@ class ROIPoolOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel { ...@@ -89,7 +89,7 @@ class ROIPoolGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
...@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel { ...@@ -49,7 +49,7 @@ class ScatterOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
...@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel { ...@@ -68,7 +68,7 @@ class ScatterGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Ref")->type()), framework::ToDataType(ctx.Input<Tensor>("Ref")->type()),
......
...@@ -20,22 +20,27 @@ limitations under the License. */ ...@@ -20,22 +20,27 @@ limitations under the License. */
#include "paddle/framework/op_registry.h" #include "paddle/framework/op_registry.h"
#include "paddle/framework/operator.h" #include "paddle/framework/operator.h"
#include "paddle/framework/program_desc.h" #include "paddle/framework/program_desc.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/selected_rows_functor.h"
#include "paddle/string/printf.h" #include "paddle/string/printf.h"
USE_NO_KERNEL_OP(send); USE_NO_KERNEL_OP(send);
USE_NO_KERNEL_OP(recv); USE_NO_KERNEL_OP(recv);
USE_OP(sum); USE_OP(sum);
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
// global for simplicity. // global for simplicity.
std::unique_ptr<paddle::framework::OperatorBase> recv_op; std::unique_ptr<f::OperatorBase> recv_op;
void InitTensorsInScope(paddle::framework::Scope &scope, void InitTensorsInScope(f::Scope &scope, p::CPUPlace &place) {
paddle::platform::CPUPlace &place) { p::CPUDeviceContext ctx(place);
paddle::platform::CPUDeviceContext ctx(place);
for (int i = 0; i < 2; ++i) { for (int i = 0; i < 2; ++i) {
auto var_name = paddle::string::Sprintf("x%d", i); auto var_name = paddle::string::Sprintf("x%d", i);
auto var = scope.Var(var_name); auto var = scope.Var(var_name);
auto tensor = var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10}); tensor->Resize({10, 10});
float *expect = tensor->mutable_data<float>(place); float *expect = tensor->mutable_data<float>(place);
for (int64_t i = 0; i < tensor->numel(); ++i) { for (int64_t i = 0; i < tensor->numel(); ++i) {
...@@ -44,21 +49,53 @@ void InitTensorsInScope(paddle::framework::Scope &scope, ...@@ -44,21 +49,53 @@ void InitTensorsInScope(paddle::framework::Scope &scope,
} }
auto out_var = scope.Var("Out"); auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<paddle::framework::LoDTensor>(); auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10}); out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate out_tensor->mutable_data<float>(place); // allocate
} }
void AddOp(const std::string &type, void InitSelectedRowsInScope(f::Scope &scope, p::CPUPlace &place) {
const paddle::framework::VariableNameMap &inputs, p::CPUDeviceContext ctx(place);
const paddle::framework::VariableNameMap &outputs, int64_t height = 10;
paddle::framework::AttributeMap attrs, int64_t row_numel = 10;
paddle::framework::BlockDesc *block) { m::SetConstant<p::CPUDeviceContext, float> set_one;
// init x0
std::vector<int64_t> rows0{0, 4, 7};
auto x0_var = scope.Var("x0");
auto x0 = x0_var->GetMutable<f::SelectedRows>();
x0->set_rows(rows0);
x0->set_height(height);
auto x0_value = x0->mutable_value();
x0_value->mutable_data<float>(
f::make_ddim({static_cast<int64_t>(rows0.size()), row_numel}), place);
set_one(ctx, x0_value, 1.0);
// init x1
std::vector<int64_t> rows1{2, 9};
auto x1_var = scope.Var("x1");
auto x1 = x1_var->GetMutable<f::SelectedRows>();
x1->set_rows(rows1);
x1->set_height(height);
auto x1_value = x1->mutable_value();
x1_value->mutable_data<float>(
f::make_ddim({static_cast<int64_t>(rows1.size()), row_numel}), place);
set_one(ctx, x1_value, 1.0);
auto out_var = scope.Var("Out");
auto out = out_var->GetMutable<f::SelectedRows>();
auto out_value = out->mutable_value();
out->set_height(height);
out_value->mutable_data<float>(f::make_ddim({5, 10}), place);
}
void AddOp(const std::string &type, const f::VariableNameMap &inputs,
const f::VariableNameMap &outputs, f::AttributeMap attrs,
f::BlockDesc *block) {
// insert output // insert output
for (auto kv : outputs) { for (auto kv : outputs) {
for (auto v : kv.second) { for (auto v : kv.second) {
auto var = block->Var(v); auto var = block->Var(v);
var->SetDataType(paddle::framework::proto::DataType::FP32); var->SetDataType(f::proto::DataType::FP32);
} }
} }
...@@ -74,58 +111,99 @@ void AddOp(const std::string &type, ...@@ -74,58 +111,99 @@ void AddOp(const std::string &type,
op->SetAttrMap(attrs); op->SetAttrMap(attrs);
} }
void StartServerNet() { void StartServerNet(bool is_sparse) {
paddle::framework::Scope scope; f::Scope scope;
paddle::platform::CPUPlace place; p::CPUPlace place;
InitTensorsInScope(scope, place); if (is_sparse) {
InitSelectedRowsInScope(scope, place);
} else {
InitTensorsInScope(scope, place);
}
// sub program run in recv_op, for simple test we use sum // sub program run in recv_op, for simple test we use sum
paddle::framework::ProgramDesc program; f::ProgramDesc program;
paddle::framework::BlockDesc *block = program.MutableBlock(0); f::BlockDesc *block = program.MutableBlock(0);
// X for server side tensors, RX for received tensers, must be of same shape. // X for server side tensors, RX for received tensers, must be of same shape.
AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"x0"}}}, {}, block); AddOp("sum", {{"X", {"x0", "x1"}}}, {{"Out", {"Out"}}}, {}, block);
paddle::framework::AttributeMap attrs; f::AttributeMap attrs;
attrs.insert({"endpoint", std::string("127.0.0.1:6174")}); attrs.insert({"endpoint", std::string("127.0.0.1:6174")});
attrs.insert({"ParamList", std::vector<std::string>({"x0"})}); attrs.insert({"ParamList", std::vector<std::string>({"Out"})});
attrs.insert({"GradList", std::vector<std::string>({"x1"})}); attrs.insert({"GradList", std::vector<std::string>({"x1"})});
std::string program_proto; std::string program_proto;
PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto)); PADDLE_ENFORCE(program.Proto()->SerializeToString(&program_proto));
attrs.insert({"OptimizeProgram", program_proto}); attrs.insert({"OptimizeProgram", program_proto});
recv_op = paddle::framework::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, recv_op = f::OpRegistry::CreateOp("recv", {{"RX", {"x1"}}}, {}, attrs);
{}, attrs);
recv_op->Run(scope, place); recv_op->Run(scope, place);
} }
TEST(SendRecvOp, CPU) { TEST(SendRecvOp, CPUDense) {
std::thread server_thread(StartServerNet); std::thread server_thread(StartServerNet, false);
sleep(5); // wait server to start sleep(3); // wait server to start
// local net // local net
paddle::framework::Scope scope; f::Scope scope;
paddle::platform::CPUPlace place; p::CPUPlace place;
InitTensorsInScope(scope, place); InitTensorsInScope(scope, place);
paddle::framework::AttributeMap attrs; f::AttributeMap attrs;
attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})}); attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})}); attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
auto send_op = paddle::framework::OpRegistry::CreateOp( auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
"send", {{"X", {"x1"}}}, {{"Out", {"x0"}}}, attrs); {{"Out", {"Out"}}}, attrs);
send_op->Run(scope, place); send_op->Run(scope, place);
auto in_var = scope.Var("x1"); auto in_var = scope.Var("x1");
auto tensor = in_var->GetMutable<paddle::framework::LoDTensor>(); auto tensor = in_var->GetMutable<f::LoDTensor>();
float *expected = tensor->data<float>(); float *expected = tensor->data<float>();
auto out_var = scope.Var("x0"); auto out_var = scope.Var("Out");
auto target = out_var->GetMutable<paddle::framework::LoDTensor>(); auto target = out_var->GetMutable<f::LoDTensor>();
// x1 * 2 == x0 // x1 * 2 == x0
EXPECT_NE(target->memory_size(), size_t(0)); EXPECT_NE(target->memory_size(), size_t(0));
float *actual = target->data<float>(); float *actual = target->data<float>();
for (int64_t i = 0; i < target->numel(); ++i) { for (int64_t i = 0; i < target->numel(); ++i) {
EXPECT_EQ(expected[i] * 2, actual[i]); EXPECT_EQ(expected[i] * 2, actual[i]);
} }
recv_op->Stop();
server_thread.join();
recv_op.reset(nullptr);
}
TEST(SendRecvOp, CPUSparse) {
std::thread server_thread(StartServerNet, true);
sleep(3); // wait server to start
// local net
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
InitSelectedRowsInScope(scope, place);
f::AttributeMap attrs;
attrs.insert({"endpoints", std::vector<std::string>({"127.0.0.1:6174"})});
attrs.insert({"epmap", std::vector<std::string>({"127.0.0.1:6174"})});
auto send_op = f::OpRegistry::CreateOp("send", {{"X", {"x1"}}},
{{"Out", {"Out"}}}, attrs);
send_op->Run(scope, place);
auto x0 = scope.Var("x0")->GetMutable<f::SelectedRows>();
auto x1 = scope.Var("x1")->GetMutable<f::SelectedRows>();
auto out = scope.Var("Out")->GetMutable<f::SelectedRows>();
auto actual = out->mutable_value();
std::unique_ptr<f::SelectedRows> expect{new f::SelectedRows()};
auto expect_value = expect->mutable_value();
expect_value->mutable_data<float>(f::make_ddim({5, 10}), place);
m::SelectedRowsAdd<p::CPUDeviceContext, float> add_functor;
add_functor(ctx, *x0, *x1, expect.get());
EXPECT_EQ(actual->numel(), expect_value->numel());
EXPECT_EQ(out->rows().size(), x0->rows().size() + x1->rows().size());
for (int64_t i = 0; i < expect_value->numel(); ++i) {
EXPECT_EQ(expect_value->mutable_data<float>(place)[i],
actual->mutable_data<float>(place)[i]);
}
recv_op->Stop(); recv_op->Stop();
server_thread.join(); server_thread.join();
// recv_op.reset(); recv_op.reset();
} }
...@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel { ...@@ -107,7 +107,7 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), framework::ToDataType(ctx.Input<Tensor>("X")->type()),
......
...@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel { ...@@ -48,7 +48,7 @@ class SequenceSliceOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
...@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel { ...@@ -69,7 +69,7 @@ class SequenceSliceGradOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()), framework::ToDataType(ctx.Input<framework::LoDTensor>("X")->type()),
......
...@@ -66,13 +66,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> { ...@@ -66,13 +66,13 @@ class SequenceSliceOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(), framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu); &offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(), framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu); &length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
...@@ -127,13 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> { ...@@ -127,13 +127,13 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::is_gpu_place(ctx.GetPlace())) {
offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace()); offset_cpu.mutable_data<T>(offset->dims(), platform::CPUPlace());
framework::CopyFrom(*offset, platform::CPUPlace(), ctx.device_context(), framework::Copy(*offset, platform::CPUPlace(), ctx.device_context(),
&offset_cpu); &offset_cpu);
offset_data = offset_cpu.data<int64_t>(); offset_data = offset_cpu.data<int64_t>();
length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace()); length_cpu.mutable_data<T>(length->dims(), platform::CPUPlace());
framework::CopyFrom(*length, platform::CPUPlace(), ctx.device_context(), framework::Copy(*length, platform::CPUPlace(), ctx.device_context(),
&length_cpu); &length_cpu);
length_data = length_cpu.data<int64_t>(); length_data = length_cpu.data<int64_t>();
} }
......
...@@ -115,7 +115,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp { ...@@ -115,7 +115,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
auto &dout_tensor = dout_var->Get<framework::LoDTensor>(); auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
auto height = dout_tensor.dims()[0]; auto height = dout_tensor.dims()[0];
auto slice = dx_tensor.Slice(0, static_cast<int>(height)); auto slice = dx_tensor.Slice(0, static_cast<int>(height));
framework::CopyFrom(dout_tensor, dout_tensor.place(), dev_ctx, &slice); framework::Copy(dout_tensor, dout_tensor.place(), dev_ctx, &slice);
if (dx_tensor.dims()[0] > height) { if (dx_tensor.dims()[0] > height) {
auto rest_tensor = dx_tensor.Slice( auto rest_tensor = dx_tensor.Slice(
static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0])); static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
......
...@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel { ...@@ -118,7 +118,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Logits")->type()), framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
...@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel { ...@@ -159,7 +159,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
......
...@@ -53,7 +53,7 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -53,7 +53,7 @@ class SplitLoDTensorOp : public framework::OperatorBase {
cpu_mask->ShareDataWith(mask); cpu_mask->ShareDataWith(mask);
} else if (platform::is_gpu_place(mask.place())) { } else if (platform::is_gpu_place(mask.place())) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
framework::CopyFrom(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get()); framework::Copy(mask, platform::CPUPlace(), dev_ctx, cpu_mask.get());
#else #else
PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option"); PADDLE_THROW("Not supported GPU, Please compile WITH_GPU option");
#endif #endif
...@@ -111,9 +111,9 @@ class SplitLoDTensorOp : public framework::OperatorBase { ...@@ -111,9 +111,9 @@ class SplitLoDTensorOp : public framework::OperatorBase {
// out[offset: offset+len] = x[each_range.begin: each_range.end] // out[offset: offset+len] = x[each_range.begin: each_range.end]
auto slice = out->Slice(static_cast<int>(offset), auto slice = out->Slice(static_cast<int>(offset),
static_cast<int>(offset + len)); static_cast<int>(offset + len));
framework::CopyFrom(x.Slice(static_cast<int>(each_range.begin), framework::Copy(x.Slice(static_cast<int>(each_range.begin),
static_cast<int>(each_range.end)), static_cast<int>(each_range.end)),
x.place(), dev_ctx, &slice); x.place(), dev_ctx, &slice);
offset += len; offset += len;
} }
} }
......
...@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel { ...@@ -53,7 +53,7 @@ class SumOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto x_vars = ctx.MultiInputVar("X"); auto x_vars = ctx.MultiInputVar("X");
if (x_vars[0]->IsType<framework::LoDTensor>()) { if (x_vars[0]->IsType<framework::LoDTensor>()) {
...@@ -122,17 +122,17 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -122,17 +122,17 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
for (auto& name : op_desc.Input("X")) { for (auto& name : op_desc.Input("X")) {
VLOG(10) << name << " " VLOG(10) << name << " "
<< block->FindRecursiveOrCreateVar(name)->GetType(); << block->FindRecursiveOrCreateVar(name).GetType();
} }
bool any_input_is_lod_tensor = std::any_of( bool any_input_is_lod_tensor = std::any_of(
inputs.begin(), inputs.end(), [block](const std::string& name) { inputs.begin(), inputs.end(), [block](const std::string& name) {
return block->FindRecursiveOrCreateVar(name)->GetType() == return block->FindRecursiveOrCreateVar(name).GetType() ==
framework::proto::VarDesc::LOD_TENSOR; framework::proto::VarDesc::LOD_TENSOR;
}); });
auto is_tensor_array = [block](const std::string& name) { auto is_tensor_array = [block](const std::string& name) {
return detail::Ref(block->FindRecursiveOrCreateVar(name)).GetType() == return block->FindRecursiveOrCreateVar(name).GetType() ==
framework::proto::VarDesc::LOD_TENSOR_ARRAY; framework::proto::VarDesc::LOD_TENSOR_ARRAY;
}; };
...@@ -146,8 +146,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -146,8 +146,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
std::ostringstream os; std::ostringstream os;
for (auto& each : inputs) { for (auto& each : inputs) {
os << " " << each << " type is " os << " " << each << " type is "
<< detail::Ref(block->FindRecursiveOrCreateVar(each)).GetType() << block->FindRecursiveOrCreateVar(each).GetType() << "\n";
<< "\n";
} }
PADDLE_ENFORCE(all_inputs_are_tensor_array, PADDLE_ENFORCE(all_inputs_are_tensor_array,
"Not all inputs are tensor array:\n%s", os.str()); "Not all inputs are tensor array:\n%s", os.str());
...@@ -158,7 +157,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference { ...@@ -158,7 +157,7 @@ class SumOpVarTypeInference : public framework::VarTypeInference {
} }
auto out_var_name = op_desc.Output("Out").front(); auto out_var_name = op_desc.Output("Out").front();
auto& out_var = detail::Ref(block->FindRecursiveOrCreateVar(out_var_name)); auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
out_var.SetType(var_type); out_var.SetType(var_type);
auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front())); auto& in_var = detail::Ref(block->FindVarRecursive(inputs.front()));
out_var.SetDataType(in_var.GetDataType()); out_var.SetDataType(in_var.GetDataType());
......
...@@ -70,6 +70,7 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -70,6 +70,7 @@ class SumKernel : public framework::OpKernel<T> {
} else if (out_var->IsType<framework::SelectedRows>()) { } else if (out_var->IsType<framework::SelectedRows>()) {
PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now"); PADDLE_ENFORCE(!in_place, "SelectedRows not support inplace sum now");
auto *out = context.Output<SelectedRows>("Out"); auto *out = context.Output<SelectedRows>("Out");
out->mutable_rows()->clear();
auto *out_value = out->mutable_value(); auto *out_value = out->mutable_value();
// Runtime InferShape // Runtime InferShape
...@@ -107,8 +108,8 @@ class SumKernel : public framework::OpKernel<T> { ...@@ -107,8 +108,8 @@ class SumKernel : public framework::OpKernel<T> {
out_array.resize(i + 1); out_array.resize(i + 1);
} }
if (out_array[i].numel() == 0) { if (out_array[i].numel() == 0) {
framework::CopyFrom(in_array[i], in_array[i].place(), framework::Copy(in_array[i], in_array[i].place(),
context.device_context(), &out_array[i]); context.device_context(), &out_array[i]);
out_array[i].set_lod(in_array[i].lod()); out_array[i].set_lod(in_array[i].lod());
} else { } else {
PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod()); PADDLE_ENFORCE(out_array[i].lod() == in_array[i].lod());
......
...@@ -44,7 +44,7 @@ class WriteToArrayOp : public ArrayOp { ...@@ -44,7 +44,7 @@ class WriteToArrayOp : public ArrayOp {
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
CopyFrom(x_tensor, place, dev_ctx, out_tensor); Copy(x_tensor, place, dev_ctx, out_tensor);
out_tensor->set_lod(x_tensor.lod()); out_tensor->set_lod(x_tensor.lod());
} else { } else {
VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so " VLOG(10) << "WARNING: The input tensor 'x_tensor' holds no memory, so "
...@@ -106,8 +106,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference { ...@@ -106,8 +106,7 @@ class WriteToArrayInferVarType : public framework::VarTypeInference {
auto x_name = op_desc.Input("X")[0]; auto x_name = op_desc.Input("X")[0];
auto out_name = op_desc.Output("Out")[0]; auto out_name = op_desc.Output("Out")[0];
VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY"; VLOG(10) << "Set Variable " << out_name << " as LOD_TENSOR_ARRAY";
auto &out = detail::Ref(block->FindRecursiveOrCreateVar(out_name), auto &out = block->FindRecursiveOrCreateVar(out_name);
"Cannot found %s", out_name);
out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY); out.SetType(framework::proto::VarDesc::LOD_TENSOR_ARRAY);
auto *x = block->FindVarRecursive(x_name); auto *x = block->FindVarRecursive(x_name);
if (x != nullptr) { if (x != nullptr) {
...@@ -136,7 +135,7 @@ class ReadFromArrayOp : public ArrayOp { ...@@ -136,7 +135,7 @@ class ReadFromArrayOp : public ArrayOp {
platform::DeviceContextPool &pool = platform::DeviceContextPool &pool =
platform::DeviceContextPool::Instance(); platform::DeviceContextPool::Instance();
auto &dev_ctx = *pool.Get(place); auto &dev_ctx = *pool.Get(place);
framework::CopyFrom(x_array[offset], place, dev_ctx, out_tensor); framework::Copy(x_array[offset], place, dev_ctx, out_tensor);
out_tensor->set_lod(x_array[offset].lod()); out_tensor->set_lod(x_array[offset].lod());
} else { } else {
VLOG(10) << "offset " << offset << " >= " << x_array.size(); VLOG(10) << "offset " << offset << " >= " << x_array.size();
......
...@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel { ...@@ -63,7 +63,7 @@ class UniformRandomOp : public framework::OperatorWithKernel {
} }
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")), static_cast<framework::proto::DataType>(ctx.Attr<int>("dtype")),
......
...@@ -71,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) { ...@@ -71,7 +71,7 @@ int OutputSize(int input_size, int ksize, int padding, int stride) {
class UnpoolOp : public framework::OperatorWithKernel { class UnpoolOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
...@@ -110,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel { ...@@ -110,7 +110,7 @@ class UnpoolOp : public framework::OperatorWithKernel {
class UnpoolOpGrad : public framework::OperatorWithKernel { class UnpoolOpGrad : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetActualKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()), framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/warpctc_op.h"
namespace paddle {
namespace operators {
class WarpCTCOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Logits"),
"Input(Logits) of WarpCTCOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput("Label"),
"Input(Label) of WarpCTCOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("WarpCTCGrad"),
"Output(WarpCTCGrad) of WarpCTCOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Loss"),
"Output(Loss) of WarpCTCOp should not be null.");
auto logits_dims = ctx->GetInputDim("Logits");
int sequence_width =
static_cast<int>(framework::product(logits_dims) / logits_dims[0]);
int blank = ctx->Attrs().Get<int>("blank");
PADDLE_ENFORCE((blank >= 0) && (blank < sequence_width),
"The value of Attr(blank) should be in interval [0, %d).",
sequence_width);
// TODO(liuyiqun): it is tricky to set the wrong dimension here.
ctx->SetOutputDim("Loss", {logits_dims[0], 1});
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
ctx.device_context());
}
};
class WarpCTCOpMaker : public framework::OpProtoAndCheckerMaker {
public:
WarpCTCOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Logits",
"(LodTensor, default: LoDTensor<float>), the unscaled "
"probabilities of variable-length sequences, which is a 2-D "
"Tensor with LoD information. It's shape is "
"[Lp, num_classes + 1], where Lp is the sum of all input "
"sequences' length and num_classes is the true number of classes "
"(not including the blank label).");
AddInput("Label",
"(LodTensor, default: LoDTensor<int>), the ground truth "
"of variable-length sequence, which is a 2-D Tensor with LoD "
"information. It is of the shape [Lg, 1], where Lg is th sum of "
"all labels' length.");
AddOutput("WarpCTCGrad",
"(Tensor, default: Tensor<float>), a temporary "
"output Tensor to store the gradients of warp-ctc, which is "
"computed with loss together in one call. It is a 3-D Tensor of "
"the shape [max_sequence_length, batch_size, num_classes + 1].")
.AsIntermediate();
AddOutput("Loss",
"(Tensor, default: Tensor<float>), the Connectionist "
"Temporal Classification (CTC) loss, which is a 2-D Tensor of "
"the shape [batch_size, 1]");
AddAttr<int>("blank",
"(int, default: 0), the blank label of Connectionist "
"Temporal Classification (CTC) loss, which is in the "
"half-opened interval [0, num_classes + 1).")
.SetDefault(0);
AddAttr<bool>("norm_by_times",
"(bool, default: false), whether to "
"normalize the gradients by the number of time-step, "
"which is also the sequence's length.")
.SetDefault(false);
AddComment(R"DOC(
An operator integrating the open-source
[warp-ctc](https://github.com/baidu-research/warp-ctc) library, which is used in
[Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin](
https://arxiv.org/pdf/1512.02595v1.pdf),
to compute Connectionist Temporal Classification (CTC) loss.
It can be aliased as softmax with ctc, since a native softmax activation is
interated to the warp-ctc library, to to normlize values for each row of the
input tensor.
More detail of CTC loss can be found by refering to
[Connectionist Temporal Classification: Labelling Unsegmented Sequence Data with
Recurrent Neural Networks](
http://machinelearning.wustl.edu/mlpapers/paper_files/icml2006_GravesFGS06.pdf).
)DOC");
}
};
class WarpCTCGradOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("WarpCTCGrad"),
"Input(WarpCTCGrad) of WarpCTCGradOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
"Output(Logits@GRAD) of WarpCTCGradOp should not be null.");
ctx->SetOutputDim(framework::GradVarName("Logits"),
ctx->GetInputDim("Logits"));
ctx->ShareLoD("Logits", /*->*/ framework::GradVarName("Logits"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Logits")->type()),
ctx.device_context());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP(warpctc, ops::WarpCTCOp, ops::WarpCTCOpMaker, warpctc_grad,
ops::WarpCTCGradOp);
REGISTER_OP_CPU_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CPUDeviceContext, float>);
REGISTER_OP_CPU_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CPUDeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/operators/warpctc_op.h"
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(
warpctc, ops::WarpCTCKernel<paddle::platform::CUDADeviceContext, float>);
REGISTER_OP_CUDA_KERNEL(
warpctc_grad,
ops::WarpCTCGradKernel<paddle::platform::CUDADeviceContext, float>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/framework/op_registry.h"
#include "paddle/operators/math/math_function.h"
#include "paddle/operators/math/sequence_padding.h"
#include "paddle/platform/dynload/warpctc.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using LoDTensor = framework::LoDTensor;
template <typename DeviceContext>
class WarpCTCFunctor {
public:
/*
* \brief Compute the connectionist temporal classification loss,
* and optionally compute the gradient with respect to the inputs.
*
* If gradient is nullptr, it only computes the ctc loss,
* or computes both ctc loss and gradient.
*
* \param ctx execution context of this functor
* \param input batch matrix of input probabilities, in
* max_sequence_length x num_sequences x
* sequence_width, (row-major) format
* \param gradient batch matrix of gradient, with the same shape as
* input.
* \param cpu_labels labels always in CPU memory.
* \param cpu_label_lengths length of all labels in CPU memory.
* \param cpu_input_lengths length of all sequences in CPU memory.
* \param sequence_width number of possible output symbols.
* \param num_sequences number of sequence.
* \param blank blank label used in ctc loss function.
* \param cpu_losss cost of each sequence in CPU memory.
*/
void operator()(const framework::ExecutionContext& ctx, const float* input,
float* gradient, const int* cpu_labels,
const int* cpu_label_lengths, const int* cpu_input_lengths,
const size_t sequence_width, const size_t num_sequences,
const size_t blank, float* cpu_loss) {
// Init warp-ctc options
init(ctx, blank);
// Compute the required workspace size.
// There is no memory allocated operations within warp-ctc.
size_t workspace_bytes = 0;
ctcStatus_t status = platform::dynload::get_workspace_size(
cpu_label_lengths, cpu_input_lengths, static_cast<int>(sequence_width),
static_cast<int>(num_sequences), options_, &workspace_bytes);
PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
"warp-ctc [version %d] Error in get_workspace_size: ",
warpctc_version_,
platform::dynload::ctcGetStatusString(status));
PADDLE_ENFORCE_GT(workspace_bytes, 0UL,
"Bytes of workspace got by warp-ctc function, "
"get_workspace_size(), should be larger than 0.");
Tensor workspace;
size_t workspace_elements = workspace_bytes / sizeof(float) + 1UL;
float* workspace_data = workspace.mutable_data<float>(
framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
ctx.GetPlace());
math::SetConstant<DeviceContext, float>()(
ctx.template device_context<DeviceContext>(), &workspace,
static_cast<float>(0));
// compute loss and gradient
status = platform::dynload::compute_ctc_loss(
input, gradient, cpu_labels, cpu_label_lengths, cpu_input_lengths,
static_cast<int>(sequence_width), static_cast<int>(num_sequences),
cpu_loss, workspace_data, options_);
PADDLE_ENFORCE_EQ(CTC_STATUS_SUCCESS, status,
"warp-ctc [version %d] Error in compute_ctc_loss: ",
warpctc_version_,
platform::dynload::ctcGetStatusString(status));
}
protected:
void init(const framework::ExecutionContext& ctx, const size_t blank) {
warpctc_version_ = platform::dynload::get_warpctc_version();
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
options_.loc = CTC_GPU;
options_.stream = reinterpret_cast<const platform::CUDADeviceContext&>(
ctx.device_context())
.stream();
#else
PADDLE_THROW("[warpctc init] GPU is not enabled.");
#endif
} else {
options_.loc = CTC_CPU;
options_.num_threads = 1;
}
options_.blank_label = blank;
}
private:
int warpctc_version_;
ctcOptions options_;
};
template <typename DeviceContext, typename T>
class WarpCTCKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* logits = ctx.Input<LoDTensor>("Logits");
auto* label = ctx.Input<LoDTensor>("Label");
auto* warpctc_grad = ctx.Output<Tensor>("WarpCTCGrad");
auto* loss = ctx.Output<Tensor>("Loss");
const size_t level = 0;
auto logits_lod = framework::ToAbsOffset(logits->lod());
auto logits_dims = logits->dims();
PADDLE_ENFORCE_EQ(logits_dims[0],
static_cast<int64_t>(logits_lod[level].back()),
"The first dimension of Input(Logits) should be equal to "
"the sum of all sequences' lengths.");
auto label_lod = framework::ToAbsOffset(label->lod());
auto label_dims = label->dims();
PADDLE_ENFORCE_EQ(
label_dims[0], label->numel(),
"The width of each timestep in Input(Label) should be 1.");
const size_t num_sequences = logits_lod[level].size() - 1;
PADDLE_ENFORCE_EQ(num_sequences, label_lod[level].size() - 1,
"The number of sequences of Input(Logits) should be "
"equal to that of Input(Label).");
const size_t sequence_width = logits->numel() / logits_dims[0];
auto loss_dims =
framework::make_ddim({static_cast<int64_t>(num_sequences), 1});
// warpctc needs sequences data stored in transposed padding format
Tensor warpctc_logits;
const size_t max_sequence_length =
math::MaximumSequenceLength(logits_lod, level);
auto warpctc_logits_dims =
framework::make_ddim({static_cast<int64_t>(max_sequence_length),
static_cast<int64_t>(num_sequences),
static_cast<int64_t>(sequence_width)});
warpctc_logits.mutable_data<T>(warpctc_logits_dims, ctx.GetPlace());
math::PaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits, warpctc_logits,
false);
const T* warpctc_logits_data = warpctc_logits.data<T>();
std::vector<int> warpctc_label_lengths(num_sequences);
std::vector<int> warpctc_logits_lengths(num_sequences);
for (size_t i = 0; i < num_sequences; ++i) {
warpctc_label_lengths[i] = label_lod[level][i + 1] - label_lod[level][i];
warpctc_logits_lengths[i] =
logits_lod[level][i + 1] - logits_lod[level][i];
}
// warpctc computes loss and gradient in one call, gradient data also stored
// in batch format
T* warpctc_grad_data =
warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
// warpctc accesses labels in CPU memory
Tensor warpctc_label;
Copy(*label, platform::CPUPlace(), ctx.device_context(), &warpctc_label);
const int* warpctc_label_data = warpctc_label.data<int>();
// warpctc stores loss in CPU memory
Tensor warpctc_loss;
T* warpctc_loss_data =
warpctc_loss.mutable_data<T>(loss_dims, platform::CPUPlace());
const size_t blank = static_cast<size_t>(ctx.Attr<int>("blank"));
WarpCTCFunctor<DeviceContext>()(
ctx, warpctc_logits_data, warpctc_grad_data, warpctc_label_data,
warpctc_label_lengths.data(), warpctc_logits_lengths.data(),
sequence_width, num_sequences, blank, warpctc_loss_data);
// Copy the loss back
Copy(warpctc_loss, ctx.GetPlace(), ctx.device_context(), loss);
}
};
template <typename DeviceContext, typename T>
class WarpCTCGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* warpctc_grad = ctx.Input<Tensor>("WarpCTCGrad");
auto* logits_grad = ctx.Output<LoDTensor>(framework::GradVarName("Logits"));
bool norm_by_times = ctx.Attr<bool>("norm_by_times");
math::UnpaddingLoDTensorFunctor<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), *logits_grad,
*warpctc_grad, norm_by_times);
}
};
} // namespace operators
} // namespace paddle
...@@ -25,12 +25,12 @@ namespace operators { ...@@ -25,12 +25,12 @@ namespace operators {
using StepScopeVar = std::vector<framework::Scope *>; using StepScopeVar = std::vector<framework::Scope *>;
using LoDTensor = framework::LoDTensor; using LoDTensor = framework::LoDTensor;
constexpr char kStepBlock[] = "sub_block"; static constexpr char kStepBlock[] = "sub_block";
constexpr char kCondition[] = "Condition"; static constexpr char kCondition[] = "Condition";
constexpr char kStepScopes[] = "StepScopes"; static constexpr char kStepScopes[] = "StepScopes";
constexpr char kParameters[] = "X"; static constexpr char kX[] = "X";
constexpr char kParamGrads[] = "X@GRAD"; static constexpr char kXGRAD[] = "X@GRAD";
constexpr char kOutputs[] = "Out"; static constexpr char kOutputs[] = "Out";
class WhileOp : public framework::OperatorBase { class WhileOp : public framework::OperatorBase {
public: public:
...@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -67,7 +67,7 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker) WhileOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) { : OpProtoAndCheckerMaker(proto, op_checker) {
AddInput(kParameters, AddInput(kX,
"A set of variables, which are required by operators inside the " "A set of variables, which are required by operators inside the "
"block of While Op.") "block of While Op.")
.AsDuplicable(); .AsDuplicable();
...@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase { ...@@ -158,8 +158,8 @@ class WhileGradOp : public framework::OperatorBase {
executor.Run(*program, *cur_scope_iter, block->ID(), false); executor.Run(*program, *cur_scope_iter, block->ID(), false);
auto &pg_names = Outputs(kParamGrads); auto &pg_names = Outputs(kXGRAD);
auto &p_names = Inputs(kParameters); auto &p_names = Inputs(kX);
PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size()); PADDLE_ENFORCE_EQ(pg_names.size(), p_names.size());
for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) { for (size_t param_id = 0; param_id < pg_names.size(); ++param_id) {
if (pg_names[param_id] == framework::kEmptyVarName) { if (pg_names[param_id] == framework::kEmptyVarName) {
...@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -213,11 +213,11 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
std::unique_ptr<framework::OpDesc> Apply() const override { std::unique_ptr<framework::OpDesc> Apply() const override {
auto *grad = new framework::OpDesc(); auto *grad = new framework::OpDesc();
grad->SetType("while_grad"); grad->SetType("while_grad");
grad->SetInput(kParameters, Input(kParameters)); grad->SetInput(kX, Input(kX));
// Not all of IGs will be generated by inner gradient operators of while op. // Not all of IGs will be generated by inner gradient operators of while op.
// Ignore IGs that is not generated by the inside block. // Ignore IGs that is not generated by the inside block.
auto igs = InputGrad(kParameters, /*do not drop empty gradient*/ false); auto igs = InputGrad(kX, /*do not drop empty gradient*/ false);
std::unordered_set<std::string> all_outs; std::unordered_set<std::string> all_outs;
for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) { for (size_t i = 0; i < grad_block_[0]->OpSize(); ++i) {
for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) { for (auto &oname : grad_block_[0]->Op(i)->OutputArgumentNames()) {
...@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -231,7 +231,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
} }
} }
grad->SetOutput(framework::GradVarName(kParameters), igs); grad->SetOutput(framework::GradVarName(kX), igs);
grad->SetInput(kOutputs, Output(kOutputs)); grad->SetInput(kOutputs, Output(kOutputs));
...@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { ...@@ -240,7 +240,7 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker {
std::unordered_set<std::string> block_ins; std::unordered_set<std::string> block_ins;
auto *fwd_block = this->grad_block_[0]->ParentBlock(); auto *fwd_block = this->grad_block_[0]->ParentBlock();
{ {
for (auto &p : Input(kParameters)) { for (auto &p : Input(kX)) {
block_ins.insert(p); block_ins.insert(p);
} }
for (auto &o : Output(kOutputs)) { for (auto &o : Output(kOutputs)) {
...@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { ...@@ -288,8 +288,8 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
public: public:
void operator()(const framework::OpDesc &op_desc, void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override { framework::BlockDesc *block) const override {
auto p_names = op_desc.Input(kParameters); auto p_names = op_desc.Input(kX);
auto pg_names = op_desc.Output(framework::GradVarName(kParameters)); auto pg_names = op_desc.Output(framework::GradVarName(kX));
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i])); auto &p_var = detail::Ref(block->FindVarRecursive(p_names[i]));
...@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference { ...@@ -307,21 +307,21 @@ class WhileGradOpVarTypeInference : public framework::VarTypeInference {
class WhileGradOpShapeInference : public framework::InferShapeBase { class WhileGradOpShapeInference : public framework::InferShapeBase {
public: public:
void operator()(framework::InferShapeContext *ctx) const override { void operator()(framework::InferShapeContext *ctx) const override {
ctx->HasInputs(kParameters); ctx->HasInputs(kX);
ctx->HasOutputs(framework::GradVarName(kParameters)); ctx->HasOutputs(framework::GradVarName(kX));
ctx->HasInputs(kOutputs); ctx->HasInputs(kOutputs);
ctx->HasInputs(framework::GradVarName(kOutputs)); ctx->HasInputs(framework::GradVarName(kOutputs));
auto p_names = ctx->Inputs(kParameters); auto p_names = ctx->Inputs(kX);
auto pg_names = ctx->Outputs(kParamGrads); auto pg_names = ctx->Outputs(kXGRAD);
auto var_types = ctx->GetInputsVarType(kParameters); auto var_types = ctx->GetInputsVarType(kX);
std::vector<std::string> names_to_set; std::vector<std::string> names_to_set;
std::vector<framework::DDim> dims_to_set; std::vector<framework::DDim> dims_to_set;
for (size_t i = 0; i < p_names.size(); ++i) { for (size_t i = 0; i < p_names.size(); ++i) {
if (pg_names[i] == framework::kEmptyVarName) { if (pg_names[i] == framework::kEmptyVarName) {
continue; continue;
} }
auto dims = ctx->GetInputsElementDim(kParameters, i); auto dims = ctx->GetInputsElementDim(kX, i);
if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) { if (var_types[i] == framework::proto::VarDesc::LOD_TENSOR) {
names_to_set.push_back(pg_names[i]); names_to_set.push_back(pg_names[i]);
dims_to_set.push_back(dims); dims_to_set.push_back(dims);
......
...@@ -21,10 +21,16 @@ ELSE() ...@@ -21,10 +21,16 @@ ELSE()
set(GPU_CTX_DEPS) set(GPU_CTX_DEPS)
ENDIF() ENDIF()
IF(WITH_MKLDNN)
set(MKLDNN_CTX_DEPS mkldnn)
ELSE()
set(MKLDNN_CTX_DEPS)
ENDIF()
# memcpy deoends on device_context, here add deps individually for # memcpy deoends on device_context, here add deps individually for
# avoiding cycle dependencies # avoiding cycle dependencies
cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator cc_library(device_context SRCS device_context.cc DEPS memory buddy_allocator
system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS}) system_allocator memory_block meta_data meta_cache place eigen3 ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
......
...@@ -127,15 +127,21 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { ...@@ -127,15 +127,21 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get())); eigen_device_.reset(new Eigen::GpuDevice(eigen_stream_.get()));
PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_)); PADDLE_ENFORCE(dynload::cublasCreate(&cublas_handle_));
PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_)); PADDLE_ENFORCE(dynload::cublasSetStream(cublas_handle_, stream_));
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); if (dynload::HasCUDNN()) {
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_)); PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_));
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream_));
} else {
cudnn_handle_ = nullptr;
}
} }
CUDADeviceContext::~CUDADeviceContext() { CUDADeviceContext::~CUDADeviceContext() {
SetDeviceId(place_.device); SetDeviceId(place_.device);
Wait(); Wait();
PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_)); PADDLE_ENFORCE(dynload::cublasDestroy(cublas_handle_));
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); if (cudnn_handle_ != nullptr) {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_));
}
eigen_stream_.reset(); eigen_stream_.reset();
eigen_device_.reset(); eigen_device_.reset();
PADDLE_ENFORCE(cudaStreamDestroy(stream_)); PADDLE_ENFORCE(cudaStreamDestroy(stream_));
...@@ -160,19 +166,69 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; } ...@@ -160,19 +166,69 @@ cudnnHandle_t CUDADeviceContext::cudnn_handle() const { return cudnn_handle_; }
cudaStream_t CUDADeviceContext::stream() const { return stream_; } cudaStream_t CUDADeviceContext::stream() const { return stream_; }
CUDNNDeviceContext::CUDNNDeviceContext(CUDAPlace place) #endif
: CUDADeviceContext(place) {
PADDLE_ENFORCE(dynload::cudnnCreate(&cudnn_handle_)); #ifdef PADDLE_WITH_MKLDNN
PADDLE_ENFORCE(dynload::cudnnSetStream(cudnn_handle_, stream())); MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), ready_(false) {
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
} }
CUDNNDeviceContext::~CUDNNDeviceContext() { template <typename T>
SetDeviceId(boost::get<CUDAPlace>(GetPlace()).device); void MKLDNNDeviceContext::AddElement(const std::string& op_key,
Wait(); const T& value) {
PADDLE_ENFORCE(dynload::cudnnDestroy(cudnn_handle_)); if (GetElement<T>(op_key)) {
return;
}
GetElementPool<T>().emplace(op_key, std::move(value));
}
template <typename T>
const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
auto it = GetElementPool<T>().find(op_key);
return it == GetElementPool<T>().end() ? nullptr : it->second;
}
template <>
const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
return memory_pool_;
} }
cudnnHandle_t CUDNNDeviceContext::cudnn_handle() const { return cudnn_handle_; } template <>
const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
return primitive_pool_;
}
template <>
const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
return primitive_desc_pool_;
}
void MKLDNNDeviceContext::Execute(bool block) {
if (pipeline_.empty()) {
return;
}
ResetStream();
stream_->submit(pipeline_).wait(block);
ready_ = false;
pipeline_.clear();
}
void MKLDNNDeviceContext::ResetStream() {
if (ready_) {
return;
}
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
ready_ = true;
}
#endif #endif
......
...@@ -21,6 +21,10 @@ limitations under the License. */ ...@@ -21,6 +21,10 @@ limitations under the License. */
#define EIGEN_USE_GPU #define EIGEN_USE_GPU
#endif #endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/platform/mkldnn_helper.h"
#endif
#include "paddle/platform/enforce.h" #include "paddle/platform/enforce.h"
#include "paddle/platform/place.h" #include "paddle/platform/place.h"
#include "unsupported/Eigen/CXX11/Tensor" #include "unsupported/Eigen/CXX11/Tensor"
...@@ -103,18 +107,54 @@ struct DefaultDeviceContextType<platform::CUDAPlace> { ...@@ -103,18 +107,54 @@ struct DefaultDeviceContextType<platform::CUDAPlace> {
using TYPE = CUDADeviceContext; using TYPE = CUDADeviceContext;
}; };
class CUDNNDeviceContext : public CUDADeviceContext { #endif
#ifdef PADDLE_WITH_MKLDNN
class MKLDNNDeviceContext : public CPUDeviceContext {
public: public:
explicit CUDNNDeviceContext(CUDAPlace place); explicit MKLDNNDeviceContext(CPUPlace place);
virtual ~CUDNNDeviceContext();
/*! \brief Return cudnn handle in the device context. */ /* \brief Add new element: memory, primitive or primitive desc */
cudnnHandle_t cudnn_handle() const; template <typename T>
void AddElement(const std::string& op_key, const T& value);
/* \brief Get existed element: memory, primitive or primitive desc */
template <typename T>
const T& GetElement(const std::string& op_key) const;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template <typename T>
const std::unordered_map<const std::string, const T, std::hash<std::string>>&
GetElementPool() const;
/* \brief Get the active engine */
const MKLDNNEngine& engine() const { return *engine_; }
/* \brief Submit primitive to pipeline */
void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
/*! \brief Execute all submitted primitives in pipeline */
void Execute(bool block = true);
protected:
/*! \brief Reset the stream to prepare next exectue */
void ResetStream();
private: private:
cudnnHandle_t cudnn_handle_; std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>
memory_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>
primitive_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>
primitive_desc_pool_;
std::vector<MKLDNNPrimitive> pipeline_;
MKLDNNStreamPtr stream_;
MKLDNNEnginePtr engine_;
bool ready_;
}; };
#endif #endif
/*! \brief device context pool singleton */ /*! \brief device context pool singleton */
...@@ -151,7 +191,7 @@ class DeviceContextPool { ...@@ -151,7 +191,7 @@ class DeviceContextPool {
struct Hash { struct Hash {
std::hash<int> hash_; std::hash<int> hash_;
size_t operator()(const platform::Place& place) const { size_t operator()(const platform::Place& place) const {
int pre_hash = place.which() + (1 << LEFT_SHIFT); int pre_hash = place.which() << LEFT_SHIFT;
if (platform::is_gpu_place(place)) { if (platform::is_gpu_place(place)) {
pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId(); pre_hash += boost::get<platform::CUDAPlace>(place).GetDeviceId();
} }
......
...@@ -49,21 +49,6 @@ TEST(Device, CUDADeviceContext) { ...@@ -49,21 +49,6 @@ TEST(Device, CUDADeviceContext) {
} }
} }
TEST(Device, CUDNNDeviceContext) {
using paddle::platform::CUDNNDeviceContext;
using paddle::platform::CUDAPlace;
if (paddle::platform::dynload::HasCUDNN()) {
int count = paddle::platform::GetCUDADeviceCount();
for (int i = 0; i < count; ++i) {
CUDNNDeviceContext* device_context = new CUDNNDeviceContext(CUDAPlace(i));
cudnnHandle_t cudnn_handle = device_context->cudnn_handle();
ASSERT_NE(nullptr, cudnn_handle);
ASSERT_NE(nullptr, device_context->stream());
delete device_context;
}
}
}
TEST(Device, DeviceContextPool) { TEST(Device, DeviceContextPool) {
using paddle::platform::DeviceContextPool; using paddle::platform::DeviceContextPool;
using paddle::platform::CUDADeviceContext; using paddle::platform::CUDADeviceContext;
......
cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce) cc_library(dynamic_loader SRCS dynamic_loader.cc DEPS glog gflags enforce)
nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc nv_library(dynload_cuda SRCS cublas.cc cudnn.cc curand.cc nccl.cc
DEPS dynamic_loader nccl) DEPS dynamic_loader nccl)
cc_library(dynload_warpctc SRCS warpctc.cc DEPS dynamic_loader warpctc)
...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/platform/dynload/cublas.h> #include "paddle/platform/dynload/cublas.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/platform/dynload/warpctc.h"
namespace paddle {
namespace platform {
namespace dynload {
std::once_flag warpctc_dso_flag;
void* warpctc_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
WARPCTC_ROUTINE_EACH(DEFINE_WRAP);
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <dlfcn.h>
#include <mutex>
#include "ctc.h"
#include "paddle/platform/dynload/dynamic_loader.h"
namespace paddle {
namespace platform {
namespace dynload {
extern std::once_flag warpctc_dso_flag;
extern void* warpctc_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load warpctc routine
* via operator overloading.
*/
#define DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
auto operator()(Args... args) -> decltype(__name(args...)) { \
using warpctcFunc = decltype(__name(args...)) (*)(Args...); \
std::call_once(warpctc_dso_flag, \
paddle::platform::dynload::GetWarpCTCDsoHandle, \
&warpctc_dso_handle); \
void* p_##_name = dlsym(warpctc_dso_handle, #__name); \
return reinterpret_cast<warpctcFunc>(p_##_name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#define DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP(__name) \
DYNAMIC_LOAD_WARPCTC_WRAP(__name)
#define WARPCTC_ROUTINE_EACH(__macro) \
__macro(get_warpctc_version); \
__macro(ctcGetStatusString); \
__macro(compute_ctc_loss); \
__macro(get_workspace_size)
WARPCTC_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_WARPCTC_WRAP);
#undef DYNAMIC_LOAD_WARPCTC_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <mkldnn.hpp>
namespace paddle {
namespace platform {
using MKLDNNStream = mkldnn::stream;
using MKLDNNEngine = mkldnn::engine;
using MKLDNNMemory = mkldnn::memory;
using MKLDNNPrimitive = mkldnn::primitive;
using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
typedef std::unique_ptr<MKLDNNStream> MKLDNNStreamPtr;
typedef std::unique_ptr<MKLDNNEngine> MKLDNNEnginePtr;
typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
} // namespace platform
} // namespace paddle
...@@ -51,6 +51,18 @@ bool places_are_same_class(const Place &p1, const Place &p2) { ...@@ -51,6 +51,18 @@ bool places_are_same_class(const Place &p1, const Place &p2) {
return p1.which() == p2.which(); return p1.which() == p2.which();
} }
bool is_same_place(const Place &p1, const Place &p2) {
if (places_are_same_class(p1, p2)) {
if (is_cpu_place(p1)) {
return true;
} else {
return boost::get<CUDAPlace>(p1) == boost::get<CUDAPlace>(p2);
}
} else {
return false;
}
}
std::ostream &operator<<(std::ostream &os, const Place &p) { std::ostream &operator<<(std::ostream &os, const Place &p) {
detail::PlacePrinter printer(os); detail::PlacePrinter printer(os);
boost::apply_visitor(printer, p); boost::apply_visitor(printer, p);
......
...@@ -52,6 +52,8 @@ struct IsCUDAPlace : public boost::static_visitor<bool> { ...@@ -52,6 +52,8 @@ struct IsCUDAPlace : public boost::static_visitor<bool> {
typedef boost::variant<CUDAPlace, CPUPlace> Place; typedef boost::variant<CUDAPlace, CPUPlace> Place;
using PlaceList = std::vector<Place>;
void set_place(const Place &); void set_place(const Place &);
const Place &get_place(); const Place &get_place();
...@@ -61,6 +63,7 @@ const CPUPlace default_cpu(); ...@@ -61,6 +63,7 @@ const CPUPlace default_cpu();
bool is_gpu_place(const Place &); bool is_gpu_place(const Place &);
bool is_cpu_place(const Place &); bool is_cpu_place(const Place &);
bool places_are_same_class(const Place &, const Place &); bool places_are_same_class(const Place &, const Place &);
bool is_same_place(const Place &, const Place &);
std::ostream &operator<<(std::ostream &, const Place &); std::ostream &operator<<(std::ostream &, const Place &);
......
...@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and ...@@ -13,12 +13,17 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/platform/profiler.h" #include "paddle/platform/profiler.h"
#include <iomanip>
#include <map>
#include "glog/logging.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
// The profiler state, the initial value is ProfilerState::kDisabled // The profiler state, the initial value is ProfilerState::kDisabled
static ProfilerState g_state = ProfilerState::kDisabled; static ProfilerState g_state = ProfilerState::kDisabled;
// To record which timer the profiler used, CUDA or CPU.
static std::string g_profiler_place = "";
// The thread local event list only can be accessed by the specific thread // The thread local event list only can be accessed by the specific thread
// The thread index of each thread // The thread index of each thread
static thread_local int32_t g_thread_id; static thread_local int32_t g_thread_id;
...@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() { ...@@ -43,10 +48,7 @@ inline uint64_t GetTimeInNsec() {
Event::Event(EventKind kind, std::string name, uint32_t thread_id, Event::Event(EventKind kind, std::string name, uint32_t thread_id,
DeviceContext* dev_ctx) DeviceContext* dev_ctx)
: kind_(kind), : kind_(kind), name_(name), thread_id_(thread_id), has_cuda_(false) {
name_(std::move(name)),
thread_id_(thread_id),
has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx); auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
if (cuda_dev_ctx) { if (cuda_dev_ctx) {
...@@ -72,11 +74,11 @@ std::string Event::kind() const { ...@@ -72,11 +74,11 @@ std::string Event::kind() const {
PADDLE_THROW("Unknown EventKind."); PADDLE_THROW("Unknown EventKind.");
} }
double Event::CpuElapsedUs(const Event& e) const { double Event::CpuElapsedMs(const Event& e) const {
return (e.cpu_ns_ - cpu_ns_) / (1000.0); return (e.cpu_ns_ - cpu_ns_) / (1000000.0);
} }
double Event::CudaElapsedUs(const Event& e) const { double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(e.has_cuda() && has_cuda()); PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device()); PADDLE_ENFORCE(e.device() == device());
...@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const { ...@@ -84,7 +86,7 @@ double Event::CudaElapsedUs(const Event& e) const {
PADDLE_ENFORCE(cudaEventSynchronize(e.event())); PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms; float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event())); PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms * 1000.0; return ms;
#else #else
PADDLE_THROW("CUDA is not enabled"); PADDLE_THROW("CUDA is not enabled");
#endif #endif
...@@ -113,21 +115,27 @@ inline EventList& GetEventList() { ...@@ -113,21 +115,27 @@ inline EventList& GetEventList() {
} }
void Mark(const std::string& name, DeviceContext* dev_ctx) { void Mark(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kMark, std::move(name), g_thread_id, GetEventList().Record(EventKind::kMark, name, g_thread_id, dev_ctx);
dev_ctx); }
void PushEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPushRange, name, g_thread_id, dev_ctx);
}
void PopEvent(const std::string& name, DeviceContext* dev_ctx) {
GetEventList().Record(EventKind::kPopRange, name, g_thread_id, dev_ctx);
} }
RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) { RecordEvent::RecordEvent(const std::string& name, DeviceContext* dev_ctx) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
dev_ctx_ = dev_ctx; dev_ctx_ = dev_ctx;
GetEventList().Record(EventKind::kPushRange, std::move(name), g_thread_id, name_ = name;
dev_ctx_); PushEvent(name_, dev_ctx_);
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
GetEventList().Record(EventKind::kPopRange, std::string(), g_thread_id, PopEvent(name_, dev_ctx_);
dev_ctx_);
} }
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
...@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -138,6 +146,7 @@ void EnableProfiler(ProfilerState state) {
"The profiling state should be disabled when calling ", "The profiling state should be disabled when calling ",
"EnableProfiler."); "EnableProfiler.");
g_state = state; g_state = state;
g_profiler_place = (g_state == ProfilerState::kCUDA) ? "CUDA" : "CPU";
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead. // Generate some dummy evenets first to reduce the startup overhead.
...@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() { ...@@ -169,5 +178,152 @@ std::vector<std::vector<Event>> DisableProfiler() {
return result; return result;
} }
void ParseEvents(std::vector<std::vector<Event>>& events,
EventSortingKey sorted_by) {
if (g_profiler_place == "") return;
std::string sorted_domain;
std::function<bool(const EventItem&, const EventItem&)> sorted_func;
switch (sorted_by) {
case EventSortingKey::kCalls:
sorted_domain = "number of calls";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.calls > b.calls;
};
break;
case EventSortingKey::kTotal:
sorted_domain = "total time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.total_time > b.total_time;
};
break;
case EventSortingKey::kMin:
sorted_domain = "minimum time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.min_time > b.min_time;
};
break;
case EventSortingKey::kMax:
sorted_domain = "maximum time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.max_time > b.max_time;
};
break;
case EventSortingKey::kAve:
sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.ave_time > b.ave_time;
};
break;
default:
sorted_domain = "event end time";
}
std::vector<std::vector<EventItem>> events_table;
size_t max_name_width = 0;
for (size_t i = 0; i < events.size(); i++) {
std::list<Event> pushed_events;
std::vector<EventItem> event_items;
std::unordered_map<std::string, int> event_idx;
for (size_t j = 0; j < events[i].size(); j++) {
if (events[i][j].kind() == "push") {
pushed_events.push_back(events[i][j]);
} else if (events[i][j].kind() == "pop") {
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend() &&
rit->name() != events[i][j].name()) {
++rit;
}
if (rit != pushed_events.rend()) {
double event_time = (g_profiler_place == "CUDA")
? rit->CudaElapsedMs(events[i][j])
: rit->CpuElapsedMs(events[i][j]);
std::string event_name =
"thread" + std::to_string(rit->thread_id()) + "::" + rit->name();
max_name_width = std::max(max_name_width, event_name.size());
if (event_idx.find(event_name) == event_idx.end()) {
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
event_items[index].calls += 1;
// total time
event_items[index].total_time += event_time;
// min time
event_items[index].min_time =
std::min(event_time, event_items[index].min_time);
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);
}
// remove the push marker from the list
pushed_events.erase((++rit).base());
} else {
LOG(WARNING) << "Cannot find the push marker of event \'"
<< events[i][j].name()
<< "\', which will be ignored in profiling report.";
}
}
}
// average time
for (auto& item : event_items) {
item.ave_time = item.total_time / item.calls;
}
// sort
if (sorted_by != EventSortingKey::kDefault) {
std::sort(event_items.begin(), event_items.end(), sorted_func);
}
events_table.push_back(event_items);
// log warning if there are events with `push` but without `pop`
std::list<Event>::reverse_iterator rit = pushed_events.rbegin();
while (rit != pushed_events.rend()) {
LOG(WARNING) << "Cannot find the pop marker of event \'" << rit->name()
<< "\', which will be ignored in profiling report.";
++rit;
}
}
// Print report
PrintProfilingReport(events_table, sorted_domain, max_name_width + 4, 12);
}
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width) {
// Output header information
std::cout << "\n------------------------->"
<< " Profiling Report "
<< "<-------------------------\n\n";
std::cout << "Place: " << g_profiler_place << std::endl;
std::cout << "Time unit: ms" << std::endl;
std::cout << "Sorted by " << sorted_domain
<< " in descending order in the same thread\n\n";
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
for (size_t j = 0; j < events_table[i].size(); ++j) {
EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::endl;
}
}
std::cout << std::endl;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -33,6 +33,7 @@ class Event { ...@@ -33,6 +33,7 @@ class Event {
std::string kind() const; std::string kind() const;
std::string name() const { return name_; } std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; } bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -40,8 +41,8 @@ class Event { ...@@ -40,8 +41,8 @@ class Event {
int device() const { return device_; } int device() const { return device_; }
#endif #endif
double CpuElapsedUs(const Event& e) const; double CpuElapsedMs(const Event& e) const;
double CudaElapsedUs(const Event& e) const; double CudaElapsedMs(const Event& e) const;
private: private:
EventKind kind_; EventKind kind_;
...@@ -94,6 +95,10 @@ enum ProfilerState { ...@@ -94,6 +95,10 @@ enum ProfilerState {
void Mark(const std::string& name, DeviceContext* dev_ctx); void Mark(const std::string& name, DeviceContext* dev_ctx);
void PushEvent(const std::string& name, DeviceContext* dev_ctx);
void PopEvent(const std::string& name, DeviceContext* dev_ctx);
struct RecordEvent { struct RecordEvent {
explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx); explicit RecordEvent(const std::string& name, DeviceContext* dev_ctx);
...@@ -101,6 +106,8 @@ struct RecordEvent { ...@@ -101,6 +106,8 @@ struct RecordEvent {
// The device context is used by Event to get the current cuda stream. // The device context is used by Event to get the current cuda stream.
DeviceContext* dev_ctx_; DeviceContext* dev_ctx_;
// Event name
std::string name_;
}; };
// Enable the profiling function. // Enable the profiling function.
...@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state); ...@@ -110,5 +117,26 @@ void EnableProfiler(ProfilerState state);
// event_lists, event_lists[i][j] represents the j-th Event of i-th thread. // event_lists, event_lists[i][j] represents the j-th Event of i-th thread.
std::vector<std::vector<Event>> DisableProfiler(); std::vector<std::vector<Event>> DisableProfiler();
// The information of each event given in the profiling report
struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
};
// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
// Parse the event list and output the profiling report
void ParseEvents(std::vector<std::vector<Event>>&,
EventSortingKey sorted_by = EventSortingKey::kDefault);
// Print results
void PrintProfilingReport(std::vector<std::vector<EventItem>>& events_table,
std::string& sorted_domain, const size_t name_width,
const size_t data_width);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) { ...@@ -26,7 +26,7 @@ TEST(Event, CpuElapsedTime) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, nullptr); Event stop_event(EventKind::kPopRange, "test", 0, nullptr);
EXPECT_GT(start_event.CpuElapsedUs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) { ...@@ -45,7 +45,7 @@ TEST(Event, CudaElapsedTime) {
counter++; counter++;
} }
Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx); Event stop_event(EventKind::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedUs(stop_event), 0); EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
} }
#endif #endif
...@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -55,6 +55,7 @@ TEST(RecordEvent, RecordEvent) {
using paddle::platform::EventKind; using paddle::platform::EventKind;
using paddle::platform::RecordEvent; using paddle::platform::RecordEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
ProfilerState state = ProfilerState::kCPU; ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr; DeviceContext* dev_ctx = nullptr;
...@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) { ...@@ -67,13 +68,45 @@ TEST(RecordEvent, RecordEvent) {
#endif #endif
EnableProfiler(state); EnableProfiler(state);
/* Usage 1:
* PushEvent(evt_name, dev_ctx);
* ...
* code to be analyzed
* ...
* PopEvent(evt_name, dev_ctx);
*/
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx);
}
}
/* Usage 2:
* {
* RecordEvent record_event(name, dev_ctx);
* ...
* code to be analyzed
* ...
* }
*/
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i); std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name, dev_ctx);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
} }
// Bad Usage:
PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx);
std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler(); std::vector<std::vector<Event>> events = paddle::platform::DisableProfiler();
// Will remove parsing-related code from test later
ParseEvents(events, EventSortingKey::kTotal);
int cuda_startup_count = 0; int cuda_startup_count = 0;
int start_profiler_count = 0; int start_profiler_count = 0;
int stop_profiler_count = 0; int stop_profiler_count = 0;
...@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) { ...@@ -85,9 +118,9 @@ TEST(RecordEvent, RecordEvent) {
if (events[i][j].name() == "push") { if (events[i][j].name() == "push") {
EXPECT_EQ(events[i][j + 1].name(), "pop"); EXPECT_EQ(events[i][j + 1].name(), "pop");
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
EXPECT_GT(events[i][j].CudaElapsedUs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CudaElapsedMs(events[i][j + 1]), 0);
#else #else
EXPECT_GT(events[i][j].CpuElapsedUs(events[i][j + 1]), 0); EXPECT_GT(events[i][j].CpuElapsedMs(events[i][j + 1]), 0);
#endif #endif
} }
} }
......
...@@ -23,11 +23,6 @@ void BindConstValue(pybind11::module& m) { ...@@ -23,11 +23,6 @@ void BindConstValue(pybind11::module& m) {
m.def("kTempVarName", [] { return framework::kTempVarName; }); m.def("kTempVarName", [] { return framework::kTempVarName; });
m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; }); m.def("kGradVarSuffix", [] { return framework::kGradVarSuffix; });
m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; }); m.def("kZeroVarSuffix", [] { return framework::kZeroVarSuffix; });
// for kernel_hint key
m.def("kUseCPU", [] { return framework::kUseCPU; });
m.def("kUseCUDNN", [] { return framework::kUseCUDNN; });
m.def("kUseMKLDNN", [] { return framework::kUseMKLDNN; });
} }
} // namespace pybind } // namespace pybind
......
...@@ -216,7 +216,7 @@ void BindVarDsec(py::module &m) { ...@@ -216,7 +216,7 @@ void BindVarDsec(py::module &m) {
.def("set_dtype", &VarDesc::SetDataType) .def("set_dtype", &VarDesc::SetDataType)
.def("shape", &VarDesc::Shape, py::return_value_policy::reference) .def("shape", &VarDesc::Shape, py::return_value_policy::reference)
.def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference) .def("dtype", &VarDesc::GetDataType, py::return_value_policy::reference)
.def("lod_level", &VarDesc::GetLodLevel) .def("lod_level", &VarDesc::GetLoDLevel)
.def("set_lod_level", &VarDesc::SetLoDLevel) .def("set_lod_level", &VarDesc::SetLoDLevel)
.def("type", &VarDesc::GetType) .def("type", &VarDesc::GetType)
.def("set_type", &VarDesc::SetType) .def("set_type", &VarDesc::SetType)
...@@ -231,7 +231,8 @@ void BindVarDsec(py::module &m) { ...@@ -231,7 +231,8 @@ void BindVarDsec(py::module &m) {
.value("FETCH_LIST", proto::VarDesc::FETCH_LIST) .value("FETCH_LIST", proto::VarDesc::FETCH_LIST)
.value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES) .value("STEP_SCOPES", proto::VarDesc::STEP_SCOPES)
.value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE) .value("LOD_RANK_TABLE", proto::VarDesc::LOD_RANK_TABLE)
.value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY); .value("LOD_TENSOR_ARRAY", proto::VarDesc::LOD_TENSOR_ARRAY)
.value("PLACE_LIST", proto::VarDesc::PLACE_LIST);
} }
void BindOpDesc(py::module &m) { void BindOpDesc(py::module &m) {
......
...@@ -427,8 +427,15 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -427,8 +427,15 @@ All parameter, weight, gradient are variables in Paddle.
m.def("unique_integer", UniqueIntegerGenerator); m.def("unique_integer", UniqueIntegerGenerator);
m.def("init_gflags", framework::InitGflags); m.def("init_gflags", framework::InitGflags);
m.def("init_glog", framework::InitGLOG);
m.def("init_devices", &framework::InitDevices); m.def("init_devices", &framework::InitDevices);
m.def("use_cpu", framework::UseCPU);
m.def("use_mkldnn", framework::UseMKLDNN);
m.def("use_cuda", framework::UseCUDA);
m.def("use_cudnn", framework::UseCUDNN);
m.def("use_all", framework::UseALL);
m.def("is_compile_gpu", IsCompileGPU); m.def("is_compile_gpu", IsCompileGPU);
m.def("set_feed_variable", framework::SetFeedVariable); m.def("set_feed_variable", framework::SetFeedVariable);
m.def("get_fetch_variable", framework::GetFetchVariable); m.def("get_fetch_variable", framework::GetFetchVariable);
......
...@@ -193,6 +193,16 @@ EOF ...@@ -193,6 +193,16 @@ EOF
EOF EOF
} }
function gen_capi_package() {
if [[ ${WITH_C_API} == "ON" ]]; then
install_prefix="/paddle/build/capi_output"
rm -rf $install_prefix
make DESTDIR="$install_prefix" install
cd $install_prefix/usr/local
ls | egrep -v "^Found.*item$" | xargs tar -cf /paddle/build/paddle.tgz
fi
}
set -xe set -xe
cmake_gen ${PYTHON_ABI:-""} cmake_gen ${PYTHON_ABI:-""}
...@@ -200,6 +210,11 @@ run_build ...@@ -200,6 +210,11 @@ run_build
run_test run_test
gen_docs gen_docs
gen_dockerfile gen_dockerfile
gen_capi_package
printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n" if [[ ${WITH_C_API:-OFF} == "ON" ]]; then
printf "PaddlePaddle C-API libraries was generated on build/paddle.tgz\n"
else
printf "If you need to install PaddlePaddle in develop docker image,"
printf "please make install or pip install build/python/dist/*.whl.\n"
fi
...@@ -29,6 +29,7 @@ DECLARE_bool(with_gpu); ...@@ -29,6 +29,7 @@ DECLARE_bool(with_gpu);
DECLARE_bool(parallel_nn); DECLARE_bool(parallel_nn);
DECLARE_string(config_args); DECLARE_string(config_args);
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_mkl_packed);
const char *kConfigParserModuleName = "paddle.trainer.config_parser"; const char *kConfigParserModuleName = "paddle.trainer.config_parser";
const char *kConfigParserFuncName = "parse_config_and_serialize"; const char *kConfigParserFuncName = "parse_config_and_serialize";
...@@ -46,6 +47,7 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath) ...@@ -46,6 +47,7 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
<< ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
<< ",parallel_nn=" << FLAGS_parallel_nn << ",parallel_nn=" << FLAGS_parallel_nn
<< ",use_mkldnn=" << FLAGS_use_mkldnn << ",use_mkldnn=" << FLAGS_use_mkldnn
<< ",use_mkl_packed=" << FLAGS_use_mkl_packed
<< ",cudnn_version=" << hl_get_cudnn_lib_version(); << ",cudnn_version=" << hl_get_cudnn_lib_version();
if (!FLAGS_config_args.empty()) { if (!FLAGS_config_args.empty()) {
configArgs << "," << FLAGS_config_args; configArgs << "," << FLAGS_config_args;
......
...@@ -27,6 +27,13 @@ DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training"); ...@@ -27,6 +27,13 @@ DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
DEFINE_bool(use_mkldnn, false, "Only support CPU training"); DEFINE_bool(use_mkldnn, false, "Only support CPU training");
#endif #endif
#ifdef PADDLE_WITH_MKLML
// TODO(TJ): change to true when fully confirmed
DEFINE_bool(use_mkl_packed, false, "Whether to use MKL Packed Optimization");
#else
DEFINE_bool(use_mkl_packed, false, "Not to use MKL Packed Optimization");
#endif
DEFINE_bool(parallel_nn, DEFINE_bool(parallel_nn,
false, false,
"Whether to use multi-threads to calculate one neural network." "Whether to use multi-threads to calculate one neural network."
......
...@@ -41,3 +41,4 @@ DECLARE_string(predict_file); ...@@ -41,3 +41,4 @@ DECLARE_string(predict_file);
DECLARE_bool(prev_batch_state); DECLARE_bool(prev_batch_state);
DECLARE_string(init_model_path); DECLARE_string(init_model_path);
DECLARE_bool(use_mkldnn); DECLARE_bool(use_mkldnn);
DECLARE_bool(use_mkl_packed);
...@@ -29,8 +29,8 @@ if(WITH_MKLML) ...@@ -29,8 +29,8 @@ if(WITH_MKLML)
endif() endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
list(APPEND MKL_SHARED_LIBS "${MKLDNN_LIB}" "${MKLDNN_LIB}.0") list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
list(APPEND MKL_DEPENDS mkldnn) list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
endif() endif()
if(WITH_GPU) if(WITH_GPU)
......
...@@ -3622,8 +3622,13 @@ class ConcatenateLayer2(LayerBase): ...@@ -3622,8 +3622,13 @@ class ConcatenateLayer2(LayerBase):
@config_layer('recurrent') @config_layer('recurrent')
class RecurrentLayer(LayerBase): class RecurrentLayer(LayerBase):
layer_type = 'recurrent'
def __init__(self, name, inputs, reversed=False, bias=True, **xargs): def __init__(self, name, inputs, reversed=False, bias=True, **xargs):
super(RecurrentLayer, self).__init__(name, 'recurrent', 0, inputs, use_mkl_packed = bool(
int(g_command_config_args.get("use_mkl_packed", 0)))
self.layer_type = 'mkl_packed_recurrent' if use_mkl_packed else 'recurrent'
super(RecurrentLayer, self).__init__(name, self.layer_type, 0, inputs,
**xargs) **xargs)
config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input') config_assert(len(self.inputs) == 1, 'RecurrentLayer must have 1 input')
input_layer = self.get_input_layer(0) input_layer = self.get_input_layer(0)
......
...@@ -58,12 +58,12 @@ def is_compatible_with(x, Type): ...@@ -58,12 +58,12 @@ def is_compatible_with(x, Type):
class HookAttribute(object): class HookAttribute(object):
""" """
Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs Hook Attribute object. As a member of ParameterAttribute class, the hook is an auxiliary operation that occurs
during training process of a layer with parameters, such as img_conv layer, fc layer. during training process of a layer with parameters, such as img_conv layer, fc layer.
:param type: Hook type, currently supported types: :param type: Hook type, currently supported types:
'pruning' : user specify a sparsity_ratio before training started, and the 'pruning' : user specify a sparsity_ratio before training started, and the
network will prune the parameters based on the sparsity_ratio. network will prune the parameters based on the sparsity_ratio.
eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6) eg: The definition of Hook object can be hk = HookAttribute('pruning', 0.6)
The specific usage can be paddle.layer.img_conv(input=img, filter_size=3, The specific usage can be paddle.layer.img_conv(input=img, filter_size=3,
num_channels=3, num_filters=64, num_channels=3, num_filters=64,
...@@ -71,10 +71,10 @@ class HookAttribute(object): ...@@ -71,10 +71,10 @@ class HookAttribute(object):
The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf The pruning details can be found https://arxiv.org/pdf/1506.02626.pdf
:type type: string :type type: string
:param sparsity_ratio: Must be specified if hook type is 'pruning', :param sparsity_ratio: Must be specified if hook type is 'pruning',
it represents the ratio of the zero elements to be set by the Parameter. it represents the ratio of the zero elements to be set by the Parameter.
:type sparsity_ratio: float or None :type sparsity_ratio: float or None
""" """
def __init__(self, type, sparsity_ratio=None): def __init__(self, type, sparsity_ratio=None):
...@@ -130,10 +130,12 @@ class ParameterAttribute(object): ...@@ -130,10 +130,12 @@ class ParameterAttribute(object):
:param sparse_update: Enable sparse update for this parameter. It will :param sparse_update: Enable sparse update for this parameter. It will
enable both local and remote sparse update. enable both local and remote sparse update.
:type sparse_update: bool :type sparse_update: bool
:param update_hooks: A HookAttribute object.
:type update_hooks: HookAttribute
:param initializer: If not None, it should be a callable object which accepts :param initializer: If not None, it should be a callable object which accepts
a parameter name and returns numpy array for the initial a parameter name and returns numpy array for the initial
value of the parameter value of the parameter
:param initializer: callable object :type initializer: callable object
""" """
def __init__(self, def __init__(self,
......
...@@ -2542,15 +2542,21 @@ def img_conv_layer(input, ...@@ -2542,15 +2542,21 @@ def img_conv_layer(input,
what-are-deconvolutional-layers/>`_ . what-are-deconvolutional-layers/>`_ .
The num_channel means input image's channel number. It may be 1 or 3 when The num_channel means input image's channel number. It may be 1 or 3 when
input is raw pixels of image(mono or RGB), or it may be the previous layer's input is raw pixels of image(mono or RGB), or it may be the previous layer's
num_filters * num_group. num_filters.
There are several groups of filters in PaddlePaddle implementation. There are several groups of filters in PaddlePaddle implementation.
Each group will process some channels of the input. For example, if If the groups attribute is greater than 1, for example groups=2,
num_channel = 256, group = 4, num_filter=32, the PaddlePaddle will create the input will be splitted into 2 parts along the channel axis, and
32*4 = 128 filters to process the input. The channels will be split into 4 the filters will also be splitted into 2 parts. The first half of the filters
pieces. First 256/4 = 64 channels will be processed by first 32 filters. The is only connected to the first half of the input channels, while the second
rest channels will be processed by the rest groups of filters. half of the filters is only connected to the second half of the input. After
the computation of convolution for each part of input,
the output will be obtained by concatenating the two results.
The details of grouped convolution, please refer to:
`ImageNet Classification with Deep Convolutional Neural Networks
<http://www.cs.toronto.edu/~kriz/imagenet_classification_with_deep_convolutional.pdf>`_
The example usage is: The example usage is:
.. code-block:: python .. code-block:: python
...@@ -2575,7 +2581,8 @@ def img_conv_layer(input, ...@@ -2575,7 +2581,8 @@ def img_conv_layer(input,
:param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter :param filter_size_y: The dimension of the filter kernel on the y axis. If the parameter
is not set, it will be set automatically according to filter_size. is not set, it will be set automatically according to filter_size.
:type filter_size_y: int :type filter_size_y: int
:param num_filters: Each filter group's number of filter :param num_filters: The number of filters. It is as same as the output image channel.
:type num_filters: int
:param act: Activation type. ReluActivation is the default activation. :param act: Activation type. ReluActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
:param groups: The group number. 1 is the default group number. :param groups: The group number. 1 is the default group number.
...@@ -7177,7 +7184,7 @@ def img_conv3d_layer(input, ...@@ -7177,7 +7184,7 @@ def img_conv3d_layer(input,
:param filter_size: The dimensions of the filter kernel along three axises. If the parameter :param filter_size: The dimensions of the filter kernel along three axises. If the parameter
is set to one integer, the three dimensions will be same. is set to one integer, the three dimensions will be same.
:type filter_size: int | tuple | list :type filter_size: int | tuple | list
:param num_filters: The number of filters in each group. :param num_filters: The number of filters. It is as same as the output image channel.
:type num_filters: int :type num_filters: int
:param act: Activation type. ReluActivation is the default activation. :param act: Activation type. ReluActivation is the default activation.
:type act: BaseActivation :type act: BaseActivation
......
...@@ -135,6 +135,8 @@ def init(**kwargs): ...@@ -135,6 +135,8 @@ def init(**kwargs):
cp.g_command_config_args['use_gpu'] = kwargs['use_gpu'] cp.g_command_config_args['use_gpu'] = kwargs['use_gpu']
if 'use_mkldnn' in kwargs: if 'use_mkldnn' in kwargs:
cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn'] cp.g_command_config_args['use_mkldnn'] = kwargs['use_mkldnn']
if 'use_mkl_packed' in kwargs:
cp.g_command_config_args['use_mkl_packed'] = kwargs['use_mkl_packed']
assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not " assert 'parallel_nn' not in kwargs, ("currently 'parallel_nn' is not "
"supported in v2 APIs.") "supported in v2 APIs.")
......
from __future__ import print_function
# import all class inside framework into fluid module # import all class inside framework into fluid module
import framework import framework
from framework import * from framework import *
...@@ -27,7 +28,7 @@ __all__ = framework.__all__ + executor.__all__ + [ ...@@ -27,7 +28,7 @@ __all__ = framework.__all__ + executor.__all__ + [
] ]
def __read_gflags_from_env__(): def __bootstrap__():
""" """
Enable reading gflags from environment variables. Enable reading gflags from environment variables.
...@@ -36,11 +37,30 @@ def __read_gflags_from_env__(): ...@@ -36,11 +37,30 @@ def __read_gflags_from_env__():
""" """
import sys import sys
import core import core
import os
try:
num_threads = int(os.getenv('OMP_NUM_THREADS', '1'))
except ValueError:
num_threads = 1
if num_threads > 1:
print(
'WARNING: OMP_NUM_THREADS set to {0}, not 1. The computation '
'speed will not be optimized if you use data parallel. It will '
'fail if this PaddlePaddle binary is compiled with OpenBlas since'
' OpenBlas does not support multi-threads.'.format(num_threads),
file=sys.stderr)
print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr)
os.environ['OMP_NUM_THREADS'] = str(num_threads)
read_env_flags = ['use_pinned_memory', 'check_nan_inf'] read_env_flags = ['use_pinned_memory', 'check_nan_inf']
if core.is_compile_gpu(): if core.is_compile_gpu():
read_env_flags.append('fraction_of_gpu_memory_to_use') read_env_flags.append('fraction_of_gpu_memory_to_use')
core.init_gflags([sys.argv[0]] + core.init_gflags([sys.argv[0]] +
["--tryfromenv=" + ",".join(read_env_flags)]) ["--tryfromenv=" + ",".join(read_env_flags)])
core.init_glog(sys.argv[0])
if core.is_compile_gpu(): if core.is_compile_gpu():
core.init_devices(["CPU", "GPU:0"]) core.init_devices(["CPU", "GPU:0"])
...@@ -48,4 +68,4 @@ def __read_gflags_from_env__(): ...@@ -48,4 +68,4 @@ def __read_gflags_from_env__():
core.init_devices(["CPU"]) core.init_devices(["CPU"])
__read_gflags_from_env__() __bootstrap__()
...@@ -7,7 +7,7 @@ __all__ = ['append_backward'] ...@@ -7,7 +7,7 @@ __all__ = ['append_backward']
def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None): def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
""" """
Traverse all ops in op_descs[begin_idx : end_idx], Traverse all ops in op_descs[begin_idx : end_idx],
if any op has inputs/outputs named "old_name", rename it as 'new_name' if any op has inputs/outputs named "old_name", rename it as 'new_name'
""" """
if begin_idx is None: if begin_idx is None:
...@@ -162,7 +162,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set): ...@@ -162,7 +162,7 @@ def _remove_no_grad_branch_(op_descs, no_grad_set):
if core.grad_var_suffix() in arg and arg in no_grad_set: if core.grad_var_suffix() in arg and arg in no_grad_set:
to_insert.append((_create_op_desc_("fill_zeros_like", { to_insert.append((_create_op_desc_("fill_zeros_like", {
"X": [_strip_grad_suffix_(arg)] "X": [_strip_grad_suffix_(arg)]
}, {"Y": [arg]}, {}), idx)) }, {"Out": [arg]}, {}), idx))
map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert)) map(lambda p: op_descs.insert(p[1], p[0]), reversed(to_insert))
...@@ -182,13 +182,23 @@ def _append_backward_ops_(target, ...@@ -182,13 +182,23 @@ def _append_backward_ops_(target,
target(Variable): the target variable of forward pass target(Variable): the target variable of forward pass
block(Block): the block where forward ops are block(Block): the block where forward ops are
target_block(Block): the block which is going to hold new generated grad ops target_block(Block): the block which is going to hold new generated grad ops
no_grad_dict(dict): no_grad_dict(dict):
key(int) block index key(int) block index
val(set) a set of varibale names. These varibales have no gradient val(set) a set of varibale names. These varibales have no gradient
grad_to_var(dict)(output argument): grad_to_var(dict)(output argument):
key(str): grad variable name key(str): grad variable name
val(str): corresponding forward variable name val(str): corresponding forward variable name
callback(callable object): a callable object used to decorate new generated grad ops
""" """
if callback is None:
def empty_callback(block, context):
pass
callback = empty_callback
elif not hasattr(callback, '__call__'):
raise ValueError("'callback' must be a callable object.")
# grad_op_descs holds created grad_op, and will be appended to target_block # grad_op_descs holds created grad_op, and will be appended to target_block
grad_op_descs = [] grad_op_descs = []
program = block.program program = block.program
...@@ -205,6 +215,7 @@ def _append_backward_ops_(target, ...@@ -205,6 +215,7 @@ def _append_backward_ops_(target,
# Getting op's corresponding grad_op # Getting op's corresponding grad_op
grad_op_desc, op_grad_to_var = core.get_grad_op_desc( grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
op.desc, no_grad_dict[block.idx], grad_sub_block_list) op.desc, no_grad_dict[block.idx], grad_sub_block_list)
grad_op_descs.extend(grad_op_desc) grad_op_descs.extend(grad_op_desc)
grad_to_var.update(op_grad_to_var) grad_to_var.update(op_grad_to_var)
...@@ -225,6 +236,7 @@ def _append_backward_ops_(target, ...@@ -225,6 +236,7 @@ def _append_backward_ops_(target,
for op_desc in grad_op_descs: for op_desc in grad_op_descs:
new_op_desc = target_block.desc.append_op() new_op_desc = target_block.desc.append_op()
new_op_desc.copy_from(op_desc) new_op_desc.copy_from(op_desc)
callback(block=target_block, context=grad_to_var)
def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
...@@ -267,7 +279,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map): ...@@ -267,7 +279,7 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
_infer_var_data_type_(arg, block) _infer_var_data_type_(arg, block)
def append_backward(loss, parameter_list=None, no_grad_set=None): def append_backward(loss, parameter_list=None, no_grad_set=None, callback=None):
""" """
Append backward part to main_program Append backward part to main_program
...@@ -275,8 +287,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): ...@@ -275,8 +287,8 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
loss(Variable): The variable generated by cost function. loss(Variable): The variable generated by cost function.
parameter_list(list): Parameters that need to be updated by optimizer. parameter_list(list): Parameters that need to be updated by optimizer.
If None, it means all parameters need to be updated. If None, it means all parameters need to be updated.
no_grad_set(set): Variables that have no gradients in Block 0. no_grad_set(set): Variables that have no gradients in Block 0.
If None, the set will be generated inside the function and If None, the set will be generated inside the function and
contains all variables with `step_gradient=True` from all blocks. contains all variables with `step_gradient=True` from all blocks.
Return: Return:
...@@ -311,7 +323,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None): ...@@ -311,7 +323,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None):
grad_to_var = dict() grad_to_var = dict()
_append_backward_ops_(loss, root_block, root_block, no_grad_dict, _append_backward_ops_(loss, root_block, root_block, no_grad_dict,
grad_to_var) grad_to_var, callback)
_append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map) _append_backward_vars_(root_block, fwd_op_num, grad_to_var, grad_info_map)
program.current_block_idx = current_block_idx program.current_block_idx = current_block_idx
......
import functools import functools
import layers import layers
from . import core
__all__ = ['GradientClipByValue', 'append_gradient_clip_ops'] __all__ = [
'GradientClipByValue', 'append_gradient_clip_ops', 'error_clip_callback'
]
class BaseErrorClipAttr(object):
def append_clip_op(self, block, grad_name):
raise NotImplementedError()
class ErrorClipByValue(BaseErrorClipAttr):
def __init__(self, max, min=None):
max = float(max)
if min is None:
min = -max
else:
min = float(min)
self.max = max
self.min = min
def append_clip_op(self, block, grad_name):
block.append_op(
type="clip",
inputs={"X": grad_name},
outputs={"Out": grad_name},
attrs={"min": self.min,
"max": self.max})
def error_clip_callback(block, context):
# the context is a grad_to_var map
grad_to_var = context
op_desc = block.desc.op(block.desc.op_size() - 1)
for grad_n in filter(lambda n: grad_to_var.has_key(n),
op_desc.output_arg_names()):
fwd_var = block.var_recursive(grad_to_var[grad_n])
error_clip = getattr(fwd_var, "error_clip", None)
if error_clip is not None:
error_clip.append_clip_op(block, grad_n)
class BaseGradientClipAttr(object): class BaseGradientClipAttr(object):
......
...@@ -65,13 +65,6 @@ class Executor(object): ...@@ -65,13 +65,6 @@ class Executor(object):
p.set_place(each) p.set_place(each)
act_places.append(p) act_places.append(p)
# TODO(dzhwinter) : consider that our fluid tests all written in
# CUDAPlace(gpu_id), this will be changed in the future
if core.is_compile_gpu():
core.init_devices(["CPU", "GPU:0"])
else:
core.init_devices(["CPU"])
# TODO(dzhwinter) : only use the first place # TODO(dzhwinter) : only use the first place
self.executor = core.Executor(act_places[0]) self.executor = core.Executor(act_places[0])
self.places = places self.places = places
......
...@@ -17,10 +17,6 @@ TEMP_VAR_NAME = core.kTempVarName() ...@@ -17,10 +17,6 @@ TEMP_VAR_NAME = core.kTempVarName()
GRAD_VAR_SUFFIX = core.kGradVarSuffix() GRAD_VAR_SUFFIX = core.kGradVarSuffix()
ZERO_VAR_SUFFIX = core.kZeroVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix()
USE_CPU = core.kUseCPU()
USE_CUDNN = core.kUseMKLDNN()
USE_MKLDNN = core.kUseMKLDNN()
def grad_var_name(var_name): def grad_var_name(var_name):
""" """
...@@ -147,9 +143,11 @@ class Variable(object): ...@@ -147,9 +143,11 @@ class Variable(object):
dtype=None, dtype=None,
lod_level=None, lod_level=None,
persistable=None, persistable=None,
error_clip=None,
stop_gradient=False, stop_gradient=False,
**kwargs): **kwargs):
self.block = block self.block = block
self.error_clip = error_clip
if name is None: if name is None:
name = Variable._unique_var_name_() name = Variable._unique_var_name_()
...@@ -452,7 +450,7 @@ class Operator(object): ...@@ -452,7 +450,7 @@ class Operator(object):
no_kernel_op_set = { no_kernel_op_set = {
'feed', 'fetch', 'save', 'load', 'recurrent', 'feed', 'fetch', 'save', 'load', 'recurrent',
'rnn_memory_helper_grad', 'conditional_block', 'while', 'send', 'rnn_memory_helper_grad', 'conditional_block', 'while', 'send',
'recv' 'recv', 'parallel_do'
} }
if type not in no_kernel_op_set: if type not in no_kernel_op_set:
self.desc.infer_var_type(self.block.desc) self.desc.infer_var_type(self.block.desc)
...@@ -626,6 +624,17 @@ class Block(object): ...@@ -626,6 +624,17 @@ class Block(object):
raise ValueError("var %s not in this block" % name) raise ValueError("var %s not in this block" % name)
return v return v
def var_recursive(self, name):
if self.has_var(name):
return self.var(name)
else:
if self.idx == 0:
raise ValueError("var %s is not in block(%d) nor its parents." %
name, self.idx)
else:
parent_block = self.program.block(self.parent_idx)
return parent_block.var_recursive(name)
def all_parameters(self): def all_parameters(self):
return list(self.iter_parameters()) return list(self.iter_parameters())
...@@ -744,6 +753,7 @@ class Block(object): ...@@ -744,6 +753,7 @@ class Block(object):
optimize_attr=p.optimize_attr, optimize_attr=p.optimize_attr,
regularizer=p.regularizer, regularizer=p.regularizer,
clip_attr=p.clip_attr, clip_attr=p.clip_attr,
error_clip=p.error_clip,
name=v.name) name=v.name)
self.vars[new_p.name] = new_p self.vars[new_p.name] = new_p
......
...@@ -36,7 +36,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None): ...@@ -36,7 +36,7 @@ def save_vars(executor, dirname, main_program=None, vars=None, predicate=None):
:param executor: executor that save variable :param executor: executor that save variable
:param dirname: directory path :param dirname: directory path
:param main_program: program. If vars is None, then filter all variables in this :param main_program: program. If vars is None, then filter all variables in this
program which fit `predicate`. Default g_program. program which fit `predicate`. Default default_main_program.
:param predicate: The Predicate describes a callable that returns a variable :param predicate: The Predicate describes a callable that returns a variable
as a bool. If it returns true, the variables will be saved. as a bool. If it returns true, the variables will be saved.
:param vars: variables need to be saved. If specify vars, program & predicate :param vars: variables need to be saved. If specify vars, program & predicate
...@@ -212,6 +212,11 @@ def save_inference_model(dirname, ...@@ -212,6 +212,11 @@ def save_inference_model(dirname,
"fetch_var_names": fetch_var_names "fetch_var_names": fetch_var_names
}, f, -1) }, f, -1)
# Save only programDesc of inference_program in binary format
# in another file: __model__.dat
with open(model_file_name + ".dat", "wb") as fp:
fp.write(inference_program.desc.serialize_to_string())
save_params(executor, dirname, main_program) save_params(executor, dirname, main_program)
......
...@@ -120,11 +120,12 @@ class LayerHelper(object): ...@@ -120,11 +120,12 @@ class LayerHelper(object):
raise ValueError("no Parameter name %s found" % name) raise ValueError("no Parameter name %s found" % name)
return param return param
def create_tmp_variable(self, dtype): def create_tmp_variable(self, dtype, stop_gradient=False):
return self.main_program.current_block().create_var( return self.main_program.current_block().create_var(
name=unique_name(".".join([self.name, 'tmp'])), name=unique_name(".".join([self.name, 'tmp'])),
dtype=dtype, dtype=dtype,
persistable=False) persistable=False,
stop_gradient=stop_gradient)
def create_variable(self, *args, **kwargs): def create_variable(self, *args, **kwargs):
return self.main_program.current_block().create_var(*args, **kwargs) return self.main_program.current_block().create_var(*args, **kwargs)
......
...@@ -8,6 +8,8 @@ import tensor ...@@ -8,6 +8,8 @@ import tensor
from tensor import * from tensor import *
import control_flow import control_flow
from control_flow import * from control_flow import *
import device
from device import *
__all__ = [] __all__ = []
__all__ += nn.__all__ __all__ += nn.__all__
...@@ -15,3 +17,4 @@ __all__ += io.__all__ ...@@ -15,3 +17,4 @@ __all__ += io.__all__
__all__ += tensor.__all__ __all__ += tensor.__all__
__all__ += control_flow.__all__ __all__ += control_flow.__all__
__all__ += ops.__all__ __all__ += ops.__all__
__all__ += device.__all__
...@@ -6,12 +6,13 @@ import contextlib ...@@ -6,12 +6,13 @@ import contextlib
from ..registry import autodoc from ..registry import autodoc
__all__ = [ __all__ = [
'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard', 'StaticRNNGuard', 'split_lod_tensor', 'merge_lod_tensor', 'BlockGuard',
'StaticRNNMemoryLink', 'WhileGuard', 'While', 'lod_rank_table', 'BlockGuardWithCompletion', 'StaticRNNMemoryLink', 'WhileGuard', 'While',
'max_sequence_len', 'topk', 'lod_tensor_to_array', 'array_to_lod_tensor', 'lod_rank_table', 'max_sequence_len', 'topk', 'lod_tensor_to_array',
'increment', 'array_write', 'create_array', 'less_than', 'array_read', 'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
'shrink_memory', 'array_length', 'IfElse', 'DynamicRNN', 'ConditionalBlock', 'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
'StaticRNN', 'reorder_lod_tensor_by_rank' 'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
'ParallelDo'
] ]
...@@ -132,29 +133,129 @@ class BlockGuard(object): ...@@ -132,29 +133,129 @@ class BlockGuard(object):
return True return True
class StaticRNNGuard(BlockGuard): class ParallelDo(object):
""" """
StaticRNNGuard class. ParallelDo class.
StaticRNNGuard class is used to create a StaticRNN block in a program. ParallelDo class is used to create a ParallelDo.
"""
def __init__(self, places, name=None):
self.helper = LayerHelper("parallel_do", name=name)
self.inputs = []
self.places = places
self.outputs = []
self.status = StaticRNN.BEFORE_RNN_BLOCK
def do(self):
return BlockGuardWithCompletion(self)
def parent_block(self):
prog = self.helper.main_program
parent_idx = prog.current_block().parent_idx
assert parent_idx >= 0
parent_block = prog.block(parent_idx)
return parent_block
def __call__(self, *args, **kwargs):
if self.status != StaticRNN.AFTER_RNN_BLOCK:
raise ValueError("RNN output can only be retrieved after rnn block")
if len(self.outputs) == 0:
raise ValueError("RNN has no output")
elif len(self.outputs) == 1:
return self.outputs[0]
else:
return self.outputs
def read_input(self, var):
self.inputs.append(var)
return var
def write_output(self, var):
self.outputs.append(var)
def get_parameters(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
local_inputs = set()
for op in current_block.ops:
for oname in op.output_names:
for out_var_name in op.output(oname):
local_inputs.add(out_var_name)
for var in self.inputs:
local_inputs.add(var.name)
params = list()
for op in current_block.ops:
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in local_inputs:
params.append(in_var_name)
return [parent_block.var(name) for name in params]
def complete_op(self):
main_program = self.helper.main_program
current_block = main_program.current_block()
parent_block = self.parent_block()
step_scope = parent_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
self.outputs = [
parent_block.create_var(
name=o.name,
shape=o.shape,
dtype=o.dtype,
lod_level=o.lod_level,
persistable=o.persistable,
stop_gradient=o.stop_gradient) for o in self.outputs
]
inputs = [parent_block.var(i.name) for i in self.inputs]
outputs = [parent_block.var(o.name) for o in self.outputs]
parent_block.append_op(
type='parallel_do',
inputs={
'inputs': inputs,
'parameters': self.get_parameters(),
'places': self.places
},
outputs={'outputs': outputs,
'parallel_scopes': [step_scope]},
attrs={'sub_block': current_block})
class BlockGuardWithCompletion(BlockGuard):
"""
BlockGuardWithCompletion class.
BlockGuardWithCompletion class is used to create an op with a block in a program.
""" """
def __init__(self, rnn): def __init__(self, rnn):
if not isinstance(rnn, StaticRNN): if not (isinstance(rnn, StaticRNN) or isinstance(rnn, ParallelDo)):
raise TypeError("StaticRNNGuard takes a StaticRNN") raise TypeError(
super(StaticRNNGuard, self).__init__(rnn.helper.main_program) "BlockGuardWithCompletion takes a StaticRNN or ParallelDo")
super(BlockGuardWithCompletion, self).__init__(rnn.helper.main_program)
self.rnn = rnn self.rnn = rnn
def __enter__(self): def __enter__(self):
self.rnn.status = StaticRNN.IN_RNN_BLOCK self.rnn.status = StaticRNN.IN_RNN_BLOCK
return super(StaticRNNGuard, self).__enter__() return super(BlockGuardWithCompletion, self).__enter__()
def __exit__(self, exc_type, exc_val, exc_tb): def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None: if exc_type is not None:
return False return False
self.rnn.status = StaticRNN.AFTER_RNN_BLOCK self.rnn.status = StaticRNN.AFTER_RNN_BLOCK
self.rnn.complete_rnn_op() self.rnn.complete_op()
return super(StaticRNNGuard, self).__exit__(exc_type, exc_val, exc_tb) return super(BlockGuardWithCompletion, self).__exit__(exc_type, exc_val,
exc_tb)
class StaticRNNMemoryLink(object): class StaticRNNMemoryLink(object):
...@@ -200,7 +301,7 @@ class StaticRNN(object): ...@@ -200,7 +301,7 @@ class StaticRNN(object):
self.seq_len = None self.seq_len = None
def step(self): def step(self):
return StaticRNNGuard(self) return BlockGuardWithCompletion(self)
def _assert_in_rnn_block_(self, method): def _assert_in_rnn_block_(self, method):
if self.status != StaticRNN.IN_RNN_BLOCK: if self.status != StaticRNN.IN_RNN_BLOCK:
...@@ -316,7 +417,7 @@ class StaticRNN(object): ...@@ -316,7 +417,7 @@ class StaticRNN(object):
else: else:
return self.outputs return self.outputs
def complete_rnn_op(self): def complete_op(self):
main_program = self.helper.main_program main_program = self.helper.main_program
rnn_block = main_program.current_block() rnn_block = main_program.current_block()
parent_block = self.parent_block() parent_block = self.parent_block()
...@@ -464,7 +565,7 @@ def lod_rank_table(x, level=0): ...@@ -464,7 +565,7 @@ def lod_rank_table(x, level=0):
"""LoD Rank Table Operator. Given an input variable **x** and a level number """LoD Rank Table Operator. Given an input variable **x** and a level number
of LoD, this layer creates a LodRankTable object. A LoDRankTable object of LoD, this layer creates a LodRankTable object. A LoDRankTable object
contains a list of bi-element tuples. Each tuple consists of an index and contains a list of bi-element tuples. Each tuple consists of an index and
a length, both of which are int type. Reffering to specified level of LoD, a length, both of which are int type. Refering to specified level of LoD,
the index is the sequence index number and the length representes the the index is the sequence index number and the length representes the
sequence length. Please note that the list is ranked in descending order by sequence length. Please note that the list is ranked in descending order by
the length. The following is an example: the length. The following is an example:
...@@ -897,7 +998,7 @@ class ConditionalBlock(object): ...@@ -897,7 +998,7 @@ class ConditionalBlock(object):
out_list = [ out_list = [
parent_block.var(var_name) for var_name in parent_block.vars parent_block.var(var_name) for var_name in parent_block.vars
if var_name not in intermediate if var_name in intermediate
] ]
step_scope = parent_block.create_var( step_scope = parent_block.create_var(
......
"""
All util layers.
"""
from ..layer_helper import LayerHelper
from ..framework import unique_name
__all__ = ['get_places']
def get_places(device_count=0, device_type="CPU"):
helper = LayerHelper('get_places', **locals())
out_places = helper.create_variable(name=unique_name(helper.name + ".out"))
helper.append_op(
type='get_places',
outputs={"Out": [out_places]},
attrs={
"device_type": device_type,
'device_count': device_count,
})
return out_places
...@@ -64,14 +64,14 @@ def fc(input, ...@@ -64,14 +64,14 @@ def fc(input,
is flattened: the first `num_flatten_dims` is flattened: the first `num_flatten_dims`
dimensions will be flatten to form the first dimensions will be flatten to form the first
dimension of the final matrix (height of the dimension of the final matrix (height of the
matrix), and the rest `rank(X) - num_col_dims` matrix), and the rest `rank(X) - num_flatten_dims`
dimensions are flattened to form the second dimensions are flattened to form the second
dimension of the final matrix (width of the matrix). dimension of the final matrix (width of the matrix).
For example, suppose `X` is a 6-dimensional tensor For example, suppose `X` is a 6-dimensional tensor
with a shape [2, 3, 4, 5, 6], and with a shape [2, 3, 4, 5, 6], and
`x_num_col_dims` = 3. Then, the flattened matrix `num_flatten_dims` = 3. Then, the flattened matrix
will have a shape [2 x 3 x 4, 5 x 6] = [24, 30]. will have a shape [2 x 3 x 4, 5 x 6] = [24, 30].
By default, `x_num_col_dims` is set to 1. By default, `num_flatten_dims` is set to 1.
param_attr(ParamAttr|list): The parameter attribute for learnable param_attr(ParamAttr|list): The parameter attribute for learnable
parameters/weights of the fully connected parameters/weights of the fully connected
layer. layer.
...@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'): ...@@ -151,7 +151,7 @@ def embedding(input, size, is_sparse=False, param_attr=None, dtype='float32'):
Args: Args:
input(Variable): Input to the function input(Variable): Input to the function
size(tuple|list|None): Shape of the look up table parameter size(tuple|list|None): Shape of the look up table parameter
is_sparse(bool): Boolean flag that specifying whether the input is sparse is_sparse(bool): Boolean flag that specifying whether the input is sparse
param_attr(ParamAttr): Parameters for this layer param_attr(ParamAttr): Parameters for this layer
dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc dtype(np.dtype|core.DataType|str): The type of data : float32, float_16, int etc
...@@ -236,21 +236,50 @@ def gru_unit(input, ...@@ -236,21 +236,50 @@ def gru_unit(input,
activation='tanh', activation='tanh',
gate_activation='sigmoid'): gate_activation='sigmoid'):
""" """
GRUUnit Operator implements partial calculations of the GRU unit as following: GRU unit layer. The equation of a gru step is:
$$ .. math::
update \ gate: u_t = actGate(xu_t + W_u * h_{t-1} + b_u) \\ u_t & = actGate(xu_{t} + W_u h_{t-1} + b_u)
reset \ gate: r_t = actGate(xr_t + W_r * h_{t-1} + b_r) \\
output \ candidate: {h}_t = actNode(xc_t + W_c * dot(r_t, h_{t-1}) + b_c) \\ r_t & = actGate(xr_{t} + W_r h_{t-1} + b_r)
output: h_t = dot((1 - u_t), h_{t-1}) + dot(u_t, {h}_t)
$$ m_t & = actNode(xm_t + W_c dot(r_t, h_{t-1}) + b_m)
h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
of the equation above, the :math:`z_t` is split into 3 parts -
:math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
implement a full GRU unit operator for an input, a fully
connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
an intermediate candidate hidden output, which is denoted by :math:`m_t`.
This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
Args:
input (Variable): The fc transformed input value of current step.
hidden (Variable): The hidden value of lstm unit from previous step.
size (integer): The input dimension value.
weight (ParamAttr): The weight parameters for gru unit. Default: None
bias (ParamAttr): The bias parameters for gru unit. Default: None
activation (string): The activation type for cell (actNode). Default: 'tanh'
gate_activation (string): The activation type for gates (actGate). Default: 'sigmoid'
Returns:
tuple: The hidden value, reset-hidden value and gate values.
Examples:
which is same as one time step of GRU Operator. .. code-block:: python
@note To implement the complete GRU unit, fully-connected operator must be # assuming we have x_t_data and prev_hidden of size=10
used before to feed xu, xr and xc as the Input of GRUUnit operator. x_t = fluid.layers.fc(input=x_t_data, size=30)
hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
hidden = prev_hidden)
TODO(ChunweiYan) add more document here
""" """
activation_dict = dict( activation_dict = dict(
identity=0, identity=0,
...@@ -366,9 +395,9 @@ def cross_entropy(input, label, **kwargs): ...@@ -366,9 +395,9 @@ def cross_entropy(input, label, **kwargs):
1) One-hot cross-entropy: 1) One-hot cross-entropy:
`soft_label = False`, `Label[i, 0]` indicates the class index for sample i: `soft_label = False`, `Label[i, 0]` indicates the class index for sample i:
.. math:: .. math::
Y[i] = -\log(X[i, Label[i]]) Y[i] = -\log(X[i, Label[i]])
2) Soft-label cross-entropy: 2) Soft-label cross-entropy:
...@@ -386,15 +415,15 @@ def cross_entropy(input, label, **kwargs): ...@@ -386,15 +415,15 @@ def cross_entropy(input, label, **kwargs):
As a special case of 2), when each row of 'label' has only one As a special case of 2), when each row of 'label' has only one
non-zero element which is equal to 1, soft-label cross-entropy degenerates non-zero element which is equal to 1, soft-label cross-entropy degenerates
to a one-hot cross-entropy with one-hot label representation. to a one-hot cross-entropy with one-hot label representation.
Args: Args:
input (Variable|list): a 2-D tensor with shape [N x D], where N is the input (Variable|list): a 2-D tensor with shape [N x D], where N is the
batch size and D is the number of classes. This input is a probability batch size and D is the number of classes. This input is a probability
computed by the previous operator, which is almost always the result computed by the previous operator, which is almost always the result
of a softmax operator. of a softmax operator.
label (Variable|list): the ground truth which is a 2-D tensor. When label (Variable|list): the ground truth which is a 2-D tensor. When
`soft_label` is set to `False`, `label` is a tensor<int64> with shape `soft_label` is set to `False`, `label` is a tensor<int64> with shape
[N x 1]. When `soft_label` is set to `True`, `label` is a [N x 1]. When `soft_label` is set to `True`, `label` is a
tensor<float/double> with shape [N x D]. tensor<float/double> with shape [N x D].
soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate soft_label (bool, via `**kwargs`): a flag indicating whether to interpretate
the given labels as soft labels, default `False`. the given labels as soft labels, default `False`.
...@@ -403,7 +432,7 @@ def cross_entropy(input, label, **kwargs): ...@@ -403,7 +432,7 @@ def cross_entropy(input, label, **kwargs):
A 2-D tensor with shape [N x 1], the cross entropy loss. A 2-D tensor with shape [N x 1], the cross entropy loss.
Raises: Raises:
`ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \ `ValueError`: 1) the 1st dimension of `input` and `label` are not equal; 2) when \
`soft_label == True`, and the 2nd dimension of `input` and `label` are not \ `soft_label == True`, and the 2nd dimension of `input` and `label` are not \
equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1. equal; 3) when `soft_label == False`, and the 2nd dimension of `label` is not 1.
...@@ -727,9 +756,9 @@ def conv2d(input, ...@@ -727,9 +756,9 @@ def conv2d(input,
def sequence_pool(input, pool_type, **kwargs): def sequence_pool(input, pool_type, **kwargs):
""" """
This function add the operator for sequence pooling. This function add the operator for sequence pooling.
It pools features of all time-steps of each instance, and is applied It pools features of all time-steps of each instance, and is applied
on top of the input using pool_type mentioned in the parameters. on top of the input using pool_type mentioned in the parameters.
It supports four pool_type: It supports four pool_type:
...@@ -758,7 +787,7 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -758,7 +787,7 @@ def sequence_pool(input, pool_type, **kwargs):
Args: Args:
input(variable): The input variable which is a LoDTensor. input(variable): The input variable which is a LoDTensor.
pool_type (string): The pooling type of sequence_pool. pool_type (string): The pooling type of sequence_pool.
It supports average, sum, sqrt and max. It supports average, sum, sqrt and max.
Returns: Returns:
...@@ -768,7 +797,7 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -768,7 +797,7 @@ def sequence_pool(input, pool_type, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
avg_x = fluid.layers.sequence_pool(input=x, pool_type='average') avg_x = fluid.layers.sequence_pool(input=x, pool_type='average')
sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum') sum_x = fluid.layers.sequence_pool(input=x, pool_type='sum')
...@@ -787,6 +816,11 @@ def sequence_pool(input, pool_type, **kwargs): ...@@ -787,6 +816,11 @@ def sequence_pool(input, pool_type, **kwargs):
"MaxIndex": max_index}, "MaxIndex": max_index},
attrs={"pooltype": pool_type.upper()}) attrs={"pooltype": pool_type.upper()})
# when pool_type is max, variable max_index is initialized,
# so we stop the gradient explicitly here
if pool_type == 'max':
max_index.stop_gradient = True
return pool_out return pool_out
...@@ -816,7 +850,7 @@ def sequence_first_step(input, **kwargs): ...@@ -816,7 +850,7 @@ def sequence_first_step(input, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
x_first_step = fluid.layers.sequence_first_step(input=x) x_first_step = fluid.layers.sequence_first_step(input=x)
""" """
...@@ -849,7 +883,7 @@ def sequence_last_step(input, **kwargs): ...@@ -849,7 +883,7 @@ def sequence_last_step(input, **kwargs):
.. code-block:: python .. code-block:: python
x = fluid.layers.data(name='x', shape=[7, 1], x = fluid.layers.data(name='x', shape=[7, 1],
dtype='float32', lod_level=1) dtype='float32', lod_level=1)
x_last_step = fluid.layers.sequence_last_step(input=x) x_last_step = fluid.layers.sequence_last_step(input=x)
""" """
...@@ -937,11 +971,17 @@ def batch_norm(input, ...@@ -937,11 +971,17 @@ def batch_norm(input,
attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True) attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
mean = helper.create_global_variable( mean = helper.create_global_variable(
dtype=input.dtype, shape=param_shape, persistable=True) dtype=input.dtype,
shape=param_shape,
persistable=True,
stop_gradient=True)
helper.set_variable_initializer(var=mean, initializer=Constant(0.0)) helper.set_variable_initializer(var=mean, initializer=Constant(0.0))
variance = helper.create_global_variable( variance = helper.create_global_variable(
dtype=input.dtype, shape=param_shape, persistable=True) dtype=input.dtype,
shape=param_shape,
persistable=True,
stop_gradient=True)
helper.set_variable_initializer(var=variance, initializer=Constant(1.0)) helper.set_variable_initializer(var=variance, initializer=Constant(1.0))
# create output # create output
...@@ -949,8 +989,8 @@ def batch_norm(input, ...@@ -949,8 +989,8 @@ def batch_norm(input,
mean_out = mean mean_out = mean
# variance and variance out share the same memory # variance and variance out share the same memory
variance_out = variance variance_out = variance
saved_mean = helper.create_tmp_variable(dtype) saved_mean = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
saved_variance = helper.create_tmp_variable(dtype) saved_variance = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
batch_norm_out = helper.create_tmp_variable(dtype) batch_norm_out = helper.create_tmp_variable(dtype)
...@@ -1168,25 +1208,26 @@ def lstm_unit(x_t, ...@@ -1168,25 +1208,26 @@ def lstm_unit(x_t,
.. math:: .. math::
i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i) i_t & = \sigma(W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i)
f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + W_{c_f}c_{t-1} + b_f) f_t & = \sigma(W_{x_f}x_{t} + W_{h_f}h_{t-1} + b_f)
c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t+W_{h_c}h_{t-1} + b_c) c_t & = f_tc_{t-1} + i_t tanh (W_{x_c}x_t + W_{h_c}h_{t-1} + b_c)
o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + W_{c_o}c_t + b_o) o_t & = \sigma(W_{x_o}x_{t} + W_{h_o}h_{t-1} + b_o)
h_t & = o_t tanh(c_t) h_t & = o_t tanh(c_t)
The inputs of lstm unit includes :math:`x_t`, :math:`h_{t-1}` and The inputs of lstm unit include :math:`x_t`, :math:`h_{t-1}` and
:math:`c_{t-1}`. The implementation separates the linear transformation :math:`c_{t-1}`. The 2nd dimensions of :math:`h_{t-1}` and :math:`c_{t-1}`
and non-linear transformation apart. Here, we take :math:`i_t` as an should be same. The implementation separates the linear transformation and
example. The linear transformation is applied by calling a `fc` layer and non-linear transformation apart. Here, we take :math:`i_t` as an example.
the equation is: The linear transformation is applied by calling a `fc` layer and the
equation is:
.. math:: .. math::
L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + W_{c_i}c_{t-1} + b_i L_{i_t} = W_{x_i}x_{t} + W_{h_i}h_{t-1} + b_i
The non-linear transformation is applied by calling `lstm_unit_op` and the The non-linear transformation is applied by calling `lstm_unit_op` and the
equation is: equation is:
...@@ -1198,9 +1239,12 @@ def lstm_unit(x_t, ...@@ -1198,9 +1239,12 @@ def lstm_unit(x_t,
This layer has two outputs including :math:`h_t` and :math:`o_t`. This layer has two outputs including :math:`h_t` and :math:`o_t`.
Args: Args:
x_t (Variable): The input value of current step. x_t (Variable): The input value of current step, a 2-D tensor with shape
hidden_t_prev (Variable): The hidden value of lstm unit. M x N, M for batch size and N for input size.
cell_t_prev (Variable): The cell value of lstm unit. hidden_t_prev (Variable): The hidden value of lstm unit, a 2-D tensor
with shape M x S, M for batch size and S for size of lstm unit.
cell_t_prev (Variable): The cell value of lstm unit, a 2-D tensor with
shape M x S, M for batch size and S for size of lstm unit.
forget_bias (float): The forget bias of lstm unit. forget_bias (float): The forget bias of lstm unit.
param_attr (ParamAttr): The attributes of parameter weights, used to set param_attr (ParamAttr): The attributes of parameter weights, used to set
initializer, name etc. initializer, name etc.
...@@ -1213,14 +1257,15 @@ def lstm_unit(x_t, ...@@ -1213,14 +1257,15 @@ def lstm_unit(x_t,
Raises: Raises:
ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\ ValueError: The ranks of **x_t**, **hidden_t_prev** and **cell_t_prev**\
not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \ not be 2 or the 1st dimensions of **x_t**, **hidden_t_prev** \
and **cell_t_prev** not be the same. and **cell_t_prev** not be the same or the 2nd dimensions of \
**hidden_t_prev** and **cell_t_prev** not be the same.
Examples: Examples:
.. code-block:: python .. code-block:: python
x_t = fluid.layers.fc(input=x_t_data, size=10) x_t = fluid.layers.fc(input=x_t_data, size=10)
prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=20) prev_hidden = fluid.layers.fc(input=prev_hidden_data, size=30)
prev_cell = fluid.layers.fc(input=prev_cell_data, size=30) prev_cell = fluid.layers.fc(input=prev_cell_data, size=30)
hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t, hidden_value, cell_value = fluid.layers.lstm_unit(x_t=x_t,
hidden_t_prev=prev_hidden, hidden_t_prev=prev_hidden,
...@@ -1239,7 +1284,11 @@ def lstm_unit(x_t, ...@@ -1239,7 +1284,11 @@ def lstm_unit(x_t,
if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[ if x_t.shape[0] != hidden_t_prev.shape[0] or x_t.shape[
0] != cell_t_prev.shape[0]: 0] != cell_t_prev.shape[0]:
raise ValueError("The 1s dimension of x_t, hidden_t_prev and " raise ValueError("The 1st dimensions of x_t, hidden_t_prev and "
"cell_t_prev must be the same.")
if hidden_t_prev.shape[1] != cell_t_prev.shape[1]:
raise ValueError("The 2nd dimensions of hidden_t_prev and "
"cell_t_prev must be the same.") "cell_t_prev must be the same.")
if bias_attr is None: if bias_attr is None:
...@@ -1268,17 +1317,17 @@ def lstm_unit(x_t, ...@@ -1268,17 +1317,17 @@ def lstm_unit(x_t,
def reduce_sum(input, dim=None, keep_dim=False): def reduce_sum(input, dim=None, keep_dim=False):
""" """
Computes the sum of tensor elements over the given dimension. Computes the sum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the sum is performed. If dim (int|None): The dimension along which the sum is performed. If
:attr:`None`, sum all elements of :attr:`input` and return a :attr:`None`, sum all elements of :attr:`input` and return a
Tensor variable with a single element, otherwise must be in the Tensor variable with a single element, otherwise must be in the
range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`, range :math:`[-rank(input), rank(input))`. If :math:`dim < 0`,
the dimension to reduce is :math:`rank + dim`. the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
...@@ -1312,17 +1361,17 @@ def reduce_sum(input, dim=None, keep_dim=False): ...@@ -1312,17 +1361,17 @@ def reduce_sum(input, dim=None, keep_dim=False):
def reduce_mean(input, dim=None, keep_dim=False): def reduce_mean(input, dim=None, keep_dim=False):
""" """
Computes the mean of tensor elements over the given dimension. Computes the mean of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the mean is computed. If dim (int|None): The dimension along which the mean is computed. If
:attr:`None`, compute the mean over all elements of :attr:`input` :attr:`None`, compute the mean over all elements of :attr:`input`
and return a Tensor variable with a single element, otherwise and return a Tensor variable with a single element, otherwise
must be in the range :math:`[-rank(input), rank(input))`. If must be in the range :math:`[-rank(input), rank(input))`. If
:math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
...@@ -1356,22 +1405,22 @@ def reduce_mean(input, dim=None, keep_dim=False): ...@@ -1356,22 +1405,22 @@ def reduce_mean(input, dim=None, keep_dim=False):
def reduce_max(input, dim=None, keep_dim=False): def reduce_max(input, dim=None, keep_dim=False):
""" """
Computes the maximum of tensor elements over the given dimension. Computes the maximum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the maximum is computed. dim (int|None): The dimension along which the maximum is computed.
If :attr:`None`, compute the maximum over all elements of If :attr:`None`, compute the maximum over all elements of
:attr:`input` and return a Tensor variable with a single element, :attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`. otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
Variable: The reduced Tensor variable. Variable: The reduced Tensor variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -1400,22 +1449,22 @@ def reduce_max(input, dim=None, keep_dim=False): ...@@ -1400,22 +1449,22 @@ def reduce_max(input, dim=None, keep_dim=False):
def reduce_min(input, dim=None, keep_dim=False): def reduce_min(input, dim=None, keep_dim=False):
""" """
Computes the minimum of tensor elements over the given dimension. Computes the minimum of tensor elements over the given dimension.
Args: Args:
input (Variable): The input variable which is a Tensor or LoDTensor. input (Variable): The input variable which is a Tensor or LoDTensor.
dim (int|None): The dimension along which the minimum is computed. dim (int|None): The dimension along which the minimum is computed.
If :attr:`None`, compute the minimum over all elements of If :attr:`None`, compute the minimum over all elements of
:attr:`input` and return a Tensor variable with a single element, :attr:`input` and return a Tensor variable with a single element,
otherwise must be in the range :math:`[-rank(input), rank(input))`. otherwise must be in the range :math:`[-rank(input), rank(input))`.
If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`. If :math:`dim < 0`, the dimension to reduce is :math:`rank + dim`.
keep_dim (bool): Whether to reserve the reduced dimension in the keep_dim (bool): Whether to reserve the reduced dimension in the
output Tensor. The result tensor will have one fewer dimension output Tensor. The result tensor will have one fewer dimension
than the :attr:`input` unless :attr:`keep_dim` is true. than the :attr:`input` unless :attr:`keep_dim` is true.
Returns: Returns:
Variable: The reduced Tensor variable. Variable: The reduced Tensor variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
from ..registry import register_layer from ..registry import register_layer
__all__ = [
'mean', 'mul', 'dropout', 'reshape', 'sigmoid', 'scale', 'transpose', __activations__ = [
'sigmoid_cross_entropy_with_logits', 'elementwise_add', 'elementwise_div', 'abs', 'tanh', 'sigmoid', 'relu', 'sqrt', 'ceil', 'floor', 'log', 'round'
'elementwise_sub', 'elementwise_mul', 'clip', 'abs', 'sequence_softmax'
] ]
__all__ = [
'mean',
'mul',
'dropout',
'reshape',
'scale',
'transpose',
'sigmoid_cross_entropy_with_logits',
'elementwise_add',
'elementwise_div',
'elementwise_sub',
'elementwise_mul',
'clip',
'sequence_softmax',
] + __activations__
for _OP in set(__all__): for _OP in set(__all__):
globals()[_OP] = register_layer(_OP) globals()[_OP] = register_layer(_OP)
...@@ -6,7 +6,7 @@ from framework import unique_name, program_guard ...@@ -6,7 +6,7 @@ from framework import unique_name, program_guard
from initializer import Constant from initializer import Constant
from layer_helper import LayerHelper from layer_helper import LayerHelper
from regularizer import append_regularization_ops from regularizer import append_regularization_ops
from clip import append_gradient_clip_ops from clip import append_gradient_clip_ops, error_clip_callback
__all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad'] __all__ = ['SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad']
...@@ -197,7 +197,8 @@ class Optimizer(object): ...@@ -197,7 +197,8 @@ class Optimizer(object):
This method combines interface `append_backward()` and This method combines interface `append_backward()` and
`create_optimization_pass()` into one. `create_optimization_pass()` into one.
""" """
params_grads = append_backward(loss, parameter_list, no_grad_set) params_grads = append_backward(loss, parameter_list, no_grad_set,
error_clip_callback)
params_grads = append_gradient_clip_ops(params_grads) params_grads = append_gradient_clip_ops(params_grads)
......
...@@ -4,6 +4,7 @@ import numpy as np ...@@ -4,6 +4,7 @@ import numpy as np
import paddle.v2 as paddle import paddle.v2 as paddle
import paddle.v2.dataset.conll05 as conll05 import paddle.v2.dataset.conll05 as conll05
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import time
word_dict, verb_dict, label_dict = conll05.get_dict() word_dict, verb_dict, label_dict = conll05.get_dict()
word_dict_len = len(word_dict) word_dict_len = len(word_dict)
...@@ -160,7 +161,8 @@ def main(): ...@@ -160,7 +161,8 @@ def main():
paddle.reader.shuffle( paddle.reader.shuffle(
paddle.dataset.conll05.test(), buf_size=8192), paddle.dataset.conll05.test(), buf_size=8192),
batch_size=BATCH_SIZE) batch_size=BATCH_SIZE)
place = fluid.CPUPlace() #place = fluid.CPUPlace()
place = fluid.CUDAPlace(0)
feeder = fluid.DataFeeder( feeder = fluid.DataFeeder(
feed_list=[ feed_list=[
word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
...@@ -174,6 +176,7 @@ def main(): ...@@ -174,6 +176,7 @@ def main():
embedding_param.set( embedding_param.set(
load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place) load_parameter(conll05.get_embedding(), word_dict_len, word_dim), place)
start_time = time.time()
batch_id = 0 batch_id = 0
for pass_id in xrange(PASS_NUM): for pass_id in xrange(PASS_NUM):
chunk_evaluator.reset(exe) chunk_evaluator.reset(exe)
...@@ -191,6 +194,9 @@ def main(): ...@@ -191,6 +194,9 @@ def main():
f1_score) + " pass_precision:" + str( f1_score) + " pass_precision:" + str(
pass_precision) + " pass_recall:" + str(pass_recall) pass_precision) + " pass_recall:" + str(pass_recall)
+ " pass_f1_score:" + str(pass_f1_score)) + " pass_f1_score:" + str(pass_f1_score))
if batch_id != 0:
print("second per batch: " + str((time.time() - start_time)
/ batch_id))
# exit early for CI # exit early for CI
exit(0) exit(0)
......
...@@ -14,6 +14,7 @@ hidden1 = fluid.layers.fc(input=image, ...@@ -14,6 +14,7 @@ hidden1 = fluid.layers.fc(input=image,
param_attr=fluid.ParamAttr( param_attr=fluid.ParamAttr(
regularizer=regularizer, regularizer=regularizer,
clip=fluid.clip.ClipByValue(10))) clip=fluid.clip.ClipByValue(10)))
hidden2 = fluid.layers.fc(input=hidden1, hidden2 = fluid.layers.fc(input=hidden1,
size=64, size=64,
act='relu', act='relu',
...@@ -73,5 +74,9 @@ for pass_id in range(PASS_NUM): ...@@ -73,5 +74,9 @@ for pass_id in range(PASS_NUM):
+ " test_acc=" + str(test_pass_acc)) + " test_acc=" + str(test_pass_acc))
if test_pass_acc > 0.7: if test_pass_acc > 0.7:
fluid.io.save_inference_model(
"./recognize_digits_mlp.inference.model/", ["x"], [predict],
exe)
exit(0) exit(0)
exit(1) exit(1)
from __future__ import print_function
import numpy as np
import paddle.v2 as paddle
import paddle.v2.fluid as fluid
import os
PASS_NUM = 100
EMBED_SIZE = 32
HIDDEN_SIZE = 256
N = 5
BATCH_SIZE = 32
IS_SPARSE = True
TRAINERS = 2
word_dict = paddle.dataset.imikolov.build_dict()
dict_size = len(word_dict)
first_word = fluid.layers.data(name='firstw', shape=[1], dtype='int64')
second_word = fluid.layers.data(name='secondw', shape=[1], dtype='int64')
third_word = fluid.layers.data(name='thirdw', shape=[1], dtype='int64')
forth_word = fluid.layers.data(name='forthw', shape=[1], dtype='int64')
next_word = fluid.layers.data(name='nextw', shape=[1], dtype='int64')
embed_first = fluid.layers.embedding(
input=first_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_second = fluid.layers.embedding(
input=second_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_third = fluid.layers.embedding(
input=third_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
embed_forth = fluid.layers.embedding(
input=forth_word,
size=[dict_size, EMBED_SIZE],
dtype='float32',
is_sparse=IS_SPARSE,
param_attr='shared_w')
concat_embed = fluid.layers.concat(
input=[embed_first, embed_second, embed_third, embed_forth], axis=1)
hidden1 = fluid.layers.fc(input=concat_embed, size=HIDDEN_SIZE, act='sigmoid')
predict_word = fluid.layers.fc(input=hidden1, size=dict_size, act='softmax')
cost = fluid.layers.cross_entropy(input=predict_word, label=next_word)
avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
train_reader = paddle.batch(
paddle.dataset.imikolov.train(word_dict, N), BATCH_SIZE)
place = fluid.CPUPlace()
exe = fluid.Executor(place)
t = fluid.DistributeTranspiler()
# all parameter server endpoints list for spliting parameters
pserver_endpoints = os.getenv("PSERVERS")
# server endpoint for current node
current_endpoint = os.getenv("SERVER_ENDPOINT")
# run as trainer or parameter server
training_role = os.getenv("TRAINING_ROLE",
"TRAINER") # get the training role: trainer/pserver
t.transpile(
optimize_ops, params_grads, pservers=pserver_endpoints, trainers=TRAINERS)
if training_role == "PSERVER":
if not current_endpoint:
print("need env SERVER_ENDPOINT")
exit(1)
pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
exe.run(fluid.default_startup_program())
exe.run(pserver_prog)
elif training_role == "TRAINER":
feeder = fluid.DataFeeder(
feed_list=[first_word, second_word, third_word, forth_word, next_word],
place=place)
exe.run(fluid.default_startup_program())
for pass_id in range(PASS_NUM):
for data in train_reader():
avg_cost_np = exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost])
print("avg_cost_np", avg_cost_np)
if avg_cost_np[0] < 5.0:
exit(
0) # if avg cost less than 10.0, we think our code is good.
else:
print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
exit(1)
import unittest import unittest
import numpy as np import numpy as np
import paddle.v2.fluid.core as core
from op_test import OpTest from op_test import OpTest
...@@ -47,6 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -47,6 +49,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class TestConv2dOp(OpTest): class TestConv2dOp(OpTest):
def setUp(self): def setUp(self):
core.use_cuda()
self.init_op_type() self.init_op_type()
self.init_group() self.init_group()
self.init_dilation() self.init_dilation()
...@@ -167,26 +170,31 @@ class TestWithDilation(TestConv2dOp): ...@@ -167,26 +170,31 @@ class TestWithDilation(TestConv2dOp):
#----------------Conv2dCudnn---------------- #----------------Conv2dCudnn----------------
class TestCudnn(TestConv2dOp): class TestCudnn(TestConv2dOp):
def init_op_type(self): def init_op_type(self):
core.use_cudnn()
self.op_type = "conv2d_cudnn" self.op_type = "conv2d_cudnn"
class TestCudnnWithPad(TestWithPad): class TestCudnnWithPad(TestWithPad):
def init_op_type(self): def init_op_type(self):
core.use_cudnn()
self.op_type = "conv2d_cudnn" self.op_type = "conv2d_cudnn"
class TestCudnnWithStride(TestWithStride): class TestCudnnWithStride(TestWithStride):
def init_op_type(self): def init_op_type(self):
core.use_cudnn()
self.op_type = "conv2d_cudnn" self.op_type = "conv2d_cudnn"
class TestCudnnWithGroup(TestWithGroup): class TestCudnnWithGroup(TestWithGroup):
def init_op_type(self): def init_op_type(self):
core.use_cudnn()
self.op_type = "conv2d_cudnn" self.op_type = "conv2d_cudnn"
class TestCudnnWith1x1(TestWith1x1): class TestCudnnWith1x1(TestWith1x1):
def init_op_type(self): def init_op_type(self):
core.use_cudnn()
self.op_type = "conv2d_cudnn" self.op_type = "conv2d_cudnn"
......
import paddle.v2.fluid as fluid
import decorators
import unittest
class TestGetPlaces(unittest.TestCase):
@decorators.prog_scope()
def test_get_places(self):
places = fluid.layers.get_places()
cpu = fluid.CPUPlace()
exe = fluid.Executor(cpu)
exe.run(fluid.default_main_program())
self.assertEqual(places.type, fluid.core.VarDesc.VarType.PLACE_LIST)
if __name__ == '__main__':
unittest.main()
...@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase): ...@@ -177,8 +177,8 @@ class TestBook(unittest.TestCase):
name='x_t_data', shape=[10, 10], dtype='float32') name='x_t_data', shape=[10, 10], dtype='float32')
x_t = layers.fc(input=x_t_data, size=10) x_t = layers.fc(input=x_t_data, size=10)
prev_hidden_data = layers.data( prev_hidden_data = layers.data(
name='prev_hidden_data', shape=[10, 20], dtype='float32') name='prev_hidden_data', shape=[10, 30], dtype='float32')
prev_hidden = layers.fc(input=prev_hidden_data, size=20) prev_hidden = layers.fc(input=prev_hidden_data, size=30)
prev_cell_data = layers.data( prev_cell_data = layers.data(
name='prev_cell', shape=[10, 30], dtype='float32') name='prev_cell', shape=[10, 30], dtype='float32')
prev_cell = layers.fc(input=prev_cell_data, size=30) prev_cell = layers.fc(input=prev_cell_data, size=30)
...@@ -196,6 +196,13 @@ class TestBook(unittest.TestCase): ...@@ -196,6 +196,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.sequence_softmax(x=seq)) self.assertIsNotNone(layers.sequence_softmax(x=seq))
print(str(program)) print(str(program))
def test_get_places(self):
program = Program()
with program_guard(program):
x = layers.get_places(device_count=4)
self.assertIsNotNone(x)
print(str(program))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
import unittest
import paddle.v2.fluid.layers as layers
import paddle.v2.fluid as fluid
from paddle.v2.fluid.framework import Program
from paddle.v2.fluid.executor import Executor
from paddle.v2.fluid.backward import append_backward
import numpy as np
import paddle.v2.fluid.core as core
class ParallelOpTest(unittest.TestCase):
def setUp(self):
x = layers.data(
shape=[-1, 30, 40],
dtype='float32',
name='x',
append_batch_size=False,
stop_gradient=False)
places = fluid.default_main_program().global_block().create_var()
pd = layers.ParallelDo(places=places)
with pd.do():
data = pd.read_input(x)
hidden = layers.fc(input=data, size=7)
pd.write_output(hidden)
data = pd()
loss = layers.mean(x=data)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
sgd_optimizer.minimize(loss)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.run(fluid.default_main_program(),
feed={
x.name: np.random.uniform(0.1, 0.6,
(20, 30, 40)).astype("float32")
})
def test_forward(self):
pass
if __name__ == '__main__':
unittest.main()
import unittest import unittest
import paddle.v2.fluid as fluid import paddle.v2.fluid as fluid
import paddle.v2.fluid.core as core
import numpy import numpy
class TestReorderLoDTensor(unittest.TestCase): class TestReorderLoDTensor(unittest.TestCase):
def test_reorder(self): num_seq = 5
dat = fluid.layers.data(name='input', shape=[1], lod_level=2) # [name, dim, lod_level] pair indicating data info of source and target
data_desc = (['input', 9, 0], ['ref', 5, 1])
@classmethod
def setUpClass(cls):
cls.set_program()
@classmethod
def set_program(cls):
dat = fluid.layers.data(
name=cls.data_desc[0][0], shape=[cls.data_desc[0][1]])
dat.stop_gradient = False dat.stop_gradient = False
rank_dat = fluid.layers.data(name='ref', shape=[1], lod_level=1) rank_dat = fluid.layers.data(
name=cls.data_desc[1][0], shape=[cls.data_desc[1][1]])
table = fluid.layers.lod_rank_table(rank_dat) table = fluid.layers.lod_rank_table(rank_dat)
new_dat = fluid.layers.reorder_lod_tensor_by_rank( new_dat = fluid.layers.reorder_lod_tensor_by_rank(
x=dat, rank_table=table) x=dat, rank_table=table)
loss = fluid.layers.mean(x=new_dat) loss = fluid.layers.reduce_sum(new_dat)
fluid.backward.append_backward(loss=loss) fluid.backward.append_backward(loss=loss)
cls.fetch_list = [new_dat, cls.data_desc[0][0] + '@GRAD']
def run_program(self):
outputs = []
input_grads = []
places = [core.CPUPlace()]
if core.is_compile_gpu():
places.append(core.CUDAPlace(0))
for place in places:
self.set_inputs(place)
exe = fluid.Executor(place)
output, input_grad = exe.run(fluid.default_main_program(),
feed=self.inputs,
fetch_list=self.fetch_list,
return_numpy=False)
outputs.append(output)
input_grads.append(input_grad)
self.actual_outputs = outputs
self.actual_grads = input_grads
def set_data(self):
self.data = {}
for desc in self.data_desc:
data_name = desc[0]
data_dim = desc[1]
data_lod_level = desc[2]
data_lod = []
for i in range(data_lod_level):
lod_level_i = numpy.random.randint(
low=1,
high=5,
size=self.num_seq if i == 0 else lod_level_i[-1])
lod_level_i = [0] + numpy.cumsum(lod_level_i).tolist()
data_lod.append(lod_level_i)
data_value = numpy.random.random(size=[
data_lod[-1][-1] if data_lod else self.num_seq, data_dim
]).astype('float32')
self.data[data_name] = (data_value, data_lod)
def set_inputs(self, place):
self.inputs = {}
for desc in self.data_desc:
tensor = fluid.Tensor()
tensor.set(self.data[desc[0]][0], place)
if self.data[desc[0]][1]:
tensor.set_lod(self.data[desc[0]][1])
self.inputs[desc[0]] = tensor
def reorder(self):
level = 0
# compute the rank_table according to ref_lod
ref_lod = self.data[self.data_desc[1][0]][1][level]
rank_table = [] # list of (index, length)
for i in range(len(ref_lod) - 1):
rank_table.append((i, ref_lod[i + 1] - ref_lod[i]))
rank_table = sorted(rank_table, lambda x, y: y[1] - x[1])
# compute the input sequence info according to input_lod
input_value, input_lod = self.data[self.data_desc[0][0]]
input_table = [] # list of (offset, length, sub_lod)
if input_lod:
for i in range(len(input_lod[level]) - 1):
start_idx = i
end_idx = i + 1
sub_lod = []
for lod_level_i in input_lod[level:]:
sub_lod_i = []
for idx in range(start_idx, end_idx):
sub_lod_i.append(lod_level_i[idx + 1] - lod_level_i[
idx])
sub_lod.append(sub_lod_i)
start_idx = lod_level_i[start_idx]
end_idx = lod_level_i[end_idx]
input_table.append((start_idx, end_idx - start_idx, sub_lod))
else:
input_table = [(i, 1, []) for i in range(len(rank_table))]
# reorder by rank_table
output_value = numpy.zeros_like(input_value)
output_lod = []
offset = 0
for index, length in rank_table:
input_seq_start = input_table[index][0]
input_seq_len = input_table[index][1]
input_seq_end = input_seq_start + input_seq_len
output_value[offset:offset + input_seq_len] = input_value[
input_seq_start:input_seq_end]
offset += input_seq_len
input_seq_sub_lod = input_table[index][2]
if len(output_lod) == 0:
output_lod = [[0] for i in input_seq_sub_lod]
for i, sub_lod_i in enumerate(input_seq_sub_lod):
for idx_sub in sub_lod_i:
output_lod[i].append(output_lod[i][-1] + idx_sub)
return output_value, output_lod
def test_reorder_lod_tensor(self):
self.data_desc[0][-1] = 2 # input is lod_tensor
self.set_data()
self.run_program()
# check output
expect_output, expect_output_lod = self.reorder()
for actual_output in self.actual_outputs:
self.assertTrue(
numpy.allclose(
numpy.array(actual_output), expect_output, atol=0.001))
self.assertEqual(expect_output_lod, actual_output.lod())
# check gradient
expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
expect_grad_lod = self.data[self.data_desc[0][0]][1]
for actual_grad in self.actual_grads:
self.assertTrue(
numpy.allclose(
numpy.array(actual_grad), expect_grad, atol=0.001))
self.assertEqual(expect_grad_lod, actual_grad.lod())
def test_reorder_tensor(self):
self.data_desc[0][-1] = 0 # input is tensor
self.set_data()
self.run_program()
# check output
expect_output, expect_output_lod = self.reorder()
for actual_output in self.actual_outputs:
self.assertTrue(
numpy.allclose(
numpy.array(actual_output), expect_output, atol=0.001))
self.assertEqual(expect_output_lod, actual_output.lod())
# check gradient
expect_grad = numpy.ones_like(self.data[self.data_desc[0][0]][0])
expect_grad_lod = self.data[self.data_desc[0][0]][1]
for actual_grad in self.actual_grads:
self.assertTrue(
numpy.allclose(
numpy.array(actual_grad), expect_grad, atol=0.001))
self.assertEqual(expect_grad_lod, actual_grad.lod())
global outputs_from_tensor_implicit_lod
outputs_from_tensor_implicit_lod = self.actual_outputs
cpu = fluid.CPUPlace() # compare outputs between LodTensors with explicit and implicit lod
exe = fluid.Executor(cpu) # use the same data but set the input lod explicitly
exe.run(fluid.default_startup_program()) input_lod = [[
i for i in range(len(self.data[self.data_desc[0][0]][0]) + 1)
ref = fluid.Tensor() ]]
ref_lod = [0, 3, 4, 7, 8, 14] self.inputs[self.data_desc[0][0]].set_lod(input_lod)
ref.set_lod([ref_lod]) # preserve the output of LodTensor with implicit lod to compare
expect_output = [
ref.set(numpy.random.random(size=[14, 1]).astype('float32'), cpu) numpy.array(actual_output) for actual_output in self.actual_outputs
input = fluid.Tensor() ]
lod_level_0 = numpy.random.randint(low=1, high=5, size=5) self.run_program()
lod_level_0 = [0] + numpy.cumsum(lod_level_0).tolist() for actual_output in self.actual_outputs:
lod_level_1 = numpy.random.randint(low=1, high=5, size=lod_level_0[-1]) self.assertTrue(
lod_level_1 = [0] + numpy.cumsum(lod_level_1).tolist() numpy.allclose(
numpy.array(actual_output), expect_output, atol=0.001))
input.set_lod([lod_level_0, lod_level_1])
input.set(
numpy.random.random(size=[lod_level_1[-1], 1]).astype('float32'),
cpu)
ig = exe.run(fluid.default_main_program(),
feed={'input': input,
'ref': ref},
fetch_list=['input@GRAD'],
return_numpy=False)[0]
self.assertAlmostEqual(numpy.array(ig).sum(), 1.0, delta=0.001)
self.assertEqual(input.lod(), ig.lod())
if __name__ == '__main__': if __name__ == '__main__':
......
import unittest import unittest
import numpy as np import numpy as np
from op_test import OpTest from op_test import OpTest
from test_softmax_op import stable_softmax
def stable_softmax(x):
"""Compute the softmax of vector x in a numerically stable way."""
shiftx = x - np.max(x).clip(-64.)
exps = np.exp(shiftx)
return exps / np.sum(exps)
class TestSequenceSoftmaxOp(OpTest): class TestSequenceSoftmaxOp(OpTest):
......
import sys
import unittest
import numpy as np
from op_test import OpTest
from test_softmax_op import stable_softmax
class CTCForward(object):
def __init__(self, softmax, softmax_lod, labels, labels_lod, blank,
norm_by_times):
self.softmax = softmax
self.softmax_lod = softmax_lod
assert labels.shape[1] == 1
self.labels = labels
self.labels_lod = labels_lod
self.blank = blank
self.norm_by_times = norm_by_times
self.level = 0
self.num_classes = softmax.shape[1]
self.batch_size = len(softmax_lod[self.level]) - 1
assert self.batch_size == len(labels_lod[self.level]) - 1
self.loss = np.zeros([self.batch_size, 1], dtype="float32")
self.gradient = np.zeros(self.softmax.shape, dtype="float32")
# float64
self.EXP_MAX = sys.float_info.max
self.EXP_MIN = sys.float_info.min
self.LOG_ZERO = np.log(self.EXP_MIN)
self.LOG_INFINITY = np.log(self.EXP_MAX)
def safe_exp(self, x):
if x <= self.LOG_ZERO:
return 0.0
if x >= self.LOG_INFINITY:
return self.EXP_MAX
return np.exp(x)
def safe_log(self, x):
if x <= self.EXP_MIN:
return self.LOG_ZERO
return np.log(x)
# x = lna and y = lnb are in log scale, ln(a / b) = lna - lnb
def log_div(self, x, y):
res = x - y
if res <= self.LOG_ZERO:
return self.LOG_ZERO
if res >= self.LOG_INFINITY:
return self.LOG_INFINITY
return res
# x = lna and y = lnb are in log scale, ln(a * b) = lna + lnb
def log_mul(self, x, y):
res = x + y
if res <= self.LOG_ZERO:
return self.LOG_ZERO
if res >= self.LOG_INFINITY:
return self.LOG_INFINITY
return res
# x = lna and y = lnb are in log scale,
# ln(a + b) = lna + ln(1 + exp(lnb - lna)), where b > a
def log_add(self, x, y):
if x < y:
t = y
y = x
x = t
return x + self.safe_log(1 + self.safe_exp(y - x))
def segment_range(self, time, total_times, total_segments):
start = max(0, total_segments - (2 * (total_times - time)))
end = min(total_segments, 2 * (time + 1))
return start, end
def forward_a_sequence(self, softmax_a_sequence, labels_a_sequence):
total_times = softmax_a_sequence.shape[0]
total_segments = labels_a_sequence.shape[0] * 2 + 1
required_times = labels_a_sequence.shape[0]
old_label = -1
for i in range(labels_a_sequence.shape[0]):
# two contingous labels with the same value
if labels_a_sequence[i, 0] == old_label:
required_times = required_times + 1
old_label = labels_a_sequence[i, 0]
if total_times < required_times:
return 0
# calculate the forward and backward variables,
# reference Chapter 7.3 of "Alex Grave, Supervised Sequence
# Labelling with Recurrent Neural Networks"
log_acts = np.zeros([total_times, self.num_classes], dtype="float32")
for i in range(total_times):
for j in range(self.num_classes):
log_acts[i, j] = self.safe_log(softmax_a_sequence[i, j])
# calculate the forward variables
forward_vars = np.zeros([total_times, total_segments], dtype="float32")
for i in range(total_times):
for j in range(total_segments):
forward_vars[i, j] = self.LOG_ZERO
for i in range(total_times):
# dp initialization at t0
if i == 0:
forward_vars[i, 0] = log_acts[0, self.blank]
if total_segments > 1:
forward_vars[i, 1] = log_acts[0, labels_a_sequence[i, 0]]
continue
# dp from t1
start, end = self.segment_range(i, total_times, total_segments)
for k in range(end - start):
j = k + start
if j & 1 == 1:
label_idx = j / 2
label_val = labels_a_sequence[label_idx, 0]
fv = self.log_add(forward_vars[i - 1, j],
forward_vars[i - 1, j - 1])
if j > 1 and label_val != labels_a_sequence[label_idx - 1,
0]:
fv = self.log_add(fv, forward_vars[i - 1, j - 2])
fv = self.log_mul(fv, log_acts[i, label_val])
else:
fv = forward_vars[i - 1, j]
if j > 0:
fv = self.log_add(fv, forward_vars[i - 1, j - 1])
fv = self.log_mul(fv, log_acts[i, self.blank])
forward_vars[i, j] = fv
# sum the last two value as log_prob
log_prob = forward_vars[total_times - 1, total_segments - 1]
if total_segments > 1:
log_prob = self.log_add(
log_prob, forward_vars[total_times - 1, total_segments - 2])
return -log_prob
def forward(self):
for i in range(self.batch_size):
softmax_start_i = self.softmax_lod[self.level][i]
softmax_end_i = self.softmax_lod[self.level][i + 1]
labels_start_i = self.labels_lod[self.level][i]
labels_end_i = self.labels_lod[self.level][i + 1]
softmax_a_sequence = self.softmax[softmax_start_i:softmax_end_i, :]
labels_a_sequence = self.labels[labels_start_i:labels_end_i, :]
self.loss[i] = self.forward_a_sequence(softmax_a_sequence,
labels_a_sequence)
return self.loss
class TestWarpCTCOp(OpTest):
def setUp(self):
self.op_type = "warpctc"
batch_size = 4
num_classes = 8
logits_lod = [[0, 4, 5, 8, 11]]
logits = np.random.uniform(0.1, 1.0,
[11, num_classes]).astype("float32")
softmax = np.apply_along_axis(stable_softmax, 1, logits)
labels_lod = [[0, 3, 4, 8, 12]]
# labels should not be blank
labels = np.random.randint(0, num_classes - 1, [12, 1], dtype="int32")
blank = num_classes - 1
norm_by_times = False
ctc = CTCForward(softmax, logits_lod, labels, labels_lod, blank,
norm_by_times)
loss = ctc.forward()
max_sequence_length = 0
for i in range(batch_size):
max_sequence_length = max(max_sequence_length,
logits_lod[0][i + 1] - logits_lod[0][i])
gradient = np.zeros(
[max_sequence_length, batch_size, num_classes], dtype="float32")
self.inputs = {
"Logits": (logits, logits_lod),
"Label": (labels, labels_lod)
}
self.outputs = {"Loss": loss}
self.attrs = {"blank": blank, "norm_by_times": norm_by_times}
def test_check_output(self):
self.check_output()
# def test_check_grad(self):
# self.outputs["WarpCTCGrad"] = None
# self.check_grad(["Logits"], "Loss", max_relative_error=0.01)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册