提交 b3a11fdf 编写于 作者: F fengjiayi

Merge branch 'rm_reader_HasNext' into dev_double_buffer_for_cpp_reader

...@@ -56,7 +56,7 @@ script: ...@@ -56,7 +56,7 @@ script:
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd` export DOCS_DIR=`pwd`
cd .. cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2 curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications: notifications:
email: email:
on_success: change on_success: change
......
...@@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND})
INSTALL_DIR ${CBLAS_INSTALL_DIR} INSTALL_DIR ${CBLAS_INSTALL_DIR}
BUILD_IN_SOURCE 1 BUILD_IN_SOURCE 1
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR> INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
&& rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
UPDATE_COMMAND "" UPDATE_COMMAND ""
CONFIGURE_COMMAND "" CONFIGURE_COMMAND ""
) )
...@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND}) ...@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
\"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
)" )"
) )
INSTALL(CODE "execute_process(
COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
)"
)
ENDIF() ENDIF()
ENDIF(NOT ${CBLAS_FOUND}) ENDIF(NOT ${CBLAS_FOUND})
......
...@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME) ...@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
else() else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
find_fluid_modules(${TARGET_NAME})
endif() endif()
if(cc_library_DEPS) if(cc_library_DEPS)
# Don't need link libwarpctc.so # Don't need link libwarpctc.so
if("${cc_library_DEPS};" MATCHES "warpctc;") if("${cc_library_DEPS};" MATCHES "warpctc;")
...@@ -263,7 +265,8 @@ function(nv_library TARGET_NAME) ...@@ -263,7 +265,8 @@ function(nv_library TARGET_NAME)
if (nv_library_SHARED OR nv_library_shared) # build *.so if (nv_library_SHARED OR nv_library_shared) # build *.so
cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
else() else()
cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
find_fluid_modules(${TARGET_NAME})
endif() endif()
if (nv_library_DEPS) if (nv_library_DEPS)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
......
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library # make package for paddle fluid shared and static library
function(copy TARGET) function(copy TARGET)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS) set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
...@@ -42,13 +55,21 @@ copy(glog_lib ...@@ -42,13 +55,21 @@ copy(glog_lib
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
) )
IF(NOT PROTOBUF_FOUND) if(NOT PROTOBUF_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf") set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
copy(protobuf_lib copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY} SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib DSTS ${dst_dir} ${dst_dir}/lib
) )
ENDIF(NOT PROTOBUF_FOUND) endif()
if(NOT CBLAS_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
copy(openblas_lib
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir}
)
endif()
# paddle fluid module # paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
...@@ -66,8 +87,8 @@ copy(memory_lib ...@@ -66,8 +87,8 @@ copy(memory_lib
) )
set(module "inference") set(module "inference")
copy(inference_lib DEPENDS paddle_fluid_shared copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module} DSTS ${dst_dir}/${module} ${dst_dir}/${module}
) )
...@@ -83,6 +104,4 @@ copy(string_lib ...@@ -83,6 +104,4 @@ copy(string_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
) )
add_custom_target(inference_lib_dist DEPENDS add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib)
...@@ -20,9 +20,8 @@ class ReaderBase { ...@@ -20,9 +20,8 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty()); PADDLE_ENFORCE(!shapes_.empty());
} }
// Read the next batch of data. (A 'batch' can be only one instance) // Read the next batch of data. (A 'batch' can be only one instance)
// If the next batch doesn't exist, the '*out' will be an empty std::vector.
virtual void ReadNext(std::vector<LoDTensor>* out) = 0; virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Show whether the next bacth exists.
virtual bool HasNext() const = 0;
// Reinitialize the reader and read the file from the begin. // Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0; virtual void ReInit() = 0;
......
## how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
...
```
1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
![chrome tracing](./tracing.jpeg)
1. The resulting timeline should be like:
![chrome timeline](./timeline.jpeg)
...@@ -2,17 +2,17 @@ ...@@ -2,17 +2,17 @@
Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
# Compile Time # Compile Time
The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
```python ```python
x = fluid.layers.data(name='x', shape=[13], dtype='float32') x = fluid.layers.data(name='x', shape=[13], dtype='float32')
...@@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost) ...@@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost)
- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#) - Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers) - Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
- Every Layer has one or more operators and variables/parameters - Every Layer has one or more operators and variables/parameters
- All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
- Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
- Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
- Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
- Optimizer: `fluid.optimizer.SGD`. It does the following - Optimizer: `fluid.optimizer.SGD`. It does the following
- Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)] - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
- Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)] - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
...@@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(), ...@@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(),
fetch_list=[avg_cost]) fetch_list=[avg_cost])
``` ```
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) - Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
- The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] - Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
- Feeds the data: `feed=feeder.feed(data)` - Feeds the data: `feed=feeder.feed(data)`
- Evaluates all the operators - Evaluates all the operators
- Fetches the result: `fetch_list=[avg_cost]` - Fetches the result: `fetch_list=[avg_cost]`
- Other worth looking files: - Other worth looking files:
- Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
- Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
- Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
...@@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具 ...@@ -39,7 +39,7 @@ PaddlePaddle可以使用常用的Python包管理工具
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无" "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
...@@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star ...@@ -42,7 +42,7 @@ If the links below shows up the login form, just click "Log in as guest" to star
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available" "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available" "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_" "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
......
进阶使用 进阶使用
======== ========
PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。使用方式请参考:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cmd_parameter/index_cn.rst cmd_parameter/index_cn.rst
PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务,具体环境配置和使用说明请参考:
.. toctree::
:maxdepth: 1
cluster/index_cn.rst cluster/index_cn.rst
PaddlePaddle提供了用于预测的C-API,关于C-API的使用,我们提供了如下指南:
.. toctree::
:maxdepth: 1
capi/index_cn.rst capi/index_cn.rst
PaddlePaddle支持多种灵活和高效的循环神经网络,具体配置使用方式请参考:
.. toctree::
:maxdepth: 1
rnn/index_cn.rst rnn/index_cn.rst
关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优,请参考:
.. toctree::
:maxdepth: 1
optimization/gpu_profiling_cn.rst optimization/gpu_profiling_cn.rst
RNN模型 RNN模型
=========== ===========
循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例,
说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
rnn_config_cn.rst rnn_config_cn.rst
Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理,
对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。
.. toctree::
:maxdepth: 1
recurrent_group_cn.md recurrent_group_cn.md
第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。
.. toctree::
:maxdepth: 1
hierarchical_layer_cn.rst hierarchical_layer_cn.rst
第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。
.. toctree::
:maxdepth: 1
hrnn_rnn_api_compare_cn.rst hrnn_rnn_api_compare_cn.rst
...@@ -53,6 +53,7 @@ struct CastDataType { ...@@ -53,6 +53,7 @@ struct CastDataType {
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_); auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin, trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>()); CastDataTypeFunctor<InType, OutType>());
context->Wait();
#endif #endif
} else { } else {
PADDLE_THROW("Unsupported place!"); PADDLE_THROW("Unsupported place!");
......
...@@ -50,13 +50,13 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -50,13 +50,13 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_fp32, kernel_fp64, in, &out); TransDataType(kernel_fp32, kernel_fp64, in, &out);
double* out_data_double = out.data<double>(); double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3)); EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
} }
TransDataType(kernel_fp32, kernel_int32, in, &out); TransDataType(kernel_fp32, kernel_int32, in, &out);
int* out_data_int = out.data<int>(); int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3)); EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
} }
} }
...@@ -76,31 +76,31 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -76,31 +76,31 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_fp16, kernel_fp32, in, &out); TransDataType(kernel_fp16, kernel_fp32, in, &out);
float* out_data_float = out.data<float>(); float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i])); EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_fp64, in, &out); TransDataType(kernel_fp16, kernel_fp64, in, &out);
double* out_data_double = out.data<double>(); double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i])); EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_int32, in, &out); TransDataType(kernel_fp16, kernel_int32, in, &out);
int* out_data_int = out.data<int>(); int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i])); EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_int64, in, &out); TransDataType(kernel_fp16, kernel_int64, in, &out);
int64_t* out_data_int64 = out.data<int64_t>(); int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i])); EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_bool, in, &out); TransDataType(kernel_fp16, kernel_bool, in, &out);
bool* out_data_bool = out.data<bool>(); bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i])); EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
} }
// transform float to float16 // transform float to float16
...@@ -112,7 +112,7 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -112,7 +112,7 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_fp32, kernel_fp16, in, &out); TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
} }
// transform double to float16 // transform double to float16
...@@ -124,7 +124,7 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -124,7 +124,7 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_fp64, kernel_fp16, in, &out); TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
} }
// transform int to float16 // transform int to float16
...@@ -136,7 +136,7 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -136,7 +136,7 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_int32, kernel_fp16, in, &out); TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
} }
// transform int64 to float16 // transform int64 to float16
...@@ -148,7 +148,7 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -148,7 +148,7 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_int64, kernel_fp16, in, &out); TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
} }
// transform bool to float16 // transform bool to float16
...@@ -160,7 +160,7 @@ TEST(DataTypeTransform, CPUTransform) { ...@@ -160,7 +160,7 @@ TEST(DataTypeTransform, CPUTransform) {
TransDataType(kernel_bool, kernel_fp16, in, &out); TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
} }
} }
} }
...@@ -49,15 +49,16 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -49,15 +49,16 @@ TEST(DataTypeTransform, GPUTransform) {
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
int data_number = sizeof(arr) / sizeof(arr[0]); int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(in_ptr, arr, sizeof(arr)); memcpy(in_ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu);
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu); TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
double* out_data_double = out.data<double>(); double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_double[i], static_cast<double>(arr[i])); EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
} }
TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu); TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
...@@ -66,7 +67,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -66,7 +67,7 @@ TEST(DataTypeTransform, GPUTransform) {
int* out_data_int = out.data<int>(); int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int[i], static_cast<int>(arr[i])); EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
} }
} }
...@@ -83,6 +84,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -83,6 +84,7 @@ TEST(DataTypeTransform, GPUTransform) {
int data_number = sizeof(arr) / sizeof(arr[0]); int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(ptr, arr, sizeof(arr)); memcpy(ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
// transform from float16 to other data types // transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu); TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
...@@ -91,7 +93,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -91,7 +93,7 @@ TEST(DataTypeTransform, GPUTransform) {
float* out_data_float = out.data<float>(); float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_float[i], static_cast<float>(ptr[i])); EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
...@@ -100,7 +102,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -100,7 +102,7 @@ TEST(DataTypeTransform, GPUTransform) {
double* out_data_double = out.data<double>(); double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_double[i], static_cast<double>(ptr[i])); EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
...@@ -109,7 +111,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -109,7 +111,7 @@ TEST(DataTypeTransform, GPUTransform) {
int* out_data_int = out.data<int>(); int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int[i], static_cast<int>(ptr[i])); EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
...@@ -118,7 +120,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -118,7 +120,7 @@ TEST(DataTypeTransform, GPUTransform) {
int64_t* out_data_int64 = out.data<int64_t>(); int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i])); EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
} }
TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
...@@ -127,7 +129,7 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -127,7 +129,7 @@ TEST(DataTypeTransform, GPUTransform) {
bool* out_data_bool = out.data<bool>(); bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_bool[i], static_cast<bool>(ptr[i])); EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
} }
// transform float to float16 // transform float to float16
...@@ -137,13 +139,14 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -137,13 +139,14 @@ TEST(DataTypeTransform, GPUTransform) {
} }
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
} }
// transform double to float16 // transform double to float16
...@@ -154,13 +157,14 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -154,13 +157,14 @@ TEST(DataTypeTransform, GPUTransform) {
} }
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu); TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
} }
// transform int to float16 // transform int to float16
...@@ -170,13 +174,14 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -170,13 +174,14 @@ TEST(DataTypeTransform, GPUTransform) {
} }
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu); TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
} }
// transform int64 to float16 // transform int64 to float16
...@@ -187,13 +192,14 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -187,13 +192,14 @@ TEST(DataTypeTransform, GPUTransform) {
} }
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu); TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
} }
// transform bool to float16 // transform bool to float16
...@@ -203,13 +209,14 @@ TEST(DataTypeTransform, GPUTransform) { ...@@ -203,13 +209,14 @@ TEST(DataTypeTransform, GPUTransform) {
} }
TensorCopy(in, gpu_place, context, &in_gpu); TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu); TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait(); context.Wait();
ptr = out.data<float16>(); ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) { for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x); EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
} }
} }
} }
...@@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, ...@@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) { for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_); op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) { if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: " VLOG(2) << "Memory used after operator " + op->Type() + " running: "
......
...@@ -26,7 +26,6 @@ class ReaderBase { ...@@ -26,7 +26,6 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty()); PADDLE_ENFORCE(!shapes_.empty());
} }
virtual void ReadNext(std::vector<LoDTensor>* out) = 0; virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
virtual bool HasNext() const = 0;
virtual void ReInit() = 0; virtual void ReInit() = 0;
...@@ -52,8 +51,6 @@ class DecoratedReader : public ReaderBase { ...@@ -52,8 +51,6 @@ class DecoratedReader : public ReaderBase {
PADDLE_ENFORCE_NOT_NULL(reader_); PADDLE_ENFORCE_NOT_NULL(reader_);
} }
bool HasNext() const override { return reader_->HasNext(); }
void ReInit() override { reader_->ReInit(); } void ReInit() override { reader_->ReInit(); }
protected: protected:
...@@ -69,7 +66,6 @@ class ReaderHolder { ...@@ -69,7 +66,6 @@ class ReaderHolder {
ReaderBase* Get() const { return reader_.get(); } ReaderBase* Get() const { return reader_.get(); }
void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); } void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
bool HasNext() const { return reader_->HasNext(); }
void ReInit() { reader_->ReInit(); } void ReInit() { reader_->ReInit(); }
DDim shape(size_t idx) const { return reader_->shape(idx); } DDim shape(size_t idx) const { return reader_->shape(idx); }
......
...@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) { ...@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
void TensorToStream(std::ostream& os, const Tensor& tensor, void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) { const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version { // the 1st field, uint32_t version
constexpr uint32_t version = 0; constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version)); os.write(reinterpret_cast<const char*>(&version), sizeof(version));
......
...@@ -67,10 +67,10 @@ class ThreadPool { ...@@ -67,10 +67,10 @@ class ThreadPool {
} catch (platform::EnforceNotMet ex) { } catch (platform::EnforceNotMet ex) {
return std::unique_ptr<platform::EnforceNotMet>( return std::unique_ptr<platform::EnforceNotMet>(
new platform::EnforceNotMet(ex)); new platform::EnforceNotMet(ex));
} catch (...) { } catch (const std::exception& e) {
LOG(FATAL) LOG(FATAL) << "Unexpected exception is catched in thread pool. All "
<< "Unexpected exception is catched in thread pool. All " "throwable exception in Fluid should be an EnforceNotMet."
"throwable exception in Fluid should be an EnforceNotMet."; << e.what();
} }
return nullptr; return nullptr;
}); });
......
...@@ -5,7 +5,8 @@ cc_library(paddle_fluid_api ...@@ -5,7 +5,8 @@ cc_library(paddle_fluid_api
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create static library # Create static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
cc_library(paddle_fluid DEPS ${fluid_modules})
# Create shared library # Create shared library
cc_library(paddle_fluid_shared SHARED cc_library(paddle_fluid_shared SHARED
......
...@@ -22,14 +22,14 @@ namespace paddle { ...@@ -22,14 +22,14 @@ namespace paddle {
namespace inference { namespace inference {
void ReadBinaryFile(const std::string& filename, std::string& contents) { void ReadBinaryFile(const std::string& filename, std::string& contents) {
VLOG(3) << "loading model from " << filename; std::ifstream fin(filename, std::ios::in | std::ios::binary);
std::ifstream inputfs(filename, std::ios::in | std::ios::binary); PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
inputfs.seekg(0, std::ios::end); fin.seekg(0, std::ios::end);
contents.clear(); contents.clear();
contents.resize(inputfs.tellg()); contents.resize(fin.tellg());
inputfs.seekg(0, std::ios::beg); fin.seekg(0, std::ios::beg);
inputfs.read(&contents[0], contents.size()); fin.read(&contents[0], contents.size());
inputfs.close(); fin.close();
} }
bool IsPersistable(const framework::VarDesc* var) { bool IsPersistable(const framework::VarDesc* var) {
...@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor, ...@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
const std::string& dirname) { const std::string& dirname) {
std::string model_filename = dirname + "/__model__"; std::string model_filename = dirname + "/__model__";
std::string program_desc_str; std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, program_desc_str); ReadBinaryFile(model_filename, program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program( std::unique_ptr<framework::ProgramDesc> main_program(
......
...@@ -17,10 +17,13 @@ limitations under the License. */ ...@@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
TEST(inference, image_classification) { TEST(inference, image_classification) {
if (FLAGS_dirname.empty()) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
} }
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
...@@ -29,13 +32,11 @@ TEST(inference, image_classification) { ...@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices // 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0]. // which should be in the range [0.0, 1.0].
SetupTensor<float>(input, SetupTensor<float>(input,
{batch_size, 3, 32, 32}, {FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0), static_cast<float>(0),
static_cast<float>(1)); static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
...@@ -46,7 +47,9 @@ TEST(inference, image_classification) { ...@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
cpu_fetchs1.push_back(&output1); cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims(); LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
...@@ -55,7 +58,9 @@ TEST(inference, image_classification) { ...@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims(); LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
......
...@@ -17,10 +17,13 @@ limitations under the License. */ ...@@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h" #include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model."); DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
TEST(inference, recognize_digits) { TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) { if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
} }
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
...@@ -29,77 +32,39 @@ TEST(inference, recognize_digits) { ...@@ -29,77 +32,39 @@ TEST(inference, recognize_digits) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices // 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc // In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input; paddle::framework::LoDTensor input;
// Use normilized image pixels as input data, // Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0]. // which should be in the range [-1.0, 1.0].
SetupTensor<float>(input, SetupTensor<float>(input,
{batch_size, 1, 28, 28}, {FLAGS_batch_size, 1, 28, 28},
static_cast<float>(-1), static_cast<float>(-1),
static_cast<float>(1)); static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds; std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input); cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1; for (auto is_combined : {false, true}) {
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1; paddle::framework::LoDTensor output1;
cpu_fetchs1.push_back(&output1); std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU // Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1); LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
LOG(INFO) << output1.dims(); TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2; paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2; std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2); cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU // Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2); LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
LOG(INFO) << output2.dims(); TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2); CheckError<float>(output1, output2);
#endif #endif
}
TEST(inference, recognize_digits_combine) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
} }
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(
input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, true>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, true>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
} }
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include <time.h> #include <time.h>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h" #include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
template <typename T> template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input, void SetupTensor(paddle::framework::LoDTensor& input,
...@@ -87,31 +88,60 @@ void CheckError(paddle::framework::LoDTensor& output1, ...@@ -87,31 +88,60 @@ void CheckError(paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
} }
template <typename Place, bool IsCombined = false> template <typename Place>
void TestInference(const std::string& dirname, void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds, const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) { std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1,
const bool is_combined = false) {
// 1. Define place, executor, scope // 1. Define place, executor, scope
auto place = Place(); auto place = Place();
auto executor = paddle::framework::Executor(place); auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope(); auto* scope = new paddle::framework::Scope();
// Profile the performance
paddle::platform::ProfilerState state;
if (paddle::platform::is_cpu_place(place)) {
state = paddle::platform::ProfilerState::kCPU;
} else {
#ifdef PADDLE_WITH_CUDA
state = paddle::platform::ProfilerState::kCUDA;
// The default device_id of paddle::platform::CUDAPlace is 0.
// Users can get the device_id using:
// int device_id = place.GetDeviceId();
paddle::platform::SetDeviceId(0);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
// Enable the profiler
paddle::platform::EnableProfiler(state);
// 2. Initialize the inference_program and load parameters // 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program; std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (IsCombined) { {
// All parameters are saved in a single file. paddle::platform::RecordEvent record_event(
// Hard-coding the file names of program and parameters in unittest. "init_program",
// The file names should be consistent with that used in Python API paddle::platform::DeviceContextPool::Instance().Get(place));
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__"; if (is_combined) {
std::string param_filename = "__params_combined__"; // All parameters are saved in a single file.
inference_program = paddle::inference::Load(executor, // Hard-coding the file names of program and parameters in unittest.
*scope, // The file names should be consistent with that used in Python API
dirname + "/" + prog_filename, // `fluid.io.save_inference_model`.
dirname + "/" + param_filename); std::string prog_filename = "__model_combined__";
} else { std::string param_filename = "__params_combined__";
// Parameters are saved in separate files sited in the specified `dirname`. inference_program =
inference_program = paddle::inference::Load(executor, *scope, dirname); paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified
// `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
} }
// 3. Get the feed_target_names and fetch_target_names // 3. Get the feed_target_names and fetch_target_names
...@@ -134,7 +164,21 @@ void TestInference(const std::string& dirname, ...@@ -134,7 +164,21 @@ void TestInference(const std::string& dirname,
} }
// 6. Run the inference program // 6. Run the inference program
executor.Run(*inference_program, scope, feed_targets, fetch_targets); {
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event(
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
}
}
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"profiler.txt");
paddle::platform::ResetProfiler();
delete scope; delete scope;
} }
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "") set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
...@@ -13,6 +15,8 @@ function(op_library TARGET) ...@@ -13,6 +15,8 @@ function(op_library TARGET)
set(cu_cc_srcs) set(cu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function) set(op_common_deps operator op_registry math_function)
set(options "") set(options "")
set(oneValueArgs "") set(oneValueArgs "")
...@@ -36,12 +40,20 @@ function(op_library TARGET) ...@@ -36,12 +40,20 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif() endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src}) list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
...@@ -62,11 +74,11 @@ function(op_library TARGET) ...@@ -62,11 +74,11 @@ function(op_library TARGET)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif() endif()
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
endif() endif()
# Define operators that don't need pybind here. # Define operators that don't need pybind here.
...@@ -101,7 +113,8 @@ function(op_library TARGET) ...@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP # pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -112,6 +125,11 @@ function(op_library TARGET) ...@@ -112,6 +125,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
# pybind USE_OP # pybind USE_OP
if (${pybind_flag} EQUAL 0) if (${pybind_flag} EQUAL 0)
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
...@@ -172,17 +190,18 @@ op_library(cos_sim_op DEPS cos_sim_functor) ...@@ -172,17 +190,18 @@ op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor) op_library(parallel_do_op DEPS executor)
if (WITH_GPU) if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv) op_library(conv_op DEPS vol2col depthwise_conv im2col)
else() else()
op_library(conv_op DEPS vol2col) op_library(conv_op DEPS vol2col im2col)
endif() endif()
op_library(conv_transpose_op DEPS vol2col) op_library(conv_transpose_op DEPS vol2col im2col)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions # FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor) op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor)
op_library(concat_op DEPS concat)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS}) foreach(src ${GENERAL_OPS})
......
...@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel { ...@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
ops::ConcatOpGrad, false) ops::ConcatOpGrad, false)
REGISTER_OP_CPU_KERNEL(concat, REGISTER_OP_CPU_KERNEL(
ops::ConcatKernel<paddle::platform::CPUPlace, float>) concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
REGISTER_OP_CPU_KERNEL(concat_grad, REGISTER_OP_CPU_KERNEL(
ops::ConcatGradKernel<paddle::platform::CPUPlace, float>) concat_grad,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle { namespace paddle {
...@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> { ...@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<framework::Tensor>("X"); auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out"); framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
out->mutable_data<T>(place); out->mutable_data<T>(place);
auto out_stride = framework::stride_numel(out->dims()); // Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && ins.size() < 10) {
size_t output_offset = 0; size_t output_offset = 0;
// If axis >=1, copy to out immediately need to call many times
// of cuda memcpy. Copy the input to cpu and do the stride copy,
// then copy to gpu output.
if (platform::is_gpu_place(place) && axis >= 1) {
platform::CPUPlace copy_place;
auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
framework::Tensor cpu_out;
cpu_out.Resize(out->dims());
cpu_out.mutable_data<T>(copy_place);
auto& dev_ctx = ctx.device_context();
std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
for (auto* in : ins) {
std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
cpu_ins.emplace_back(std::move(cpu_in));
}
// TODO(dzhwinter): overlap copy and compute stream
// https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
dev_ctx.Wait();
for (auto& in : cpu_ins) {
auto& cpu_in = *in.get();
auto in_stride = framework::stride_numel(cpu_in.dims());
StridedNumelCopyWithAxis<T>(
cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
cpu_in.data<T>(), in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
framework::TensorCopy(cpu_out, place, dev_ctx, out);
} else {
for (auto* in : ins) { for (auto* in : ins) {
auto in_stride = framework::stride_numel(in->dims()); auto in_stride = framework::stride_numel(in->dims());
auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
out->data<T>() + output_offset, out_stride, out->data<T>() + output_offset, out_stride,
in->data<T>(), in_stride, in_stride[axis]); in->data<T>(), in_stride, in_stride[axis]);
output_offset += in_stride[axis]; output_offset += in_stride[axis];
} }
} else {
std::vector<framework::Tensor> inputs(ins.size());
for (size_t j = 0; j < ins.size(); ++j) {
inputs[j] = *ins[j];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
} }
} }
}; };
...@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> { ...@@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis")); int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
size_t input_offset = 0;
auto in_stride = framework::stride_numel(in->dims());
for (auto& out : outs) { // Sometimes direct copies will be faster, this maybe need deeply analysis.
out->mutable_data<T>(ctx.GetPlace()); if (axis == 0 && outs.size() < 10) {
auto out_stride = framework::stride_numel(out->dims()); size_t input_offset = 0;
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(), auto in_stride = framework::stride_numel(in->dims());
out_stride, in->data<T>() + input_offset,
in_stride, out_stride[axis]); for (auto& out : outs) {
input_offset += out_stride[axis]; out->mutable_data<T>(ctx.GetPlace());
auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis, out->data<T>(),
out_stride, in->data<T>() + input_offset,
in_stride, out_stride[axis]);
input_offset += out_stride[axis];
}
} else {
std::vector<framework::Tensor> outputs(outs.size());
for (size_t j = 0; j < outs.size(); ++j) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs[j] = *outs[j];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
} }
} }
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
using paddle::platform::MKLDNNMemDesc;
using mkldnn::memory; // Note: paddle has also "memory" namespace
using mkldnn::primitive;
using mkldnn::convolution_forward;
using mkldnn::convolution_backward_weights;
using mkldnn::convolution_backward_data;
using mkldnn::convolution_direct;
using mkldnn::prop_kind;
using mkldnn::padding_kind;
using mkldnn::stream;
namespace {
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine);
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine);
} // anonymous namespace
template <typename T>
class ConvOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
// allocate memory for output
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
// memory descriptors for convolution src/weight/dst
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory primitives
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
auto conv_dst_memory = memory({conv_dst_md, mkldnn_engine}, output_data);
std::unique_ptr<convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(conv_src_md, conv_weights_md, conv_dst_md, strides,
paddings, mkldnn_engine);
// save p_conv_pd into dev_ctx to be referred in backward path
auto p_conv_pd = conv_pd.get();
std::shared_ptr<void> conv_pd_value = std::move(conv_pd);
dev_ctx.SetBlob(key_conv_pd, conv_pd_value);
// create convolution op primitive
auto conv_prim = convolution_forward(*p_conv_pd, conv_src_memory,
conv_weights_memory, conv_dst_memory);
// push op to stream and wait MKLDNN until it's executed
std::vector<primitive> pipeline{conv_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
};
template <typename T>
class ConvGradOpMkldnnKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output = ctx.Input<Tensor>("Output");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
// allocate memory for gradient of input/filter
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto conv_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_diff_src_md =
MKLDNNMemDesc(src_tz, memory::data_type::f32, memory::format::nchw);
auto conv_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_weights_md =
MKLDNNMemDesc(weights_tz, memory::data_type::f32, memory::format::oihw);
auto conv_diff_dst_md =
MKLDNNMemDesc(dst_tz, memory::data_type::f32, memory::format::nchw);
// create memory
auto conv_diff_dst_memory =
memory({conv_diff_weights_md, mkldnn_engine}, (void*)output_grad_data);
// Retrieve conv_pd from device context
std::shared_ptr<void> conv_pd;
convolution_forward::primitive_desc* p_conv_pd;
conv_pd = dev_ctx.GetBlob(key_conv_pd);
PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context");
p_conv_pd =
static_cast<convolution_forward::primitive_desc*>(conv_pd.get());
// create backward conv primitive for weights
if (filter_grad) {
// create primitive descriptor
convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(conv_src_md, conv_diff_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_weights_memory = memory(
{conv_diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto conv_src_memory =
memory({conv_src_md, mkldnn_engine}, (void*)input_data);
// create backward conv primitive for weights
auto conv_bwd_weights_prim = convolution_backward_weights(
conv_bwd_weights_pd, conv_src_memory, conv_diff_dst_memory,
conv_diff_weights_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_weights_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
// create primitive descriptor
convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(conv_diff_src_md, conv_weights_md,
conv_diff_dst_md, strides, paddings,
*p_conv_pd, mkldnn_engine);
// create memory
auto conv_diff_src_memory =
memory({conv_diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto conv_weights_memory =
memory({conv_weights_md, mkldnn_engine}, (void*)filter_data);
// create backward conv primitive for data
auto conv_bwd_data_prim =
convolution_backward_data(conv_bwd_data_pd, conv_diff_dst_memory,
conv_weights_memory, conv_diff_src_memory);
// push primitive and execute it
std::vector<primitive> pipeline{conv_bwd_data_prim};
stream(stream::kind::eager).submit(pipeline).wait();
}
} // Compute()
};
namespace {
std::unique_ptr<convolution_forward::primitive_desc> ConvFwdPrimitiveDesc(
const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings, const mkldnn::engine& engine) {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
auto p_conv_pd = new convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
convolution_backward_weights::primitive_desc ConvBwdWeightsPrimitiveDesc(
const memory::desc& src, const memory::desc& diff_weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_weights_desc = convolution_backward_weights::desc(
convolution_direct, src, diff_weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_weights::primitive_desc(conv_bwd_weights_desc,
engine, conv_pd);
}
convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const memory::desc& diff_src, const memory::desc& weights,
const memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) {
auto conv_bwd_data_desc = convolution_backward_data::desc(
convolution_direct, diff_src, weights, diff_dst, strides, paddings,
paddings, padding_kind::zero);
return convolution_backward_data::primitive_desc(conv_bwd_data_desc, engine,
conv_pd);
}
} // anonymous namespace
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvOpMkldnnKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvGradOpMkldnnKernel<float>);
...@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType( framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); library_ = framework::LibraryType::kCUDNN;
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
} }
#endif #endif
framework::LibraryType library_; #ifdef PADDLE_WITH_MKLDNN
if (use_cudnn) { if (library_ == framework::LibraryType::kPlain &&
library_ = framework::LibraryType::kCUDNN; platform::CanMKLDNNBeUsed(ctx)) {
} else { library_ = framework::LibraryType::kMKLDNN;
library_ = framework::LibraryType::kPlain;
} }
#endif
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
...@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) ...@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn", "use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn") "(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) ...@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn", "use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn") "(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { ...@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType( framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn"); framework::LibraryType library_{framework::LibraryType::kPlain};
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) { if (platform::CanCUDNNBeUsed(ctx)) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); library_ = framework::LibraryType::kCUDNN;
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
} }
#endif #endif
#ifdef PADDLE_WITH_MKLDNN
framework::LibraryType library_; if (library_ == framework::LibraryType::kPlain &&
if (use_cudnn) { platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN; library_ = framework::LibraryType::kMKLDNN;
} else {
library_ = framework::LibraryType::kPlain;
} }
#endif
std::string data_format = ctx.Attr<std::string>("data_format"); std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format); framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(), framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
endif() endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
GrpcByteBufferSource::GrpcByteBufferSource() {}
bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
cur_ = -1;
left_ = 0;
ptr_ = nullptr;
byte_count_ = 0;
bool ok = src.Dump(&slices_).ok();
if (!ok) {
slices_.clear();
}
return ok;
}
bool GrpcByteBufferSource::Next(const void** data, int* size) {
// Use loop instead of if in case buffer contained empty slices.
while (left_ == 0) {
// Advance to next slice.
cur_++;
if (cur_ >= slices_.size()) {
return false;
}
const ::grpc::Slice& s = slices_[cur_];
left_ = s.size();
ptr_ = reinterpret_cast<const char*>(s.begin());
}
*data = ptr_;
*size = left_;
byte_count_ += left_;
ptr_ += left_;
left_ = 0;
return true;
}
void GrpcByteBufferSource::BackUp(int count) {
ptr_ -= count;
left_ += count;
byte_count_ -= count;
}
bool GrpcByteBufferSource::Skip(int count) {
const void* data;
int size;
while (Next(&data, &size)) {
if (size >= count) {
BackUp(size - count);
return true;
}
// size < count;
count -= size;
}
// error or we have too large count;
return false;
}
google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_;
}
} // namespace detail
} // namespace operators
} // namespace paddle
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
namespace paddle {
namespace operators {
namespace detail {
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class GrpcByteBufferSource
: public ::google::protobuf::io::ZeroCopyInputStream {
public:
GrpcByteBufferSource();
bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times.
bool Next(const void** data, int* size) override;
void BackUp(int count) override;
bool Skip(int count) override;
::google::protobuf::int64 ByteCount() const override;
private:
std::vector<::grpc::Slice> slices_;
size_t cur_; // Current slice index.
int left_; // Number of bytes in slices_[cur_] left to yield.
const char* ptr_; // Address of next byte in slices_[cur_] to yield.
::google::protobuf::int64 byte_count_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace detail {
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1 << 7)) {
*(ptr++) = v;
} else if (v < (1 << 14)) {
*(ptr++) = v | B;
*(ptr++) = v >> 7;
} else if (v < (1 << 21)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = v >> 14;
} else if (v < (1 << 28)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = v >> 21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = (v >> 21) | B;
*(ptr++) = v >> 28;
}
return reinterpret_cast<char*>(ptr);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B - 1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
class ProtoEncodeHelper {
public:
ProtoEncodeHelper(char* buf, int max_size)
: base_(buf), p_(buf), limit_(base_ + max_size) {}
~ProtoEncodeHelper() {
// Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE(p_, limit_);
}
const char* data() const { return base_; }
size_t size() const { return p_ - base_; }
void WriteUint64(int tag, uint64_t v) {
Encode32(combine(tag, WIRETYPE_VARINT));
Encode64(v);
}
void WriteBool(int tag, bool v) {
Encode32(combine(tag, WIRETYPE_VARINT));
EncodeBool(v);
}
void WriteString(int tag, const std::string& v) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(v.size());
EncodeBytes(v.data(), v.size());
}
void WriteVarlengthBeginning(int tag, uint32_t len) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(len);
}
void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
private:
// Note: this module's behavior must match the protocol buffer wire encoding
// format.
enum {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
static uint32_t combine(uint32_t tag, uint32_t type) {
return ((tag << 3) | type);
}
inline void Encode32(uint32_t v) {
if (v < 128) {
// Fast path for single-byte values. Many of the calls will use a
// constant value for v, so the comparison will get optimized away
// when Encode32 is inlined into the caller.
*p_ = v;
p_++;
} else {
p_ = EncodeVarint32(p_, v);
}
}
void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
void EncodeBool(bool v) {
*p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1
p_++;
}
void EncodeBytes(const char* bytes, int N) {
memcpy(p_, bytes, N);
p_ += N;
}
char* base_;
char* p_;
char* limit_; // Just for CHECKs
};
} // detail
} // operators
} // paddle
...@@ -33,10 +33,34 @@ enum VarType { ...@@ -33,10 +33,34 @@ enum VarType {
} }
message VariableMessage { message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
string varname = 1; string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType // TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2; VarType type = 2;
bytes serialized = 3; // bool persistable is not needed for sending.
// tensor info:
Type data_type = 3;
repeated int64 dims = 4;
// lod details:
int64 lod_level = 5;
repeated LodData lod = 6;
// tensor data
bytes serialized = 7;
// selected_rows data
bytes rows = 8;
} }
message VoidMessage {} message VoidMessage {}
...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -63,6 +68,242 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg, ...@@ -63,6 +68,242 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
} }
} }
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) {
using VarMsg = sendrecv::VariableMessage;
sendrecv::VariableMessage request;
std::string header;
request.AppendToString(&header);
// When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent
// parallelism execution, need to know when to free the tensor.
DestroyCallback destroy_callback = [](void* backing) {};
void* buf = malloc(1024);
void* payload;
size_t payload_size;
ProtoEncodeHelper e((char*)buf, 1024);
e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
} else if (var->IsType<framework::SelectedRows>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
}
switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR: {
auto tensor = var->Get<framework::LoDTensor>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(tensor.type()));
for (auto& dim : framework::vectorize(tensor.dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
auto lod = tensor.lod(); // std::vector<Vector<size_t>>
if (lod.size() > 0) {
e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
for (auto& each : lod) {
e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
2 + // tag + varintlength of submessage
1 + // kLodDataFieldNumber
each.size());
// auto copied from GPU
for (auto& d : each) {
e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
}
}
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()),
copy_size, gpu_dev_ctx.stream());
destroy_callback = [](void* backing) {
std::cout << "destroy payload" << std::endl;
platform::CPUPlace cpu;
memory::Free(cpu, backing);
};
#endif
} else {
payload = tensor.data<void>();
}
payload_size = tensor.memory_size();
std::string tmp(reinterpret_cast<char*>(payload), payload_size);
for (int i = 0; i < tmp.size(); ++i) {
printf("%02X ", tmp.data()[i]);
}
printf("\n");
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
case framework::proto::VarType_Type_SELECTED_ROWS: {
// TODO(typhoonzero): selectedrows implement should not use unique_ptr
auto* slr = var->GetMutable<framework::SelectedRows>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(slr->value().type()));
for (auto& dim : framework::vectorize(slr->value().dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()),
copy_size, gpu_dev_ctx.stream());
ctx.Wait();
float* ttt = reinterpret_cast<float*>(payload);
for (int i = 0; i < copy_size / 4; i++) {
std::cout << "copied to cpu: " << ttt[i] << std::endl;
}
destroy_callback = [](void* backing) {
std::cout << "destroy..." << std::endl;
// platform::CPUPlace cpu;
// memory::Free(cpu, backing);
};
#endif
} else {
payload = slr->mutable_value()->data<void>();
}
payload_size = tensor->memory_size();
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
default:
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
break;
}
// steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
static_cast<char*>(payload)),
::grpc::Slice::STEAL_REF);
if (framework::ToVarType(var->Type()) ==
framework::proto::VarType_Type_SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2((char*)buf, 128);
// NOTE: rows is of type int64_t
size_t rows_memory_size =
slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
slices[3] = ::grpc::Slice(
grpc_slice_new_with_user_data(
const_cast<void*>(
reinterpret_cast<const void*>(slr->rows().data())),
rows_memory_size,
[](void* backing) {
// TODO(typhoonzero): add unref here, same as above.
},
const_cast<char*>(
reinterpret_cast<const char*>(slr->rows().data()))),
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var) {
sendrecv::VariableMessage meta;
GrpcByteBufferSource source;
source.Init(msg);
::google::protobuf::io::CodedInputStream input(&source);
// do zerocopy parsing
PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
PADDLE_ENFORCE(input.ConsumedEntireMessage());
// dims is needed by both tensor and selectedrows
std::vector<int> vecdims;
for (auto& d : meta.dims()) {
vecdims.push_back(d);
}
framework::DDim dims = framework::make_ddim(vecdims);
if (meta.type() == sendrecv::LOD_TENSOR) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
framework::LoD lod;
for (int i = 0; i < meta.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
v.push_back(meta.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
// How to avoid copying and use the message buffer directly?
// Maybe need to find a way to release all memory except tensor content.
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
} else if (meta.type() == sendrecv::SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
int64_t* rows_data = slr->mutable_rows()->data();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
// copy rows CPU data, GPU data will be copied lazly
memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
meta.rows().size());
}
}
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
\ No newline at end of file
...@@ -33,6 +33,14 @@ namespace detail { ...@@ -33,6 +33,14 @@ namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
typedef void (*DestroyCallback)(void*);
inline int64_t GetTimestamp() {
return std::chrono::duration_cast<std::chrono::milliseconds>(
std::chrono::system_clock::now().time_since_epoch())
.count();
}
void SerializeToMessage(const std::string& name, const framework::Variable* var, void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg); sendrecv::VariableMessage* msg);
...@@ -40,6 +48,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var, ...@@ -40,6 +48,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var,
void DeserializeFromMessage(const sendrecv::VariableMessage& msg, void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
framework::Variable* var); framework::Variable* var);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) {
case sendrecv::VariableMessage::FP32:
return typeid(float); // NOLINT
case sendrecv::VariableMessage::FP64:
return typeid(double); // NOLINT
case sendrecv::VariableMessage::INT32:
return typeid(int); // NOLINT
case sendrecv::VariableMessage::INT64:
return typeid(int64_t); // NOLINT
case sendrecv::VariableMessage::BOOL:
return typeid(bool); // NOLINT
default:
PADDLE_THROW("Not support type %d", type);
}
}
} // namespace detail } // namespace detail
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace math = paddle::operators::math;
namespace memory = paddle::memory;
void RunSerdeTestTensor(platform::Place place) {
// serialize var to ByteBuffer
framework::Variable var;
auto* tensor = var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
framework::LoD lod;
lod.push_back(framework::Vector<size_t>({1, 3, 8}));
tensor->set_lod(lod);
int tensor_numel = 4 * 8 * 4 * 2;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
float* orig_tensor_data = tensor->mutable_data<float>(place);
math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 0);
EXPECT_EQ(varmsg.dims()[0], 4);
EXPECT_EQ(varmsg.dims()[1], 8);
EXPECT_EQ(varmsg.dims()[2], 4);
EXPECT_EQ(varmsg.dims()[3], 2);
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
for (int i = 0; i < varmsg.serialized().size(); ++i) {
printf("%02X ", varmsg.serialized().data()[i]);
}
printf("\n");
for (int i = 0; i < tensor_numel; ++i) {
std::cout << "#####tensor data: " << tensor_data[i] << std::endl;
EXPECT_EQ(tensor_data[i], orig_tensor_data[i]);
std::cout << "test end 1 " << std::endl;
}
std::cout << "tensor data end " << std::endl;
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto tensor2 = var2.Get<framework::LoDTensor>();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2.data<float>());
}
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
for (int i = 0; i < tensor_numel; ++i)
EXPECT_EQ(tensor_data2[i], orig_tensor_data[i]);
}
void RunSerdeTestSelectedRows(platform::Place place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// serialize var to ByteBuffer
framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({2, 10}));
int tensor_numel = 2 * 10;
float* orig_tensor_data = tensor->mutable_data<float>(place);
math::set_constant(ctx, tensor, 32.7);
rows->push_back(3);
rows->push_back(10);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 1);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
const int64_t* rows_data =
reinterpret_cast<const int64_t*>(varmsg.rows().data());
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_EQ(tensor_data[i], orig_tensor_data[i]);
}
EXPECT_EQ(rows_data[0], 3);
EXPECT_EQ(rows_data[1], 10);
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto* slr2 = var2.GetMutable<framework::SelectedRows>();
auto* tensor2 = slr2->mutable_value();
auto* rows2 = slr2->mutable_rows();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2->data<float>());
}
const int64_t* rows_data2 = rows2->data();
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_EQ(tensor_data2[i], orig_tensor_data[i]);
}
EXPECT_EQ(rows_data2[0], 3);
EXPECT_EQ(rows_data2[1], 10);
}
// TEST(SelectedRows, CPU) {
// platform::CPUPlace place;
// RunSerdeTestSelectedRows(place);
// }
// TEST(SelectedRows, GPU) {
// platform::CUDAPlace place;
// RunSerdeTestSelectedRows(place);
// }
TEST(Tensor, CPU) {
platform::CPUPlace place;
RunSerdeTestTensor(place);
}
TEST(Tensor, GPU) {
platform::CUDAPlace place;
RunSerdeTestTensor(place);
}
\ No newline at end of file
...@@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { ...@@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
return framework::OpKernelType( return framework::OpKernelType(
framework::ToDataType( framework::ToDataType(
ctx.Input<framework::Tensor>("DetectRes")->type()), ctx.Input<framework::Tensor>("DetectRes")->type()),
ctx.device_context()); platform::CPUPlace());
} }
}; };
......
...@@ -144,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -144,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
} }
} }
inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const {
T one = static_cast<T>(1.0);
T zero = static_cast<T>(0.0);
clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero);
clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero);
clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero);
clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero);
}
void GetBoxes(const framework::LoDTensor& input_label, void GetBoxes(const framework::LoDTensor& input_label,
const framework::LoDTensor& input_detect, const framework::LoDTensor& input_detect,
std::vector<std::map<int, std::vector<Box>>>& gt_boxes, std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
...@@ -360,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> { ...@@ -360,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
size_t max_idx = 0; size_t max_idx = 0;
auto score = pred_boxes[i].first; auto score = pred_boxes[i].first;
for (size_t j = 0; j < matched_bboxes.size(); ++j) { for (size_t j = 0; j < matched_bboxes.size(); ++j) {
T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]); Box& pred_box = pred_boxes[i].second;
ClipBBox(pred_box, &pred_box);
T overlap = JaccardOverlap(pred_box, matched_bboxes[j]);
if (overlap > max_overlap) { if (overlap > max_overlap) {
max_overlap = overlap; max_overlap = overlap;
max_idx = j; max_idx = j;
......
...@@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> { ...@@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
struct ElementwiseDivGradFunctor { struct DivGradDX {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto z_e = framework::EigenVector<T>::Flatten(*z);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
}
}
};
template <typename T>
struct ElementwiseDivBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
}; };
template <typename T> template <typename T>
struct ElementwiseDivBroadCast2GradFunctor { struct DivGradDY {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
typename dY, typename dZ, typename Pre, typename N, typename Post> return -dout * x / (y * y);
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
} }
}; };
...@@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> { ...@@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>, ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ElementwiseDivBroadCastGradFunctor<T>, ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
ElementwiseDivBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
......
...@@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> { ...@@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
struct ElementwiseMaxGradFunctor { struct MaxGradDx {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
typename dY, typename dZ> return dout * (x > y);
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
}
} }
}; };
template <typename T> template <typename T>
struct ElementwiseMaxBroadCastGradFunctor { struct MaxGradDy {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
typename dY, typename dZ, typename Pre, typename N> return dout * (x <= y);
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseMaxBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
} }
}; };
...@@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel<T> { ...@@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>, ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
ElementwiseMaxBroadCastGradFunctor<T>, ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
ElementwiseMaxBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel<T> { ...@@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
struct ElementwiseMinGradFunctor { struct MinGradDx {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
typename dY, typename dZ> return dout * (x < y);
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
}
} }
}; };
template <typename T> template <typename T>
struct ElementwiseMinBroadCastGradFunctor { struct MinGradDy {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
typename dY, typename dZ, typename Pre, typename N> return dout * (x >= y);
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseMinBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
} }
}; };
...@@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel<T> { ...@@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>, ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
ElementwiseMinBroadCastGradFunctor<T>, ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
ElementwiseMinBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -40,14 +40,15 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -40,14 +40,15 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
struct IdentityGrad_DX { struct MulGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
}; };
template <typename T> template <typename T>
struct IdentityGrad_DY { struct MulGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class ElementwiseMulGradKernel : public framework::OpKernel<T> { class ElementwiseMulGradKernel : public framework::OpKernel<T> {
public: public:
...@@ -61,10 +62,8 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> { ...@@ -61,10 +62,8 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElemwiseGradCompute<DeviceContext, T, IdentityGrad_DX<T>, ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
IdentityGrad_DY<T>>(ctx, *x, *y, *out, *dout, axis, dx, ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
dy, IdentityGrad_DX<T>(),
IdentityGrad_DY<T>());
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> { ...@@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
}; };
template <typename T> template <typename T>
struct ElementwiseSubGradFunctor { struct SubGradDX {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) * dz_e;
}
}
}; };
template <typename T> template <typename T>
struct ElementwiseSubBroadCastGradFunctor { struct SubGradDY {
template <typename Device, typename X, typename Y, typename Z, typename dX, HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) *
dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseSubBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) *
dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> { ...@@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X")); auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y")); auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis"); int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>, ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
ElementwiseSubBroadCastGradFunctor<T>, ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
ElementwiseSubBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
} }
}; };
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
add_subdirectory(detail) add_subdirectory(detail)
if(WITH_GPU) function(math_library TARGET)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto) # math_library is a function to create math library.
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) # The interface is the same as cc_library.
nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function) # But it handle split GPU/CPU code and link some common library.
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) set(cc_srcs)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) set(cu_srcs)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) set(math_common_deps device_context framework_proto)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) set(multiValueArgs DEPS)
nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context) cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) "${multiValueArgs}" ${ARGN})
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function) list(APPEND cc_srcs ${TARGET}.cc)
nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context) endif()
nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) list(APPEND cu_srcs ${TARGET}.cu)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) endif()
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) list(LENGTH cc_srcs cc_srcs_len)
nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context) if (WITH_GPU)
else() nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) elseif(${cc_srcs_len} GREATER 0)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
cc_library(softmax SRCS softmax.cc DEPS device_context) endif()
cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) endfunction()
cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
endif()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) # please add new math_library in alphabetical order
math_library(concat)
math_library(context_project DEPS im2col math_function)
math_library(cross_entropy)
math_library(cos_sim_functor)
math_library(depthwise_conv)
math_library(gru_compute DEPS activation_functions math_function)
math_library(im2col)
math_library(lstm_compute DEPS activation_functions)
math_library(math_function DEPS cblas)
math_library(maxouting)
math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows)
math_library(sequence2batch)
math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale)
math_library(softmax)
math_library(unpooling)
math_library(vol2col)
cc_test(math_function_test SRCS math_function_test.cc)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat)
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
namespace paddle {
namespace operators {
namespace math {
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) {
// TODO(zcd): Add input data validity checking
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> input_cols(input.size());
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
}
auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
for (int k = 0; k < out_rows; ++k) {
T* dst_ptr = output->data<T>() + k * out_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
const T* src_prt = input[j].data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
sizeof(T) * col_len);
col_idx += col_len;
}
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking
int num = outputs.size();
int input_rows = 1;
auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i];
}
int input_cols = 0;
std::vector<int64_t> output_cols(outputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = outputs[i].numel() / input_rows;
input_cols += t_cols;
output_cols[i] = t_cols;
}
auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = output_cols[j];
T* dst_ptr = outputs[j].data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len);
col_idx += col_len;
}
}
}
};
template class ConcatFunctor<platform::CPUDeviceContext, int>;
template class ConcatFunctor<platform::CPUDeviceContext, int64_t>;
template class ConcatFunctor<platform::CPUDeviceContext, float>;
template class ConcatFunctor<platform::CPUDeviceContext, double>;
template class ConcatGradFunctor<platform::CPUDeviceContext, int>;
template class ConcatGradFunctor<platform::CPUDeviceContext, int64_t>;
template class ConcatGradFunctor<platform::CPUDeviceContext, float>;
template class ConcatGradFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
const T* orig = first;
const T* it = nullptr;
T step = 0;
while (count > 0) {
it = first;
step = count / 2;
it += step;
if (!(val < *it)) {
first = ++it;
count -= step + 1;
} else {
count = step;
}
}
return first - orig;
}
template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols,
T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
int curr_offset = input_cols[segment];
int curr_segment = segment;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
output[tid_y * output_cols + tid_x] =
input_ptr[tid_y * segment_width + local_col];
}
}
template <typename T>
__global__ void KernelConcat(T** inputs, const int input_col,
const int output_rows, const int output_cols,
T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
int split = tid_x * inv_input_col;
int in_offset = tid_x - split * input_col;
T* input_ptr = inputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
output[tid_y * output_cols + tid_x] =
input_ptr[tid_y * input_col + in_offset];
}
}
}
template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
const int input_col, const int* output_cols,
int col_size, T** outputs) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
int curr_offset = output_cols[segment];
int curr_segment = segment;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] =
input[tid_y * input_col + tid_x];
}
}
template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
const int input_col, const int output_cols,
T** outputs) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
int split = tid_x * inv_input_col;
int in_offset = tid_x - split * input_col;
T* output_ptr = outputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * output_cols + in_offset] =
input[tid_y * input_col + tid_x];
}
}
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) {
// TODO(zcd): Add input data validity checking
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int cols = input[0].numel() / rows;
int out_rows = rows, out_cols = 0;
framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
framework::Vector<int> inputs_cols(num + 1);
inputs_cols[0] = 0;
T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
bool sameShape = true;
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
if (sameShape) {
if (t_cols != cols) sameShape = false;
}
out_cols += t_cols;
inputs_cols[i + 1] = out_cols;
inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
}
T** ins_gpu =
reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
// computation
// set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((out_cols + 31) >> 5) << 5;
}
int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1);
int max_threads = context.GetMaxPhysicalThreadCount();
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols =
std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
int grid_rows =
std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, cols, out_rows, out_cols, output->data<T>());
} else {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
out_cols, output->data<T>());
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking
int num = outputs.size();
int input_row = 1;
auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) {
input_row *= dim_0[i];
}
int output_col_0 = outputs[0].numel() / input_row;
int input_col = 0;
bool sameShape = true;
framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
framework::Vector<int> outputs_cols(num + 1);
outputs_cols[0] = 0;
T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
for (int i = 0; i < num; ++i) {
int t_col = outputs[i].numel() / input_row;
if (sameShape) {
if (t_col != output_col_0) sameShape = false;
}
input_col += t_col;
outputs_cols[i + 1] = input_col;
outputs_ptr[i] = outputs[i].data<T>();
}
T** outs_gpu =
reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
// computation
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((input_col + 31) >> 5) << 5;
}
int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1);
int max_threads = context.GetMaxPhysicalThreadCount();
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols =
std::min((input_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows =
std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
} else {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, outs_col_gpu,
static_cast<int>(outputs_cols.size()), outs_gpu);
}
}
};
template class ConcatFunctor<platform::CUDADeviceContext, int>;
template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatFunctor<platform::CUDADeviceContext, float>;
template class ConcatFunctor<platform::CUDADeviceContext, double>;
template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
/*
* \brief Concatenate the input tensors along the dimension axis.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input[0] = [[1,2],[3,4]]
* Input[1] = [[5,6]]
* axis = 0
*
* Output = [[1,2],
* [3,4],
* [5,6]]
*/
template <typename DeviceContext, typename T>
class ConcatFunctor {
public:
void operator()(const DeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output);
};
/*
* \brief Split the input tensors along the dimension axis into outputs.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input = [[1,2],
* [3,4],
* [5,6]]
* axis = 0
*
* Output[0] = [[1,2],[3,4]]
* Output[1] = [[5,6]]
*/
template <typename DeviceContext, typename T>
class ConcatGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>& outputs);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
using namespace paddle::framework;
using namespace paddle::platform;
template <typename DeviceContext, typename Place>
void testConcat() {
Tensor input_a_cpu;
Tensor input_b_cpu;
Tensor out_cpu;
Tensor input_a;
Tensor input_b;
Tensor out;
DeviceContext* context = new DeviceContext(Place());
// DeviceContext context(Place());
/**
* cast1:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [3, 3, 4]
* output:
* out.shape: [5, 3, 4]
*/
auto dim_a = make_ddim({2, 3, 4});
auto dim_b = make_ddim({3, 3, 4});
auto dim_out = make_ddim({5, 3, 4});
input_a.mutable_data<int>(dim_a, Place());
input_b.mutable_data<int>(dim_b, Place());
out.mutable_data<int>(dim_out, Place());
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
out_cpu.mutable_data<int>(dim_out, CPUPlace());
}
int* a_ptr;
int* b_ptr;
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 3 * 3 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
std::vector<Tensor> input;
input.push_back(input_a);
input.push_back(input_b);
paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
concat_functor(*context, input, 0, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
int cols = 2 * 3 * 4;
int idx_a = 0, idx_b = 0;
for (int j = 0; j < 5 * 3 * 4; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
++idx_a;
}
}
//
/**
* cast2:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 4, 4]
* output:
* out.shape: [2, 7, 4]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 4, 4});
dim_out = make_ddim({2, 7, 4});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 4 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 1, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
cols = 3 * 4;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 28; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
/**
* cast3:
* inputs:
* t_a.shape: [2, 3, 5]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 3, 9]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 3, 5});
dim_out = make_ddim({2, 3, 9});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 3 * 5; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 2, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
// check the data
cols = 4;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 9; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
/**
* cast4:
* inputs:
* axis = 1
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 6, 4]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 3, 4});
dim_out = make_ddim({2, 6, 4});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 1, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
// check the data
cols = 12;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 24; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
}
TEST(math, concat) {
testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA
testConcat<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif
}
...@@ -15,11 +15,23 @@ limitations under the License. */ ...@@ -15,11 +15,23 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <> template <>
void gemm<platform::CPUDeviceContext, float>( void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
...@@ -46,6 +58,15 @@ void gemm<platform::CPUDeviceContext, double>( ...@@ -46,6 +58,15 @@ void gemm<platform::CPUDeviceContext, double>(
beta, C, ldc); beta, C, ldc);
} }
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <> template <>
void gemm<platform::CPUDeviceContext, float>( void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const bool transA, const platform::CPUDeviceContext& context, const bool transA,
...@@ -68,6 +89,15 @@ void gemm<platform::CPUDeviceContext, double>( ...@@ -68,6 +89,15 @@ void gemm<platform::CPUDeviceContext, double>(
lda, B, ldb, beta, C, ldc); lda, B, ldb, beta, C, ldc);
} }
template <>
void matmul<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
PADDLE_THROW("float16 matmul not supported on CPU");
}
template <> template <>
void matmul<platform::CPUDeviceContext, float>( void matmul<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const platform::CPUDeviceContext& context,
...@@ -126,6 +156,15 @@ void matmul<platform::CPUDeviceContext, double>( ...@@ -126,6 +156,15 @@ void matmul<platform::CPUDeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>()); matrix_b.data<double>(), beta, matrix_out->data<double>());
} }
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
PADDLE_THROW("float16 batched_gemm not supported on CPU");
}
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <> template <>
......
...@@ -16,11 +16,40 @@ limitations under the License. */ ...@@ -16,11 +16,40 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h" #include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
}
template <> template <>
void gemm<platform::CUDADeviceContext, float>( void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
...@@ -60,6 +89,28 @@ void gemm<platform::CUDADeviceContext, double>( ...@@ -60,6 +89,28 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, N)); lda, &beta, C, N));
} }
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, ldc));
}
template <> template <>
void gemm<platform::CUDADeviceContext, float>( void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const bool transA, const platform::CUDADeviceContext& context, const bool transA,
...@@ -90,6 +141,35 @@ void gemm<platform::CUDADeviceContext, double>( ...@@ -90,6 +141,35 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, ldc)); lda, &beta, C, ldc));
} }
template <>
void matmul<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, float16>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
matrix_b.data<float16>(), beta, matrix_out->data<float16>());
}
template <> template <>
void matmul<platform::CUDADeviceContext, float>( void matmul<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const platform::CUDADeviceContext& context,
...@@ -148,6 +228,34 @@ void matmul<platform::CUDADeviceContext, double>( ...@@ -148,6 +228,34 @@ void matmul<platform::CUDADeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>()); matrix_b.data<double>(), beta, matrix_out->data<double>());
} }
template <>
void batched_gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
}
template <> template <>
void batched_gemm<platform::CUDADeviceContext, float>( void batched_gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
......
...@@ -14,30 +14,41 @@ ...@@ -14,30 +14,41 @@
#include "gtest/gtest.h" #include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
TEST(math_function, notrans_mul_trans) { void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
paddle::framework::Tensor input1; const std::vector<float>& data) {
paddle::framework::Tensor input1_gpu; PADDLE_ENFORCE_EQ(size, data.size());
paddle::framework::Tensor input2_gpu; for (size_t i = 0; i < data.size(); ++i) {
paddle::framework::Tensor out_gpu; in_ptr[i] = paddle::platform::float16(data[i]);
paddle::framework::Tensor out; }
}
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place); TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0); TensorCopy(input1, gpu_place, context, &input1_gpu);
paddle::platform::CUDADeviceContext context(*gpu_place); TensorCopy(input1, gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, *gpu_place); out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) { ...@@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) {
EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[2], 14);
EXPECT_EQ(out_ptr[3], 50); EXPECT_EQ(out_ptr[3], 50);
delete gpu_place;
} }
TEST(math_function, trans_mul_notrans) { TEST(math_function, notrans_mul_trans_fp16) {
paddle::framework::Tensor input1; using namespace paddle::framework;
paddle::framework::Tensor input1_gpu; using namespace paddle::platform;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu; Tensor input1;
paddle::framework::Tensor out; Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>(
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
}
TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
auto* cpu_place = new paddle::platform::CPUPlace(); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5}; float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float)); memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0); TensorCopy(input1, gpu_place, context, &input1_gpu);
paddle::platform::CUDADeviceContext context(*gpu_place); TensorCopy(input1, gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>(); float* out_ptr = out.data<float>();
context.Wait(); context.Wait();
...@@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) { ...@@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) {
EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[6], 15);
EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[7], 22);
EXPECT_EQ(out_ptr[8], 29); EXPECT_EQ(out_ptr[8], 29);
delete gpu_place;
} }
TEST(math_function, gemm_notrans_cublas) { TEST(math_function, trans_mul_notrans_fp16) {
paddle::framework::Tensor input1; using namespace paddle::framework;
paddle::framework::Tensor input2; using namespace paddle::platform;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu; Tensor input1;
paddle::framework::Tensor input2_gpu; Tensor input1_gpu;
paddle::framework::Tensor input3_gpu; Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>(
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
}
TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace(); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5}; float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float)); memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place); float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float)); memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place); float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0); TensorCopy(input1, gpu_place, context, &input1_gpu);
paddle::platform::CUDADeviceContext context(*gpu_place); TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code: // numpy code:
// a = np.arange(6).reshape(2, 3) // a = np.arange(6).reshape(2, 3)
...@@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) { ...@@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) {
EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99); EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place;
} }
TEST(math_function, gemm_trans_cublas) { TEST(math_function, gemm_notrans_cublas_fp16) {
paddle::framework::Tensor input1; using namespace paddle::framework;
paddle::framework::Tensor input2; using namespace paddle::platform;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu; Tensor input1;
paddle::framework::Tensor input2_gpu; Tensor input2;
paddle::framework::Tensor input3_gpu; Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
// b = np.arange(12).reshape(3, 4)[:, 1:]
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}
TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2; int m = 2;
int n = 3; int n = 3;
int k = 3; int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace(); float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5}; float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float)); memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place); float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float)); memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place); float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float)); memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0); TensorCopy(input1, gpu_place, context, &input1_gpu);
paddle::platform::CUDADeviceContext context(*gpu_place); TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>(); float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>(); float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place); float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>( paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
context.Wait();
EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28); EXPECT_EQ(input3_ptr[2], 28);
...@@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) { ...@@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) {
EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99); EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place; }
TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
} }
template <typename T> template <typename T>
void GemvTest(int m, int n, bool trans) { void GemvTest(int m, int n, bool trans) {
paddle::framework::Tensor mat_a; using namespace paddle::framework;
paddle::framework::Tensor vec_b; using namespace paddle::platform;
paddle::framework::Tensor vec_c;
auto* cpu_place = new paddle::platform::CPUPlace(); Tensor mat_a;
Tensor vec_b;
T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place); Tensor vec_c;
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place); CPUPlace cpu_place;
CUDAPlace gpu_place(0);
auto* gpu_place = new paddle::platform::CUDAPlace(0); CUDADeviceContext context(gpu_place);
paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b; T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
paddle::framework::Tensor g_vec_c; T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place); T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place); Tensor g_mat_a;
Tensor g_vec_b;
Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
for (int i = 0; i < mat_a.numel(); ++i) { for (int i = 0; i < mat_a.numel(); ++i) {
data_a[i] = static_cast<T>(i); data_a[i] = static_cast<T>(i);
...@@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) { ...@@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i); data_b[i] = static_cast<T>(i);
} }
paddle::platform::CUDADeviceContext context(*gpu_place); TensorCopy(mat_a, gpu_place, context, &g_mat_a);
paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a); TensorCopy(vec_b, gpu_place, context, &g_vec_b);
paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>( paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a, context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c); g_data_b, 0., g_data_c);
paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context, TensorCopy(g_vec_c, cpu_place, context, &vec_c);
&vec_c);
if (!trans) { if (!trans) {
for (int i = 0; i < m; ++i) { for (int i = 0; i < m; ++i) {
......
...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase { ...@@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase {
const platform::Place& dev_place) const override { const platform::Place& dev_place) const override {
framework::ReaderHolder* reader = framework::ReaderHolder* reader =
scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>(); scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
if (!reader->HasNext()) { std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
if (ins.empty()) {
reader->ReInit(); reader->ReInit();
reader->ReadNext(&ins);
PADDLE_ENFORCE( PADDLE_ENFORCE(
reader->HasNext(), !ins.empty(),
"Reader can not read the next data even it has been re-initialized."); "Reader can not read the next data even it has been re-initialized.");
} }
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) { for (size_t i = 0; i < ins.size(); ++i) {
auto* out = auto* out =
......
...@@ -68,10 +68,10 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) { ...@@ -68,10 +68,10 @@ void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
buffer_.clear(); buffer_.clear();
buffer_.reserve(batch_size_); buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) { for (int i = 0; i < batch_size_; ++i) {
if (reader_->HasNext()) { buffer_.push_back(std::vector<framework::LoDTensor>());
buffer_.push_back(std::vector<framework::LoDTensor>()); reader_->ReadNext(&buffer_.back());
reader_->ReadNext(&buffer_.back()); if (buffer_.back().empty()) {
} else { buffer_.pop_back();
break; break;
} }
} }
......
...@@ -50,8 +50,6 @@ class RandomDataGenerator : public framework::FileReader { ...@@ -50,8 +50,6 @@ class RandomDataGenerator : public framework::FileReader {
} }
} }
bool HasNext() const override { return true; }
void ReInit() override { return; } void ReInit() override { return; }
private: private:
......
...@@ -39,10 +39,10 @@ void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) { ...@@ -39,10 +39,10 @@ void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
buffer_.clear(); buffer_.clear();
buffer_.reserve(buffer_size_); buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) { for (int i = 0; i < buffer_size_; ++i) {
if (reader_->HasNext()) { buffer_.push_back(std::vector<framework::LoDTensor>());
buffer_.push_back(std::vector<framework::LoDTensor>()); reader_->ReadNext(&buffer_.back());
reader_->ReadNext(&buffer_.back()); if (buffer_.back().empty()) {
} else { buffer_.pop_back();
break; break;
} }
} }
......
...@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -84,6 +84,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<int>>("shape", AddAttr<std::vector<int>>("shape",
"(vector<int>) " "(vector<int>) "
"Target shape of reshape operator."); "Target shape of reshape operator.");
AddAttr<bool>("inplace",
"Change the source tensor's shape without copy memory.")
.SetDefault(true);
AddComment(R"DOC( AddComment(R"DOC(
Reshape Operator. Reshape Operator.
......
...@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> { ...@@ -26,10 +26,16 @@ class ReshapeKernel : public framework::OpKernel<T> {
void Compute(const framework::ExecutionContext& ctx) const { void Compute(const framework::ExecutionContext& ctx) const {
auto* out = ctx.Output<framework::Tensor>("Out"); auto* out = ctx.Output<framework::Tensor>("Out");
auto* in = ctx.Input<framework::Tensor>("X"); auto* in = ctx.Input<framework::Tensor>("X");
bool inplace = ctx.Attr<bool>("inplace");
auto out_dims = out->dims(); auto out_dims = out->dims();
out->mutable_data<T>(ctx.GetPlace()); if (!inplace) {
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); out->mutable_data<T>(ctx.GetPlace());
out->Resize(out_dims); framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
out->Resize(out_dims);
}
} }
}; };
...@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> { ...@@ -40,10 +46,16 @@ class ReshapeGradKernel : public framework::OpKernel<T> {
auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto* d_out = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X")); auto* d_x = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
d_x->mutable_data<T>(ctx.GetPlace()); d_x->mutable_data<T>(ctx.GetPlace());
bool inplace = ctx.Attr<bool>("inplace");
auto in_dims = d_x->dims(); auto in_dims = d_x->dims();
framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x); if (!inplace) {
d_x->Resize(in_dims); framework::TensorCopy(*d_out, ctx.GetPlace(), ctx.device_context(), d_x);
d_x->Resize(in_dims);
} else {
d_x->ShareDataWith(*d_out);
d_x->Resize(in_dims);
}
} }
}; };
} // namespace operators } // namespace operators
......
...@@ -15,6 +15,8 @@ limitations under the License. */ ...@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor { ...@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
}; };
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (use_cudnn) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
}
#endif
return use_cudnn;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool( ...@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT(places.size(), 0); PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) { for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) { if (platform::is_cpu_place(places[i])) {
#ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace(places[i],
new platform::MKLDNNDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#else
device_contexts_.emplace(places[i], device_contexts_.emplace(places[i],
new platform::CPUDeviceContext( new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i]))); boost::get<platform::CPUPlace>(places[i])));
#endif
} else if (platform::is_gpu_place(places[i])) { } else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i], device_contexts_.emplace(places[i],
...@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { ...@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device); SetDeviceId(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_)); PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place); eigen_stream_->Reinitialize(&stream_, place);
...@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const { ...@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaGetLastError()); PADDLE_ENFORCE(cudaGetLastError());
} }
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get(); return eigen_device_.get();
} }
...@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; } ...@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), ready_(false) { : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
} }
template <typename T> void MKLDNNDeviceContext::SetBlob(const std::string& name,
void MKLDNNDeviceContext::AddElement(const std::string& op_key, std::shared_ptr<void> data) const {
const T& value) { std::unordered_map<std::string, std::shared_ptr<void>>* p;
if (GetElement<T>(op_key)) { p = p_blobs_.get();
return;
}
GetElementPool<T>().emplace(op_key, std::move(value));
}
template <typename T> auto it = p->find(name);
const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
auto it = GetElementPool<T>().find(op_key);
return it == GetElementPool<T>().end() ? nullptr : it->second;
}
template <> if (it == p->end()) {
const std::unordered_map<const std::string, const MKLDNNMemoryPtr, (*p)[name] = data; // create new blob
std::hash<std::string>>& } else {
MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const { it->second = data; // set data to existing blob
return memory_pool_; }
}
template <> return;
const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
return primitive_pool_;
} }
template <> std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr, const std::string& name) const {
std::hash<std::string>>& std::unordered_map<std::string, std::shared_ptr<void>>* p;
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const { p = p_blobs_.get();
return primitive_desc_pool_;
}
void MKLDNNDeviceContext::Execute(bool block) { auto it = p->find(name);
if (pipeline_.empty()) {
return;
}
ResetStream();
stream_->submit(pipeline_).wait(block);
ready_ = false;
pipeline_.clear();
}
void MKLDNNDeviceContext::ResetStream() { if (it != p->end()) {
if (ready_) { return it->second;
return;
} }
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); return nullptr;
ready_ = true;
} }
#endif #endif
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#endif #endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include <mkldnn.hpp>
#endif #endif
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
...@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { ...@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */ /*! \brief Return place in the device context. */
Place GetPlace() const override; Place GetPlace() const override;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return eigen device in the device context. */ /*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const; Eigen::GpuDevice* eigen_device() const;
...@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext { ...@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_; cudaStream_t stream_;
cudnnHandle_t cudnn_handle_; cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_; cublasHandle_t cublas_handle_;
int multi_process;
int max_threads_per_mp;
}; };
template <> template <>
...@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext { ...@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public: public:
explicit MKLDNNDeviceContext(CPUPlace place); explicit MKLDNNDeviceContext(CPUPlace place);
/* \brief Add new element: memory, primitive or primitive desc */
template <typename T>
void AddElement(const std::string& op_key, const T& value);
/* \brief Get existed element: memory, primitive or primitive desc */
template <typename T>
const T& GetElement(const std::string& op_key) const;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template <typename T>
const std::unordered_map<const std::string, const T, std::hash<std::string>>&
GetElementPool() const;
/* \brief Get the active engine */ /* \brief Get the active engine */
const MKLDNNEngine& engine() const { return *engine_; } const mkldnn::engine& GetEngine() const { return engine_; }
/* \brief Submit primitive to pipeline */
void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
/*! \brief Execute all submitted primitives in pipeline */ // Set data to blob (i.e. name/data pair). Create blob if not existing
void Execute(bool block = true); void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
protected: // Find a saved blob. Return nullptr if not found
/*! \brief Reset the stream to prepare next exectue */ std::shared_ptr<void> GetBlob(const std::string& name) const;
void ResetStream();
private: private:
std::unordered_map<const std::string, const MKLDNNMemoryPtr, mkldnn::engine engine_;
std::hash<std::string>> std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
memory_pool_; p_blobs_;
std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>
primitive_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>
primitive_desc_pool_;
std::vector<MKLDNNPrimitive> pipeline_;
MKLDNNStreamPtr stream_;
MKLDNNEnginePtr engine_;
bool ready_;
}; };
#endif #endif
......
...@@ -199,20 +199,29 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -199,20 +199,29 @@ class DeviceTracerImpl : public DeviceTracer {
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back( cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
} }
void AddMemRecords(const std::string &name, uint64_t start_ns, void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id, uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) { uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) {
VLOG(3) << name << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) { uint32_t stream_id, uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back( kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id}); KernelRecord{start, end, device_id, stream_id, correlation_id});
...@@ -285,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -285,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc); profile_f.open(profile_path, std::ios::out | std::ios::trunc);
std::string profile_str;
profile_pb.SerializeToString(&profile_str);
profile_f << profile_str; profile_f << profile_str;
profile_f.close(); profile_f.close();
return profile_pb; return profile_pb;
......
...@@ -68,6 +68,8 @@ extern void *cublas_dso_handle; ...@@ -68,6 +68,8 @@ extern void *cublas_dso_handle;
__macro(cublasDgemv_v2); \ __macro(cublasDgemv_v2); \
__macro(cublasSgemm_v2); \ __macro(cublasSgemm_v2); \
__macro(cublasDgemm_v2); \ __macro(cublasDgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasSgeam_v2); \ __macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \ __macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \ __macro(cublasCreate_v2); \
...@@ -83,6 +85,7 @@ extern void *cublas_dso_handle; ...@@ -83,6 +85,7 @@ extern void *cublas_dso_handle;
__macro(cublasDgemmStridedBatched); \ __macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \ __macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); \
__macro(cublasSgetrfBatched); \ __macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \ __macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \ __macro(cublasDgetrfBatched); \
......
...@@ -33,6 +33,26 @@ int GetCUDADeviceCount() { ...@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
return count; return count;
} }
int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcessors");
return count;
}
int GetCUDAMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;
PADDLE_ENFORCE(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
return count;
}
int GetCurrentDeviceId() { int GetCurrentDeviceId() {
int device_id; int device_id;
PADDLE_ENFORCE( PADDLE_ENFORCE(
......
...@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse = ...@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system. //! Get the total number of GPU devices in system.
int GetCUDADeviceCount(); int GetCUDADeviceCount();
//! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int GetCUDAMaxThreadsPerMultiProcessor(int i);
//! Get the current GPU device id in system. //! Get the current GPU device id in system.
int GetCurrentDeviceId(); int GetCurrentDeviceId();
......
...@@ -16,12 +16,15 @@ limitations under the License. */ ...@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp> #include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
using MKLDNNStream = mkldnn::stream; using MKLDNNStream = mkldnn::stream;
using MKLDNNEngine = mkldnn::engine; using MKLDNNEngine = mkldnn::engine;
using MKLDNNMemory = mkldnn::memory; using MKLDNNMemory = mkldnn::memory;
using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
using MKLDNNPrimitive = mkldnn::primitive; using MKLDNNPrimitive = mkldnn::primitive;
using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>; using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
...@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr; ...@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr; typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr; typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
mkldnn::memory::data_type data_type,
mkldnn::memory::format format) {
mkldnn::memory::dims tz = dims;
return mkldnn::memory::desc({tz}, data_type, format);
}
inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) { ...@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
} }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) { ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
syntax = "proto2"; syntax = "proto2";
package paddle.platform.proto; package paddle.platform.proto;
message MemCopy { optional uint64 bytes = 3; } message MemCopy { optional uint64 bytes = 1; }
message Event { message Event {
optional string name = 1; optional string name = 1;
......
...@@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) {
* ... * ...
* PopEvent(evt_name, dev_ctx); * PopEvent(evt_name, dev_ctx);
*/ */
LOG(INFO) << "Usage 1: PushEvent & PopEvent";
for (int loop = 0; loop < 3; ++loop) { for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i); std::string name = "op_" + std::to_string(i);
...@@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) {
* ... * ...
* } * }
*/ */
LOG(INFO) << "Usage 2: RecordEvent";
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i); std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name, dev_ctx);
...@@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) { ...@@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) {
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
} }
/* Usage 3
* {
* RecordEvent record_event(name1, dev_ctx);
* ...
* code to be analyzed
* ...
* {
* RecordEvent nested_record_event(name2, dev_ctx);
* ...
* code to be analyzed
* ...
* }
* }
*/
LOG(INFO) << "Usage 3: nested RecordEvent";
for (int i = 1; i < 5; ++i) {
std::string name = "ano_evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
int counter = 1;
while (counter != i * 100) counter++;
{
std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
RecordEvent nested_record_event(nested_name, dev_ctx);
int nested_counter = 1;
while (nested_counter != i * 100) nested_counter++;
}
}
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push", dev_ctx);
......
...@@ -213,7 +213,7 @@ function gen_fluid_inference_lib() { ...@@ -213,7 +213,7 @@ function gen_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF cat <<EOF
======================================== ========================================
Building fluid inference library ... Deploying fluid inference library ...
======================================== ========================================
EOF EOF
make inference_lib_dist make inference_lib_dist
......
...@@ -1111,6 +1111,7 @@ def conv2d(input, ...@@ -1111,6 +1111,7 @@ def conv2d(input,
param_attr=None, param_attr=None,
bias_attr=None, bias_attr=None,
use_cudnn=True, use_cudnn=True,
use_mkldnn=False,
act=None): act=None):
""" """
**Convlution2D Layer** **Convlution2D Layer**
...@@ -1252,7 +1253,8 @@ def conv2d(input, ...@@ -1252,7 +1253,8 @@ def conv2d(input,
'strides': stride, 'strides': stride,
'paddings': padding, 'paddings': padding,
'groups': groups, 'groups': groups,
'use_cudnn': use_cudnn 'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn
}) })
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2) pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
......
...@@ -31,6 +31,8 @@ dtype_to_size = { ...@@ -31,6 +31,8 @@ dtype_to_size = {
sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"] sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
PRINT_LOG = False
class ControlFlowGraph(object): class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num, skip_opt): def __init__(self, Program, ops, forward_num, skip_opt):
...@@ -171,12 +173,14 @@ class ControlFlowGraph(object): ...@@ -171,12 +173,14 @@ class ControlFlowGraph(object):
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype] # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
# and dtype_to_size[cache_dtype] # and dtype_to_size[cache_dtype]
if x_dtype == cache_dtype: if x_dtype == cache_dtype:
print(("Hit Cache !!!! cache pool index " if PRINT_LOG:
"is %d, var name is %s, " print(
"cached var name is %s, " ("Hit Cache !!!! cache pool index "
"var shape is %s ") % "is %d, var name is %s, "
(index, x, cache_var, "cached var name is %s, "
str(cache_shape))) "var shape is %s ") %
(index, x, cache_var,
str(cache_shape)))
self.pool.pop(index) self.pool.pop(index)
if x == cache_var: if x == cache_var:
break break
...@@ -277,7 +281,9 @@ def _get_cfgs(input_program): ...@@ -277,7 +281,9 @@ def _get_cfgs(input_program):
return cfgs return cfgs
def memory_optimize(input_program): def memory_optimize(input_program, print_log=False):
global PRINT_LOG
PRINT_LOG = print_log
cfgs = _get_cfgs(input_program) cfgs = _get_cfgs(input_program)
for cfg in cfgs: for cfg in cfgs:
cfg.memory_optimize() cfg.memory_optimize()
...@@ -29,14 +29,16 @@ def simple_img_conv_pool(input, ...@@ -29,14 +29,16 @@ def simple_img_conv_pool(input,
act, act,
param_attr=None, param_attr=None,
pool_type='max', pool_type='max',
use_cudnn=True): use_cudnn=True,
use_mkldnn=False):
conv_out = layers.conv2d( conv_out = layers.conv2d(
input=input, input=input,
num_filters=num_filters, num_filters=num_filters,
filter_size=filter_size, filter_size=filter_size,
param_attr=param_attr, param_attr=param_attr,
act=act, act=act,
use_cudnn=use_cudnn) use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d( pool_out = layers.pool2d(
input=conv_out, input=conv_out,
...@@ -58,7 +60,8 @@ def img_conv_group(input, ...@@ -58,7 +60,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate=0.0, conv_batchnorm_drop_rate=0.0,
pool_stride=1, pool_stride=1,
pool_type=None, pool_type=None,
use_cudnn=True): use_cudnn=True,
use_mkldnn=False):
""" """
Image Convolution Group, Used for vgg net. Image Convolution Group, Used for vgg net.
""" """
...@@ -90,7 +93,8 @@ def img_conv_group(input, ...@@ -90,7 +93,8 @@ def img_conv_group(input,
padding=conv_padding[i], padding=conv_padding[i],
param_attr=param_attr[i], param_attr=param_attr[i],
act=local_conv_act, act=local_conv_act,
use_cudnn=use_cudnn) use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]: if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act) tmp = layers.batch_norm(input=tmp, act=conv_act)
......
...@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost) ...@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_cost) sgd_optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program()) fluid.memory_optimize(fluid.default_main_program(), print_log=True)
BATCH_SIZE = 200 BATCH_SIZE = 200
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册