提交 a8cdd97e 编写于 作者: C caoying03

Merge branch 'develop' into enhance_reshape

......@@ -56,7 +56,7 @@ script:
export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh
export DOCS_DIR=`pwd`
cd ..
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2
curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/
notifications:
email:
on_success: change
......
......@@ -144,6 +144,8 @@ include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
include(external/grpc)
include(external/snappy) # download snappy
include(external/snappystream)
include(cudnn) # set cudnn libraries, must before configure
include(cupti)
......
......@@ -138,13 +138,14 @@ def main():
avg_cost = fluid.layers.mean(x=cost)
# Evaluator
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size)
# inference program
inference_program = fluid.default_main_program().clone()
with fluid.program_guard(inference_program):
test_target = accuracy.metrics + accuracy.states
inference_program = fluid.io.get_inference_program(test_target)
inference_program = fluid.io.get_inference_program(batch_acc)
# Optimization
optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate)
......@@ -157,27 +158,30 @@ def main():
# test
def test(exe):
accuracy.reset(exe)
test_pass_acc = fluid.average.WeightedAverage()
for batch_id, data in enumerate(test_reader()):
img_data = np.array(map(lambda x: x[0].reshape(data_shape),
data)).astype("float32")
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
exe.run(inference_program,
outs = exe.run(inference_program,
feed={"pixel": img_data,
"label": y_data})
"label": y_data},
fetch_list=[batch_acc, batch_size])
test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1]))
return accuracy.eval(exe)
return test_pass_acc.eval()
def train_loop(exe, trainer_prog):
iters = 0
ts = time.time()
train_pass_acc = fluid.average.WeightedAverage()
for pass_id in range(args.num_passes):
# train
start_time = time.time()
num_samples = 0
accuracy.reset(exe)
train_pass_acc.reset()
with profiler.profiler("CPU", 'total') as prof:
for batch_id, data in enumerate(train_reader()):
ts = time.time()
......@@ -187,13 +191,14 @@ def main():
y_data = np.array(map(lambda x: x[1], data)).astype("int64")
y_data = y_data.reshape([-1, 1])
loss, acc = exe.run(
loss, acc, b_size = exe.run(
trainer_prog,
feed={"pixel": img_data,
"label": y_data},
fetch_list=[avg_cost] + accuracy.metrics)
fetch_list=[avg_cost, batch_acc, batch_size])
iters += 1
num_samples += len(data)
train_pass_acc.add(value=acc, weight=b_size)
print(
"Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s"
% (pass_id, iters, loss, acc,
......@@ -201,7 +206,7 @@ def main():
) # The accuracy is the accumulation of batches, but not the current batch.
pass_elapsed = time.time() - start_time
pass_train_acc = accuracy.eval(exe)
pass_train_acc = train_pass_acc.eval()
pass_test_acc = test(exe)
print(
"Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n"
......
......@@ -78,6 +78,7 @@ IF(NOT ${CBLAS_FOUND})
BUILD_IN_SOURCE 1
BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS}
INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX=<INSTALL_DIR>
&& rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
)
......@@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND})
\"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\"
)"
)
INSTALL(CODE "execute_process(
COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake
${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig
)"
)
ENDIF()
ENDIF(NOT ${CBLAS_FOUND})
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy)
SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy)
SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE)
ExternalProject_Add(
extern_snappy
GIT_REPOSITORY "https://github.com/google/snappy"
GIT_TAG "1.1.7"
PREFIX ${SNAPPY_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DBUILD_TESTING=OFF
-DSNAPPY_BUILD_TESTS:BOOL=OFF
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
BUILD_COMMAND make -j8
INSTALL_COMMAND make install
)
add_library(snappy STATIC IMPORTED GLOBAL)
set_property(TARGET snappy PROPERTY IMPORTED_LOCATION
"${SNAPPY_INSTALL_DIR}/lib/libsnappy.a")
include_directories(${SNAPPY_INCLUDE_DIR})
add_dependencies(snappy extern_snappy)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
IF(MOBILE_INFERENCE)
return()
ENDIF()
include (ExternalProject)
# NOTE: snappy is needed when linking with recordio
SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream)
SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream)
SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE)
ExternalProject_Add(
extern_snappystream
GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git"
GIT_TAG "0.2.8"
PREFIX ${SNAPPYSTREAM_SOURCES_DIR}
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
-DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
-DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS}
-DCMAKE_C_FLAGS=${CMAKE_C_FLAGS}
-DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib
-DCMAKE_POSITION_INDEPENDENT_CODE=ON
-DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
-DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR}
${EXTERNAL_OPTIONAL_ARGS}
CMAKE_CACHE_ARGS
-DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR}
-DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib
-DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}
BUILD_COMMAND make -j8
INSTALL_COMMAND make install
DEPENDS snappy
)
add_library(snappystream STATIC IMPORTED GLOBAL)
set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION
"${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a")
include_directories(${SNAPPYSTREAM_INCLUDE_DIR})
add_dependencies(snappystream extern_snappystream)
......@@ -186,7 +186,9 @@ function(cc_library TARGET_NAME)
add_library(${TARGET_NAME} SHARED ${cc_library_SRCS})
else()
add_library(${TARGET_NAME} STATIC ${cc_library_SRCS})
find_fluid_modules(${TARGET_NAME})
endif()
if(cc_library_DEPS)
# Don't need link libwarpctc.so
if("${cc_library_DEPS};" MATCHES "warpctc;")
......@@ -264,6 +266,7 @@ function(nv_library TARGET_NAME)
cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS})
else()
cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS})
find_fluid_modules(${TARGET_NAME})
endif()
if (nv_library_DEPS)
add_dependencies(${TARGET_NAME} ${nv_library_DEPS})
......
set_property(GLOBAL PROPERTY FLUID_MODULES "")
# find all fluid modules is used for paddle fluid static library
function(find_fluid_modules TARGET_NAME)
get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
string(FIND "${__target_path}" "fluid" pos)
if(pos GREATER 1)
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
set(fluid_modules ${fluid_modules} ${TARGET_NAME})
set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
endif()
endfunction(find_fluid_modules)
# make package for paddle fluid shared and static library
function(copy TARGET)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DSTS DEPS)
cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE)
list(LENGTH copy_lib_SRCS copy_lib_SRCS_len)
list(LENGTH copy_lib_DSTS copy_lib_DSTS_len)
......@@ -42,13 +55,21 @@ copy(glog_lib
DSTS ${dst_dir} ${dst_dir}/lib
)
IF(NOT PROTOBUF_FOUND)
if(NOT PROTOBUF_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
copy(protobuf_lib
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY}
SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
DSTS ${dst_dir} ${dst_dir}/lib
)
ENDIF(NOT PROTOBUF_FOUND)
endif()
if(NOT CBLAS_FOUND)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
copy(openblas_lib
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir}
)
endif()
# paddle fluid module
set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
......@@ -66,8 +87,8 @@ copy(memory_lib
)
set(module "inference")
copy(inference_lib DEPENDS paddle_fluid_shared
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so
copy(inference_lib DEPS paddle_fluid_shared paddle_fluid
SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*
DSTS ${dst_dir}/${module} ${dst_dir}/${module}
)
......@@ -83,6 +104,4 @@ copy(string_lib
DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat
)
add_custom_target(inference_lib_dist DEPENDS
inference_lib framework_lib memory_lib platform_lib string_lib
gflags_lib glog_lib protobuf_lib eigen3_lib)
add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
......@@ -20,9 +20,8 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty());
}
// Read the next batch of data. (A 'batch' can be only one instance)
// If the next batch doesn't exist, the '*out' will be an empty std::vector.
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
// Show whether the next bacth exists.
virtual bool HasNext() const = 0;
// Reinitialize the reader and read the file from the begin.
virtual void ReInit() = 0;
......
......@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。
以EigenTensor为例,做一个介绍
......@@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework
在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector<T>::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector<T>::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)
更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)
......
......@@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override {
### paddle::framework::Tensor到EigenTensor的转换
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`.
Using EigenTensor as an example:
......@@ -125,7 +125,7 @@ EigenTensor<float, 3>::Type et = EigenTensor<float, 3>::From(t);
In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector<T>::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector<T>::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector.
For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file.
For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file.
......
## how to use timeline tool to do profile
1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number.
```python
with profiler.profiler('All', 'total', '/tmp/profile') as prof:
for pass_id in range(pass_num):
for batch_id, data in enumerate(train_reader()):
exe.run(fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[],
use_program_cache=True)
...
```
1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another
file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at
[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details.
1. Open chrome and visit <chrome://tracing/>, use `load` button to load the generated `timeline` file.
![chrome tracing](./tracing.jpeg)
1. The resulting timeline should be like:
![chrome timeline](./timeline.jpeg)
......@@ -2,17 +2,17 @@
Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book
Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework
Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework
Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators
Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators
Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory
Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory
Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform
Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform
# Compile Time
The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto).
The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto).
```python
x = fluid.layers.data(name='x', shape=[13], dtype='float32')
......@@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost)
- Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#)
- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers)
- Every Layer has one or more operators and variables/parameters
- All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
- Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
- Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h)
- Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
- All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files:
- Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h)
- Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h)
- Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h)
- Optimizer: `fluid.optimizer.SGD`. It does the following
- Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)]
- Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)]
......@@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(),
fetch_list=[avg_cost])
```
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h)
- The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)]
- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h)
- The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h)
- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)]
- Feeds the data: `feed=feeder.feed(data)`
- Evaluates all the operators
- Fetches the result: `fetch_list=[avg_cost]`
- Other worth looking files:
- Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live
- Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live
- Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory)
- Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live
- Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live
- Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory)
......@@ -34,15 +34,15 @@ PaddlePaddle可以使用常用的Python包管理工具
:align: center
.. csv-table:: 各个版本最新的whl包
:header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "暂无"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
:header: "版本说明", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
.. _pip_dependency:
......
......@@ -37,15 +37,15 @@ If the links below shows up the login form, just click "Log in as guest" to star
:align: center
.. csv-table:: whl package of each version
:header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API"
:widths: 1, 3, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "Not Available"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_", "`paddle.tgz <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz>`_"
:header: "version", "cp27-cp27mu", "cp27-cp27m"
:widths: 1, 3, 3
"cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
"cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl>`_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl <https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl>`_"
.. _pip_dependency:
......
新手入门
============
如果需要快速了解PaddlePaddle的使用,可以参考以下指南。
.. toctree::
:maxdepth: 1
quickstart_cn.rst
在使用PaddlePaddle构建应用时,需要了解一些基本概念。
这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。
.. toctree::
:maxdepth: 1
concepts/use_concepts_cn.rst
## 安装与编译C-API预测库
### 概述
使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项:
必须配置选项:
- `WITH_C_API`,必须配置为`ON`
推荐配置选项:
- `WITH_PYTHON`,推荐配置为`OFF`
- `WITH_SWIG_PY`,推荐配置为`OFF`
- `WITH_GOLANG`,推荐设置为`OFF`
可选配置选项:
- `WITH_GPU`,可配置为`ON/OFF`
- `WITH_MKL`,可配置为`ON/OFF`
对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
## 安装、编译与链接C-API预测库
### 直接下载安装
从CI系统中下载最新的C-API开发包进行安装,用户可以从下面的表格中找到需要的版本:
<table>
<thead>
<tr>
<th>版本说明</th>
<th>C-API</th>
</tr>
</thead>
<tbody>
<tr>
<td>cpu_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuAvxCp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cpu_avx_openblas</td>
<td>暂无</td>
</tr>
<tr>
<td>cpu_noavx_openblas</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_CpuNoavxOpenblas/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda7.5_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda75cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn5_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda80cudnn5cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr>
<tr>
<td>cuda8.0_cudnn7_avx_mkl</td>
<td><a href="https://guest:@paddleci.ngrok.io/repository/download/Manylinux1_Cuda8cudnn7cp27cp27mu/.lastSuccessful/paddle.tgz" rel="nofollow">paddle.tgz</a></td>
</tr></tbody></table>
### 从源码编译
用户也可以从 PaddlePaddle 核心代码编译C-API链接库,只需在编译时配制下面这些编译选项:
<table>
<thead>
<tr>
<th>选项</th>
<th></th>
</tr>
</thead>
<tbody>
<tr>
<td>WITH_C_API</td>
<td>ON</td>
</tr>
<tr>
<td>WITH_PYTHON</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_SWIG_PY</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_GOLANG</td>
<td>OFF(推荐)</td>
</tr>
<tr>
<td>WITH_GPU</td>
<td>ON/OFF</td>
</tr>
<tr>
<td>WITH_MKL</td>
<td>ON/OFF</td>
</tr></tbody></table>
建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。
下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径):
......@@ -100,22 +158,18 @@ cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
目前提供三种链接方式:
1. 链接`libpaddle_capi_shared.so` 动态库
- 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意:
1. 链接`libpaddle_capi_shared.so` 动态库(这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**),需注意:
1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。
1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。
1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。
- 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**
2. 链接静态库 `libpaddle_capi_whole.a`
- 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
2. 链接静态库 `libpaddle_capi_whole.a`,需注意:
1. 需要指定`-Wl,--whole-archive`链接选项。
1. 需要显式地链接 `gflags``glog``libz``protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。
1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`
1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。
3. 链接静态库 `libpaddle_capi_layers.a``libpaddle_capi_engine.a`
- 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意:
3. 链接静态库 `libpaddle_capi_layers.a``libpaddle_capi_engine.a`,需注意:
1. 这种链接方式主要用于移动端预测。
1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。
1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。
......
进阶使用
========
PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。使用方式请参考:
.. toctree::
:maxdepth: 1
cmd_parameter/index_cn.rst
PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务,具体环境配置和使用说明请参考:
.. toctree::
:maxdepth: 1
cluster/index_cn.rst
PaddlePaddle提供了用于预测的C-API,关于C-API的使用,我们提供了如下指南:
.. toctree::
:maxdepth: 1
capi/index_cn.rst
PaddlePaddle支持多种灵活和高效的循环神经网络,具体配置使用方式请参考:
.. toctree::
:maxdepth: 1
rnn/index_cn.rst
关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优,请参考:
.. toctree::
:maxdepth: 1
optimization/gpu_profiling_cn.rst
......@@ -22,7 +22,7 @@
pooling
========
pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API
pooling 的使用示例如下。
.. code-block:: bash
......@@ -47,7 +47,7 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API
last_seq 和 first_seq
=====================
last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详细见 :ref:`api_v2.layer_last_seq` 配置API
last_seq 的使用示例如下(first_seq 类似)
.. code-block:: bash
......@@ -68,7 +68,7 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详
expand
======
expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API
expand 的使用示例如下。
.. code-block:: bash
......
......@@ -4,7 +4,7 @@
单双层RNN API对比介绍
#####################
本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/reyoung/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/gserver/tests/test_RecurrentGradientMachine.cpp>`_\ 。
示例1:双层RNN,子序列间无Memory
================================
......@@ -166,11 +166,6 @@
在上面代码中,单层和双层序列的使用和示例2中的示例类似,区别是同时处理了两个输入。而对于双层序列,两个输入的子序列长度也并不相同。但是,我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状,和\ :code:`emb2`\ 的序列形状一致。
示例4:beam_search的生成
========================
TBD
词汇表
======
......
RNN模型
===========
循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。
这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。
第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例,
说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。
.. toctree::
:maxdepth: 1
rnn_config_cn.rst
Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理,
对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。
.. toctree::
:maxdepth: 1
recurrent_group_cn.md
第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。
.. toctree::
:maxdepth: 1
hierarchical_layer_cn.rst
第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。
.. toctree::
:maxdepth: 1
hrnn_rnn_api_compare_cn.rst
......@@ -5,3 +5,4 @@ add_subdirectory(operators)
add_subdirectory(pybind)
add_subdirectory(inference)
add_subdirectory(string)
add_subdirectory(recordio)
......@@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost)
cc_test(ddim_test SRCS ddim_test.cc DEPS ddim)
nv_test(dim_test SRCS dim_test.cu DEPS ddim)
if (WITH_GPU)
if(WITH_GPU)
nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto)
else()
cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto)
endif ()
endif()
cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
if (WITH_GPU)
if(WITH_GPU)
nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor)
else()
cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor)
......@@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor)
nv_test(data_device_transform_test SRCS data_device_transform_test.cu
DEPS operator op_registry init math_function)
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
if(WITH_GPU)
nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor)
nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform)
else()
cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor)
cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform)
endif()
cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function)
cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform)
......
......@@ -28,24 +28,19 @@ class Channel {
virtual bool Send(T*) = 0;
virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
virtual void Close() = 0;
virtual ~Channel() {}
};
// Forward declaration of channel implementations.
namespace details {
template <typename T>
class Buffered;
template <typename T>
class UnBuffered;
} // namespace details
class ChannelImpl;
template <typename T>
Channel<T>* MakeChannel(size_t buffer_size) {
if (buffer_size > 0) {
return new details::Buffered<T>(buffer_size);
}
return new details::UnBuffered<T>();
return new ChannelImpl<T>(buffer_size);
}
template <typename T>
......@@ -89,6 +84,19 @@ class ChannelHolder {
if (IsInitialized()) holder_->Close();
}
size_t Cap() {
if (IsInitialized()) return holder_->Cap();
return -1;
}
void Lock() {
if (IsInitialized()) holder_->Lock();
}
void Unlock() {
if (IsInitialized()) holder_->Unlock();
}
inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() {
......@@ -106,6 +114,9 @@ class ChannelHolder {
virtual const std::type_index Type() const = 0;
virtual void* Ptr() const = 0;
virtual void Close() = 0;
virtual void Lock() = 0;
virtual void Unlock() = 0;
virtual size_t Cap() = 0;
};
template <typename T>
......@@ -115,11 +126,28 @@ class ChannelHolder {
}
virtual const std::type_index Type() const { return type_; }
virtual void* Ptr() const { return static_cast<void*>(channel_.get()); }
virtual void Close() {
if (channel_) channel_->Close();
}
virtual size_t Cap() {
if (channel_)
return channel_->Cap();
else
return -1;
}
virtual void Lock() {
if (channel_) channel_->Lock();
}
virtual void Unlock() {
if (channel_) channel_->Unlock();
}
std::unique_ptr<Channel<T>> channel_;
const std::type_index type_;
};
......@@ -131,5 +159,4 @@ class ChannelHolder {
} // namespace framework
} // namespace paddle
#include "paddle/fluid/framework/details/buffered_channel.h"
#include "paddle/fluid/framework/details/unbuffered_channel.h"
#include "paddle/fluid/framework/channel_impl.h"
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h> // for size_t
#include <atomic>
#include <condition_variable>
#include <deque>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
template <typename T>
class ChannelImpl : public paddle::framework::Channel<T> {
friend Channel<T> *paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T> *);
public:
virtual bool Send(T *);
virtual bool Receive(T *);
virtual size_t Cap() { return cap_; }
virtual void Lock();
virtual void Unlock();
virtual void Close();
ChannelImpl(size_t);
virtual ~ChannelImpl();
private:
struct QueueMessage {
T *data;
std::condition_variable_any cond;
bool chan_closed = false;
bool completed = false;
QueueMessage(T *item) : data(item) {}
void Wait(std::unique_lock<std::recursive_mutex> &lock) {
cond.wait(lock, [this]() { return completed; });
}
void Notify() {
completed = true;
cond.notify_all();
}
};
bool send_return(bool value) {
send_ctr--;
destructor_cond_.notify_all();
return value;
}
bool recv_return(bool value) {
recv_ctr--;
destructor_cond_.notify_all();
return value;
}
size_t cap_;
std::recursive_mutex mu_;
bool closed_;
std::deque<T> buf_;
std::deque<std::shared_ptr<QueueMessage>> recvq;
std::deque<std::shared_ptr<QueueMessage>> sendq;
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
std::condition_variable_any destructor_cond_;
};
template <typename T>
ChannelImpl<T>::ChannelImpl(size_t capacity)
: cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) {
PADDLE_ENFORCE_GE(capacity, 0);
}
template <typename T>
bool ChannelImpl<T>::Send(T *item) {
send_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed, do nothing
if (closed_) {
lock.unlock();
// TODO(abhinavarora) Should panic on closed channel
return send_return(false);
}
// If there is a receiver, directly pass the value we want
// to send to the receiver, bypassing the channel buffer if any
if (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front();
// Do the data transfer
*(m->data) = std::move(*item);
// Wake up the blocked process and unlock
m->Notify();
lock.unlock();
return send_return(true);
}
// Unbuffered channel will always bypass this
// If buffered channel has space in buffer,
// write the element to the buffer.
if (buf_.size() < cap_) {
// Copy to buffer
buf_.push_back(std::move(*item));
// Release lock and return true
lock.unlock();
return send_return(true);
}
// Block on channel, because some receiver will complete
// the operation for us
auto m = std::make_shared<QueueMessage>(item);
sendq.push_back(m);
m->Wait(lock);
// TODO(abhinavarora) Should panic on closed channel
return send_return(!m->chan_closed);
}
template <typename T>
bool ChannelImpl<T>::Receive(T *item) {
recv_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed and buffer is empty or
// channel is unbuffered
if (closed_ && buf_.empty()) {
lock.unlock();
return recv_return(false);
}
// If there is a sender, directly receive the value we want
// from the sender, bypassing the channel buffer if any
if (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front();
// Do the data transfer
*item = std::move(*(m->data));
// Wake up the blocked process and unlock
m->Notify();
lock.unlock();
return recv_return(true);
}
// If this is a buffered channel and there are items in buffer
if (buf_.size() > 0) {
// Directly read from buffer
*item = std::move(buf_.front());
buf_.pop_front();
// Release lock and return true
lock.unlock();
return recv_return(true);
}
// No sender available, block on this channel
// Some receiver will complete the option for us
auto m = std::make_shared<QueueMessage>(item);
recvq.push_back(m);
m->Wait(lock);
return recv_return(!m->chan_closed);
}
template <typename T>
void ChannelImpl<T>::Lock() {
mu_.lock();
}
template <typename T>
void ChannelImpl<T>::Unlock() {
mu_.unlock();
}
template <typename T>
void ChannelImpl<T>::Close() {
std::unique_lock<std::recursive_mutex> lock{mu_};
if (closed_) {
// TODO(abhinavarora): closing an already closed channel should panic
lock.unlock();
return;
}
closed_ = true;
// Empty the readers
while (!recvq.empty()) {
std::shared_ptr<QueueMessage> m = recvq.front();
recvq.pop_front();
m->chan_closed = true;
m->Notify();
}
// Empty the senders
while (!sendq.empty()) {
std::shared_ptr<QueueMessage> m = sendq.front();
sendq.pop_front();
m->chan_closed = true;
m->Notify();
}
}
template <typename T>
ChannelImpl<T>::~ChannelImpl() {
Close();
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
std::unique_lock<std::recursive_mutex> lock{mu_};
destructor_cond_.wait(lock,
[this]() { return send_ctr == 0 && recv_ctr == 0; });
}
} // namespace framework
} // namespace paddle
......@@ -23,8 +23,19 @@ using paddle::framework::Channel;
using paddle::framework::ChannelHolder;
using paddle::framework::MakeChannel;
using paddle::framework::CloseChannel;
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
TEST(Channel, ChannelCapacityTest) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
EXPECT_EQ(ch->Cap(), buffer_size);
CloseChannel(ch);
delete ch;
ch = MakeChannel<size_t>(0);
EXPECT_EQ(ch->Cap(), 0U);
CloseChannel(ch);
delete ch;
}
void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
unsigned sum_send = 0;
......@@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
}
});
for (int i = 0; i < 5; i++) {
int recv;
int recv = 999;
EXPECT_EQ(ch->Receive(&recv), true);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(200));
CloseChannel(ch);
t.join();
EXPECT_EQ(sum_send, 10U);
delete ch;
}
TEST(Channel, MakeAndClose) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
{
// MakeChannel should return a buffered channel is buffer_size > 0.
auto ch = MakeChannel<int>(10);
EXPECT_NE(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_EQ(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
{
// MakeChannel should return an un-buffered channel is buffer_size = 0.
auto ch = MakeChannel<int>(0);
EXPECT_EQ(dynamic_cast<Buffered<int> *>(ch), nullptr);
EXPECT_NE(dynamic_cast<UnBuffered<int> *>(ch), nullptr);
CloseChannel(ch);
delete ch;
}
}
TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
......@@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size);
size_t sum = 0;
std::thread t([&]() {
// Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) {
......@@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations
else
EXPECT_EQ(ch->Send(&i), false);
sum += i;
}
});
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
EXPECT_EQ(sum, 45U);
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
CloseChannel(ch);
t.join();
delete ch;
......@@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
// This should unblock all receivers
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel<int> *ch) {
for (size_t i = 0; i < num_threads; i++) t[i].join();
}
void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
......@@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
if (dynamic_cast<Buffered<int> *>(ch)) {
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
......@@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch) {
// This should unblock all senders
CloseChannel(ch);
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
EXPECT_EQ(thread_ended[i], true);
}
if (dynamic_cast<Buffered<int> *>(ch)) {
if (isBuffered) {
// Verify that only 1 send was successful
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
......@@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) {
// any senders waiting for channel to have write space
TEST(Channel, BufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(1);
ChannelCloseUnblocksSendersTest(ch);
ChannelCloseUnblocksSendersTest(ch, true);
delete ch;
}
......@@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) {
// unblocks any senders waiting for senders
TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
ChannelCloseUnblocksReceiversTest(ch);
ChannelCloseUnblocksSendersTest(ch, false);
delete ch;
}
......@@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
ch->Receive(&recv);
EXPECT_EQ(recv, i);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
EXPECT_EQ(sum_send, 3U);
CloseChannel(ch);
......@@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
ch->Send(&i);
sum_send += i;
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
EXPECT_EQ(sum_send, 10U);
EXPECT_EQ(sum_receive, 10U);
// send three more elements
......@@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) {
// This tests that destroying a channel unblocks
// any senders waiting for channel to have write space
void ChannelDestroyUnblockSenders(Channel<int> *ch) {
void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
......@@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
bool is_buffered_channel = false;
if (dynamic_cast<Buffered<int> *>(ch)) is_buffered_channel = true;
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
if (is_buffered_channel) {
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
......@@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch) {
EXPECT_EQ(thread_ended[i], true);
}
// Count number of successfuld sends
// Count number of successful sends
int ct = 0;
for (size_t i = 0; i < num_threads; i++) {
if (send_success[i]) ct++;
}
if (is_buffered_channel) {
if (isBuffered) {
// Only 1 send must be successful
EXPECT_EQ(ct, 1);
} else {
......@@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) {
TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) {
size_t buffer_size = 1;
auto ch = MakeChannel<int>(buffer_size);
ChannelDestroyUnblockSenders(ch);
ChannelDestroyUnblockSenders(ch, true);
}
// This tests that destroying an unbuffered channel also unblocks
......@@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) {
TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) {
auto ch = MakeChannel<int>(0);
ChannelDestroyUnblockSenders(ch);
ChannelDestroyUnblockSenders(ch, false);
}
TEST(ChannelHolder, ChannelHolderCapacityTest) {
const size_t buffer_size = 10;
ChannelHolder *ch = new ChannelHolder();
ch->Reset<int>(buffer_size);
EXPECT_EQ(ch->Cap(), buffer_size);
delete ch;
ch = new ChannelHolder();
ch->Reset<int>(0);
EXPECT_EQ(ch->Cap(), 0U);
delete ch;
}
void ChannelHolderSendReceive(ChannelHolder *ch) {
......@@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all the threads are blocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
// This should unblock all receivers
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) {
}
void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
using paddle::framework::details::Buffered;
using paddle::framework::details::UnBuffered;
size_t num_threads = 5;
std::thread t[num_threads];
bool thread_ended[num_threads];
......@@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
},
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
if (isBuffered) {
// If ch is Buffered, atleast 4 threads must be blocked.
......@@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
// This should unblock all senders
ch->close();
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads got unblocked
for (size_t i = 0; i < num_threads; i++) {
......@@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
&thread_ended[i], &send_success[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
if (isBuffered) {
// If channel is buffered, verify that atleast 4 threads are blocked
int ct = 0;
......@@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) {
},
&thread_ended[i]);
}
std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait
// Verify that all threads are blocked
for (size_t i = 0; i < num_threads; i++) {
......
......@@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type,
PassTensorData(&out, &in);
}
// do data type transform
if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) {
TransDataType(kernel_type_for_var, expected_kernel_type, in, &out);
transformed = true;
......
......@@ -16,13 +16,16 @@ limitations under the License. */
#include <typeindex>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace framework {
inline proto::VarType::Type ToDataType(std::type_index type) {
using namespace paddle::framework::proto;
if (typeid(float).hash_code() == type.hash_code()) {
if (typeid(platform::float16).hash_code() == type.hash_code()) {
return proto::VarType::FP16;
} else if (typeid(float).hash_code() == type.hash_code()) {
return proto::VarType::FP32;
} else if (typeid(double).hash_code() == type.hash_code()) {
return proto::VarType::FP64;
......@@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) {
inline std::type_index ToTypeIndex(proto::VarType::Type type) {
using namespace paddle::framework::proto;
switch (type) {
case proto::VarType::FP16:
return typeid(platform::float16);
case proto::VarType::FP32:
return typeid(float);
case proto::VarType::FP64:
......@@ -59,6 +64,9 @@ template <typename Visitor>
inline void VisitDataType(proto::VarType::Type type, Visitor visitor) {
using namespace paddle::framework::proto;
switch (type) {
case proto::VarType::FP16:
visitor.template operator()<platform::float16>();
break;
case proto::VarType::FP32:
visitor.template operator()<float>();
break;
......
......@@ -47,9 +47,16 @@ struct CastDataType {
auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
#ifdef __NVCC__
} else if (platform::is_gpu_place(in_.place())) {
platform::Transform<platform::CUDADeviceContext> trans;
auto* context = static_cast<const platform::CUDADeviceContext*>(ctx_);
trans(*context, in_begin, in_end, out_begin,
CastDataTypeFunctor<InType, OutType>());
context->Wait();
#endif
} else {
// TODO(dzhwinter): enhance Copy CPU<->GPU with different data type?
PADDLE_THROW("Unsupport CPU <-> GPU!");
PADDLE_THROW("Unsupported place!");
}
}
};
......@@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var,
auto ctx = pool.Get(in.place());
switch (src_type) {
case proto::VarType::FP16:
framework::VisitDataType(dst_type,
CastDataType<platform::float16>(in, out, ctx));
break;
case proto::VarType::FP32:
framework::VisitDataType(dst_type, CastDataType<float>(in, out, ctx));
break;
......
data_type_transform.cc
\ No newline at end of file
......@@ -22,6 +22,21 @@ TEST(DataTypeTransform, CPUTransform) {
auto place = CPUPlace();
auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int64 = OpKernelType(proto::VarType::INT64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_bool = OpKernelType(proto::VarType::BOOL, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
// data type transform from float32
{
Tensor in;
Tensor out;
......@@ -32,22 +47,120 @@ TEST(DataTypeTransform, CPUTransform) {
ptr[i] = i / 3;
}
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, place,
DataLayout::kAnyLayout, LibraryType::kPlain);
TransDataType(kernel_fp32, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_double[i], static_cast<double>(i / 3));
EXPECT_EQ(out_data_double[i], static_cast<double>(i / 3));
}
TransDataType(kernel_fp32, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
ASSERT_EQ(out_data_int[i], static_cast<int>(i / 3));
EXPECT_EQ(out_data_int[i], static_cast<int>(i / 3));
}
}
// data type transform from/to float16
{
Tensor in;
Tensor out;
float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), place);
int data_number = 2 * 3;
for (int i = 0; i < data_number; ++i) {
ptr[i] = i;
}
// transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in, &out);
float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_fp64, in, &out);
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int32, in, &out);
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int64, in, &out);
int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_bool, in, &out);
bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
}
// transform float to float16
float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_float[i] = i;
}
TransDataType(kernel_fp32, kernel_fp16, in, &out);
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
}
// transform double to float16
double* in_data_double = in.mutable_data<double>(make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_double[i] = i;
}
TransDataType(kernel_fp64, kernel_fp16, in, &out);
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
}
// transform int to float16
int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int[i] = i;
}
TransDataType(kernel_int32, kernel_fp16, in, &out);
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
}
// transform int64 to float16
int64_t* in_data_int64 = in.mutable_data<int64_t>(make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_int64[i] = i;
}
TransDataType(kernel_int64, kernel_fp16, in, &out);
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
}
// transform bool to float16
bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), place);
for (int i = 0; i < data_number; ++i) {
in_data_bool[i] = i;
}
TransDataType(kernel_bool, kernel_fp16, in, &out);
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
}
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_type_transform.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "gtest/gtest.h"
TEST(DataTypeTransform, GPUTransform) {
using namespace paddle::framework;
using namespace paddle::platform;
auto cpu_place = CPUPlace();
auto gpu_place = CUDAPlace(0);
CUDADeviceContext context(gpu_place);
auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place,
DataLayout::kAnyLayout, LibraryType::kPlain);
// data type transform from float32
{
Tensor in;
Tensor in_gpu;
Tensor out_gpu;
Tensor out;
float* in_ptr = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(in_ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(arr[i]));
}
TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(arr[i]));
}
}
// data type transform from/to float16
{
Tensor in;
Tensor in_gpu;
Tensor out_gpu;
Tensor out;
float16* ptr = in.mutable_data<float16>(make_ddim({2, 3}), cpu_place);
float16 arr[6] = {float16(0), float16(1), float16(2),
float16(3), float16(4), float16(5)};
int data_number = sizeof(arr) / sizeof(arr[0]);
memcpy(ptr, arr, sizeof(arr));
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
// transform from float16 to other data types
TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
float* out_data_float = out.data<float>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_float[i], static_cast<float>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
double* out_data_double = out.data<double>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_double[i], static_cast<double>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int* out_data_int = out.data<int>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int[i], static_cast<int>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
int64_t* out_data_int64 = out.data<int64_t>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_int64[i], static_cast<int64_t>(ptr[i]));
}
TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
bool* out_data_bool = out.data<bool>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(out_data_bool[i], static_cast<bool>(ptr[i]));
}
// transform float to float16
float* in_data_float = in.mutable_data<float>(make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_float[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_float[i]).x);
}
// transform double to float16
double* in_data_double =
in.mutable_data<double>(make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_double[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_double[i]).x);
}
// transform int to float16
int* in_data_int = in.mutable_data<int>(make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_int[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int[i]).x);
}
// transform int64 to float16
int64_t* in_data_int64 =
in.mutable_data<int64_t>(make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_int64[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_int64[i]).x);
}
// transform bool to float16
bool* in_data_bool = in.mutable_data<bool>(make_ddim({2, 3}), cpu_place);
for (int i = 0; i < data_number; ++i) {
in_data_bool[i] = i;
}
TensorCopy(in, gpu_place, context, &in_gpu);
context.Wait();
TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu);
TensorCopy(out_gpu, cpu_place, context, &out);
context.Wait();
ptr = out.data<float16>();
for (int i = 0; i < data_number; ++i) {
EXPECT_EQ(ptr[i].x, static_cast<float16>(in_data_bool[i]).x);
}
}
}
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <deque>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace details {
// Four of the properties of Buffered Channel:
// - A send to a full channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from an empty channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T>
class Buffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual bool Send(T*);
virtual bool Receive(T*);
virtual size_t Cap() { return cap_; }
virtual void Close();
virtual ~Buffered();
private:
size_t cap_;
std::mutex mu_;
std::condition_variable empty_cond_var_;
std::condition_variable full_cond_var_;
std::condition_variable destructor_cond_var_;
std::deque<T> channel_;
std::atomic<bool> closed_{false};
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
Buffered(size_t cap) : cap_(cap), closed_(false) {
PADDLE_ENFORCE_GT(cap, 0);
}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
};
template <typename T>
bool Buffered<T>::Send(T* item) {
bool ret = false;
if (closed_) {
return ret;
}
send_ctr++;
std::unique_lock<std::mutex> lock(mu_);
full_cond_var_.wait(lock,
[this]() { return channel_.size() < cap_ || closed_; });
if (!closed_) {
channel_.push_back(std::move(*item));
lock.unlock();
empty_cond_var_.notify_one();
ret = true;
}
send_ctr--;
destructor_cond_var_.notify_one();
return ret;
}
template <typename T>
bool Buffered<T>::Receive(T* item) {
bool ret = false;
// Once the channel has been closed and all data has been consumed,
// just return false. Don't even try acquiring the mutex.
if (closed_ && channel_.empty()) {
return false;
}
recv_ctr++;
std::unique_lock<std::mutex> lock(mu_);
empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; });
if (!channel_.empty()) {
*item = std::move(channel_.front());
channel_.pop_front();
full_cond_var_.notify_one();
ret = true;
}
recv_ctr--;
destructor_cond_var_.notify_one();
return ret;
}
template <typename T>
void Buffered<T>::Close() {
if (closed_) {
return;
}
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
NotifyAllParticipants(&lock);
}
template <typename T>
Buffered<T>::~Buffered() {
std::unique_lock<std::mutex> lock(mu_);
closed_ = true;
channel_.clear();
NotifyAllParticipants(&lock);
// The destructor must wait for all readers and writers to complete their task
// The channel has been closed, so we will not accept new readers and writers
lock.lock();
destructor_cond_var_.wait(
lock, [this]() { return send_ctr == 0 && recv_ctr == 0; });
}
template <typename T>
void Buffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
full_cond_var_.notify_all();
empty_cond_var_.notify_all();
}
} // namespace details
} // namespace framework
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <condition_variable>
#include <mutex>
#include "paddle/fluid/framework/channel.h"
namespace paddle {
namespace framework {
namespace details {
// Four of the properties of UnBuffered Channel:
// - A send to a channel blocks temporarily until a receive from the
// channel or the channel is closed.
// - A receive from a channel blocks temporarily until a send to the
// channel or the channel is closed.
// - A send to a closed channel returns false immediately.
// - A receive from a closed channel returns false immediately.
template <typename T>
class UnBuffered : public paddle::framework::Channel<T> {
friend Channel<T>* paddle::framework::MakeChannel<T>(size_t);
friend void paddle::framework::CloseChannel<T>(Channel<T>*);
public:
virtual bool Send(T*);
virtual bool Receive(T*);
virtual size_t Cap() { return 0; }
virtual void Close();
virtual ~UnBuffered();
private:
std::mutex mu_ch_;
// Mutex for readers and writers who are waiting for other reader
// and writer to complete execution
std::recursive_mutex mu_read_, mu_write_;
// reader_found_ is set true when a reader is ready to accept data
// writer_found_ is set true when a writer is ready to send data
// A transaction occurs only when both are true
std::atomic<bool> reader_found_{false}, writer_found_{false};
std::condition_variable cv_channel_;
std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_;
T* item{nullptr};
std::atomic<bool> closed_{false};
std::atomic<unsigned> send_ctr{0};
std::atomic<unsigned> recv_ctr{0};
UnBuffered() : closed_(false) {}
void NotifyAllParticipants(std::unique_lock<std::mutex>*);
};
// This function implements the concept of how data should
// be sent from a writer to a reader.
template <typename T>
bool UnBuffered<T>::Send(T* data) {
bool ret = false;
if (closed_) {
return ret;
}
send_ctr++;
// Prevent other writers from entering
std::unique_lock<std::recursive_mutex> writer_lock(mu_write_);
writer_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock(mu_write_);
// If writer comes first, it should wait till a reader arrives
cv_writer_.wait(cv_lock,
[this]() { return reader_found_ == true || closed_; });
cv_reader_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> channel_lock(mu_ch_);
item = data;
channel_lock.unlock();
cv_channel_.notify_one();
channel_lock.lock();
cv_channel_.wait(channel_lock,
[this]() { return item == nullptr || closed_; });
ret = true;
}
writer_found_ = false;
send_ctr--;
cv_destructor_.notify_one();
return ret;
}
// This function implements the concept of how
// data that was sent by a writer is read from a reader.
template <typename T>
bool UnBuffered<T>::Receive(T* data) {
bool ret = false;
// If channel is closed, we don't even want any reader to enter.
// Unlike a buffered channel, an unbuffered channel does not allow
// readers to read after closing because there is no buffer to be consumed.
if (closed_) return ret;
recv_ctr++;
// Prevent other readers from entering
std::unique_lock<std::recursive_mutex> read_lock{mu_read_};
reader_found_ = true;
std::unique_lock<std::recursive_mutex> cv_lock{mu_read_};
// If reader comes first, it should wait till a writer arrives
cv_reader_.wait(cv_lock,
[this]() { return writer_found_ == true || closed_; });
cv_writer_.notify_one();
if (!closed_) {
std::unique_lock<std::mutex> lock_ch{mu_ch_};
// Reader should wait for the writer to first write its data
cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; });
if (!closed_) {
*data = std::move(*item);
item = nullptr;
lock_ch.unlock();
ret = true;
}
cv_channel_.notify_one();
}
reader_found_ = false;
recv_ctr--;
cv_destructor_.notify_one();
return ret;
}
// This function implements the sequence of events
// that take place once the channel is closed.
template <typename T>
void UnBuffered<T>::Close() {
if (closed_) {
return;
}
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
}
// This function implements the sequence of events
// that are executed once the object of an UnBuffered
// channel is destroyed.
template <typename T>
UnBuffered<T>::~UnBuffered() {
std::unique_lock<std::mutex> lock(mu_ch_);
item = nullptr;
closed_ = true;
NotifyAllParticipants(&lock);
lock.lock();
cv_destructor_.wait(lock,
[this]() { return send_ctr == 0 && recv_ctr == 0; });
}
// This function notifies all the readers, writers and
// the channel condition variables.
template <typename T>
void UnBuffered<T>::NotifyAllParticipants(std::unique_lock<std::mutex>* lock) {
lock->unlock();
cv_writer_.notify_all();
cv_channel_.notify_all();
cv_reader_.notify_all();
}
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id,
for (auto& op_desc : block.AllOps()) {
auto op = paddle::framework::OpRegistry::CreateOp(*op_desc);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
VLOG(4) << place_ << " " << op->DebugStringEx(local_scope);
op->Run(*local_scope, place_);
VLOG(3) << place_ << " " << op->DebugStringEx(local_scope);
if (FLAGS_benchmark) {
VLOG(2) << "Memory used after operator " + op->Type() + " running: "
......
......@@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const {
return shapes_[idx];
}
void ShuffleReader::ReadNext(std::vector<LoDTensor>* out) {
if (iteration_pos_ >= buffer_.size()) {
// Reload buffer with new data
buffer_.clear();
buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
break;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std::random_shuffle(buffer_.begin(), buffer_.end());
iteration_pos_ = 0;
}
out->clear();
if (!buffer_.empty()) {
std::swap(*out, buffer_[iteration_pos_++]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
void BatchReader::ReadNext(std::vector<LoDTensor>* out) {
buffer_.clear();
buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) {
if (reader_->HasNext()) {
buffer_.push_back(std::vector<LoDTensor>());
reader_->ReadNext(&buffer_.back());
} else {
break;
}
}
// Concat instances
out->clear();
if (buffer_.empty()) {
// if buffer_ is empty, the 'out' will return as an empty vector.
return;
}
int out_num = buffer_[0].size();
out->reserve(out_num);
for (int j = 0; j < out_num; ++j) {
// Merge shape and check date type
std::type_index batch_type = buffer_[0][j].type();
DDim batch_shape = buffer_[0][j].dims();
for (size_t i = 1; i < buffer_.size(); ++i) {
std::type_index ins_type = buffer_[i][j].type();
DDim ins_shape = buffer_[i][j].dims();
PADDLE_ENFORCE_EQ(batch_type, ins_type);
PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
slice_ddim(ins_shape, 1, ins_shape.size()));
PADDLE_ENFORCE_GT(ins_shape[0], 0);
batch_shape[0] += ins_shape[0];
}
LoDTensor out_tensor;
out_tensor.Resize(batch_shape);
out_tensor.mutable_data(platform::CPUPlace(), batch_type);
int64_t dst_offset = 0;
// Merge lod and data
LoD batch_lod;
for (size_t i = 0; i < buffer_.size(); ++i) {
DDim ins_shape = buffer_[i][j].dims();
LoD ins_lod = buffer_[i][j].lod();
if (i == 0) {
batch_lod = ins_lod;
} else {
PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
auto& lod_level = batch_lod[level_idx];
for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
}
}
}
Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
dst_offset += ins_shape[0];
}
out_tensor.set_lod(batch_lod);
out->push_back(out_tensor);
}
}
} // namespace framework
} // namespace paddle
......@@ -26,7 +26,6 @@ class ReaderBase {
PADDLE_ENFORCE(!shapes_.empty());
}
virtual void ReadNext(std::vector<LoDTensor>* out) = 0;
virtual bool HasNext() const = 0;
virtual void ReInit() = 0;
......@@ -52,91 +51,14 @@ class DecoratedReader : public ReaderBase {
PADDLE_ENFORCE_NOT_NULL(reader_);
}
bool HasNext() const override { return reader_->HasNext(); }
void ReInit() override { reader_->ReInit(); }
protected:
ReaderBase* reader_;
};
// file readers
template <typename T>
class RandomDataGenerator : public FileReader {
public:
RandomDataGenerator(const std::vector<DDim>& shapes, float min, float max)
: FileReader(shapes), min_(min), max_(max) {
PADDLE_ENFORCE_LE(
min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
unsigned int seed = std::random_device()();
engine_.seed(seed);
dist_ = std::uniform_real_distribution<float>(min_, max_);
}
void ReadNext(std::vector<LoDTensor>* out) override {
out->clear();
out->reserve(shapes_.size());
for (const DDim& shape : shapes_) {
PADDLE_ENFORCE_GE(
shape.size(), 2,
"The rank of reader's output data should be 2 at least.(Now it's %d)",
shape.size());
LoDTensor out_tensor;
out_tensor.Resize(shape);
T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
int64_t numel = product(shape);
for (int64_t i = 0; i < numel; ++i) {
data[i] = dist_(engine_);
}
out->push_back(out_tensor);
}
}
bool HasNext() const override { return true; }
void ReInit() override { return; }
private:
float min_;
float max_;
std::minstd_rand engine_;
std::uniform_real_distribution<float> dist_;
};
// decorated readers
class ShuffleReader : public DecoratedReader {
public:
ShuffleReader(ReaderBase* reader, int buffer_size)
: DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
buffer_.reserve(buffer_size);
}
void ReadNext(std::vector<LoDTensor>* out) override;
private:
int buffer_size_;
std::vector<std::vector<LoDTensor>> buffer_;
size_t iteration_pos_;
};
class BatchReader : public DecoratedReader {
public:
BatchReader(ReaderBase* reader, int batch_size)
: DecoratedReader(reader), batch_size_(batch_size) {
buffer_.reserve(batch_size_);
}
void ReadNext(std::vector<LoDTensor>* out) override;
private:
int batch_size_;
std::vector<std::vector<LoDTensor>> buffer_;
};
// The ReaderHolder is used as readers' unified wrapper,
// making it easier to access different type readers in Variables.
// The ReaderHolder is used as reader' unified wrapper,
// making it easier to access different type reader in Variables.
class ReaderHolder {
public:
void Reset(ReaderBase* reader) { reader_.reset(reader); }
......@@ -144,7 +66,6 @@ class ReaderHolder {
ReaderBase* Get() const { return reader_.get(); }
void ReadNext(std::vector<LoDTensor>* out) { reader_->ReadNext(out); }
bool HasNext() const { return reader_->HasNext(); }
void ReInit() { reader_->ReInit(); }
DDim shape(size_t idx) const { return reader_->shape(idx); }
......
......@@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
void TensorToStream(std::ostream& os, const Tensor& tensor,
const platform::DeviceContext& dev_ctx) {
// TODO(typhoonzero): serialize to ostream
{ // the 1st field, uint32_t version
constexpr uint32_t version = 0;
os.write(reinterpret_cast<const char*>(&version), sizeof(version));
......
......@@ -235,6 +235,7 @@ TEST(TensorToVector, Tensor) {
TEST(TensorContainsNAN, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
{
Tensor src;
float* buf = src.mutable_data<float>({3}, CPUPlace());
buf[0] = 0.0;
......@@ -243,11 +244,24 @@ TEST(TensorContainsNAN, CPU) {
ASSERT_TRUE(TensorContainsNAN(src));
buf[1] = 0.0;
ASSERT_FALSE(TensorContainsNAN(src));
}
{
Tensor src;
float16* buf = src.mutable_data<float16>({3}, CPUPlace());
buf[0] = 0.0;
buf[1].x = 0x7fff;
buf[2] = 0.0;
ASSERT_TRUE(TensorContainsNAN(src));
buf[1] = 0.0;
ASSERT_FALSE(TensorContainsNAN(src));
}
}
TEST(TensorContainsInf, CPU) {
using namespace paddle::framework;
using namespace paddle::platform;
{
Tensor src;
double* buf = src.mutable_data<double>({3}, CPUPlace());
buf[0] = 1.0;
......@@ -256,6 +270,18 @@ TEST(TensorContainsInf, CPU) {
ASSERT_TRUE(TensorContainsInf(src));
buf[1] = 1.0;
ASSERT_FALSE(TensorContainsInf(src));
}
{
Tensor src;
float16* buf = src.mutable_data<float16>({3}, CPUPlace());
buf[0] = 1.0;
buf[1].x = 0x7c00;
buf[2] = 0.0;
ASSERT_TRUE(TensorContainsInf(src));
buf[1] = 1.0;
ASSERT_FALSE(TensorContainsInf(src));
}
}
TEST(Tensor, FromAndToStream) {
......
......@@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) {
buf[1] = 0.1;
buf[2] = NAN;
}
static __global__ void FillInf(float* buf) {
buf[0] = 0.0;
buf[1] = INFINITY;
buf[2] = 0.5;
}
static __global__ void FillNAN(platform::float16* buf) {
buf[0] = 0.0;
buf[1] = 0.1;
buf[2].x = 0x7fff;
}
static __global__ void FillInf(platform::float16* buf) {
buf[0] = 0.0;
buf[1].x = 0x7c00;
buf[2] = 0.5;
}
TEST(TensorContainsNAN, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
using namespace paddle::platform;
CUDAPlace gpu(0);
auto& pool = DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor));
}
{
Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu);
FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsNAN(tensor));
}
}
TEST(TensorContainsInf, GPU) {
Tensor tensor;
platform::CUDAPlace gpu(0);
auto& pool = platform::DeviceContextPool::Instance();
using namespace paddle::platform;
CUDAPlace gpu(0);
auto& pool = DeviceContextPool::Instance();
auto* cuda_ctx = pool.GetByPlace(gpu);
{
Tensor tensor;
float* buf = tensor.mutable_data<float>({3}, gpu);
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor));
}
{
Tensor tensor;
float16* buf = tensor.mutable_data<float16>({3}, gpu);
FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
cuda_ctx->Wait();
ASSERT_TRUE(TensorContainsInf(tensor));
}
}
} // namespace framework
......
......@@ -67,10 +67,10 @@ class ThreadPool {
} catch (platform::EnforceNotMet ex) {
return std::unique_ptr<platform::EnforceNotMet>(
new platform::EnforceNotMet(ex));
} catch (...) {
LOG(FATAL)
<< "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet.";
} catch (const std::exception& e) {
LOG(FATAL) << "Unexpected exception is catched in thread pool. All "
"throwable exception in Fluid should be an EnforceNotMet."
<< e.what();
}
return nullptr;
});
......
......@@ -5,7 +5,8 @@ cc_library(paddle_fluid_api
DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
# Create static library
cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB})
get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
cc_library(paddle_fluid DEPS ${fluid_modules})
# Create shared library
cc_library(paddle_fluid_shared SHARED
......
......@@ -22,14 +22,14 @@ namespace paddle {
namespace inference {
void ReadBinaryFile(const std::string& filename, std::string& contents) {
VLOG(3) << "loading model from " << filename;
std::ifstream inputfs(filename, std::ios::in | std::ios::binary);
inputfs.seekg(0, std::ios::end);
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s", filename);
fin.seekg(0, std::ios::end);
contents.clear();
contents.resize(inputfs.tellg());
inputfs.seekg(0, std::ios::beg);
inputfs.read(&contents[0], contents.size());
inputfs.close();
contents.resize(fin.tellg());
fin.seekg(0, std::ios::beg);
fin.read(&contents[0], contents.size());
fin.close();
}
bool IsPersistable(const framework::VarDesc* var) {
......@@ -97,6 +97,7 @@ std::unique_ptr<framework::ProgramDesc> Load(framework::Executor& executor,
const std::string& dirname) {
std::string model_filename = dirname + "/__model__";
std::string program_desc_str;
VLOG(3) << "loading model from " << model_filename;
ReadBinaryFile(model_filename, program_desc_str);
std::unique_ptr<framework::ProgramDesc> main_program(
......
......@@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
TEST(inference, image_classification) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
......@@ -29,13 +32,11 @@ TEST(inference, image_classification) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [0.0, 1.0].
SetupTensor<float>(input,
{batch_size, 3, 32, 32},
{FLAGS_batch_size, 3, 32, 32},
static_cast<float>(0),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
......@@ -46,7 +47,9 @@ TEST(inference, image_classification) {
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << "--- CPU Runs: ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
......@@ -55,7 +58,9 @@ TEST(inference, image_classification) {
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << "--- GPU Runs: ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
......
......@@ -17,10 +17,13 @@ limitations under the License. */
#include "paddle/fluid/inference/tests/test_helper.h"
DEFINE_string(dirname, "", "Directory of the inference model.");
DEFINE_int32(batch_size, 1, "Batch size of input data");
DEFINE_int32(repeat, 1, "Running the inference program repeat times");
TEST(inference, recognize_digits) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model "
"--batch_size=1 --repeat=1";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
......@@ -29,24 +32,25 @@ TEST(inference, recognize_digits) {
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
int64_t batch_size = 1;
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(input,
{batch_size, 1, 28, 28},
{FLAGS_batch_size, 1, 28, 28},
static_cast<float>(-1),
static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
for (auto is_combined : {false, true}) {
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace>(dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CPUPlace>(
dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
......@@ -55,51 +59,12 @@ TEST(inference, recognize_digits) {
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace>(dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---";
TestInference<paddle::platform::CUDAPlace>(
dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
TEST(inference, recognize_digits_combine) {
if (FLAGS_dirname.empty()) {
LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model";
}
LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl;
std::string dirname = FLAGS_dirname;
// 0. Call `paddle::framework::InitDevices()` initialize all the devices
// In unittests, this is done in paddle/testing/paddle_gtest_main.cc
paddle::framework::LoDTensor input;
// Use normilized image pixels as input data,
// which should be in the range [-1.0, 1.0].
SetupTensor<float>(
input, {1, 1, 28, 28}, static_cast<float>(-1), static_cast<float>(1));
std::vector<paddle::framework::LoDTensor*> cpu_feeds;
cpu_feeds.push_back(&input);
paddle::framework::LoDTensor output1;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs1;
cpu_fetchs1.push_back(&output1);
// Run inference on CPU
TestInference<paddle::platform::CPUPlace, true>(
dirname, cpu_feeds, cpu_fetchs1);
LOG(INFO) << output1.dims();
#ifdef PADDLE_WITH_CUDA
paddle::framework::LoDTensor output2;
std::vector<paddle::framework::LoDTensor*> cpu_fetchs2;
cpu_fetchs2.push_back(&output2);
// Run inference on CUDA GPU
TestInference<paddle::platform::CUDAPlace, true>(
dirname, cpu_feeds, cpu_fetchs2);
LOG(INFO) << output2.dims();
CheckError<float>(output1, output2);
#endif
}
......@@ -15,6 +15,7 @@ limitations under the License. */
#include <time.h>
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/inference/io.h"
#include "paddle/fluid/platform/profiler.h"
template <typename T>
void SetupTensor(paddle::framework::LoDTensor& input,
......@@ -87,32 +88,61 @@ void CheckError(paddle::framework::LoDTensor& output1,
EXPECT_EQ(count, 0U) << "There are " << count << " different elements.";
}
template <typename Place, bool IsCombined = false>
template <typename Place>
void TestInference(const std::string& dirname,
const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs) {
std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
const int repeat = 1,
const bool is_combined = false) {
// 1. Define place, executor, scope
auto place = Place();
auto executor = paddle::framework::Executor(place);
auto* scope = new paddle::framework::Scope();
// Profile the performance
paddle::platform::ProfilerState state;
if (paddle::platform::is_cpu_place(place)) {
state = paddle::platform::ProfilerState::kCPU;
} else {
#ifdef PADDLE_WITH_CUDA
state = paddle::platform::ProfilerState::kCUDA;
// The default device_id of paddle::platform::CUDAPlace is 0.
// Users can get the device_id using:
// int device_id = place.GetDeviceId();
paddle::platform::SetDeviceId(0);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
// Enable the profiler
paddle::platform::EnableProfiler(state);
// 2. Initialize the inference_program and load parameters
std::unique_ptr<paddle::framework::ProgramDesc> inference_program;
if (IsCombined) {
{
paddle::platform::RecordEvent record_event(
"init_program",
paddle::platform::DeviceContextPool::Instance().Get(place));
if (is_combined) {
// All parameters are saved in a single file.
// Hard-coding the file names of program and parameters in unittest.
// The file names should be consistent with that used in Python API
// `fluid.io.save_inference_model`.
std::string prog_filename = "__model_combined__";
std::string param_filename = "__params_combined__";
inference_program = paddle::inference::Load(executor,
inference_program =
paddle::inference::Load(executor,
*scope,
dirname + "/" + prog_filename,
dirname + "/" + param_filename);
} else {
// Parameters are saved in separate files sited in the specified `dirname`.
// Parameters are saved in separate files sited in the specified
// `dirname`.
inference_program = paddle::inference::Load(executor, *scope, dirname);
}
}
// 3. Get the feed_target_names and fetch_target_names
const std::vector<std::string>& feed_target_names =
......@@ -134,7 +164,21 @@ void TestInference(const std::string& dirname,
}
// 6. Run the inference program
{
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event(
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));
executor.Run(*inference_program, scope, feed_targets, fetch_targets);
}
}
// Disable the profiler and print the timing information
paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,
"profiler.txt");
paddle::platform::ResetProfiler();
delete scope;
}
file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}")
string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}")
list(REMOVE_DUPLICATES GENERAL_OPS)
set(DEPS_OPS "")
set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h)
file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n")
......@@ -13,6 +15,8 @@ function(op_library TARGET)
set(cu_cc_srcs)
set(cudnn_cu_cc_srcs)
set(CUDNN_FILE)
set(mkldnn_cc_srcs)
set(MKLDNN_FILE)
set(op_common_deps operator op_registry math_function)
set(options "")
set(oneValueArgs "")
......@@ -36,12 +40,20 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif()
if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc)
endif()
endif()
else()
foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$")
......@@ -62,15 +74,15 @@ function(op_library TARGET)
set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE)
endif()
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else()
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS}
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
endif()
# Define operators that don't need pybind here.
foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op")
foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1)
endif()
......@@ -101,7 +113,8 @@ function(op_library TARGET)
# pybind USE_CPU_ONLY_OP
list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1)
endif()
......@@ -112,6 +125,11 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
# pybind USE_OP
if (${pybind_flag} EQUAL 0)
file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
......@@ -128,8 +146,8 @@ else()
set(DEPS_OPS ${DEPS_OPS} nccl_op)
endif()
add_subdirectory(detail)
if(WITH_DISTRIBUTE)
add_subdirectory(detail)
set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
op_library(send_op DEPS ${DISTRIBUTE_DEPS})
......@@ -170,26 +188,31 @@ op_library(recurrent_op DEPS executor)
op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
op_library(cos_sim_op DEPS cos_sim_functor)
op_library(parallel_do_op DEPS executor)
op_library(create_reader_op DEPS reader)
if (WITH_GPU)
op_library(conv_op DEPS vol2col depthwise_conv)
op_library(conv_op DEPS vol2col depthwise_conv im2col)
else()
op_library(conv_op DEPS vol2col)
op_library(conv_op DEPS vol2col im2col)
endif()
op_library(conv_transpose_op DEPS vol2col)
op_library(conv_transpose_op DEPS vol2col im2col)
# FIXME(typhoonzero): save/load depends lodtensor serialization functions
op_library(save_op DEPS lod_tensor)
op_library(load_op DEPS lod_tensor)
op_library(save_combine_op DEPS lod_tensor)
op_library(load_combine_op DEPS lod_tensor)
op_library(concat_op DEPS concat)
list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
foreach(src ${GENERAL_OPS})
op_library(${src})
endforeach()
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n")
file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n")
add_subdirectory(reader)
foreach(src ${READER_LIBRARY})
set(OP_LIBRARY ${src} ${OP_LIBRARY})
endforeach()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
......
......@@ -63,13 +63,27 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker {
}
};
class CastOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx);
// CastOp kernel's device type is decided by input tensor place
kt.place_ = ctx.Input<framework::LoDTensor>("X")->place();
return kt;
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
using CPU = paddle::platform::CPUDeviceContext;
REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape,
ops::CastOpProtoMaker);
REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker,
ops::CastOpInferShape, ops::CastOpProtoMaker);
REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel<CPU, float>,
ops::CastOpKernel<CPU, double>,
ops::CastOpKernel<CPU, int>,
......
......@@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel {
namespace ops = paddle::operators;
REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad,
ops::ConcatOpGrad, false)
REGISTER_OP_CPU_KERNEL(concat,
ops::ConcatKernel<paddle::platform::CPUPlace, float>)
REGISTER_OP_CPU_KERNEL(concat_grad,
ops::ConcatGradKernel<paddle::platform::CPUPlace, float>)
REGISTER_OP_CPU_KERNEL(
concat, ops::ConcatKernel<paddle::platform::CPUDeviceContext, float>)
REGISTER_OP_CPU_KERNEL(
concat_grad,
ops::ConcatGradKernel<paddle::platform::CPUDeviceContext, float>)
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <utility>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/operators/strided_memcpy.h"
namespace paddle {
......@@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto ins = ctx.MultiInput<framework::Tensor>("X");
auto* out = ctx.Output<framework::Tensor>("Out");
framework::Tensor* out = ctx.Output<framework::Tensor>("Out");
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
auto place = ctx.GetPlace();
out->mutable_data<T>(place);
auto out_stride = framework::stride_numel(out->dims());
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && ins.size() < 10) {
size_t output_offset = 0;
// If axis >=1, copy to out immediately need to call many times
// of cuda memcpy. Copy the input to cpu and do the stride copy,
// then copy to gpu output.
if (platform::is_gpu_place(place) && axis >= 1) {
platform::CPUPlace copy_place;
auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place);
framework::Tensor cpu_out;
cpu_out.Resize(out->dims());
cpu_out.mutable_data<T>(copy_place);
auto& dev_ctx = ctx.device_context();
std::vector<std::unique_ptr<framework::Tensor>> cpu_ins;
for (auto* in : ins) {
std::unique_ptr<framework::Tensor> cpu_in(new framework::Tensor);
framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get());
cpu_ins.emplace_back(std::move(cpu_in));
}
// TODO(dzhwinter): overlap copy and compute stream
// https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/
dev_ctx.Wait();
for (auto& in : cpu_ins) {
auto& cpu_in = *in.get();
auto in_stride = framework::stride_numel(cpu_in.dims());
StridedNumelCopyWithAxis<T>(
cpu_ctx, axis, cpu_out.data<T>() + output_offset, out_stride,
cpu_in.data<T>(), in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
framework::TensorCopy(cpu_out, place, dev_ctx, out);
} else {
for (auto* in : ins) {
auto in_stride = framework::stride_numel(in->dims());
auto out_stride = framework::stride_numel(out->dims());
StridedNumelCopyWithAxis<T>(ctx.device_context(), axis,
out->data<T>() + output_offset, out_stride,
in->data<T>(), in_stride, in_stride[axis]);
output_offset += in_stride[axis];
}
} else {
std::vector<framework::Tensor> inputs(ins.size());
for (size_t j = 0; j < ins.size(); ++j) {
inputs[j] = *ins[j];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatFunctor<DeviceContext, T> concat_functor;
concat_functor(dev_ctx, inputs, static_cast<int>(axis), out);
}
}
};
......@@ -86,6 +63,9 @@ class ConcatGradKernel : public framework::OpKernel<T> {
auto* in = ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto outs = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
int64_t axis = static_cast<int64_t>(ctx.Attr<int>("axis"));
// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && outs.size() < 10) {
size_t input_offset = 0;
auto in_stride = framework::stride_numel(in->dims());
......@@ -97,6 +77,18 @@ class ConcatGradKernel : public framework::OpKernel<T> {
in_stride, out_stride[axis]);
input_offset += out_stride[axis];
}
} else {
std::vector<framework::Tensor> outputs(outs.size());
for (size_t j = 0; j < outs.size(); ++j) {
outs[j]->mutable_data<T>(ctx.GetPlace());
outputs[j] = *outs[j];
}
auto& dev_ctx = ctx.template device_context<DeviceContext>();
paddle::operators::math::ConcatGradFunctor<DeviceContext, T>
concat_grad_functor;
concat_grad_functor(dev_ctx, *in, static_cast<int>(axis), outputs);
}
}
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
template <typename T>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* output = ctx.Output<Tensor>("Output");
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
// TODO(pzelazko-intel) add support for group convolution and dilation
PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet");
PADDLE_ENFORCE(
dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
PADDLE_ENFORCE(input->dims().size() == 4,
"Input must be with 4 dimensions, i.e. NCHW");
PADDLE_ENFORCE(filter->dims().size() == 4,
"Filter must be with 4 dimensions, i.e. OIHW");
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto dst_md = platform::MKLDNNMemDesc(
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data);
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine);
// save conv_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_conv_pd, conv_pd);
// create convolution op primitive
auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory,
weights_memory, dst_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{conv_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
private:
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src,
const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine) const {
mkldnn::memory::dims stride_dims = {strides[0], strides[1]};
mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]};
auto conv_desc = mkldnn::convolution_forward::desc(
mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights,
dst, stride_dims, padding_dims, padding_dims,
mkldnn::padding_kind::zero);
auto p_conv_pd =
new mkldnn::convolution_forward::primitive_desc(conv_desc, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
};
template <typename T>
class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("Input");
const Tensor* filter = ctx.Input<Tensor>("Filter");
const Tensor* output = ctx.Input<Tensor>("Output");
const Tensor* output_grad =
ctx.Input<Tensor>(framework::GradVarName("Output"));
Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
Tensor* filter_grad = ctx.Output<Tensor>(framework::GradVarName("Filter"));
if (!input_grad && !filter_grad) return;
// Get an unique name from "argument" name of "Output" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Input("Output");
const std::string key_conv_pd = key + "@conv_pd";
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
const T* input_data = input->data<T>();
const T* filter_data = filter->data<T>();
const T* output_grad_data = output_grad->data<T>();
T* input_grad_data = nullptr;
T* filter_grad_data = nullptr;
if (input_grad) {
input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
}
if (filter_grad) {
filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace());
}
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_src_md = platform::MKLDNNMemDesc(
src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto diff_weights_md =
platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32,
mkldnn::memory::format::oihw);
auto diff_dst_md = platform::MKLDNNMemDesc(
dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
// create memory
auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine},
(void*)output_grad_data);
// Retrieve conv_pd from device context
auto conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
dev_ctx.GetBlob(key_conv_pd));
PADDLE_ENFORCE(conv_pd != nullptr,
"Fail to find conv_pd in device context");
// create backward conv primitive for weights
if (filter_grad) {
// create primitive descriptor
mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd =
ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md,
strides, paddings, *conv_pd,
mkldnn_engine);
// create memory
auto diff_weights_memory = mkldnn::memory(
{diff_weights_md, mkldnn_engine}, (void*)filter_grad_data);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
// create backward conv primitive for weights
auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights(
conv_bwd_weights_pd, src_memory, diff_dst_memory,
diff_weights_memory);
// push primitive and execute it
std::vector<mkldnn::primitive> pipeline{conv_bwd_weights_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
if (input_grad) {
// create primitive descriptor
mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd =
ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md,
strides, paddings, *conv_pd, mkldnn_engine);
// create memory
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data);
auto weights_memory =
mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data);
// create backward conv primitive for data
auto conv_bwd_data_prim = mkldnn::convolution_backward_data(
conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{conv_bwd_data_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // Compute()
private:
mkldnn::convolution_backward_weights::primitive_desc
ConvBwdWeightsPrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights,
const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) const {
auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc(
mkldnn::convolution_direct, src, diff_weights, diff_dst, strides,
paddings, paddings, mkldnn::padding_kind::zero);
return mkldnn::convolution_backward_weights::primitive_desc(
conv_bwd_weights_desc, engine, conv_pd);
}
mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc(
const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights,
const mkldnn::memory::desc& diff_dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::convolution_forward::primitive_desc& conv_pd,
const mkldnn::engine& engine) const {
auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc(
mkldnn::convolution_direct, diff_src, weights, diff_dst, strides,
paddings, paddings, mkldnn::padding_kind::zero);
return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc,
engine, conv_pd);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::ConvMKLDNNGradOpKernel<float>);
......@@ -13,6 +13,12 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/conv_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
......@@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......@@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType ConvOpGrad::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace paddle {
namespace operators {
static std::vector<framework::DDim> RestoreShapes(
const std::vector<int>& shape_concat, const std::vector<int>& ranks) {
std::vector<framework::DDim> res;
int offset = 0;
for (int len : ranks) {
auto start_it = shape_concat.begin() + offset;
auto end_it = start_it + len;
res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
offset += len;
}
return res;
}
// general infershape for file readers
class CreateFileReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null.");
const auto shape_concat =
ctx->Attrs().Get<std::vector<int>>("shape_concat");
const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
ctx->SetReaderDims("Out", shapes);
if (ctx->IsRuntime()) {
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
PADDLE_ENFORCE_EQ(
lod_levels.size(), shapes.size(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d).",
lod_levels.size(), shapes.size());
framework::VarDesc* reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
reader->SetLoDLevels(lod_levels);
}
}
};
// general infershape for decorated readers
class CreateDecoratedReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
"Input(UnderlyingReader) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
if (ctx->IsRuntime()) {
framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
ctx->GetInputVarPtrs("UnderlyingReader")[0]);
framework::VarDesc* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
}
};
// general var type inference for file readers
class CreateFileReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
std::string reader_name = op_desc.Output("Out")[0];
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
reader->SetType(framework::proto::VarType::READER);
}
};
// general var type inference for decorated readers
class CreateDecoratedReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
std::string out_reader_name = op_desc.Output("Out")[0];
framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
out_reader->SetType(framework::proto::VarType::READER);
out_reader->SetDataTypes(in_reader->GetDataTypes());
}
};
template <typename T>
class CreateRandomDataGeneratorOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::RandomDataGenerator<T>(shapes, Attr<float>("min"),
Attr<float>("max")));
}
};
class CreateRandomDataGeneratorOpMaker
: public framework::OpProtoAndCheckerMaker {
public:
CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddOutput("Out", "(ReaderHolder) The created random reader.");
AddAttr<std::vector<int>>("shape_concat",
"The concat of all data's shapes.");
AddAttr<std::vector<int>>(
"ranks",
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively.");
AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
AddComment(R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC");
}
};
class CreateShuffleReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::ShuffleReader(underlying_reader.Get(),
Attr<int>("buffer_size")));
}
};
class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput(
"UnderlyingReader",
"(ReaderHolder) The underlying reader for creating a shuffle reader.");
AddOutput("Out", "(ReaderHolder) The created shuffle reader.");
AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
AddComment(R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC");
}
};
class CreateBatchReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new framework::BatchReader(underlying_reader.Get(),
Attr<int>("batch_size")));
}
};
class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker {
public:
CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput(
"UnderlyingReader",
"(ReaderHolder) The underlying reader for creating a batch reader.");
AddOutput("Out", "(ReaderHolder) The created batch reader.");
AddAttr<int>("batch_size",
"How many instances the batch reader yields each time.")
.GreaterThan(0);
AddComment(R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(create_random_data_generator,
ops::CreateRandomDataGeneratorOp<float>,
ops::CreateFileReaderInferShape,
ops::CreateRandomDataGeneratorOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateFileReaderInferVarType);
REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp,
ops::CreateDecoratedReaderInferShape,
ops::CreateShuffleReaderOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateDecoratedReaderInferVarType);
REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp,
ops::CreateDecoratedReaderInferShape,
ops::CreateBatchReaderOpMaker,
paddle::framework::EmptyGradOpMaker,
ops::CreateDecoratedReaderInferVarType);
grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
if(WITH_DISTRIBUTE)
grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc)
endif()
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#include "bytebuffer_stream.h"
namespace paddle {
namespace operators {
namespace detail {
GrpcByteBufferSource::GrpcByteBufferSource() {}
bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) {
cur_ = -1;
left_ = 0;
ptr_ = nullptr;
byte_count_ = 0;
bool ok = src.Dump(&slices_).ok();
if (!ok) {
slices_.clear();
}
return ok;
}
bool GrpcByteBufferSource::Next(const void** data, int* size) {
// Use loop instead of if in case buffer contained empty slices.
while (left_ == 0) {
// Advance to next slice.
cur_++;
if (cur_ >= slices_.size()) {
return false;
}
const ::grpc::Slice& s = slices_[cur_];
left_ = s.size();
ptr_ = reinterpret_cast<const char*>(s.begin());
}
*data = ptr_;
*size = left_;
byte_count_ += left_;
ptr_ += left_;
left_ = 0;
return true;
}
void GrpcByteBufferSource::BackUp(int count) {
ptr_ -= count;
left_ += count;
byte_count_ -= count;
}
bool GrpcByteBufferSource::Skip(int count) {
const void* data;
int size;
while (Next(&data, &size)) {
if (size >= count) {
BackUp(size - count);
return true;
}
// size < count;
count -= size;
}
// error or we have too large count;
return false;
}
google::protobuf::int64 GrpcByteBufferSource::ByteCount() const {
return byte_count_;
}
} // namespace detail
} // namespace operators
} // namespace paddle
\ No newline at end of file
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
namespace paddle {
namespace operators {
namespace detail {
// A ZeroCopyInputStream that reads from a grpc::ByteBuffer.
class GrpcByteBufferSource
: public ::google::protobuf::io::ZeroCopyInputStream {
public:
GrpcByteBufferSource();
bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times.
bool Next(const void** data, int* size) override;
void BackUp(int count) override;
bool Skip(int count) override;
::google::protobuf::int64 ByteCount() const override;
private:
std::vector<::grpc::Slice> slices_;
size_t cur_; // Current slice index.
int left_; // Number of bytes in slices_[cur_] left to yield.
const char* ptr_; // Address of next byte in slices_[cur_] to yield.
::google::protobuf::int64 byte_count_;
};
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
// NOTE: This file was originally created by tensorflow
// (https://github.com/tensorflow/tensorflow/) we borrow this
// file and did some modifications so that we can send gRPC
// requests without too much copying of the tensor data.
#pragma once
#include <grpc++/grpc++.h>
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace detail {
char* EncodeVarint32(char* dst, uint32_t v) {
// Operate on characters as unsigneds
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
static const int B = 128;
if (v < (1 << 7)) {
*(ptr++) = v;
} else if (v < (1 << 14)) {
*(ptr++) = v | B;
*(ptr++) = v >> 7;
} else if (v < (1 << 21)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = v >> 14;
} else if (v < (1 << 28)) {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = v >> 21;
} else {
*(ptr++) = v | B;
*(ptr++) = (v >> 7) | B;
*(ptr++) = (v >> 14) | B;
*(ptr++) = (v >> 21) | B;
*(ptr++) = v >> 28;
}
return reinterpret_cast<char*>(ptr);
}
char* EncodeVarint64(char* dst, uint64_t v) {
static const int B = 128;
unsigned char* ptr = reinterpret_cast<unsigned char*>(dst);
while (v >= B) {
*(ptr++) = (v & (B - 1)) | B;
v >>= 7;
}
*(ptr++) = static_cast<unsigned char>(v);
return reinterpret_cast<char*>(ptr);
}
int VarintLength(uint64_t v) {
int len = 1;
while (v >= 128) {
v >>= 7;
len++;
}
return len;
}
class ProtoEncodeHelper {
public:
ProtoEncodeHelper(char* buf, int max_size)
: base_(buf), p_(buf), limit_(base_ + max_size) {}
~ProtoEncodeHelper() {
// Make sure callers didn't do operations that went over max_size promised
PADDLE_ENFORCE_LE(p_, limit_);
}
const char* data() const { return base_; }
size_t size() const { return p_ - base_; }
void WriteUint64(int tag, uint64_t v) {
Encode32(combine(tag, WIRETYPE_VARINT));
Encode64(v);
}
void WriteBool(int tag, bool v) {
Encode32(combine(tag, WIRETYPE_VARINT));
EncodeBool(v);
}
void WriteString(int tag, const std::string& v) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(v.size());
EncodeBytes(v.data(), v.size());
}
void WriteVarlengthBeginning(int tag, uint32_t len) {
Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED));
Encode32(len);
}
void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); }
private:
// Note: this module's behavior must match the protocol buffer wire encoding
// format.
enum {
WIRETYPE_VARINT = 0,
WIRETYPE_LENGTH_DELIMITED = 2,
};
static uint32_t combine(uint32_t tag, uint32_t type) {
return ((tag << 3) | type);
}
inline void Encode32(uint32_t v) {
if (v < 128) {
// Fast path for single-byte values. Many of the calls will use a
// constant value for v, so the comparison will get optimized away
// when Encode32 is inlined into the caller.
*p_ = v;
p_++;
} else {
p_ = EncodeVarint32(p_, v);
}
}
void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); }
void EncodeBool(bool v) {
*p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1
p_++;
}
void EncodeBytes(const char* bytes, int N) {
memcpy(p_, bytes, N);
p_ += N;
}
char* base_;
char* p_;
char* limit_; // Just for CHECKs
};
} // detail
} // operators
} // paddle
......@@ -33,10 +33,34 @@ enum VarType {
}
message VariableMessage {
enum Type {
// Pod Types
BOOL = 0;
INT16 = 1;
INT32 = 2;
INT64 = 3;
FP16 = 4;
FP32 = 5;
FP64 = 6;
}
message LodData { repeated int64 lod_data = 1; }
string varname = 1;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
VarType type = 2;
bytes serialized = 3;
// bool persistable is not needed for sending.
// tensor info:
Type data_type = 3;
repeated int64 dims = 4;
// lod details:
int64 lod_level = 5;
repeated LodData lod = 6;
// tensor data
bytes serialized = 7;
// selected_rows data
bytes rows = 8;
}
message VoidMessage {}
......@@ -13,6 +13,11 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "google/protobuf/io/coded_stream.h"
#include "google/protobuf/io/zero_copy_stream.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/detail/bytebuffer_stream.h"
#include "paddle/fluid/operators/detail/proto_encoder_helper.h"
namespace paddle {
namespace operators {
......@@ -63,6 +68,233 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
}
}
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg) {
using VarMsg = sendrecv::VariableMessage;
sendrecv::VariableMessage request;
std::string header;
request.AppendToString(&header);
// When using GPU, need to free the copied CPU buffer
// when the ByteBuffer destroies
// TODO(typhoonzero): add unref here, if we have dependent
// parallelism execution, need to know when to free the tensor.
DestroyCallback destroy_callback = [](void* backing) {};
void* buf = malloc(1024);
void* payload;
size_t payload_size;
ProtoEncodeHelper e((char*)buf, 1024);
e.WriteString(VarMsg::kVarnameFieldNumber, name);
if (var->IsType<framework::LoDTensor>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 0);
} else if (var->IsType<framework::SelectedRows>()) {
e.WriteUint64(VarMsg::kTypeFieldNumber, 1);
}
switch (framework::ToVarType(var->Type())) {
case framework::proto::VarType_Type_LOD_TENSOR: {
auto tensor = var->Get<framework::LoDTensor>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(tensor.type()));
for (auto& dim : framework::vectorize(tensor.dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
auto lod = tensor.lod(); // std::vector<Vector<size_t>>
if (lod.size() > 0) {
e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size());
for (auto& each : lod) {
e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber,
2 + // tag + varintlength of submessage
1 + // kLodDataFieldNumber
each.size());
// auto copied from GPU
for (auto& d : each) {
e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d);
}
}
}
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
PADDLE_ENFORCE(platform::is_gpu_place(tensor.place()));
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor.memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor.place()),
reinterpret_cast<const void*>(tensor.data<void>()),
copy_size, gpu_dev_ctx.stream());
ctx.Wait();
destroy_callback = [](void* backing) {
platform::CPUPlace cpu;
memory::Free(cpu, backing);
};
#endif
} else {
payload = tensor.data<void>();
}
payload_size = tensor.memory_size();
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
case framework::proto::VarType_Type_SELECTED_ROWS: {
// TODO(typhoonzero): selectedrows implement should not use unique_ptr
auto* slr = var->GetMutable<framework::SelectedRows>();
e.WriteUint64(VarMsg::kDataTypeFieldNumber,
framework::ToDataType(slr->value().type()));
for (auto& dim : framework::vectorize(slr->value().dims())) {
e.WriteUint64(VarMsg::kDimsFieldNumber, dim);
}
e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0);
auto* tensor = slr->mutable_value();
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx =
static_cast<const platform::CUDADeviceContext&>(ctx);
auto copy_size = tensor->memory_size();
payload = memory::Alloc(cpu, copy_size);
memory::Copy(cpu, payload,
boost::get<platform::CUDAPlace>(tensor->place()),
reinterpret_cast<const void*>(tensor->data<void>()),
copy_size, gpu_dev_ctx.stream());
ctx.Wait();
destroy_callback = [](void* backing) {
platform::CPUPlace cpu;
memory::Free(cpu, backing);
};
#endif
} else {
payload = slr->mutable_value()->data<void>();
}
payload_size = tensor->memory_size();
e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size);
} break;
default:
PADDLE_THROW("Serialize does not support type: %s",
typeid(var->Type()).name());
break;
}
// steal reference of tensor data
::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows
int num_slices = 2; // only SelectedRows have rows buffer
slices[0] = ::grpc::Slice(e.size());
memcpy(const_cast<uint8_t*>(slices[0].begin()), e.data(), e.size());
slices[1] = ::grpc::Slice(
grpc_slice_new_with_user_data(payload, payload_size, destroy_callback,
static_cast<char*>(payload)),
::grpc::Slice::STEAL_REF);
if (framework::ToVarType(var->Type()) ==
framework::proto::VarType_Type_SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
ProtoEncodeHelper e2((char*)buf, 128);
// NOTE: rows is of type int64_t
size_t rows_memory_size =
slr->rows().capacity() * framework::SizeOfType(typeid(int64_t));
e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size);
slices[2] = ::grpc::Slice(e2.size());
memcpy(const_cast<uint8_t*>(slices[2].begin()), e2.data(), e2.size());
slices[3] = ::grpc::Slice(
grpc_slice_new_with_user_data(
const_cast<void*>(
reinterpret_cast<const void*>(slr->rows().data())),
rows_memory_size,
[](void* backing) {
// TODO(typhoonzero): add unref here, same as above.
},
const_cast<char*>(
reinterpret_cast<const char*>(slr->rows().data()))),
::grpc::Slice::STEAL_REF);
num_slices = 4;
}
::grpc::ByteBuffer tmp(&slices[0], num_slices);
msg->Swap(&tmp);
}
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var) {
sendrecv::VariableMessage meta;
GrpcByteBufferSource source;
source.Init(msg);
::google::protobuf::io::CodedInputStream input(&source);
// do zerocopy parsing
PADDLE_ENFORCE(meta.ParseFromCodedStream(&input));
PADDLE_ENFORCE(input.ConsumedEntireMessage());
// dims is needed by both tensor and selectedrows
std::vector<int> vecdims;
for (auto& d : meta.dims()) {
vecdims.push_back(d);
}
framework::DDim dims = framework::make_ddim(vecdims);
if (meta.type() == sendrecv::LOD_TENSOR) {
auto* tensor = var->GetMutable<framework::LoDTensor>();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
framework::LoD lod;
for (int i = 0; i < meta.lod_level(); ++i) {
framework::Vector<size_t> v;
for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) {
v.push_back(meta.lod(i).lod_data(j));
}
lod.push_back(v);
}
tensor->set_lod(lod);
// How to avoid copying and use the message buffer directly?
// Maybe need to find a way to release all memory except tensor content.
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
} else if (meta.type() == sendrecv::SELECTED_ROWS) {
auto* slr = var->GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
int64_t* rows_data = slr->mutable_rows()->data();
tensor->Resize(dims);
void* tensor_data = tensor->mutable_data(
ctx.GetPlace(),
paddle::operators::detail::ToTypeIndex(meta.data_type()));
if (platform::is_gpu_place(ctx.GetPlace())) {
#ifdef PADDLE_WITH_CUDA
platform::CPUPlace cpu;
auto& gpu_dev_ctx = static_cast<const platform::CUDADeviceContext&>(ctx);
memory::Copy(boost::get<platform::CUDAPlace>(tensor->place()),
tensor_data, cpu,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size(), gpu_dev_ctx.stream());
ctx.Wait();
#endif
} else {
memcpy(tensor_data,
reinterpret_cast<const void*>(meta.serialized().data()),
meta.serialized().size());
}
// copy rows CPU data, GPU data will be copied lazly
memcpy(rows_data, reinterpret_cast<const void*>(meta.rows().data()),
meta.rows().size());
}
}
} // namespace detail
} // namespace operators
} // namespace paddle
\ No newline at end of file
......@@ -33,6 +33,8 @@ namespace detail {
#define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
#define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV"
typedef void (*DestroyCallback)(void*);
void SerializeToMessage(const std::string& name, const framework::Variable* var,
const platform::DeviceContext& ctx,
sendrecv::VariableMessage* msg);
......@@ -40,6 +42,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var,
void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
const platform::DeviceContext& ctx,
::grpc::ByteBuffer* msg);
void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
framework::Variable* var);
inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) {
switch (type) {
case sendrecv::VariableMessage::FP32:
return typeid(float); // NOLINT
case sendrecv::VariableMessage::FP64:
return typeid(double); // NOLINT
case sendrecv::VariableMessage::INT32:
return typeid(int); // NOLINT
case sendrecv::VariableMessage::INT64:
return typeid(int64_t); // NOLINT
case sendrecv::VariableMessage::BOOL:
return typeid(bool); // NOLINT
default:
PADDLE_THROW("Not support type %d", type);
}
}
} // namespace detail
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/printf.h"
namespace framework = paddle::framework;
namespace platform = paddle::platform;
namespace operators = paddle::operators;
namespace math = paddle::operators::math;
namespace memory = paddle::memory;
void RunSerdeTestTensor(platform::Place place) {
// serialize var to ByteBuffer
framework::Variable var;
auto* tensor = var.GetMutable<framework::LoDTensor>();
tensor->Resize(framework::make_ddim({4, 8, 4, 2}));
framework::LoD lod;
lod.push_back(framework::Vector<size_t>({1, 3, 8}));
tensor->set_lod(lod);
int tensor_numel = 4 * 8 * 4 * 2;
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
tensor->mutable_data<float>(place);
math::set_constant(ctx, tensor, 31.9);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 0);
EXPECT_EQ(varmsg.dims()[0], 4);
EXPECT_EQ(varmsg.dims()[1], 8);
EXPECT_EQ(varmsg.dims()[2], 4);
EXPECT_EQ(varmsg.dims()[3], 2);
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 31.9);
}
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto tensor2 = var2.Get<framework::LoDTensor>();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2.data<float>());
}
EXPECT_EQ(varmsg.lod_level(), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(0), 1);
EXPECT_EQ(varmsg.lod(0).lod_data(1), 3);
EXPECT_EQ(varmsg.lod(0).lod_data(2), 8);
for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
}
void RunSerdeTestSelectedRows(platform::Place place) {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
// serialize var to ByteBuffer
framework::Variable var;
auto* slr = var.GetMutable<framework::SelectedRows>();
auto* tensor = slr->mutable_value();
auto* rows = slr->mutable_rows();
tensor->Resize(framework::make_ddim({2, 10}));
tensor->mutable_data<float>(place);
int tensor_numel = 2 * 10;
math::set_constant(ctx, tensor, 32.7);
rows->push_back(3);
rows->push_back(10);
::grpc::ByteBuffer msg;
operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg);
EXPECT_GT(msg.Length(), 0);
// deserialize
std::vector<::grpc::Slice> slices;
(void)msg.Dump(&slices);
std::string tmp;
for (const auto& s : slices) {
tmp.append(reinterpret_cast<const char*>(s.begin()), s.size());
}
sendrecv::VariableMessage varmsg;
EXPECT_TRUE(varmsg.ParseFromString(tmp));
EXPECT_EQ(varmsg.varname(), "myvar");
EXPECT_EQ(varmsg.type(), 1);
const float* tensor_data =
reinterpret_cast<const float*>(varmsg.serialized().data());
const int64_t* rows_data =
reinterpret_cast<const int64_t*>(varmsg.rows().data());
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data[i], 32.7);
}
EXPECT_EQ(rows_data[0], 3);
EXPECT_EQ(rows_data[1], 10);
// deserialize zero-copy
framework::Variable var2;
operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
auto* slr2 = var2.GetMutable<framework::SelectedRows>();
auto* tensor2 = slr2->mutable_value();
auto* rows2 = slr2->mutable_rows();
float* tensor_data2 = nullptr;
framework::Tensor tmp_tensor;
if (platform::is_gpu_place(ctx.GetPlace())) {
platform::CPUPlace cpu;
framework::TensorCopy(*tensor2, cpu, &tmp_tensor);
tensor_data2 = tmp_tensor.data<float>();
} else {
tensor_data2 = const_cast<float*>(tensor2->data<float>());
}
const int64_t* rows_data2 = rows2->data();
for (int i = 0; i < tensor_numel; ++i) {
EXPECT_FLOAT_EQ(tensor_data2[i], 32.7);
}
EXPECT_EQ(rows_data2[0], 3);
EXPECT_EQ(rows_data2[1], 10);
}
TEST(SelectedRows, CPU) {
platform::CPUPlace place;
RunSerdeTestSelectedRows(place);
}
TEST(SelectedRows, GPU) {
platform::CUDAPlace place;
RunSerdeTestSelectedRows(place);
}
TEST(Tensor, CPU) {
platform::CPUPlace place;
RunSerdeTestTensor(place);
}
TEST(Tensor, GPU) {
platform::CUDAPlace place;
RunSerdeTestTensor(place);
}
\ No newline at end of file
......@@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel {
return framework::OpKernelType(
framework::ToDataType(
ctx.Input<framework::Tensor>("DetectRes")->type()),
ctx.device_context());
platform::CPUPlace());
}
};
......@@ -142,7 +142,15 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput("MAP",
"(Tensor) A tensor with shape [1], store the mAP evaluate "
"result of the detection.");
AddAttr<int>("class_num",
"(int) "
"The class number.");
AddAttr<int>(
"background_label",
"(int, defalut: 0) "
"The index of background label, the background label will be ignored. "
"If set to -1, then all categories will be considered.")
.SetDefault(0);
AddAttr<float>(
"overlap_threshold",
"(float) "
......
......@@ -69,6 +69,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
float overlap_threshold = ctx.Attr<float>("overlap_threshold");
float evaluate_difficult = ctx.Attr<bool>("evaluate_difficult");
auto ap_type = GetAPType(ctx.Attr<std::string>("ap_type"));
int class_num = ctx.Attr<int>("class_num");
auto label_lod = in_label->lod();
auto detect_lod = in_detect->lod();
......@@ -95,17 +96,19 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
if (in_pos_count != nullptr && state) {
GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count,
true_pos, false_pos);
true_pos, false_pos, class_num);
}
CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult,
overlap_threshold, label_pos_count, true_pos,
false_pos);
T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos);
int background_label = ctx.Attr<int>("background_label");
T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos,
background_label);
GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count,
*out_true_pos, *out_false_pos);
*out_true_pos, *out_false_pos, class_num);
T* map_data = out_map->mutable_data<T>(ctx.GetPlace());
map_data[0] = map;
......@@ -141,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
}
}
inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const {
T one = static_cast<T>(1.0);
T zero = static_cast<T>(0.0);
clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero);
clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero);
clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero);
clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero);
}
void GetBoxes(const framework::LoDTensor& input_label,
const framework::LoDTensor& input_detect,
std::vector<std::map<int, std::vector<Box>>>& gt_boxes,
......@@ -190,24 +202,20 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
framework::Tensor& output_pos_count,
framework::LoDTensor& output_true_pos,
framework::LoDTensor& output_false_pos) const {
int max_class_id = 0;
framework::LoDTensor& output_false_pos, const int class_num) const {
int true_pos_count = 0;
int false_pos_count = 0;
for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
int label = it->first;
if (label > max_class_id) max_class_id = label;
int label_num_pos = it->second;
if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
continue;
auto label_true_pos = true_pos.find(label)->second;
auto label_false_pos = false_pos.find(label)->second;
true_pos_count += label_true_pos.size();
false_pos_count += label_false_pos.size();
for (auto it = true_pos.begin(); it != true_pos.end(); ++it) {
auto tp = it->second;
true_pos_count += tp.size();
}
for (auto it = false_pos.begin(); it != false_pos.end(); ++it) {
auto fp = it->second;
false_pos_count += fp.size();
}
int* pos_count_data = output_pos_count.mutable_data<int>(
framework::make_ddim({max_class_id + 1, 1}), ctx.GetPlace());
framework::make_ddim({class_num, 1}), ctx.GetPlace());
T* true_pos_data = output_true_pos.mutable_data<T>(
framework::make_ddim({true_pos_count, 2}), ctx.GetPlace());
......@@ -217,7 +225,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
false_pos_count = 0;
std::vector<size_t> true_pos_starts = {0};
std::vector<size_t> false_pos_starts = {0};
for (int i = 0; i <= max_class_id; ++i) {
for (int i = 0; i < class_num; ++i) {
auto it_count = label_pos_count.find(i);
pos_count_data[i] = 0;
if (it_count != label_pos_count.end()) {
......@@ -258,17 +266,16 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
return;
}
void GetInputPos(
const framework::Tensor& input_pos_count,
void GetInputPos(const framework::Tensor& input_pos_count,
const framework::LoDTensor& input_true_pos,
const framework::LoDTensor& input_false_pos,
std::map<int, int>& label_pos_count,
std::map<int, std::vector<std::pair<T, int>>>& true_pos,
std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
std::map<int, std::vector<std::pair<T, int>>>& false_pos,
const int class_num) const {
constexpr T kEPS = static_cast<T>(1e-6);
int class_number = input_pos_count.dims()[0];
const int* pos_count_data = input_pos_count.data<int>();
for (int i = 0; i < class_number; ++i) {
for (int i = 0; i < class_num; ++i) {
label_pos_count[i] = pos_count_data[i];
}
......@@ -362,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
size_t max_idx = 0;
auto score = pred_boxes[i].first;
for (size_t j = 0; j < matched_bboxes.size(); ++j) {
T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]);
Box& pred_box = pred_boxes[i].second;
ClipBBox(pred_box, &pred_box);
T overlap = JaccardOverlap(pred_box, matched_bboxes[j]);
if (overlap > max_overlap) {
max_overlap = overlap;
max_idx = j;
......@@ -391,17 +400,19 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
}
}
T CalcMAP(
APType ap_type, const std::map<int, int>& label_pos_count,
T CalcMAP(APType ap_type, const std::map<int, int>& label_pos_count,
const std::map<int, std::vector<std::pair<T, int>>>& true_pos,
const std::map<int, std::vector<std::pair<T, int>>>& false_pos) const {
const std::map<int, std::vector<std::pair<T, int>>>& false_pos,
const int background_label) const {
T mAP = 0.0;
int count = 0;
for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) {
int label = it->first;
int label_num_pos = it->second;
if (label_num_pos == 0 || true_pos.find(label) == true_pos.end())
if (label_num_pos == background_label ||
true_pos.find(label) == true_pos.end()) {
continue;
}
auto label_true_pos = true_pos.find(label)->second;
auto label_false_pos = false_pos.find(label)->second;
// Compute average precision.
......@@ -450,7 +461,7 @@ class DetectionMAPOpKernel : public framework::OpKernel<T> {
}
}
if (count != 0) mAP /= count;
return mAP * 100;
return mAP;
}
}; // namespace operators
......
......@@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
};
template <typename T>
struct ElementwiseDivGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto z_e = framework::EigenVector<T>::Flatten(*z);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = -1.0 * dz_e * z_e / y_e;
}
}
};
template <typename T>
struct ElementwiseDivBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
struct DivGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; }
};
template <typename T>
struct ElementwiseDivBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e / y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast))
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
struct DivGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return -dout * x / (y * y);
}
};
......@@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseDivGradFunctor<T>,
ElementwiseDivBroadCastGradFunctor<T>,
ElementwiseDivBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
}
};
......
......@@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel<T> {
};
template <typename T>
struct ElementwiseMaxGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e <= y_e).template cast<T>() * dz_e;
}
struct MaxGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x > y);
}
};
template <typename T>
struct ElementwiseMaxBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseMaxBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e > y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e <= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
struct MaxGradDy {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x <= y);
}
};
......@@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMaxGradFunctor<T>,
ElementwiseMaxBroadCastGradFunctor<T>,
ElementwiseMaxBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
ElemwiseGradCompute<DeviceContext, T, MaxGradDx<T>, MaxGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx<T>(), MaxGradDy<T>());
}
};
} // namespace operators
} // namespace paddle
......@@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel<T> {
};
template <typename T>
struct ElementwiseMinGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e >= y_e).template cast<T>() * dz_e;
}
struct MinGradDx {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x < y);
}
};
template <typename T>
struct ElementwiseMinBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseMinBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = (x_e < y_e_bcast).template cast<T>() * dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = ((x_e >= y_e_bcast).template cast<T>() * dz_e)
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
struct MinGradDy {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const {
return dout * (x >= y);
}
};
......@@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMinGradFunctor<T>,
ElementwiseMinBroadCastGradFunctor<T>,
ElementwiseMinBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
ElemwiseGradCompute<DeviceContext, T, MinGradDx<T>, MinGradDy<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx<T>(), MinGradDy<T>());
}
};
} // namespace operators
} // namespace paddle
......@@ -40,78 +40,13 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
};
template <typename T>
struct ElementwiseMulGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e * y_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = x_e * dz_e;
}
}
};
template <typename T>
struct ElementwiseMulBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 2>(1, n))
.broadcast(Eigen::DSizes<int, 2>(pre, 1))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e * y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e * dz_e)
.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
struct MulGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; }
};
template <typename T>
struct ElementwiseMulBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto x_e = framework::EigenVector<T>::Flatten(*x);
auto y_e = framework::EigenVector<T>::Flatten(*y);
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
auto y_e_bcast = y_e.reshape(Eigen::DSizes<int, 3>(1, n, 1))
.broadcast(Eigen::DSizes<int, 3>(pre, 1, post))
.reshape(Eigen::DSizes<int, 1>(x_e.size()));
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e * y_e_bcast;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (x_e * dz_e)
.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
}
struct MulGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; }
};
template <typename DeviceContext, typename T>
......@@ -127,12 +62,9 @@ class ElementwiseMulGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseMulGradFunctor<T>,
ElementwiseMulBroadCastGradFunctor<T>,
ElementwiseMulBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
ElemwiseGradCompute<DeviceContext, T, MulGradDX<T>, MulGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX<T>(), MulGradDY<T>());
}
};
} // namespace operators
} // namespace paddle
......@@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast {
dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
}
if (dy_ != nullptr) {
dy_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]);
dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]);
}
}
......
......@@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
};
template <typename T>
struct ElementwiseSubGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) * dz_e;
}
}
struct SubGradDX {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; }
};
template <typename T>
struct ElementwiseSubBroadCastGradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) *
dz_e.reshape(Eigen::DSizes<int, 2>(pre, n))
.sum(Eigen::array<int, 1>{{0}});
}
}
};
template <typename T>
struct ElementwiseSubBroadCast2GradFunctor {
template <typename Device, typename X, typename Y, typename Z, typename dX,
typename dY, typename dZ, typename Pre, typename N, typename Post>
void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n,
Post post) {
auto dz_e = framework::EigenVector<T>::Flatten(*dz);
if (dx) {
auto dx_e = framework::EigenVector<T>::Flatten(*dx);
dx_e.device(d) = dz_e;
}
if (dy) {
auto dy_e = framework::EigenVector<T>::Flatten(*dy);
dy_e.device(d) = (-1.0) *
dz_e.reshape(Eigen::DSizes<int, 3>(pre, n, post))
.sum(Eigen::array<int, 2>{{0, 2}});
}
}
struct SubGradDY {
HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; }
};
template <typename DeviceContext, typename T>
......@@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel<T> {
auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
int axis = ctx.Attr<int>("axis");
ElementwiseGradCompute<DeviceContext, T, ElementwiseSubGradFunctor<T>,
ElementwiseSubBroadCastGradFunctor<T>,
ElementwiseSubBroadCast2GradFunctor<T>>(
ctx, x, y, out, dout, axis, dx, dy);
ElemwiseGradCompute<DeviceContext, T, SubGradDX<T>, SubGradDY<T>>(
ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX<T>(), SubGradDY<T>());
}
};
} // namespace operators
} // namespace paddle
add_subdirectory(detail)
if(WITH_GPU)
nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto)
nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context)
nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context)
nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context)
nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context)
nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function)
nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor)
nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function)
nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function)
nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context)
nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context)
nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions)
nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context)
nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context)
nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function)
nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context)
else()
cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto)
cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function)
cc_library(softmax SRCS softmax.cc DEPS device_context)
cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context)
cc_library(pooling SRCS pooling.cc DEPS device_context)
cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function)
cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor)
cc_library(context_project SRCS context_project.cc DEPS device_context math_function)
cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function)
cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context)
cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context)
cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions)
cc_library(maxouting SRCS maxouting.cc DEPS device_context)
cc_library(unpooling SRCS unpooling.cc DEPS device_context)
cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function)
cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context)
endif()
function(math_library TARGET)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set(cc_srcs)
set(cu_srcs)
set(math_common_deps device_context framework_proto)
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
endfunction()
cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor)
# please add new math_library in alphabetical order
math_library(concat)
math_library(context_project DEPS im2col math_function)
math_library(cross_entropy)
math_library(cos_sim_functor)
math_library(depthwise_conv)
math_library(gru_compute DEPS activation_functions math_function)
math_library(im2col)
math_library(lstm_compute DEPS activation_functions)
math_library(math_function DEPS cblas)
math_library(maxouting)
math_library(pooling)
math_library(selected_rows_functor DEPS selected_rows)
math_library(sequence2batch)
math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function)
math_library(sequence_scale)
math_library(softmax)
math_library(unpooling)
math_library(vol2col)
cc_test(math_function_test SRCS math_function_test.cc)
cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor)
cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding)
if(WITH_GPU)
nv_test(math_function_gpu_test SRCS math_function_test.cu)
nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor)
endif()
cc_test(concat_test SRCS concat_test.cc DEPS concat)
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
namespace paddle {
namespace operators {
namespace math {
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) {
// TODO(zcd): Add input data validity checking
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int64_t> input_cols(input.size());
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
}
auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
for (int k = 0; k < out_rows; ++k) {
T* dst_ptr = output->data<T>() + k * out_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
const T* src_prt = input[j].data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt,
sizeof(T) * col_len);
col_idx += col_len;
}
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> {
public:
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking
int num = outputs.size();
int input_rows = 1;
auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) {
input_rows *= dim_0[i];
}
int input_cols = 0;
std::vector<int64_t> output_cols(outputs.size());
for (int i = 0; i < num; ++i) {
int t_cols = outputs[i].numel() / input_rows;
input_cols += t_cols;
output_cols[i] = t_cols;
}
auto& cpu_place = boost::get<platform::CPUPlace>(context.GetPlace());
// computation
for (int k = 0; k < input_rows; ++k) {
const T* src_ptr = input.data<T>() + k * input_cols;
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = output_cols[j];
T* dst_ptr = outputs[j].data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
sizeof(T) * col_len);
col_idx += col_len;
}
}
}
};
template class ConcatFunctor<platform::CPUDeviceContext, int>;
template class ConcatFunctor<platform::CPUDeviceContext, int64_t>;
template class ConcatFunctor<platform::CPUDeviceContext, float>;
template class ConcatFunctor<platform::CPUDeviceContext, double>;
template class ConcatGradFunctor<platform::CPUDeviceContext, int>;
template class ConcatGradFunctor<platform::CPUDeviceContext, int64_t>;
template class ConcatGradFunctor<platform::CPUDeviceContext, float>;
template class ConcatGradFunctor<platform::CPUDeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/mixed_vector.h"
#include "paddle/fluid/operators/math/concat.h"
#include "paddle/fluid/platform/cuda_helper.h"
namespace paddle {
namespace operators {
namespace math {
template <typename T>
__device__ T upper_bound(const T* first, T count, T val) {
const T* orig = first;
const T* it = nullptr;
T step = 0;
while (count > 0) {
it = first;
step = count / 2;
it += step;
if (!(val < *it)) {
first = ++it;
count -= step + 1;
} else {
count = step;
}
}
return first - orig;
}
template <typename T>
__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
const int output_rows, const int output_cols,
T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(input_cols, col_size, tid_x) - 1;
int curr_offset = input_cols[segment];
int curr_segment = segment;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* input_ptr = inputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y)
output[tid_y * output_cols + tid_x] =
input_ptr[tid_y * segment_width + local_col];
}
}
template <typename T>
__global__ void KernelConcat(T** inputs, const int input_col,
const int output_rows, const int output_cols,
T* output) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col;
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) {
int split = tid_x * inv_input_col;
int in_offset = tid_x - split * input_col;
T* input_ptr = inputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) {
output[tid_y * output_cols + tid_x] =
input_ptr[tid_y * input_col + in_offset];
}
}
}
template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
const int input_col, const int* output_cols,
int col_size, T** outputs) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1;
int curr_offset = output_cols[segment];
int curr_segment = segment;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset;
while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset;
++curr_segment;
}
int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] =
input[tid_y * input_col + tid_x];
}
}
template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row,
const int input_col, const int output_cols,
T** outputs) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) {
int split = tid_x * inv_input_col;
int in_offset = tid_x - split * input_col;
T* output_ptr = outputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * output_cols + in_offset] =
input[tid_y * input_col + tid_x];
}
}
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) {
// TODO(zcd): Add input data validity checking
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int cols = input[0].numel() / rows;
int out_rows = rows, out_cols = 0;
framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2);
framework::Vector<int> inputs_cols(num + 1);
inputs_cols[0] = 0;
T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
bool sameShape = true;
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
if (sameShape) {
if (t_cols != cols) sameShape = false;
}
out_cols += t_cols;
inputs_cols[i + 1] = out_cols;
inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
}
T** ins_gpu =
reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
// computation
// set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((out_cols + 31) >> 5) << 5;
}
int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1);
int max_threads = context.GetMaxPhysicalThreadCount();
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols =
std::min((out_cols + block_cols - 1) / block_cols, max_blocks);
int grid_rows =
std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, cols, out_rows, out_cols, output->data<T>());
} else {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows,
out_cols, output->data<T>());
}
}
};
/*
* All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension.
*/
template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public:
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking
int num = outputs.size();
int input_row = 1;
auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) {
input_row *= dim_0[i];
}
int output_col_0 = outputs[0].numel() / input_row;
int input_col = 0;
bool sameShape = true;
framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2);
framework::Vector<int> outputs_cols(num + 1);
outputs_cols[0] = 0;
T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
for (int i = 0; i < num; ++i) {
int t_col = outputs[i].numel() / input_row;
if (sameShape) {
if (t_col != output_col_0) sameShape = false;
}
input_col += t_col;
outputs_cols[i + 1] = input_col;
outputs_ptr[i] = outputs[i].data<T>();
}
T** outs_gpu =
reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
// computation
const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock;
if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((input_col + 31) >> 5) << 5;
}
int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1);
int max_threads = context.GetMaxPhysicalThreadCount();
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols =
std::min((input_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows =
std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, output_col_0, outs_gpu);
} else {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, outs_col_gpu,
static_cast<int>(outputs_cols.size()), outs_gpu);
}
}
};
template class ConcatFunctor<platform::CUDADeviceContext, int>;
template class ConcatFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatFunctor<platform::CUDADeviceContext, float>;
template class ConcatFunctor<platform::CUDADeviceContext, double>;
template class ConcatGradFunctor<platform::CUDADeviceContext, int>;
template class ConcatGradFunctor<platform::CUDADeviceContext, int64_t>;
template class ConcatGradFunctor<platform::CUDADeviceContext, float>;
template class ConcatGradFunctor<platform::CUDADeviceContext, double>;
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/tensor.h"
namespace paddle {
namespace operators {
namespace math {
/*
* \brief Concatenate the input tensors along the dimension axis.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input[0] = [[1,2],[3,4]]
* Input[1] = [[5,6]]
* axis = 0
*
* Output = [[1,2],
* [3,4],
* [5,6]]
*/
template <typename DeviceContext, typename T>
class ConcatFunctor {
public:
void operator()(const DeviceContext& context,
const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output);
};
/*
* \brief Split the input tensors along the dimension axis into outputs.
* TODO(zcd): maybe it needs to be more detailed.
* Examples:
* Input = [[1,2],
* [3,4],
* [5,6]]
* axis = 0
*
* Output[0] = [[1,2],[3,4]]
* Output[1] = [[5,6]]
*/
template <typename DeviceContext, typename T>
class ConcatGradFunctor {
public:
void operator()(const DeviceContext& context, const framework::Tensor& input,
const int axis, std::vector<framework::Tensor>& outputs);
};
} // namespace math
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/concat.h"
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/tensor_util.h"
using namespace paddle::framework;
using namespace paddle::platform;
template <typename DeviceContext, typename Place>
void testConcat() {
Tensor input_a_cpu;
Tensor input_b_cpu;
Tensor out_cpu;
Tensor input_a;
Tensor input_b;
Tensor out;
DeviceContext* context = new DeviceContext(Place());
// DeviceContext context(Place());
/**
* cast1:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [3, 3, 4]
* output:
* out.shape: [5, 3, 4]
*/
auto dim_a = make_ddim({2, 3, 4});
auto dim_b = make_ddim({3, 3, 4});
auto dim_out = make_ddim({5, 3, 4});
input_a.mutable_data<int>(dim_a, Place());
input_b.mutable_data<int>(dim_b, Place());
out.mutable_data<int>(dim_out, Place());
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.mutable_data<int>(dim_a, CPUPlace());
input_b_cpu.mutable_data<int>(dim_b, CPUPlace());
out_cpu.mutable_data<int>(dim_out, CPUPlace());
}
int* a_ptr;
int* b_ptr;
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 3 * 3 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
std::vector<Tensor> input;
input.push_back(input_a);
input.push_back(input_b);
paddle::operators::math::ConcatFunctor<DeviceContext, int> concat_functor;
concat_functor(*context, input, 0, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
int* out_ptr;
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
int cols = 2 * 3 * 4;
int idx_a = 0, idx_b = 0;
for (int j = 0; j < 5 * 3 * 4; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]);
++idx_a;
}
}
//
/**
* cast2:
* inputs:
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 4, 4]
* output:
* out.shape: [2, 7, 4]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 4, 4});
dim_out = make_ddim({2, 7, 4});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 4 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 1, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
cols = 3 * 4;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 28; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
/**
* cast3:
* inputs:
* t_a.shape: [2, 3, 5]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 3, 9]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 3, 5});
dim_out = make_ddim({2, 3, 9});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 3 * 5; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 2, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
// check the data
cols = 4;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 6; ++i) {
for (int j = 0; j < 9; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
/**
* cast4:
* inputs:
* axis = 1
* t_a.shape: [2, 3, 4]
* t_b.shape: [2, 3, 4]
* output:
* out.shape: [2, 6, 4]
*/
dim_a = make_ddim({2, 3, 4});
dim_b = make_ddim({2, 3, 4});
dim_out = make_ddim({2, 6, 4});
input_a.Resize(dim_a);
input_b.Resize(dim_b);
out.Resize(dim_out);
if (paddle::platform::is_gpu_place(Place())) {
input_a_cpu.Resize(dim_a);
input_b_cpu.Resize(dim_b);
out_cpu.Resize(dim_out);
}
if (paddle::platform::is_gpu_place(Place())) {
a_ptr = input_a_cpu.data<int>();
b_ptr = input_b_cpu.data<int>();
} else {
a_ptr = input_a.data<int>();
b_ptr = input_b.data<int>();
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
a_ptr[i] = i;
}
for (int i = 0; i < 2 * 3 * 4; ++i) {
b_ptr[i] = i;
}
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(input_a_cpu, Place(), *context, &input_a);
TensorCopy(input_b_cpu, Place(), *context, &input_b);
}
input.clear();
input.push_back(input_a);
input.push_back(input_b);
concat_functor(*context, input, 1, &out);
// check the dim of input_a, input_b
PADDLE_ENFORCE_EQ(input_a.dims(), dim_a);
PADDLE_ENFORCE_EQ(input_b.dims(), dim_b);
if (paddle::platform::is_gpu_place(Place())) {
TensorCopy(out, CPUPlace(), *context, &out_cpu);
out_ptr = out_cpu.data<int>();
} else {
out_ptr = out.data<int>();
}
// check the data
cols = 12;
idx_a = 0, idx_b = 0;
for (int i = 0; i < 2; ++i) {
for (int j = 0; j < 24; ++j) {
if (j >= cols) {
PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]);
++idx_b;
} else {
PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]);
++idx_a;
}
}
}
}
TEST(math, concat) {
testConcat<paddle::platform::CPUDeviceContext, paddle::platform::CPUPlace>();
#ifdef PADDLE_WITH_CUDA
testConcat<paddle::platform::CUDADeviceContext,
paddle::platform::CUDAPlace>();
#endif
}
......@@ -15,11 +15,23 @@ limitations under the License. */
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
......@@ -46,6 +58,15 @@ void gemm<platform::CPUDeviceContext, double>(
beta, C, ldc);
}
template <>
void gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
PADDLE_THROW("float16 GEMM not supported on CPU");
}
template <>
void gemm<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context, const bool transA,
......@@ -68,6 +89,15 @@ void gemm<platform::CPUDeviceContext, double>(
lda, B, ldb, beta, C, ldc);
}
template <>
void matmul<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
PADDLE_THROW("float16 matmul not supported on CPU");
}
template <>
void matmul<platform::CPUDeviceContext, float>(
const platform::CPUDeviceContext& context,
......@@ -126,6 +156,15 @@ void matmul<platform::CPUDeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CPUDeviceContext, float16>(
const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
PADDLE_THROW("float16 batched_gemm not supported on CPU");
}
#ifdef PADDLE_WITH_MKLML
// Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize.
template <>
......@@ -246,6 +285,8 @@ template struct SetConstant<platform::CPUDeviceContext, int64_t>;
template struct SetConstant<platform::CPUDeviceContext, bool>;
#define DEFINE_CPU_TRANS(RANK) \
template struct Transpose<platform::CPUDeviceContext, platform::float16, \
RANK>; \
template struct Transpose<platform::CPUDeviceContext, float, RANK>; \
template struct Transpose<platform::CPUDeviceContext, double, RANK>; \
template struct Transpose<platform::CPUDeviceContext, int, RANK>; \
......
......@@ -16,11 +16,40 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/math_function_impl.h"
#include "paddle/fluid/platform/float16.h"
namespace paddle {
namespace operators {
namespace math {
using float16 = paddle::platform::float16;
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, N));
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
......@@ -60,6 +89,28 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, N));
}
template <>
void gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const bool transA,
const bool transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const int lda, const float16* B,
const int ldb, const float16 beta, float16* C, const int ldc) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemm(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
h_A, lda, &h_beta, h_C, ldc));
}
template <>
void gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const bool transA,
......@@ -90,6 +141,35 @@ void gemm<platform::CUDADeviceContext, double>(
lda, &beta, C, ldc));
}
template <>
void matmul<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context,
const framework::Tensor& matrix_a, bool trans_a,
const framework::Tensor& matrix_b, bool trans_b, float16 alpha,
framework::Tensor* matrix_out, float16 beta) {
auto dim_a = matrix_a.dims();
auto dim_b = matrix_b.dims();
auto dim_out = matrix_out->dims();
PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2,
"The input and output of matmul be matrix");
PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) &&
platform::is_gpu_place(matrix_b.place()) &&
platform::is_gpu_place(matrix_out->place()),
"Matrix must all be in CUDAPlace");
int M = dim_out[0];
int N = dim_out[1];
int K = (trans_a == false) ? dim_a[1] : dim_a[0];
CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
gemm<platform::CUDADeviceContext, float16>(
context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
matrix_b.data<float16>(), beta, matrix_out->data<float16>());
}
template <>
void matmul<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context,
......@@ -148,6 +228,34 @@ void matmul<platform::CUDADeviceContext, double>(
matrix_b.data<double>(), beta, matrix_out->data<double>());
}
template <>
void batched_gemm<platform::CUDADeviceContext, float16>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
const float16 alpha, const float16* A, const float16* B, const float16 beta,
float16* C, const int batchCount, const int strideA, const int strideB) {
// Note that cublas follows fortran order, so the order is different from
// the cblas convention.
int lda = (transA == CblasNoTrans) ? K : M;
int ldb = (transB == CblasNoTrans) ? N : K;
int ldc = N;
cublasOperation_t cuTransA =
(transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
cublasOperation_t cuTransB =
(transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
const int strideC = M * N;
const half h_alpha = static_cast<const half>(alpha);
const half h_beta = static_cast<const half>(beta);
const half* h_A = reinterpret_cast<const half*>(A);
const half* h_B = reinterpret_cast<const half*>(B);
half* h_C = reinterpret_cast<half*>(C);
PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched(
context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount));
}
template <>
void batched_gemm<platform::CUDADeviceContext, float>(
const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
......
......@@ -14,30 +14,41 @@
#include "gtest/gtest.h"
#include "paddle/fluid/operators/math/math_function.h"
TEST(math_function, notrans_mul_trans) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
const std::vector<float>& data) {
PADDLE_ENFORCE_EQ(size, data.size());
for (size_t i = 0; i < data.size(); ++i) {
in_ptr[i] = paddle::platform::float16(data[i]);
}
}
TEST(math_function, notrans_mul_trans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({2, 2}, *gpu_place);
out_gpu.mutable_data<float>({2, 2}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
paddle::operators::math::matmul<CUDADeviceContext, float>(
context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) {
EXPECT_EQ(out_ptr[1], 14);
EXPECT_EQ(out_ptr[2], 14);
EXPECT_EQ(out_ptr[3], 50);
delete gpu_place;
}
TEST(math_function, trans_mul_notrans) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor out_gpu;
paddle::framework::Tensor out;
TEST(math_function, notrans_mul_trans_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({2, 2}, gpu_place);
paddle::operators::math::matmul<CUDADeviceContext, float16>(
context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 5);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 14);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 50);
}
TEST(math_function, trans_mul_notrans_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr, 6 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float>({3, 3}, *gpu_place);
out_gpu.mutable_data<float>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float>(
context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out);
TensorCopy(out_gpu, cpu_place, context, &out);
float* out_ptr = out.data<float>();
context.Wait();
......@@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) {
EXPECT_EQ(out_ptr[6], 15);
EXPECT_EQ(out_ptr[7], 22);
EXPECT_EQ(out_ptr[8], 29);
delete gpu_place;
}
TEST(math_function, gemm_notrans_cublas) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
TEST(math_function, trans_mul_notrans_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor out_gpu;
Tensor out;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input1, gpu_place, context, &input2_gpu);
out_gpu.mutable_data<float16>({3, 3}, gpu_place);
paddle::operators::math::matmul<paddle::platform::CUDADeviceContext, float16>(
context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu,
float16(0));
TensorCopy(out_gpu, cpu_place, context, &out);
float16* out_ptr = out.data<float16>();
context.Wait();
EXPECT_EQ(static_cast<float>(out_ptr[0]), 9);
EXPECT_EQ(static_cast<float>(out_ptr[1]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[2]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[3]), 12);
EXPECT_EQ(static_cast<float>(out_ptr[4]), 17);
EXPECT_EQ(static_cast<float>(out_ptr[5]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[6]), 15);
EXPECT_EQ(static_cast<float>(out_ptr[7]), 22);
EXPECT_EQ(static_cast<float>(out_ptr[8]), 29);
}
TEST(math_function, gemm_notrans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({3, 4}, *cpu_place);
float* input2_ptr = input2.mutable_data<float>({3, 4}, cpu_place);
float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
......@@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) {
EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place;
}
TEST(math_function, gemm_trans_cublas) {
paddle::framework::Tensor input1;
paddle::framework::Tensor input2;
paddle::framework::Tensor input3;
paddle::framework::Tensor input1_gpu;
paddle::framework::Tensor input2_gpu;
paddle::framework::Tensor input3_gpu;
TEST(math_function, gemm_notrans_cublas_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({3, 4}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
// numpy code:
// a = np.arange(6).reshape(2, 3)
// b = np.arange(12).reshape(3, 4)[:, 1:]
// c = np.arange(8).reshape(2, 4)[:, 1:]
// out = np.arange(8).reshape(2, 4)
// out[:, 1:] = np.dot(a, b) + c
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}
TEST(math_function, gemm_trans_cublas_fp32) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
auto* cpu_place = new paddle::platform::CPUPlace();
float* input1_ptr = input1.mutable_data<float>({2, 3}, *cpu_place);
float* input1_ptr = input1.mutable_data<float>({2, 3}, cpu_place);
float arr1[6] = {0, 1, 2, 3, 4, 5};
memcpy(input1_ptr, arr1, 6 * sizeof(float));
float* input2_ptr = input2.mutable_data<float>({4, 3}, *cpu_place);
float* input2_ptr = input2.mutable_data<float>({4, 3}, cpu_place);
float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
memcpy(input2_ptr, arr2, 12 * sizeof(float));
float* input3_ptr = input3.mutable_data<float>({2, 4}, *cpu_place);
float* input3_ptr = input3.mutable_data<float>({2, 4}, cpu_place);
float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7};
memcpy(input3_ptr, arr3, 8 * sizeof(float));
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu);
paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu);
paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu);
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float* a = input1_gpu.data<float>();
float* b = input2_gpu.data<float>();
float* c = input3_gpu.mutable_data<float>(*gpu_place);
float* c = input3_gpu.mutable_data<float>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float>(
context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3);
context.Wait();
TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(input3_ptr[0], 0);
EXPECT_EQ(input3_ptr[1], 24);
EXPECT_EQ(input3_ptr[2], 28);
......@@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) {
EXPECT_EQ(input3_ptr[5], 73);
EXPECT_EQ(input3_ptr[6], 86);
EXPECT_EQ(input3_ptr[7], 99);
delete gpu_place;
}
TEST(math_function, gemm_trans_cublas_fp16) {
using namespace paddle::framework;
using namespace paddle::platform;
Tensor input1;
Tensor input2;
Tensor input3;
Tensor input1_gpu;
Tensor input2_gpu;
Tensor input3_gpu;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
int m = 2;
int n = 3;
int k = 3;
float16* input1_ptr = input1.mutable_data<float16>({2, 3}, cpu_place);
fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
float16* input2_ptr = input2.mutable_data<float16>({4, 3}, cpu_place);
fill_fp16_data(input2_ptr, input2.numel(),
{0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
float16* input3_ptr = input3.mutable_data<float16>({2, 4}, cpu_place);
fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
TensorCopy(input1, gpu_place, context, &input1_gpu);
TensorCopy(input2, gpu_place, context, &input2_gpu);
TensorCopy(input3, gpu_place, context, &input3_gpu);
float16* a = input1_gpu.data<float16>();
float16* b = input2_gpu.data<float16>();
float16* c = input3_gpu.mutable_data<float16>(gpu_place);
paddle::operators::math::gemm<paddle::platform::CUDADeviceContext, float16>(
context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1),
c + 1, 4);
TensorCopy(input3_gpu, cpu_place, context, &input3);
context.Wait();
EXPECT_EQ(static_cast<float>(input3_ptr[0]), 0);
EXPECT_EQ(static_cast<float>(input3_ptr[1]), 24);
EXPECT_EQ(static_cast<float>(input3_ptr[2]), 28);
EXPECT_EQ(static_cast<float>(input3_ptr[3]), 32);
EXPECT_EQ(static_cast<float>(input3_ptr[4]), 4);
EXPECT_EQ(static_cast<float>(input3_ptr[5]), 73);
EXPECT_EQ(static_cast<float>(input3_ptr[6]), 86);
EXPECT_EQ(static_cast<float>(input3_ptr[7]), 99);
}
template <typename T>
void GemvTest(int m, int n, bool trans) {
paddle::framework::Tensor mat_a;
paddle::framework::Tensor vec_b;
paddle::framework::Tensor vec_c;
auto* cpu_place = new paddle::platform::CPUPlace();
T* data_a = mat_a.mutable_data<T>({m, n}, *cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, *cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, *cpu_place);
auto* gpu_place = new paddle::platform::CUDAPlace(0);
paddle::framework::Tensor g_mat_a;
paddle::framework::Tensor g_vec_b;
paddle::framework::Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), *gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), *gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), *gpu_place);
using namespace paddle::framework;
using namespace paddle::platform;
Tensor mat_a;
Tensor vec_b;
Tensor vec_c;
CPUPlace cpu_place;
CUDAPlace gpu_place(0);
CUDADeviceContext context(gpu_place);
T* data_a = mat_a.mutable_data<T>({m, n}, cpu_place);
T* data_b = vec_b.mutable_data<T>({trans ? m : n}, cpu_place);
T* data_c = vec_c.mutable_data<T>({trans ? n : m}, cpu_place);
Tensor g_mat_a;
Tensor g_vec_b;
Tensor g_vec_c;
T* g_data_a = g_mat_a.mutable_data<T>(mat_a.dims(), gpu_place);
T* g_data_b = g_vec_b.mutable_data<T>(vec_b.dims(), gpu_place);
T* g_data_c = g_vec_c.mutable_data<T>(vec_c.dims(), gpu_place);
for (int i = 0; i < mat_a.numel(); ++i) {
data_a[i] = static_cast<T>(i);
......@@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) {
data_b[i] = static_cast<T>(i);
}
paddle::platform::CUDADeviceContext context(*gpu_place);
paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a);
paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b);
TensorCopy(mat_a, gpu_place, context, &g_mat_a);
TensorCopy(vec_b, gpu_place, context, &g_vec_b);
paddle::operators::math::gemv<paddle::platform::CUDADeviceContext, T>(
paddle::operators::math::gemv<CUDADeviceContext, T>(
context, trans, static_cast<int>(m), static_cast<int>(n), 1., g_data_a,
g_data_b, 0., g_data_c);
paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context,
&vec_c);
TensorCopy(g_vec_c, cpu_place, context, &vec_c);
if (!trans) {
for (int i = 0; i < m; ++i) {
......
......@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
......
......@@ -324,7 +324,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker {
" Please note, M is equal to the 1st dimension of BBoxes. ");
AddAttr<int>(
"background_label",
"(int64_t, defalut: 0) "
"(int, defalut: 0) "
"The index of background label, the background label will be ignored. "
"If set to -1, then all categories will be considered.")
.SetDefault(0);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pool_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
template <typename T>
class PoolMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const Tensor* input = ctx.Input<Tensor>("X");
Tensor* output = ctx.Output<Tensor>("Out");
// Get an unique name from "argument" name of "Out" variable
// This name will be used as key when saving info into device context
const std::string key = ctx.op().Output("Out");
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
if (ctx.Attr<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(input->dims()[i + 2]);
}
}
// Only 2D pooling is supported now
PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling");
PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg",
"pooling_type must be 'max' or 'avg'");
PADDLE_ENFORCE(input->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW");
const T* input_data = input->data<T>();
T* output_data = output->mutable_data<T>(ctx.GetPlace());
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// TODO(pzelazko-intel): support more formats
auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd =
CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize,
pooling_type, mkldnn_engine);
// save pool_pd into global device context to be referred in backward path
dev_ctx.SetBlob(key_pool_pd, pool_pd);
std::shared_ptr<mkldnn::memory> workspace_memory =
CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine);
// save pool_workspace_memory to be referred in backward path
dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory);
auto src_memory =
mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data);
auto dst_memory =
mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data);
auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory,
*workspace_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{pool_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
private:
std::unique_ptr<mkldnn::pooling_forward::primitive_desc> CreatePrimitiveDesc(
const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst,
const std::vector<int>& stride, const std::vector<int>& padding,
const std::vector<int>& kernel, const std::string& pooling_type,
const mkldnn::engine& engine) const {
auto pool_desc = mkldnn::pooling_forward::desc(
mkldnn::prop_kind::forward,
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero);
auto p_pool_pd =
new mkldnn::pooling_forward::primitive_desc(pool_desc, engine);
return std::unique_ptr<mkldnn::pooling_forward::primitive_desc>(p_pool_pd);
}
std::unique_ptr<mkldnn::memory> CreateWorkspaceMemory(
std::shared_ptr<mkldnn::pooling_forward::primitive_desc> pool_pd,
const std::string& pooling_type, const mkldnn::engine& engine) const {
mkldnn::memory::primitive_desc workspace_md =
pooling_type == "max"
? pool_pd->workspace_primitive_desc()
: mkldnn::memory::primitive_desc(
{{}, mkldnn::memory::f32, mkldnn::memory::format::nchw},
engine);
auto p_workspace_memory = new mkldnn::memory(workspace_md);
return std::unique_ptr<mkldnn::memory>(p_workspace_memory);
}
};
template <typename T>
class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
const Tensor* in_x = ctx.Input<Tensor>("X");
const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
Tensor* in_x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
// Get an unique name from "argument" name of "Out" variable
// This name will be used as key when referring info from device context
const std::string key = ctx.op().Input("Out");
const std::string key_pool_pd = key + "@pool_pd";
const std::string key_pool_workspace_memory =
key + "@pool_workspace_memory";
std::string pooling_type = ctx.Attr<std::string>("pooling_type");
std::vector<int> ksize = ctx.Attr<std::vector<int>>("ksize");
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
if (ctx.Attr<bool>("global_pooling")) {
for (size_t i = 0; i < ksize.size(); ++i) {
paddings[i] = 0;
ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
}
}
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine();
const T* out_grad_data = out_grad->data<T>();
T* in_x_grad_data = in_x_grad->mutable_data<T>(ctx.GetPlace());
std::vector<int> diff_src_tz =
paddle::framework::vectorize2int(in_x_grad->dims());
std::vector<int> diff_dst_tz =
paddle::framework::vectorize2int(out_grad->dims());
auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// Retrieve pool_pd/pool_workspace_memory from device context
auto pool_pd =
std::static_pointer_cast<mkldnn::pooling_forward::primitive_desc>(
dev_ctx.GetBlob(key_pool_pd));
PADDLE_ENFORCE(pool_pd != nullptr,
"Fail to find pool_pd in device context");
auto workspace_memory = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(key_pool_workspace_memory));
PADDLE_ENFORCE(workspace_memory != nullptr,
"Fail to find workspace_memory in device context");
auto pool_bwd_desc = mkldnn::pooling_backward::desc(
pooling_type == "max" ? mkldnn::algorithm::pooling_max
: mkldnn::algorithm::pooling_avg,
diff_src_md, diff_dst_md, strides, ksize, paddings, paddings,
mkldnn::padding_kind::zero);
auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc(
pool_bwd_desc, mkldnn_engine, *pool_pd);
auto diff_src_memory =
mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data);
auto diff_dst_memory =
mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data);
auto bwd_prim = mkldnn::pooling_backward(
pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline{bwd_prim};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
} // Compute()
};
} // namespace operators
} // namespace paddle
REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace,
paddle::operators::PoolMKLDNNGradOpKernel<float>);
......@@ -13,12 +13,25 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/pool_op.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cudnn_helper.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
int PoolOutputSize(int input_size, int filter_size, int padding, int stride) {
int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
int PoolOutputSize(int input_size, int filter_size, int padding, int stride,
bool ceil_mode) {
int output_size;
if (!ceil_mode) {
output_size = (input_size - filter_size + 2 * padding) / stride + 1;
} else {
output_size =
(input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
}
PADDLE_ENFORCE(output_size > 0,
"Due to the settings of padding(%d), filter_size(%d) and "
"stride(%d), the output size is less than 0, please check "
......@@ -38,6 +51,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
std::vector<int> ksize = ctx->Attrs().Get<std::vector<int>>("ksize");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
bool ceil_mode = ctx->Attrs().Get<bool>("ceil_mode");
PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5,
"Pooling intput should be 4-D or 5-D tensor.");
......@@ -59,8 +73,8 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
std::vector<int64_t> output_shape({in_x_dims[0], in_x_dims[1]});
for (size_t i = 0; i < ksize.size(); ++i) {
output_shape.push_back(
PoolOutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i]));
output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i],
paddings[i], strides[i], ceil_mode));
}
ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
ctx->ShareLoD("X", "Out");
......@@ -68,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOp::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -99,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const {
framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= platform::is_gpu_place(ctx.GetPlace());
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(ctx.GetPlace())) {
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
if (platform::CanCUDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kCUDNN;
}
#endif
framework::LibraryType library_;
if (use_cudnn) {
library_ = framework::LibraryType::kCUDNN;
} else {
library_ = framework::LibraryType::kPlain;
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
......@@ -167,6 +177,15 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>(
"ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -192,11 +211,16 @@ Example:
X shape: $(N, C, H_{in}, W_{in})$
Output:
Out shape: $(N, C, H_{out}, W_{out})$
Where
For ceil_mode = false:
$$
H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1
$$
For ceil_mode = true:
$$
H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1
$$
)DOC");
}
......@@ -251,6 +275,15 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker)
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(false);
AddAttr<bool>(
"ceil_mode",
"(bool, default false) Wether to use the ceil function to calculate "
"output height and width. False is the default. If it is set to False, "
"the floor function will be used.")
.SetDefault(false);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
......@@ -276,12 +309,18 @@ Example:
X shape: $(N, C, D_{in}, H_{in}, W_{in})$
Output:
Out shape: $(N, C, D_{out}, H_{out}, W_{out})$
Where
For ceil_mode = false:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1
$$
For ceil_mode = true:
$$
D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\
H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\
W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1
$$
)DOC");
}
......
......@@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase {
const platform::Place& dev_place) const override {
framework::ReaderHolder* reader =
scope.FindVar(Input("Reader"))->GetMutable<framework::ReaderHolder>();
if (!reader->HasNext()) {
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
if (ins.empty()) {
reader->ReInit();
reader->ReadNext(&ins);
PADDLE_ENFORCE(
reader->HasNext(),
!ins.empty(),
"Reader can not read the next data even it has been re-initialized.");
}
std::vector<std::string> out_arg_names = Outputs("Out");
std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins);
PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size());
for (size_t i = 0; i < ins.size(); ++i) {
auto* out =
......
cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader)
op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry)
op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry)
op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry)
op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry)
set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class BatchReader : public framework::DecoratedReader {
public:
BatchReader(ReaderBase* reader, int batch_size)
: DecoratedReader(reader), batch_size_(batch_size) {
buffer_.reserve(batch_size_);
}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
private:
int batch_size_;
std::vector<std::vector<framework::LoDTensor>> buffer_;
};
class CreateBatchReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(
new BatchReader(underlying_reader.Get(), Attr<int>("batch_size")));
}
};
class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddAttr<int>("batch_size",
"How many instances the batch reader yields each time.")
.GreaterThan(0);
AddComment(R"DOC(
CreateBatchReader Operator
A batch reader takes another reader as its 'underlying reader',
gathers the underlying reader's outputs and then yields them in batches.
)DOC");
}
};
void BatchReader::ReadNext(std::vector<framework::LoDTensor>* out) {
buffer_.clear();
buffer_.reserve(batch_size_);
for (int i = 0; i < batch_size_; ++i) {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
if (buffer_.back().empty()) {
buffer_.pop_back();
break;
}
}
// Concat instances
out->clear();
if (buffer_.empty()) {
// if buffer_ is empty, the 'out' will return as an empty vector.
return;
}
int out_num = buffer_[0].size();
out->reserve(out_num);
for (int j = 0; j < out_num; ++j) {
// Merge shape and check date type
std::type_index batch_type = buffer_[0][j].type();
framework::DDim batch_shape = buffer_[0][j].dims();
for (size_t i = 1; i < buffer_.size(); ++i) {
std::type_index ins_type = buffer_[i][j].type();
framework::DDim ins_shape = buffer_[i][j].dims();
PADDLE_ENFORCE_EQ(batch_type, ins_type);
PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()),
slice_ddim(ins_shape, 1, ins_shape.size()));
PADDLE_ENFORCE_GT(ins_shape[0], 0);
batch_shape[0] += ins_shape[0];
}
framework::LoDTensor out_tensor;
out_tensor.Resize(batch_shape);
out_tensor.mutable_data(platform::CPUPlace(), batch_type);
int64_t dst_offset = 0;
// Merge lod and data
framework::LoD batch_lod;
for (size_t i = 0; i < buffer_.size(); ++i) {
framework::DDim ins_shape = buffer_[i][j].dims();
framework::LoD ins_lod = buffer_[i][j].lod();
if (i == 0) {
batch_lod = ins_lod;
} else {
PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size());
for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) {
auto& lod_level = batch_lod[level_idx];
for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) {
lod_level.push_back(ins_lod[level_idx][k] + lod_level.back());
}
}
}
auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]);
TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst);
dst_offset += ins_shape[0];
}
out_tensor.set_lod(batch_lod);
out->push_back(out_tensor);
}
}
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_batch_reader,
ops::CreateBatchReaderOp,
ops::CreateBatchReaderOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <thread>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
static constexpr size_t kDoubleBufferSize = 2;
class DoubleBufferReader : public framework::DecoratedReader {
public:
explicit DoubleBufferReader(ReaderBase* reader)
: DecoratedReader(reader),
buffer_(framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize)) {
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
prefetch.detach();
}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
void ReInit() override;
~DoubleBufferReader() { buffer_->Close(); }
private:
void PrefetchThreadFunc();
framework::Channel<std::vector<framework::LoDTensor>>* buffer_;
};
class CreateDoubleBufferReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new DoubleBufferReader(underlying_reader.Get()));
}
};
class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddComment(R"DOC(
CreateDoubleBufferReader Operator
A double buffer reader takes another reader as its 'underlying reader'.
It launches another thread to execute the 'underlying reader' asynchronously,
which prevents reading process from blocking subsequent training.
)DOC");
}
};
void DoubleBufferReader::ReadNext(std::vector<framework::LoDTensor>* out) {
out->clear();
buffer_->Receive(out);
}
void DoubleBufferReader::ReInit() {
reader_->ReInit();
buffer_->Close();
// The existing prefetch thread will terminate for the buffer_ is closed.
buffer_ = framework::MakeChannel<std::vector<framework::LoDTensor>>(
kDoubleBufferSize);
std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this);
prefetch.detach();
}
void DoubleBufferReader::PrefetchThreadFunc() {
VLOG(5) << "A new prefetch thread starts.";
while (true) {
std::vector<framework::LoDTensor> batch;
reader_->ReadNext(&batch);
if (batch.empty()) {
// EOF
buffer_->Close();
VLOG(5) << "Reached the end of the file. The prefetch thread terminates.";
break;
}
if (!buffer_->Send(&batch)) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread terminates.";
break;
}
}
}
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader,
ops::CreateDoubleBufferReaderOp,
ops::CreateDoubleBufferReaderOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
template <typename T>
class RandomDataGenerator : public framework::FileReader {
public:
RandomDataGenerator(const std::vector<framework::DDim>& shapes, float min,
float max)
: FileReader(shapes), min_(min), max_(max) {
PADDLE_ENFORCE_LE(
min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max);
unsigned int seed = std::random_device()();
engine_.seed(seed);
dist_ = std::uniform_real_distribution<float>(min_, max_);
}
void ReadNext(std::vector<framework::LoDTensor>* out) override {
out->clear();
out->reserve(shapes_.size());
for (const framework::DDim& shape : shapes_) {
PADDLE_ENFORCE_GE(
shape.size(), 2,
"The rank of reader's output data should be 2 at least.(Now it's %d)",
shape.size());
framework::LoDTensor out_tensor;
out_tensor.Resize(shape);
T* data = out_tensor.mutable_data<T>(platform::CPUPlace());
int64_t numel = framework::product(shape);
for (int64_t i = 0; i < numel; ++i) {
data[i] = dist_(engine_);
}
out->push_back(out_tensor);
}
}
void ReInit() override { return; }
private:
float min_;
float max_;
std::minstd_rand engine_;
std::uniform_real_distribution<float> dist_;
};
template <typename T>
class CreateRandomDataGeneratorOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& shape_concat = Attr<std::vector<int>>("shape_concat");
const auto& ranks = Attr<std::vector<int>>("ranks");
PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty());
PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0),
int(shape_concat.size()),
"The accumulate of all ranks should be equal to the "
"shape concat's length.");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(new RandomDataGenerator<T>(shapes, Attr<float>("min"),
Attr<float>("max")));
}
};
class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase {
public:
CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: FileReaderMakerBase(op_proto, op_checker) {
AddAttr<float>("min", "The lower bound of reader's uniform distribution.");
AddAttr<float>("max", "The upper bound of reader's uniform distribution.");
AddComment(R"DOC(
CreateRandomDataGenerator Operator
This Op creates a random reader.
The reader generates random data instead of really reading from files.
Generated data follow an uniform distribution between 'min' and 'max'.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_FILE_READER_OPERATOR(create_random_data_generator,
ops::CreateRandomDataGeneratorOp<float>,
ops::CreateRandomDataGeneratorOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/reader/reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
class ShuffleReader : public framework::DecoratedReader {
public:
ShuffleReader(ReaderBase* reader, int buffer_size)
: DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) {
buffer_.reserve(buffer_size);
}
void ReadNext(std::vector<framework::LoDTensor>* out) override;
private:
int buffer_size_;
std::vector<std::vector<framework::LoDTensor>> buffer_;
size_t iteration_pos_;
};
void ShuffleReader::ReadNext(std::vector<framework::LoDTensor>* out) {
if (iteration_pos_ >= buffer_.size()) {
// Reload buffer with new data
buffer_.clear();
buffer_.reserve(buffer_size_);
for (int i = 0; i < buffer_size_; ++i) {
buffer_.push_back(std::vector<framework::LoDTensor>());
reader_->ReadNext(&buffer_.back());
if (buffer_.back().empty()) {
buffer_.pop_back();
break;
}
}
// TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be
// optimize.
std::random_shuffle(buffer_.begin(), buffer_.end());
iteration_pos_ = 0;
}
out->clear();
if (!buffer_.empty()) {
std::swap(*out, buffer_[iteration_pos_++]);
}
// if buffer_ is empty, the 'out' will return as an empty vector.
}
class CreateShuffleReaderOp : public framework::OperatorBase {
public:
using framework::OperatorBase::OperatorBase;
private:
void RunImpl(const framework::Scope& scope,
const platform::Place& dev_place) const override {
const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader"))
->Get<framework::ReaderHolder>();
auto* out = scope.FindVar(Output("Out"))
->template GetMutable<framework::ReaderHolder>();
out->Reset(
new ShuffleReader(underlying_reader.Get(), Attr<int>("buffer_size")));
}
};
class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase {
public:
CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker)
: DecoratedReaderMakerBase(op_proto, op_checker) {
AddAttr<int>("buffer_size", "The shuffle buffer size.").GreaterThan(0);
AddComment(R"DOC(
CreateShuffleReader Operator
A shuffle reader takes another reader as its 'underlying reader'
and yields the underlying reader's outputs in a shuffled order.
)DOC");
}
};
} // namespace reader
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators::reader;
REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader,
ops::CreateShuffleReaderOp,
ops::CreateShuffleReaderOpMaker);
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "reader_op_registry.h"
namespace paddle {
namespace operators {
namespace reader {
std::vector<framework::DDim> RestoreShapes(const std::vector<int>& shape_concat,
const std::vector<int>& ranks) {
std::vector<framework::DDim> res;
int offset = 0;
for (int len : ranks) {
auto start_it = shape_concat.begin() + offset;
auto end_it = start_it + len;
res.push_back(framework::make_ddim(std::vector<int>(start_it, end_it)));
offset += len;
}
return res;
}
FileReaderMakerBase::FileReaderMakerBase(
framework::OpProtoAndCheckerMaker::OpProto* op_proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddOutput("Out", "(ReaderHolder) The created random reader.");
AddAttr<std::vector<int>>("shape_concat", "The concat of all data's shapes.");
AddAttr<std::vector<int>>(
"ranks",
"The ranks of each data."
"e.g."
"shape_concat = [2,3,4,5,6]"
"ranks = [3,2]"
"It means the reader will generate two data each time,"
"whose shapes are [2,3,4] and [5,6] respectively.");
AddAttr<std::vector<int>>("lod_levels", "The LoD levels of each data.");
}
void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output file reader should not be null.");
const auto shape_concat = ctx->Attrs().Get<std::vector<int>>("shape_concat");
const auto ranks = ctx->Attrs().Get<std::vector<int>>("ranks");
std::vector<framework::DDim> shapes = RestoreShapes(shape_concat, ranks);
ctx->SetReaderDims("Out", shapes);
if (ctx->IsRuntime()) {
const auto lod_levels = ctx->Attrs().Get<std::vector<int>>("lod_levels");
PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(),
"The number of 'lod_levels'(%d) doesn't match the number "
"of 'shapes'(%d).",
lod_levels.size(), shapes.size());
framework::VarDesc* reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
reader->SetLoDLevels(lod_levels);
}
}
void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const {
std::string reader_name = op_desc.Output("Out")[0];
framework::VarDesc* reader = block->FindVarRecursive(reader_name);
reader->SetType(framework::proto::VarType::READER);
}
void DecoratedReaderInferShape::operator()(
framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"),
"Input(UnderlyingReader) should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"The output decorated reader should not be null.");
ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader"));
if (ctx->IsRuntime()) {
framework::VarDesc* in_reader = boost::get<framework::VarDesc*>(
ctx->GetInputVarPtrs("UnderlyingReader")[0]);
framework::VarDesc* out_reader =
boost::get<framework::VarDesc*>(ctx->GetOutputVarPtrs("Out")[0]);
out_reader->SetLoDLevels(in_reader->GetLoDLevels());
}
}
void DecoratedReaderInferVarType::operator()(
const framework::OpDesc& op_desc, framework::BlockDesc* block) const {
std::string in_reader_name = op_desc.Input("UnderlyingReader")[0];
framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name);
std::string out_reader_name = op_desc.Output("Out")[0];
framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name);
out_reader->SetType(framework::proto::VarType::READER);
out_reader->SetDataTypes(in_reader->GetDataTypes());
}
DecoratedReaderMakerBase::DecoratedReaderMakerBase(
framework::OpProtoAndCheckerMaker::OpProto* op_proto,
framework::OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(op_proto, op_checker) {
AddInput("UnderlyingReader",
"(ReaderHolder) The underlying reader for creating a batch reader.");
AddOutput("Out", "(ReaderHolder) The created batch reader.");
}
} // namespace reader
} // namespace operators
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/reader.h"
namespace paddle {
namespace operators {
namespace reader {
extern std::vector<framework::DDim> RestoreShapes(
const std::vector<int>& shape_concat, const std::vector<int>& ranks);
class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker {
public:
FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
};
class FileReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override;
};
class FileReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override;
};
// general infershape for decorated reader
class DecoratedReaderInferShape : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override;
};
// general var type inference for decorated reader
class DecoratedReaderInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override;
};
class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker {
public:
DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker);
};
} // namespace reader
} // namespace operators
} // namespace paddle
#define REGISTER_FILE_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::FileReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::FileReaderInferVarType)
#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...) \
REGISTER_OPERATOR(op_name, __VA_ARGS__, \
paddle::operators::reader::DecoratedReaderInferShape, \
paddle::framework::EmptyGradOpMaker, \
paddle::operators::reader::DecoratedReaderInferVarType)
......@@ -118,6 +118,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<std::vector<int>>(
"shape", "(std::vector<int>) Target shape of reshape operator.")
.SetDefault(std::vector<int>());
AddAttr<bool>("inplace",
"Change the source tensor's shape without copy memory.")
.SetDefault(true);
AddComment(R"DOC(
Reshape Operator.
......
......@@ -38,9 +38,15 @@ class ReshapeKernel : public framework::OpKernel<T> {
out_dims = out->dims();
}
bool inplace = ctx.Attr<bool>("inplace");
if (!inplace) {
out->mutable_data<T>(ctx.GetPlace());
framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out);
out->Resize(out_dims);
} else {
out->ShareDataWith(*in);
out->Resize(out_dims);
}
}
private:
......
......@@ -24,7 +24,7 @@ limitations under the License. */
namespace paddle {
namespace operators {
static bool IsVariableInitialized(const framework::Scope& scope,
static bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
......@@ -32,7 +32,7 @@ static bool IsVariableInitialized(const framework::Scope& scope,
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().value().IsInitialized();
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
......@@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase {
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (IsVariableInitialized(scope, ins[i])) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
......
......@@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel {
// and run time.
ctx->SetOutputDim("ParamOut", param_dim);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::LoDTensor>("Param")->type()),
ctx.GetPlace());
}
};
class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
......
......@@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_EQ(param, param_out);
auto* grad = ctx.Input<framework::SelectedRows>("Grad");
// for distributed training, a sparse var may be empty,
// just skip updating.
if (grad->rows().size() == 0) {
return;
}
auto in_height = grad->height();
auto out_dims = param_out->dims();
PADDLE_ENFORCE_EQ(in_height, out_dims[0]);
......@@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel<T> {
auto* in_data = in_value.data<T>();
auto* out_data = param_out->data<T>();
auto* lr = learning_rate->data<T>();
for (size_t i = 0; i < in_rows.size(); i++) {
PADDLE_ENFORCE(in_rows[i] < in_height,
"Input rows index should less than height");
for (int64_t j = 0; j < in_row_numel; j++) {
out_data[in_rows[i] * in_row_numel + j] -=
lr[0] * in_data[i * in_row_numel + j];
}
}
} else {
PADDLE_THROW("Unsupported Variable Type of Grad");
}
......
......@@ -21,15 +21,24 @@ limitations under the License. */
namespace paddle {
namespace operators {
static int FindOutIdx(int row, const std::vector<int>& height_sections) {
int offset = 0;
for (size_t i = 0; i < height_sections.size(); ++i) {
if (row >= offset && row < (offset + height_sections[i])) {
return i;
static int FindOutIdx(int row, const std::vector<int>& abs_sections) {
for (size_t i = 1; i < abs_sections.size(); ++i) {
if (row < abs_sections[i]) {
return i - 1;
}
offset += height_sections[i];
}
return -1;
return abs_sections.size() - 1;
}
static std::vector<int> ToAbsoluteSection(
const std::vector<int>& height_sections) {
std::vector<int> abs_sections;
abs_sections.resize(height_sections.size());
abs_sections[0] = 0;
for (size_t i = 1; i < height_sections.size(); ++i) {
abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1];
}
return abs_sections;
}
template <typename DeviceContext, typename T>
......@@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
auto outs = ctx.MultiOutput<framework::SelectedRows>("Out");
auto height_sections = ctx.Attr<std::vector<int>>("height_sections");
auto abs_sections = ToAbsoluteSection(height_sections);
auto x_rows = x->rows();
std::vector<std::vector<int>> outs_rows_idx;
std::vector<std::vector<int>> outs_dense_idx;
outs_rows_idx.resize(outs.size());
outs_dense_idx.resize(outs.size());
auto row_numel = x->value().numel() / x->value().dims()[0];
auto src = x->value().data<T>();
// split rows index into output sparse vars
for (size_t i = 0; i < x_rows.size(); ++i) {
int out_idx = FindOutIdx(x_rows[i], height_sections);
outs_rows_idx[out_idx].push_back(i);
int out_idx = FindOutIdx(x_rows[i], abs_sections);
outs_rows_idx[out_idx].push_back(x_rows[i]);
outs_dense_idx[out_idx].push_back(i);
}
auto place = ctx.GetPlace();
......@@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel<T> {
dims[0] = rows_idx.size();
outs[i]->mutable_value()->mutable_data<T>(dims, x->place());
for (auto idx : rows_idx) {
outs[i]->mutable_rows()->push_back(x_rows[idx]);
outs[i]->mutable_rows()->push_back(idx - abs_sections[i]);
}
auto dst = outs[i]->mutable_value()->mutable_data<T>(ctx.GetPlace());
for (size_t j = 0; j < rows_idx.size(); j++) {
if (platform::is_cpu_place(place)) {
memory::Copy(platform::CPUPlace(), dst + j * row_numel,
platform::CPUPlace(), src + rows_idx[j] * row_numel,
sizeof(T) * row_numel);
memory::Copy(
platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(),
src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel);
} else {
#ifdef PADDLE_WITH_CUDA
auto stream = ctx.cuda_device_context().stream();
memory::Copy(platform::CUDAPlace(), dst + j * row_numel,
platform::CUDAPlace(), src + rows_idx[j] * row_numel,
platform::CUDAPlace(),
src + outs_dense_idx[i][j] * row_numel,
sizeof(T) * row_numel, stream);
#else
PADDLE_THROW("Paddle is not compiled with GPU");
......
......@@ -76,9 +76,15 @@ class SumOp : public framework::OperatorWithKernel {
static_cast<framework::proto::VarType::Type>(dtype),
ctx.device_context());
} else if (x_vars[0]->IsType<framework::SelectedRows>()) {
return framework::OpKernelType(
framework::ToDataType(
x_vars[0]->Get<framework::SelectedRows>().value().type()),
for (auto& var : x_vars) {
auto& value = var->Get<framework::SelectedRows>().value();
if (value.IsInitialized()) {
return framework::OpKernelType(framework::ToDataType(value.type()),
ctx.device_context());
}
}
// if input sparse vars are not initialized, use an default kernel type.
return framework::OpKernelType(framework::proto::VarType::FP32,
ctx.device_context());
} else if (x_vars[0]->IsType<framework::LoDTensorArray>()) {
for (auto& x_var : x_vars) {
......
......@@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel<T> {
in_dim[0] = static_cast<int64_t>(first_dim);
out_value->Resize(framework::make_ddim(in_dim));
// if all the input sparse vars are empty, no need to
// merge these vars.
if (first_dim == 0UL) {
return;
}
out_value->mutable_data<T>(context.GetPlace());
math::SelectedRowsAddTo<DeviceContext, T> functor;
......@@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel<T> {
int64_t offset = 0;
for (int i = 0; i < N; i++) {
auto &sel_row = get_selected_row(i);
if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) {
if (sel_row.rows().size() == 0) {
continue;
}
PADDLE_ENFORCE_EQ(out->height(), sel_row.height());
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/platform/dynload/cudnn.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/macros.h"
......@@ -282,5 +284,17 @@ class ScopedPoolingDescriptor {
DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor);
};
inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_cudnn = ctx.Attr<bool>("use_cudnn");
use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace());
#ifdef PADDLE_WITH_CUDA
if (use_cudnn) {
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
use_cudnn &= dev_ctx.cudnn_handle() != nullptr;
}
#endif
return use_cudnn;
}
} // namespace platform
} // namespace paddle
......@@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool(
PADDLE_ENFORCE_GT(places.size(), 0);
for (size_t i = 0; i < places.size(); i++) {
if (platform::is_cpu_place(places[i])) {
#ifdef PADDLE_WITH_MKLDNN
device_contexts_.emplace(places[i],
new platform::MKLDNNDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#else
device_contexts_.emplace(places[i],
new platform::CPUDeviceContext(
boost::get<platform::CPUPlace>(places[i])));
#endif
} else if (platform::is_gpu_place(places[i])) {
#ifdef PADDLE_WITH_CUDA
device_contexts_.emplace(places[i],
......@@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface {
CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) {
SetDeviceId(place_.device);
multi_process = GetCUDAMultiProcessors(place_.device);
max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device);
PADDLE_ENFORCE(cudaStreamCreate(&stream_));
eigen_stream_.reset(new EigenCudaStreamDevice());
eigen_stream_->Reinitialize(&stream_, place);
......@@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const {
PADDLE_ENFORCE(cudaGetLastError());
}
int CUDADeviceContext::GetMaxPhysicalThreadCount() const {
return multi_process * max_threads_per_mp;
}
Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
return eigen_device_.get();
}
......@@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; }
#ifdef PADDLE_WITH_MKLDNN
MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place)
: CPUDeviceContext(place), ready_(false) {
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
: CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() {
p_blobs_.reset(new std::unordered_map<std::string, std::shared_ptr<void>>());
}
template <typename T>
void MKLDNNDeviceContext::AddElement(const std::string& op_key,
const T& value) {
if (GetElement<T>(op_key)) {
return;
}
GetElementPool<T>().emplace(op_key, std::move(value));
}
void MKLDNNDeviceContext::SetBlob(const std::string& name,
std::shared_ptr<void> data) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p;
p = p_blobs_.get();
template <typename T>
const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const {
auto it = GetElementPool<T>().find(op_key);
return it == GetElementPool<T>().end() ? nullptr : it->second;
}
auto it = p->find(name);
template <>
const std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNMemoryPtr>() const {
return memory_pool_;
}
if (it == p->end()) {
(*p)[name] = data; // create new blob
} else {
it->second = data; // set data to existing blob
}
template <>
const std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitivePtr>() const {
return primitive_pool_;
return;
}
template <>
const std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>&
MKLDNNDeviceContext::GetElementPool<MKLDNNPrimitiveDescPtr>() const {
return primitive_desc_pool_;
}
std::shared_ptr<void> MKLDNNDeviceContext::GetBlob(
const std::string& name) const {
std::unordered_map<std::string, std::shared_ptr<void>>* p;
p = p_blobs_.get();
void MKLDNNDeviceContext::Execute(bool block) {
if (pipeline_.empty()) {
return;
}
ResetStream();
stream_->submit(pipeline_).wait(block);
ready_ = false;
pipeline_.clear();
}
auto it = p->find(name);
void MKLDNNDeviceContext::ResetStream() {
if (ready_) {
return;
if (it != p->end()) {
return it->second;
}
// TODO(TJ): change me when mkldnn have specific method to reset this state
stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
ready_ = true;
return nullptr;
}
#endif
......
......@@ -22,7 +22,7 @@ limitations under the License. */
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#include <mkldnn.hpp>
#endif
#include "paddle/fluid/platform/enforce.h"
......@@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext {
/*! \brief Return place in the device context. */
Place GetPlace() const override;
/*! \brief Return the max physical thread count in the device context */
int GetMaxPhysicalThreadCount() const;
/*! \brief Return eigen device in the device context. */
Eigen::GpuDevice* eigen_device() const;
......@@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext {
cudaStream_t stream_;
cudnnHandle_t cudnn_handle_;
cublasHandle_t cublas_handle_;
int multi_process;
int max_threads_per_mp;
};
template <>
......@@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
public:
explicit MKLDNNDeviceContext(CPUPlace place);
/* \brief Add new element: memory, primitive or primitive desc */
template <typename T>
void AddElement(const std::string& op_key, const T& value);
/* \brief Get existed element: memory, primitive or primitive desc */
template <typename T>
const T& GetElement(const std::string& op_key) const;
/* \brief Get element pool: memory, primitive or primitive desc pool */
template <typename T>
const std::unordered_map<const std::string, const T, std::hash<std::string>>&
GetElementPool() const;
/* \brief Get the active engine */
const MKLDNNEngine& engine() const { return *engine_; }
/* \brief Submit primitive to pipeline */
void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); }
const mkldnn::engine& GetEngine() const { return engine_; }
/*! \brief Execute all submitted primitives in pipeline */
void Execute(bool block = true);
// Set data to blob (i.e. name/data pair). Create blob if not existing
void SetBlob(const std::string& name, std::shared_ptr<void> data) const;
protected:
/*! \brief Reset the stream to prepare next exectue */
void ResetStream();
// Find a saved blob. Return nullptr if not found
std::shared_ptr<void> GetBlob(const std::string& name) const;
private:
std::unordered_map<const std::string, const MKLDNNMemoryPtr,
std::hash<std::string>>
memory_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitivePtr,
std::hash<std::string>>
primitive_pool_;
std::unordered_map<const std::string, const MKLDNNPrimitiveDescPtr,
std::hash<std::string>>
primitive_desc_pool_;
std::vector<MKLDNNPrimitive> pipeline_;
MKLDNNStreamPtr stream_;
MKLDNNEnginePtr engine_;
bool ready_;
mkldnn::engine engine_;
std::shared_ptr<std::unordered_map<std::string, std::shared_ptr<void>>>
p_blobs_;
};
#endif
......
......@@ -192,21 +192,36 @@ class DeviceTracerImpl : public DeviceTracer {
}
void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) {
if (!anno) {
// TODO(panyx0718): Currently, it doesn't support nested situation
// Up-level can be cleared by low-level and therefore get nullptr
// here.
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
CPURecord{anno, start_ns, end_ns,
std::hash<std::thread::id>{}(std::this_thread::get_id())});
cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0});
}
void AddMemRecords(const std::string &name, uint64_t start_ns,
uint64_t end_ns, uint32_t device_id, uint32_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) {
VLOG(3) << name << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id,
uint32_t stream_id, uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) {
VLOG(3) << correlation_id << " cannot be traced";
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back(
KernelRecord{start, end, device_id, stream_id, correlation_id});
......@@ -279,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
std::string profile_str;
google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str);
std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc);
std::string profile_str;
profile_pb.SerializeToString(&profile_str);
profile_f << profile_str;
profile_f.close();
return profile_pb;
......
......@@ -68,6 +68,8 @@ extern void *cublas_dso_handle;
__macro(cublasDgemv_v2); \
__macro(cublasSgemm_v2); \
__macro(cublasDgemm_v2); \
__macro(cublasHgemm); \
__macro(cublasSgemmEx); \
__macro(cublasSgeam_v2); \
__macro(cublasDgeam_v2); \
__macro(cublasCreate_v2); \
......@@ -83,6 +85,7 @@ extern void *cublas_dso_handle;
__macro(cublasDgemmStridedBatched); \
__macro(cublasCgemmStridedBatched); \
__macro(cublasZgemmStridedBatched); \
__macro(cublasHgemmStridedBatched); \
__macro(cublasSgetrfBatched); \
__macro(cublasSgetriBatched); \
__macro(cublasDgetrfBatched); \
......
......@@ -20,10 +20,6 @@ limitations under the License. */
#include <cuda.h>
#endif // PADDLE_WITH_CUDA
#include "unsupported/Eigen/CXX11/Tensor"
#include "paddle/fluid/platform/hostdevice.h"
#ifdef __GNUC__
#define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__)
#else
......@@ -64,6 +60,18 @@ limitations under the License. */
namespace paddle {
namespace platform {
// Forward declare float16 for eigen.h
struct float16;
} // namespace platform
} // namespace paddle
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace platform {
// Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated
// and aligned at least on a 2-byte boundary, which leads to efficient
// memory access of float16 struct and also makes float16 compatible
......@@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) {
}
#endif
HOSTDEVICE inline bool(isnan)(const float16& a) {
#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530
return __hisnan(half(a));
#else
return (a.x & 0x7fff) > 0x7c00;
#endif
}
HOSTDEVICE inline bool(isinf)(const float16& a) {
return (a.x & 0x7fff) == 0x7c00;
}
HOSTDEVICE inline bool(isfinite)(const float16& a) {
return !((isnan)(a)) && !((isinf)(a));
}
} // namespace platform
} // namespace paddle
......@@ -750,3 +774,27 @@ struct is_pod<paddle::platform::float16> {
};
} // namespace std
namespace Eigen {
namespace numext {
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)(
const paddle::platform::float16& a) {
return (paddle::platform::isnan)(a);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)(
const paddle::platform::float16& a) {
return (paddle::platform::isinf)(a);
}
template <>
EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)(
const paddle::platform::float16& a) {
return (paddle::platform::isfinite)(a);
}
} // namespace numext
} // namespace Eigen
......@@ -33,6 +33,26 @@ int GetCUDADeviceCount() {
return count;
}
int GetCUDAMultiProcessors(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;
PADDLE_ENFORCE(
cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMultiProcessors");
return count;
}
int GetCUDAMaxThreadsPerMultiProcessor(int id) {
PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count");
int count;
PADDLE_ENFORCE(cudaDeviceGetAttribute(
&count, cudaDevAttrMaxThreadsPerMultiProcessor, id),
"cudaDeviceGetAttribute failed in "
"paddle::platform::GetCUDAMaxThreadsPerMultiProcessor");
return count;
}
int GetCurrentDeviceId() {
int device_id;
PADDLE_ENFORCE(
......
......@@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse =
//! Get the total number of GPU devices in system.
int GetCUDADeviceCount();
//! Get the MultiProcessors of the ith GPU.
int GetCUDAMultiProcessors(int i);
//! Get the MaxThreads of each MultiProcessor of the ith GPU.
int GetCUDAMaxThreadsPerMultiProcessor(int i);
//! Get the current GPU device id in system.
int GetCurrentDeviceId();
......
......@@ -16,12 +16,15 @@ limitations under the License. */
#include <mkldnn.hpp>
#include "paddle/fluid/framework/operator.h"
namespace paddle {
namespace platform {
using MKLDNNStream = mkldnn::stream;
using MKLDNNEngine = mkldnn::engine;
using MKLDNNMemory = mkldnn::memory;
using MKLDNNMemoryDescriptor = mkldnn::memory::desc;
using MKLDNNPrimitive = mkldnn::primitive;
using MKLDNNPrimitiveDesc = mkldnn::handle<mkldnn_primitive_desc_t>;
......@@ -31,5 +34,17 @@ typedef std::unique_ptr<MKLDNNMemory> MKLDNNMemoryPtr;
typedef std::unique_ptr<MKLDNNPrimitive> MKLDNNPrimitivePtr;
typedef std::unique_ptr<MKLDNNPrimitiveDesc> MKLDNNPrimitiveDescPtr;
inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector<int>& dims,
mkldnn::memory::data_type data_type,
mkldnn::memory::format format) {
mkldnn::memory::dims tz = dims;
return mkldnn::memory::desc({tz}, data_type, format);
}
inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) {
bool use_mkldnn = ctx.Attr<bool>("use_mkldnn");
return use_mkldnn && platform::is_cpu_place(ctx.GetPlace());
}
} // namespace platform
} // namespace paddle
......@@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) {
}
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
// Generate some dummy evenets first to reduce the startup overhead.
// Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
......
......@@ -15,7 +15,7 @@ limitations under the License. */
syntax = "proto2";
package paddle.platform.proto;
message MemCopy { optional uint64 bytes = 3; }
message MemCopy { optional uint64 bytes = 1; }
message Event {
optional string name = 1;
......
......@@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) {
* ...
* PopEvent(evt_name, dev_ctx);
*/
LOG(INFO) << "Usage 1: PushEvent & PopEvent";
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
......@@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) {
* ...
* }
*/
LOG(INFO) << "Usage 2: RecordEvent";
for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
......@@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) {
while (counter != i * 1000) counter++;
}
/* Usage 3
* {
* RecordEvent record_event(name1, dev_ctx);
* ...
* code to be analyzed
* ...
* {
* RecordEvent nested_record_event(name2, dev_ctx);
* ...
* code to be analyzed
* ...
* }
* }
*/
LOG(INFO) << "Usage 3: nested RecordEvent";
for (int i = 1; i < 5; ++i) {
std::string name = "ano_evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
int counter = 1;
while (counter != i * 100) counter++;
{
std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
RecordEvent nested_record_event(nested_name, dev_ctx);
int nested_counter = 1;
while (nested_counter != i * 100) nested_counter++;
}
}
// Bad Usage:
PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx);
......
# internal library.
cc_library(header SRCS header.cc)
cc_test(header_test SRCS header_test.cc DEPS header)
cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib)
cc_test(chunk_test SRCS chunk_test.cc DEPS chunk)
cc_library(recordio DEPS chunk header)
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/chunk.h"
#include <memory>
#include <sstream>
#include "paddle/fluid/platform/enforce.h"
#include "snappystream.hpp"
#include "zlib.h"
namespace paddle {
namespace recordio {
constexpr size_t kMaxBufSize = 1024;
template <typename Callback>
static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) {
char buf[kMaxBufSize];
std::streamsize actual_size;
size_t counter = 0;
do {
auto actual_max =
limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize;
actual_size = in.readsome(buf, actual_max);
if (actual_size == 0) {
break;
}
callback(buf, actual_size);
if (limit > 0) {
counter += actual_size;
}
} while (actual_size == kMaxBufSize);
}
static void PipeStream(std::istream& in, std::ostream& os) {
ReadStreamByBuf(
in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); });
}
static uint32_t Crc32Stream(std::istream& in, int limit = -1) {
auto crc = crc32(0, nullptr, 0);
ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) {
crc = crc32(crc, reinterpret_cast<const Bytef*>(buf), len);
});
return crc;
}
bool Chunk::Write(std::ostream& os, Compressor ct) const {
// NOTE(dzhwinter): don't check records.numBytes instead, because
// empty records are allowed.
if (records_.empty()) {
return false;
}
std::stringstream sout;
std::unique_ptr<std::ostream> compressed_stream;
switch (ct) {
case Compressor::kNoCompress:
break;
case Compressor::kSnappy:
compressed_stream.reset(new snappy::oSnappyStream(sout));
break;
default:
PADDLE_THROW("Not implemented");
}
std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout;
for (auto& record : records_) {
size_t sz = record.size();
buf_stream.write(reinterpret_cast<const char*>(&sz), sizeof(uint32_t))
.write(record.data(), record.size());
}
if (compressed_stream) {
compressed_stream.reset();
}
auto end_pos = sout.tellg();
sout.seekg(0, std::ios::beg);
uint32_t len = static_cast<uint32_t>(end_pos - sout.tellg());
uint32_t crc = Crc32Stream(sout);
sout.seekg(0, std::ios::beg);
Header hdr(static_cast<uint32_t>(records_.size()), crc, ct, len);
hdr.Write(os);
PipeStream(sout, os);
return true;
}
void Chunk::Parse(std::istream& sin) {
Header hdr;
hdr.Parse(sin);
auto beg_pos = sin.tellg();
auto crc = Crc32Stream(sin, hdr.CompressSize());
PADDLE_ENFORCE_EQ(hdr.Checksum(), crc);
Clear();
sin.seekg(beg_pos, std::ios::beg);
std::unique_ptr<std::istream> compressed_stream;
switch (hdr.CompressType()) {
case Compressor::kNoCompress:
break;
case Compressor::kSnappy:
compressed_stream.reset(new snappy::iSnappyStream(sin));
break;
default:
PADDLE_THROW("Not implemented");
}
std::istream& stream = compressed_stream ? *compressed_stream : sin;
for (uint32_t i = 0; i < hdr.NumRecords(); ++i) {
uint32_t rec_len;
stream.read(reinterpret_cast<char*>(&rec_len), sizeof(uint32_t));
std::string buf;
buf.resize(rec_len);
stream.read(&buf[0], rec_len);
Add(buf);
}
}
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/platform/macros.h"
#include "paddle/fluid/recordio/header.h"
namespace paddle {
namespace recordio {
// A Chunk contains the Header and optionally compressed records.
class Chunk {
public:
Chunk() : num_bytes_(0) {}
void Add(std::string buf) {
records_.push_back(buf);
num_bytes_ += buf.size();
}
// dump the chunk into w, and clears the chunk and makes it ready for
// the next add invocation.
bool Write(std::ostream& fo, Compressor ct) const;
void Clear() {
records_.clear();
num_bytes_ = 0;
}
void Parse(std::istream& sin);
size_t NumBytes() { return num_bytes_; }
const std::string& Record(int i) const { return records_[i]; }
private:
std::vector<std::string> records_;
// sum of record lengths in bytes.
size_t num_bytes_;
DISABLE_COPY_AND_ASSIGN(Chunk);
};
size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out);
void DeflateData(const char* in, size_t in_length, Compressor ct, char* out);
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/chunk.h"
#include <sstream>
#include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Chunk, SaveLoad) {
Chunk ch;
ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4));
std::stringstream ss;
ch.Write(ss, Compressor::kNoCompress);
ch.Clear();
ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 10U);
}
TEST(Chunk, Compressor) {
Chunk ch;
ch.Add(std::string("12345", 6));
ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4));
ch.Add(std::string("123", 4));
std::stringstream ss;
ch.Write(ss, Compressor::kSnappy);
std::stringstream ss2;
ch.Write(ss2, Compressor::kNoCompress);
ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data;
ch.Clear();
ch.Parse(ss);
ASSERT_EQ(ch.NumBytes(), 18);
}
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/header.h"
namespace paddle {
namespace recordio {
Header::Header()
: num_records_(0),
checksum_(0),
compressor_(Compressor::kNoCompress),
compress_size_(0) {}
Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs)
: num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {}
void Header::Parse(std::istream& is) {
is.read(reinterpret_cast<char*>(&num_records_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&checksum_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&compressor_), sizeof(uint32_t))
.read(reinterpret_cast<char*>(&compress_size_), sizeof(uint32_t));
}
void Header::Write(std::ostream& os) const {
os.write(reinterpret_cast<const char*>(&num_records_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&checksum_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&compressor_), sizeof(uint32_t))
.write(reinterpret_cast<const char*>(&compress_size_), sizeof(uint32_t));
}
std::ostream& operator<<(std::ostream& os, Header h) {
os << h.NumRecords() << h.Checksum()
<< static_cast<uint32_t>(h.CompressType()) << h.CompressSize();
return os;
}
bool operator==(Header l, Header r) {
return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() &&
l.CompressType() == r.CompressType() &&
l.CompressSize() == r.CompressSize();
}
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <sstream>
namespace paddle {
namespace recordio {
// Default ChunkSize
constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024;
// MagicNumber for memory checking
constexpr uint32_t kMagicNumber = 0x01020304;
enum class Compressor : uint32_t {
// NoCompression means writing raw chunk data into files.
// With other choices, chunks are compressed before written.
kNoCompress = 0,
// Snappy had been the default compressing algorithm widely
// used in Google. It compromises between speech and
// compression ratio.
kSnappy = 1,
// Gzip is a well-known compression algorithm. It is
// recommmended only you are looking for compression ratio.
kGzip = 2,
};
// Header is the metadata of Chunk
class Header {
public:
Header();
Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs);
void Write(std::ostream& os) const;
void Parse(std::istream& is);
uint32_t NumRecords() const { return num_records_; }
uint32_t Checksum() const { return checksum_; }
Compressor CompressType() const { return compressor_; }
uint32_t CompressSize() const { return compress_size_; }
private:
uint32_t num_records_;
uint32_t checksum_;
Compressor compressor_;
uint32_t compress_size_;
};
// Allow Header Loggable
std::ostream& operator<<(std::ostream& os, Header h);
bool operator==(Header l, Header r);
} // namespace recordio
} // namespace paddle
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/recordio/header.h"
#include <sstream>
#include "gtest/gtest.h"
using namespace paddle::recordio;
TEST(Recordio, ChunkHead) {
Header hdr(0, 1, Compressor::kGzip, 3);
std::stringstream ss;
hdr.Write(ss);
ss.seekg(0, std::ios::beg);
Header hdr2;
hdr2.Parse(ss);
EXPECT_TRUE(hdr == hdr2);
}
......@@ -213,7 +213,7 @@ function gen_fluid_inference_lib() {
if [ ${WITH_C_API:-OFF} == "OFF" ] ; then
cat <<EOF
========================================
Building fluid inference library ...
Deploying fluid inference library ...
========================================
EOF
make inference_lib_dist
......
......@@ -28,6 +28,7 @@ import nets
import optimizer
import backward
import regularizer
import average
from param_attr import ParamAttr, WeightNormParamAttr
from data_feeder import DataFeeder
from core import LoDTensor, CPUPlace, CUDAPlace
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
"""
Class of all kinds of Average.
All Averages are accomplished via Python totally.
They do not change Paddle's Program, nor do anything to
modify NN model's configuration. They are completely
wrappers of Python functions.
"""
def _is_number_(var):
return isinstance(var, int) or isinstance(var, float) or (isinstance(
var, np.ndarray) and var.shape == (1, ))
def _is_number_or_matrix_(var):
return _is_number_(var) or isinstance(var, np.ndarray)
class WeightedAverage(object):
def __init__(self):
self.reset()
def reset(self):
self.numerator = None
self.denominator = None
def add(self, value, weight):
if not _is_number_or_matrix_(value):
raise ValueError(
"The 'value' must be a number(int, float) or a numpy ndarray.")
if not _is_number_(weight):
raise ValueError("The 'weight' must be a number(int, float).")
if self.numerator is None or self.denominator is None:
self.numerator = value * weight
self.denominator = weight
else:
self.numerator += value * weight
self.denominator += weight
def eval(self):
if self.numerator is None or self.denominator is None:
raise ValueError(
"There is no data to be averaged in WeightedAverage.")
return self.numerator / self.denominator
......@@ -486,7 +486,7 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
params_and_grads = []
for param in parameters:
if param not in grad_info_map:
raise ValueError("param %s is not in map" % param)
continue
grad_info = grad_info_map[param]
grad_block = grad_info[1]
if not grad_block.has_var(grad_info[0]):
......
......@@ -108,44 +108,6 @@ class Evaluator(object):
return state
class Accuracy(Evaluator):
"""
Average Accuracy for multiple mini-batches.
"""
def __init__(self, input, label, k=1, **kwargs):
super(Accuracy, self).__init__("accuracy", **kwargs)
main_program = self.helper.main_program
if main_program.current_block().idx != 0:
raise ValueError("You can only invoke Evaluator in root block")
self.total = self.create_state(dtype='int64', shape=[1], suffix='total')
self.correct = self.create_state(
dtype='int64', shape=[1], suffix='correct')
total = self.helper.create_tmp_variable(dtype='int')
correct = self.helper.create_tmp_variable(dtype='int')
acc = layers.accuracy(
input=input, label=label, k=k, total=total, correct=correct)
total = layers.cast(x=total, dtype='int64')
correct = layers.cast(x=correct, dtype='int64')
layers.sums(input=[self.total, total], out=self.total)
layers.sums(input=[self.correct, correct], out=self.correct)
self.metrics.append(acc)
def eval(self, executor, eval_program=None):
if eval_program is None:
eval_program = Program()
block = eval_program.current_block()
with program_guard(main_program=eval_program):
total = _clone_var_(block, self.total)
correct = _clone_var_(block, self.correct)
total = layers.cast(total, dtype='float32')
correct = layers.cast(correct, dtype='float32')
out = layers.elementwise_div(x=correct, y=total)
return np.array(executor.run(eval_program, fetch_list=[out])[0])
class ChunkEvaluator(Evaluator):
"""
Accumulate counter numbers output by chunk_eval from mini-batches and
......@@ -312,6 +274,10 @@ class DetectionMAP(Evaluator):
bounding box (bbox), which is a LoDTensor [N, 1].
gt_box (Variable): The ground truth bounding box (bbox), which is a
LoDTensor with shape [N, 6]. The layout is [xmin, ymin, xmax, ymax].
class_num (int): The class number.
background_label (int): The index of background label, the background
label will be ignored. If set to -1, then all categories will be
considered, 0 by defalut.
overlap_threshold (float): The threshold for deciding true/false
positive, 0.5 by defalut.
evaluate_difficult (bool): Whether to consider difficult ground truth
......@@ -345,6 +311,8 @@ class DetectionMAP(Evaluator):
gt_label,
gt_box,
gt_difficult,
class_num,
background_label=0,
overlap_threshold=0.5,
evaluate_difficult=True,
ap_version='integral'):
......@@ -358,6 +326,8 @@ class DetectionMAP(Evaluator):
map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
ap_version=ap_version)
......@@ -377,6 +347,8 @@ class DetectionMAP(Evaluator):
accum_map = layers.detection_map(
input,
label,
class_num,
background_label,
overlap_threshold=overlap_threshold,
evaluate_difficult=evaluate_difficult,
has_state=self.has_state,
......
......@@ -102,6 +102,9 @@ def save_vars(executor,
save_var_map = {}
for each_var in vars:
# NOTE: don't save the variable which type is RAW
if each_var.type == core.VarDesc.VarType.RAW:
continue
new_var = _clone_var_in_block_(save_block, each_var)
if filename is None:
save_block.append_op(
......
......@@ -28,6 +28,8 @@ import math_op_patch
from math_op_patch import *
import detection
from detection import *
import metric
from metric import *
from learning_rate_scheduler import *
__all__ = []
......@@ -39,4 +41,5 @@ __all__ += control_flow.__all__
__all__ += ops.__all__
__all__ += device.__all__
__all__ += detection.__all__
__all__ += metric.__all__
__all__ += learning_rate_scheduler.__all__
......@@ -151,6 +151,8 @@ def detection_output(loc,
@autodoc()
def detection_map(detect_res,
label,
class_num,
background_label=0,
overlap_threshold=0.3,
evaluate_difficult=True,
has_state=None,
......@@ -192,7 +194,8 @@ def detection_map(detect_res,
attrs={
'overlap_threshold': overlap_threshold,
'evaluate_difficult': evaluate_difficult,
'ap_type': ap_version
'ap_type': ap_version,
'class_num': class_num,
})
return map_out
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
All layers just related to metric.
"""
from ..layer_helper import LayerHelper
from ..initializer import Normal, Constant
from ..framework import Variable
from ..param_attr import ParamAttr
__all__ = ['accuracy']
def accuracy(input, label, k=1, correct=None, total=None):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper = LayerHelper("accuracy", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
acc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
if total is None:
total = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
outputs={
"Accuracy": [acc_out],
"Correct": [correct],
"Total": [total],
})
return acc_out
......@@ -35,7 +35,6 @@ __all__ = [
'cos_sim',
'cross_entropy',
'square_error_cost',
'accuracy',
'chunk_eval',
'sequence_conv',
'conv2d',
......@@ -1022,40 +1021,6 @@ def square_error_cost(input, label):
return square_out
def accuracy(input, label, k=1, correct=None, total=None):
"""
This function computes the accuracy using the input and label.
The output is the top_k inputs and their indices.
"""
helper = LayerHelper("accuracy", **locals())
topk_out = helper.create_tmp_variable(dtype=input.dtype)
topk_indices = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="top_k",
inputs={"X": [input]},
outputs={"Out": [topk_out],
"Indices": [topk_indices]},
attrs={"k": k})
acc_out = helper.create_tmp_variable(dtype="float32")
if correct is None:
correct = helper.create_tmp_variable(dtype="int64")
if total is None:
total = helper.create_tmp_variable(dtype="int64")
helper.append_op(
type="accuracy",
inputs={
"Out": [topk_out],
"Indices": [topk_indices],
"Label": [label]
},
outputs={
"Accuracy": [acc_out],
"Correct": [correct],
"Total": [total],
})
return acc_out
def chunk_eval(input,
label,
chunk_scheme,
......@@ -1146,6 +1111,7 @@ def conv2d(input,
param_attr=None,
bias_attr=None,
use_cudnn=True,
use_mkldnn=False,
act=None):
"""
**Convlution2D Layer**
......@@ -1287,7 +1253,8 @@ def conv2d(input,
'strides': stride,
'paddings': padding,
'groups': groups,
'use_cudnn': use_cudnn
'use_cudnn': use_cudnn,
'use_mkldnn': use_mkldnn
})
pre_act = helper.append_bias_op(pre_bias, dim_start=1, dim_end=2)
......@@ -1438,6 +1405,8 @@ def pool2d(input,
pool_padding=0,
global_pooling=False,
use_cudnn=True,
ceil_mode=False,
use_mkldnn=False,
name=None):
"""
This function adds the operator for pooling in 2 dimensions, using the
......@@ -1474,7 +1443,9 @@ def pool2d(input,
"global_pooling": global_pooling,
"strides": pool_stride,
"paddings": pool_padding,
"use_cudnn": use_cudnn
"use_cudnn": use_cudnn,
"ceil_mode": ceil_mode,
"use_mkldnn": use_mkldnn
})
return pool_out
......@@ -3180,7 +3151,7 @@ def smooth_l1(x, y, inside_weight=None, outside_weight=None, sigma=None):
data = fluid.layers.data(name='data', shape=[128], dtype='float32')
label = fluid.layers.data(name='label', shape=[100], dtype='int64')
fc = fluid.layers.fc(input=data, size=100)
out = fluid.layers.smooth_l1(logits=fc, label=label)
out = fluid.layers.smooth_l1(x=fc, y=label)
"""
helper = LayerHelper('smooth_l1_loss', **locals())
diff = helper.create_tmp_variable(dtype=x.dtype)
......
......@@ -31,6 +31,8 @@ dtype_to_size = {
sub_block_ops = ["while", "while_grad", "parallel_do", "parallel_do_grad"]
PRINT_LOG = False
class ControlFlowGraph(object):
def __init__(self, Program, ops, forward_num, skip_opt):
......@@ -171,7 +173,9 @@ class ControlFlowGraph(object):
# TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
# and dtype_to_size[cache_dtype]
if x_dtype == cache_dtype:
print(("Hit Cache !!!! cache pool index "
if PRINT_LOG:
print(
("Hit Cache !!!! cache pool index "
"is %d, var name is %s, "
"cached var name is %s, "
"var shape is %s ") %
......@@ -277,7 +281,9 @@ def _get_cfgs(input_program):
return cfgs
def memory_optimize(input_program):
def memory_optimize(input_program, print_log=False):
global PRINT_LOG
PRINT_LOG = print_log
cfgs = _get_cfgs(input_program)
for cfg in cfgs:
cfg.memory_optimize()
......@@ -29,21 +29,24 @@ def simple_img_conv_pool(input,
act,
param_attr=None,
pool_type='max',
use_cudnn=True):
use_cudnn=True,
use_mkldnn=False):
conv_out = layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
param_attr=param_attr,
act=act,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
pool_out = layers.pool2d(
input=conv_out,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
return pool_out
......@@ -58,7 +61,8 @@ def img_conv_group(input,
conv_batchnorm_drop_rate=0.0,
pool_stride=1,
pool_type=None,
use_cudnn=True):
use_cudnn=True,
use_mkldnn=False):
"""
Image Convolution Group, Used for vgg net.
"""
......@@ -90,7 +94,8 @@ def img_conv_group(input,
padding=conv_padding[i],
param_attr=param_attr[i],
act=local_conv_act,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
if conv_with_batchnorm[i]:
tmp = layers.batch_norm(input=tmp, act=conv_act)
......@@ -103,7 +108,8 @@ def img_conv_group(input,
pool_size=pool_size,
pool_type=pool_type,
pool_stride=pool_stride,
use_cudnn=use_cudnn)
use_cudnn=use_cudnn,
use_mkldnn=use_mkldnn)
return pool_out
......
......@@ -49,7 +49,7 @@ avg_cost = fluid.layers.mean(x=cost)
sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01)
sgd_optimizer.minimize(avg_cost)
fluid.memory_optimize(fluid.default_main_program())
fluid.memory_optimize(fluid.default_main_program(), print_log=True)
BATCH_SIZE = 200
......
......@@ -122,7 +122,8 @@ avg_cost = fluid.layers.mean(cost)
optimizer = fluid.optimizer.Adam(learning_rate=0.001)
opts = optimizer.minimize(avg_cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(input=predict, label=label, total=batch_size)
fluid.memory_optimize(fluid.default_main_program())
......@@ -144,13 +145,17 @@ feeder = fluid.DataFeeder(place=place, feed_list=[images, label])
exe.run(fluid.default_startup_program())
i = 0
accuracy = fluid.average.WeightedAverage()
for pass_id in range(PASS_NUM):
accuracy.reset(exe)
accuracy.reset()
for data in train_reader():
loss, acc = exe.run(fluid.default_main_program(),
loss, acc, weight = exe.run(
fluid.default_main_program(),
feed=feeder.feed(data),
fetch_list=[avg_cost] + accuracy.metrics)
pass_acc = accuracy.eval(exe)
fetch_list=[avg_cost, batch_acc, batch_size])
accuracy.add(value=acc, weight=weight)
pass_acc = accuracy.eval()
print("loss:" + str(loss) + " acc:" + str(acc) + " pass_acc:" + str(
pass_acc))
# this model is slow, so if we can train two mini batch, we think it works properly.
......
......@@ -15,16 +15,30 @@
import paddle.v2 as paddle
import paddle.fluid as fluid
import numpy as np
import sys
prog = fluid.framework.Program()
block = prog.current_block()
startup_prog = fluid.framework.Program()
startup_block = startup_prog.current_block()
random_reader = block.create_var(
random_reader = startup_block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="RandomDataGenerator")
random_reader.desc.set_dtypes(
[fluid.core.VarDesc.VarType.FP32, fluid.core.VarDesc.VarType.FP32])
random_reader.persistable = True
shuffle_reader = startup_block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
shuffle_reader.persistable = True
batch_reader = startup_block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
batch_reader.persistable = True
double_buffer = startup_block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="DoubleBuffer")
double_buffer.persistable = True
main_prog = startup_prog.clone()
main_block = main_prog.current_block()
create_random_data_generator_op = block.append_op(
create_random_data_generator_op = startup_block.append_op(
type="create_random_data_generator",
outputs={"Out": random_reader},
attrs={
......@@ -34,37 +48,45 @@ create_random_data_generator_op = block.append_op(
"max": 1.0,
'lod_levels': [0, 0]
})
shuffle_reader = block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="ShuffleReader")
create_shuffle_reader_op = block.append_op(
create_shuffle_reader_op = startup_block.append_op(
type="create_shuffle_reader",
inputs={"UnderlyingReader": random_reader},
outputs={"Out": shuffle_reader},
attrs={"buffer_size": 7})
batch_reader = block.create_var(
type=fluid.core.VarDesc.VarType.READER, name="BatchReader")
create_batch_reader_op = block.append_op(
create_batch_reader_op = startup_block.append_op(
type="create_batch_reader",
inputs={"UnderlyingReader": shuffle_reader},
outputs={"Out": batch_reader},
attrs={"batch_size": 10})
out1 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
out2 = block.create_var(type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
create_double_buffer_reader_op = startup_block.append_op(
type="create_double_buffer_reader",
inputs={"UnderlyingReader": batch_reader},
outputs={"Out": double_buffer})
out1 = main_block.create_var(
type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out1")
out2 = main_block.create_var(
type=fluid.core.VarDesc.VarType.LOD_TENSOR, name="Out2")
read_op = block.append_op(
type="read", inputs={"Reader": batch_reader},
main_block.var("DoubleBuffer").desc.set_shapes(double_buffer.desc.shapes())
main_block.var("DoubleBuffer").desc.set_dtypes(double_buffer.desc.dtypes())
main_block.var("DoubleBuffer").desc.set_lod_levels(
double_buffer.desc.lod_levels())
read_op = main_block.append_op(
type="read",
inputs={"Reader": double_buffer},
outputs={"Out": [out1, out2]})
place = fluid.CPUPlace()
exe = fluid.Executor(place)
[res1, res2] = exe.run(prog, fetch_list=[out1, out2])
exe.run(startup_prog)
if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
for i in range(1, 100):
[res1, res2] = exe.run(main_prog, fetch_list=[out1, out2])
if not (res1.shape == (10, 2) and res2.shape == (10, 1)):
exit(1)
exit(0)
......@@ -158,7 +158,7 @@ class TestDetectionMAP(unittest.TestCase):
append_batch_size=False,
dtype='float32')
map_out = layers.detection_map(detect_res=detect_res, label=label)
map_out = layers.detection_map(detect_res, label, 21)
self.assertIsNotNone(map_out)
self.assertEqual(map_out.shape, (1, ))
print(str(program))
......
......@@ -64,6 +64,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
class TestConv2dOp(OpTest):
def setUp(self):
self.use_cudnn = False
self.use_mkldnn = False
self.init_op_type()
self.init_group()
self.init_dilation()
......@@ -85,7 +86,8 @@ class TestConv2dOp(OpTest):
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn
}
self.outputs = {'Output': output}
......@@ -290,5 +292,25 @@ class TestDepthwiseConv2(TestConv2dOp):
# def init_op_type(self):
# self.op_type = "conv_cudnn"
#----------------Conv2dMKLDNN----------------
class TestMKLDNN(TestConv2dOp):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithPad(TestWithPad):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
class TestMKLDNNWithStride(TestWithStride):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "conv2d"
if __name__ == '__main__':
unittest.main()
......@@ -22,8 +22,8 @@ from op_test import OpTest
class TestDetectionMAPOp(OpTest):
def set_data(self):
self.class_num = 4
self.init_test_case()
self.mAP = [self.calc_map(self.tf_pos, self.tf_pos_lod)]
self.label = np.array(self.label).astype('float32')
self.detect = np.array(self.detect).astype('float32')
......@@ -53,7 +53,8 @@ class TestDetectionMAPOp(OpTest):
self.attrs = {
'overlap_threshold': self.overlap_threshold,
'evaluate_difficult': self.evaluate_difficult,
'ap_type': self.ap_type
'ap_type': self.ap_type,
'class_num': self.class_num
}
self.out_class_pos_count = np.array(self.out_class_pos_count).astype(
......@@ -126,12 +127,7 @@ class TestDetectionMAPOp(OpTest):
return class_pos_count_dict, true_pos_dict, false_pos_dict
def get_output_pos(label_count, true_pos, false_pos):
max_label = 0
for (label, label_pos_num) in label_count.items():
if max_label < label:
max_label = label
label_number = max_label + 1
label_number = self.class_num
out_class_pos_count = []
out_true_pos_lod = [0]
......@@ -220,11 +216,16 @@ class TestDetectionMAPOp(OpTest):
mAP += average_precisions
count += 1
self.out_class_pos_count, self.out_true_pos, self.out_true_pos_lod, self.out_false_pos, self.out_false_pos_lod = get_output_pos(
label_count, true_pos, false_pos)
pcnt, tp, tp_lod, fp, fp_lod = get_output_pos(label_count, true_pos,
false_pos)
self.out_class_pos_count = pcnt
self.out_true_pos = tp
self.out_true_pos_lod = tp_lod
self.out_false_pos = fp
self.out_false_pos_lod = fp_lod
if count != 0:
mAP /= count
return mAP * 100.0
return mAP
def setUp(self):
self.op_type = "detection_map"
......
......@@ -19,6 +19,7 @@ import unittest
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle.fluid.framework as framework
import paddle.fluid.core as core
def exponential_decay(learning_rate,
......@@ -81,6 +82,16 @@ def piecewise_decay(global_step, boundaries, values):
class TestLearningRateDecay(unittest.TestCase):
def check_decay(self, python_decay_fn, fluid_decay_fn, kwargs):
places = [fluid.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(fluid.CUDAPlace(0))
for place in places:
self.check_decay_with_place(place, python_decay_fn, fluid_decay_fn,
kwargs)
def check_decay_with_place(self, place, python_decay_fn, fluid_decay_fn,
kwargs):
decayed_lr = fluid_decay_fn(**kwargs)
place = fluid.CPUPlace()
......
......@@ -19,12 +19,21 @@ import paddle.fluid.core as core
from op_test import OpTest
def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
def max_pool2D_forward_naive(x,
ksize,
strides,
paddings,
global_pool=0,
ceil_mode=False):
N, C, H, W = x.shape
if global_pool == 1:
ksize = [H, W]
H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
paddings[1]) / strides[1] + 1
out = np.zeros((N, C, H_out, W_out))
for i in xrange(H_out):
for j in xrange(W_out):
......@@ -38,12 +47,21 @@ def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
return out
def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
def avg_pool2D_forward_naive(x,
ksize,
strides,
paddings,
global_pool=0,
ceil_mode=False):
N, C, H, W = x.shape
if global_pool == 1:
ksize = [H, W]
H_out = (H - ksize[0] + 2 * paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1]) / strides[1] + 1
H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1
) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
paddings[0]) / strides[0] + 1
W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1
) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
paddings[1]) / strides[1] + 1
out = np.zeros((N, C, H_out, W_out))
for i in xrange(H_out):
for j in xrange(W_out):
......@@ -61,16 +79,18 @@ def avg_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0):
class TestPool2d_Op(OpTest):
def setUp(self):
self.use_cudnn = False
self.use_mkldnn = False
self.init_test_case()
self.init_global_pool()
self.init_op_type()
self.init_pool_type()
self.init_ceil_mode()
if self.global_pool:
self.paddings = [0 for _ in range(len(self.paddings))]
input = np.random.random(self.shape).astype("float32")
output = self.pool2D_forward_naive(input, self.ksize, self.strides,
self.paddings,
self.global_pool).astype("float32")
self.paddings, self.global_pool,
self.ceil_mode).astype("float32")
self.inputs = {'X': input}
self.attrs = {
......@@ -80,6 +100,8 @@ class TestPool2d_Op(OpTest):
'pooling_type': self.pool_type,
'global_pooling': self.global_pool,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'ceil_mode': self.ceil_mode,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
}
......@@ -116,6 +138,9 @@ class TestPool2d_Op(OpTest):
def init_global_pool(self):
self.global_pool = True
def init_ceil_mode(self):
self.ceil_mode = False
class TestCase1(TestPool2d_Op):
def init_test_case(self):
......@@ -217,5 +242,62 @@ class TestCUDNNCase6(TestCase5):
self.op_type = "pool2d"
class TestCeilModeCase1(TestCUDNNCase1):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase2(TestCUDNNCase2):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase3(TestCase1):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase4(TestCase2):
def init_ceil_mode(self):
self.ceil_mode = True
#--------------------test pool2d MKLDNN--------------------
class TestMKLDNNCase1(TestPool2d_Op):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase2(TestCase1):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase3(TestCase2):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase4(TestCase3):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase5(TestCase4):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
class TestMKLDNNCase6(TestCase5):
def init_op_type(self):
self.use_mkldnn = True
self.op_type = "pool2d"
if __name__ == '__main__':
unittest.main()
......@@ -19,13 +19,24 @@ import paddle.fluid.core as core
from op_test import OpTest
def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
def max_pool3D_forward_naive(x,
ksize,
strides,
paddings,
global_pool=0,
ceil_mode=False):
N, C, D, H, W = x.shape
if global_pool == 1:
ksize = [D, H, W]
D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
paddings[2]) / strides[2] + 1
out = np.zeros((N, C, D_out, H_out, W_out))
for k in xrange(D_out):
d_start = np.max((k * strides[0] - paddings[0], 0))
......@@ -42,13 +53,24 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
return out
def avg_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0):
def avg_pool3D_forward_naive(x,
ksize,
strides,
paddings,
global_pool=0,
ceil_mode=False):
N, C, D, H, W = x.shape
if global_pool == 1:
ksize = [D, H, W]
D_out = (D - ksize[0] + 2 * paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2]) / strides[2] + 1
D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1
) / strides[0] + 1 if ceil_mode else (H - ksize[0] + 2 *
paddings[0]) / strides[0] + 1
H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1
) / strides[1] + 1 if ceil_mode else (W - ksize[1] + 2 *
paddings[1]) / strides[1] + 1
W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1
) / strides[2] + 1 if ceil_mode else (W - ksize[2] + 2 *
paddings[2]) / strides[2] + 1
out = np.zeros((N, C, D_out, H_out, W_out))
for k in xrange(D_out):
d_start = np.max((k * strides[0] - paddings[0], 0))
......@@ -73,13 +95,14 @@ class TestPool3d_Op(OpTest):
self.init_global_pool()
self.init_op_type()
self.init_pool_type()
self.init_ceil_mode()
if self.global_pool:
self.paddings = [0 for _ in range(len(self.paddings))]
input = np.random.random(self.shape).astype("float32")
output = self.pool3D_forward_naive(input, self.ksize, self.strides,
self.paddings,
self.global_pool).astype("float32")
self.paddings, self.global_pool,
self.ceil_mode).astype("float32")
self.inputs = {'X': input}
self.attrs = {
......@@ -89,6 +112,7 @@ class TestPool3d_Op(OpTest):
'pooling_type': self.pool_type,
'global_pooling': self.global_pool,
'use_cudnn': self.use_cudnn,
'ceil_mode': self.ceil_mode,
'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter
}
......@@ -125,6 +149,9 @@ class TestPool3d_Op(OpTest):
def init_global_pool(self):
self.global_pool = True
def init_ceil_mode(self):
self.ceil_mode = False
class TestCase1(TestPool3d_Op):
def init_test_case(self):
......@@ -227,5 +254,25 @@ class TestCUDNNCase6(TestCase5):
self.op_type = "pool3d"
class TestCeilModeCase1(TestCUDNNCase1):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase2(TestCUDNNCase2):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase3(TestCase1):
def init_ceil_mode(self):
self.ceil_mode = True
class TestCeilModeCase4(TestCase2):
def init_ceil_mode(self):
self.ceil_mode = True
if __name__ == '__main__':
unittest.main()
......@@ -37,7 +37,9 @@ class TestProfiler(unittest.TestCase):
label = fluid.layers.data(name='y', shape=[1], dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label)
avg_cost = fluid.layers.mean(cost)
accuracy = fluid.evaluator.Accuracy(input=predict, label=label)
batch_size = fluid.layers.create_tensor(dtype='int64')
batch_acc = fluid.layers.accuracy(
input=predict, label=label, total=batch_size)
optimizer = fluid.optimizer.Momentum(learning_rate=0.001, momentum=0.9)
opts = optimizer.minimize(avg_cost, startup_program=startup_program)
......@@ -46,7 +48,7 @@ class TestProfiler(unittest.TestCase):
exe = fluid.Executor(place)
exe.run(startup_program)
accuracy.reset(exe)
pass_acc_calculator = fluid.average.WeightedAverage()
with profiler.profiler(state, 'total', profile_path) as prof:
for iter in range(10):
if iter == 2:
......@@ -57,9 +59,11 @@ class TestProfiler(unittest.TestCase):
outs = exe.run(main_program,
feed={'x': x,
'y': y},
fetch_list=[avg_cost] + accuracy.metrics)
fetch_list=[avg_cost, batch_acc, batch_size])
acc = np.array(outs[1])
pass_acc = accuracy.eval(exe)
b_size = np.array(outs[2])
pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval()
def test_cpu_profiler(self):
self.net_profiler('CPU')
......
......@@ -60,8 +60,8 @@ class TestSpliteSelectedRows(unittest.TestCase):
# expected output selected rows
expected_out0_rows = [0, 4]
expected_out1_rows = [5, 7]
expected_out4_rows = [20]
expected_out1_rows = [0, 2]
expected_out4_rows = [0]
op = Operator(
"split_selected_rows",
......@@ -101,7 +101,7 @@ class TestSpliteSelectedRows(unittest.TestCase):
out0_grad_tensor.set(np_array, place)
out1_grad = scope.var("out1@GRAD").get_selected_rows()
rows1 = [7, 5]
rows1 = [2, 0]
out1_grad.set_rows(rows1)
out1_grad.set_height(height)
out1_grad_tensor = out1_grad.get_tensor()
......
......@@ -13,8 +13,10 @@ ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz
COPY build_scripts /build_scripts
RUN bash build_scripts/build.sh && rm -r build_scripts
RUN bash build_scripts/build.sh && \
bash build_scripts/install_nccl2.sh && rm -r build_scripts
ENV SSL_CERT_FILE=/opt/_internal/certs.pem
......@@ -34,9 +36,6 @@ RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf
tar xzf protobuf-cpp-3.1.0.tar.gz && \
cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool
RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
......@@ -47,10 +46,7 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
RUN wget -O /opt/swig-2.0.12.tar.gz https://cytranet.dl.sourceforge.net/project/swig/swig/swig-2.0.12/swig-2.0.12.tar.gz && \
cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
make -j `nproc` install <NCCL_MAKE_OPTS> && cd .. && rm -rf nccl
CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"]
#!/bin/bash
DEB="nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb"
DIR="/nccl2"
mkdir -p $DIR
# we cached the nccl2 deb package in BOS, so we can download it with wget
# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
wget -O $DIR/$DEB \
"http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb?responseContentDisposition=attachment"
cd $DIR && ar x $DEB && tar xf data.tar.xz
DEBS=$(find ./var/ -name "*.deb")
for sub_deb in $DEBS; do
echo $sub_deb
ar x $sub_deb && tar xf data.tar.xz
done
mv -f usr/include/nccl.h /usr/local/include/
mv -f usr/lib/libnccl* /usr/local/lib/
rm -rf $DIR
......@@ -159,7 +159,7 @@ if args.timeline_path:
with open(profile_path, 'r') as f:
profile_s = f.read()
profile_pb = profiler_pb2.Profile()
text_format.Merge(profile_s, profile_pb)
profile_pb.ParseFromString(profile_s)
tl = Timeline(profile_pb)
with open(timeline_path, 'w') as f:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册