diff --git a/.travis.yml b/.travis.yml index 9dd5f48164a3417940880f2b15bf7d9906453fb8..bf6a41d13c4eabc2d8543ab821ce0ff747a061df 100644 --- a/.travis.yml +++ b/.travis.yml @@ -56,7 +56,7 @@ script: export DEPLOY_DOCS_SH=https://raw.githubusercontent.com/PaddlePaddle/PaddlePaddle.org/master/scripts/deploy/deploy_docs.sh export DOCS_DIR=`pwd` cd .. - curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/v2 + curl $DEPLOY_DOCS_SH | bash -s $CONTENT_DEC_PASSWD $TRAVIS_BRANCH $DOCS_DIR $DOCS_DIR/build/doc/ notifications: email: on_success: change diff --git a/CMakeLists.txt b/CMakeLists.txt index 469af0f7859b9ea79d1fc4c53e19cc29bfe28ce8..c86889c05c8cf0d521dce9adbf3e918ba91729a1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -144,6 +144,8 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/grpc) +include(external/snappy) # download snappy +include(external/snappystream) include(cudnn) # set cudnn libraries, must before configure include(cupti) diff --git a/benchmark/cluster/vgg16/vgg16_fluid.py b/benchmark/cluster/vgg16/vgg16_fluid.py index 53e71998f1517a8f2cfa4509426231fb0f0177e8..786f224608f7d41c438411de0e09fedbcf2264b8 100644 --- a/benchmark/cluster/vgg16/vgg16_fluid.py +++ b/benchmark/cluster/vgg16/vgg16_fluid.py @@ -1,11 +1,11 @@ # Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# +# # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at -# +# # http://www.apache.org/licenses/LICENSE-2.0 -# +# # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. @@ -138,13 +138,14 @@ def main(): avg_cost = fluid.layers.mean(x=cost) # Evaluator - accuracy = fluid.evaluator.Accuracy(input=predict, label=label) + batch_size = fluid.layers.create_tensor(dtype='int64') + batch_acc = fluid.layers.accuracy( + input=predict, label=label, total=batch_size) # inference program inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): - test_target = accuracy.metrics + accuracy.states - inference_program = fluid.io.get_inference_program(test_target) + inference_program = fluid.io.get_inference_program(batch_acc) # Optimization optimizer = fluid.optimizer.Adam(learning_rate=args.learning_rate) @@ -157,27 +158,30 @@ def main(): # test def test(exe): - accuracy.reset(exe) + test_pass_acc = fluid.average.WeightedAverage() for batch_id, data in enumerate(test_reader()): img_data = np.array(map(lambda x: x[0].reshape(data_shape), data)).astype("float32") y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) - exe.run(inference_program, - feed={"pixel": img_data, - "label": y_data}) + outs = exe.run(inference_program, + feed={"pixel": img_data, + "label": y_data}, + fetch_list=[batch_acc, batch_size]) + test_pass_acc.add(value=np.array(outs[0]), weight=np.array(outs[1])) - return accuracy.eval(exe) + return test_pass_acc.eval() def train_loop(exe, trainer_prog): iters = 0 ts = time.time() + train_pass_acc = fluid.average.WeightedAverage() for pass_id in range(args.num_passes): # train start_time = time.time() num_samples = 0 - accuracy.reset(exe) + train_pass_acc.reset() with profiler.profiler("CPU", 'total') as prof: for batch_id, data in enumerate(train_reader()): ts = time.time() @@ -187,13 +191,14 @@ def main(): y_data = np.array(map(lambda x: x[1], data)).astype("int64") y_data = y_data.reshape([-1, 1]) - loss, acc = exe.run( + loss, acc, b_size = exe.run( trainer_prog, feed={"pixel": img_data, "label": y_data}, - fetch_list=[avg_cost] + accuracy.metrics) + fetch_list=[avg_cost, batch_acc, batch_size]) iters += 1 num_samples += len(data) + train_pass_acc.add(value=acc, weight=b_size) print( "Pass = %d, Iters = %d, Loss = %f, Accuracy = %f, Speed = %.2f img/s" % (pass_id, iters, loss, acc, @@ -201,7 +206,7 @@ def main(): ) # The accuracy is the accumulation of batches, but not the current batch. pass_elapsed = time.time() - start_time - pass_train_acc = accuracy.eval(exe) + pass_train_acc = train_pass_acc.eval() pass_test_acc = test(exe) print( "Pass = %d, Training performance = %f imgs/s, Train accuracy = %f, Test accuracy = %f\n" diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake index e2b7ef8d54748de46370e808e28f9b9fcc988a84..8af2765f58717408e3a1ef6b500bb01511bfd8d3 100644 --- a/cmake/external/openblas.cmake +++ b/cmake/external/openblas.cmake @@ -77,7 +77,8 @@ IF(NOT ${CBLAS_FOUND}) INSTALL_DIR ${CBLAS_INSTALL_DIR} BUILD_IN_SOURCE 1 BUILD_COMMAND ${CMAKE_MAKE_PROGRAM} ${COMMON_ARGS} ${OPTIONAL_ARGS} - INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + INSTALL_COMMAND ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 NO_LAPACK=1 PREFIX= + && rm -r ${CBLAS_INSTALL_DIR}/lib/cmake ${CBLAS_INSTALL_DIR}/lib/pkgconfig UPDATE_COMMAND "" CONFIGURE_COMMAND "" ) @@ -100,11 +101,6 @@ IF(NOT ${CBLAS_FOUND}) \"${CBLAS_INSTALL_DIR}/lib -> ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}\" )" ) - INSTALL(CODE "execute_process( - COMMAND rm -r ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/cmake - ${CMAKE_INSTALL_PREFIX}/${TMP_INSTALL_DIR}/pkgconfig - )" - ) ENDIF() ENDIF(NOT ${CBLAS_FOUND}) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake new file mode 100644 index 0000000000000000000000000000000000000000..71f54c425d4c38e271a8f1b78887d95a27252443 --- /dev/null +++ b/cmake/external/snappy.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: snappy is needed when linking with recordio + +SET(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) +SET(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) +SET(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include/" CACHE PATH "snappy include directory." FORCE) + +ExternalProject_Add( + extern_snappy + GIT_REPOSITORY "https://github.com/google/snappy" + GIT_TAG "1.1.7" + PREFIX ${SNAPPY_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DBUILD_TESTING=OFF + -DSNAPPY_BUILD_TESTS:BOOL=OFF + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install +) + +add_library(snappy STATIC IMPORTED GLOBAL) +set_property(TARGET snappy PROPERTY IMPORTED_LOCATION + "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") + +include_directories(${SNAPPY_INCLUDE_DIR}) +add_dependencies(snappy extern_snappy) diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake new file mode 100644 index 0000000000000000000000000000000000000000..5377a0b046a796cd6f0bb1fb466e1cd0b4b678bf --- /dev/null +++ b/cmake/external/snappystream.cmake @@ -0,0 +1,58 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: snappy is needed when linking with recordio + +SET(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) +SET(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) +SET(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include/" CACHE PATH "snappy stream include directory." FORCE) + +ExternalProject_Add( + extern_snappystream + GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" + GIT_TAG "0.2.8" + PREFIX ${SNAPPYSTREAM_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + BUILD_COMMAND make -j8 + INSTALL_COMMAND make install + DEPENDS snappy +) + +add_library(snappystream STATIC IMPORTED GLOBAL) +set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION + "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") + +include_directories(${SNAPPYSTREAM_INCLUDE_DIR}) +add_dependencies(snappystream extern_snappystream) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 356da582d1f8b6a8858af90ccdf5af2100e5db87..d0b5eaec2e2a50acf17e5dd1d1aeb0ec3e614fbf 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -186,7 +186,9 @@ function(cc_library TARGET_NAME) add_library(${TARGET_NAME} SHARED ${cc_library_SRCS}) else() add_library(${TARGET_NAME} STATIC ${cc_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) endif() + if(cc_library_DEPS) # Don't need link libwarpctc.so if("${cc_library_DEPS};" MATCHES "warpctc;") @@ -263,7 +265,8 @@ function(nv_library TARGET_NAME) if (nv_library_SHARED OR nv_library_shared) # build *.so cuda_add_library(${TARGET_NAME} SHARED ${nv_library_SRCS}) else() - cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + cuda_add_library(${TARGET_NAME} STATIC ${nv_library_SRCS}) + find_fluid_modules(${TARGET_NAME}) endif() if (nv_library_DEPS) add_dependencies(${TARGET_NAME} ${nv_library_DEPS}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index 4471df36b0717171da4dff92ca0ec98b4f981028..6b2237b858380f384be0aa3c6ae24a4c83ad646d 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -1,9 +1,22 @@ +set_property(GLOBAL PROPERTY FLUID_MODULES "") +# find all fluid modules is used for paddle fluid static library +function(find_fluid_modules TARGET_NAME) + get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE) + string(FIND "${__target_path}" "fluid" pos) + if(pos GREATER 1) + get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) + set(fluid_modules ${fluid_modules} ${TARGET_NAME}) + set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}") + endif() +endfunction(find_fluid_modules) + # make package for paddle fluid shared and static library function(copy TARGET) set(options "") set(oneValueArgs "") set(multiValueArgs SRCS DSTS DEPS) cmake_parse_arguments(copy_lib "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(inference_lib_dist_dep ${TARGET} ${inference_lib_dist_dep} PARENT_SCOPE) list(LENGTH copy_lib_SRCS copy_lib_SRCS_len) list(LENGTH copy_lib_DSTS copy_lib_DSTS_len) @@ -42,13 +55,21 @@ copy(glog_lib DSTS ${dst_dir} ${dst_dir}/lib ) -IF(NOT PROTOBUF_FOUND) +if(NOT PROTOBUF_FOUND) set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf") copy(protobuf_lib - SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LITE_LIBRARY} + SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} DSTS ${dst_dir} ${dst_dir}/lib ) -ENDIF(NOT PROTOBUF_FOUND) +endif() + +if(NOT CBLAS_FOUND) + set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas") + copy(openblas_lib + SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include + DSTS ${dst_dir} ${dst_dir} + ) +endif() # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -66,8 +87,8 @@ copy(memory_lib ) set(module "inference") -copy(inference_lib DEPENDS paddle_fluid_shared - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.so +copy(inference_lib DEPS paddle_fluid_shared paddle_fluid + SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* DSTS ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -83,6 +104,4 @@ copy(string_lib DSTS ${dst_dir}/${module} ${dst_dir}/${module}/tinyformat ) -add_custom_target(inference_lib_dist DEPENDS - inference_lib framework_lib memory_lib platform_lib string_lib - gflags_lib glog_lib protobuf_lib eigen3_lib) +add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) diff --git a/doc/design/cpp_data_feeding.md b/doc/design/cpp_data_feeding.md index 40205350f99722f0b71bfa6f390fe9d01d831966..22c2a925eb8c5e1dd8451e1d3cba261ce471ec51 100644 --- a/doc/design/cpp_data_feeding.md +++ b/doc/design/cpp_data_feeding.md @@ -20,9 +20,8 @@ class ReaderBase { PADDLE_ENFORCE(!shapes_.empty()); } // Read the next batch of data. (A 'batch' can be only one instance) + // If the next batch doesn't exist, the '*out' will be an empty std::vector. virtual void ReadNext(std::vector* out) = 0; - // Show whether the next bacth exists. - virtual bool HasNext() const = 0; // Reinitialize the reader and read the file from the begin. virtual void ReInit() = 0; diff --git a/doc/fluid/dev/use_eigen_cn.md b/doc/fluid/dev/use_eigen_cn.md index 1367323b71277984834d9d4f0d9bea0f69478479..f36843b4408c21bdca1fa83853e5b0a40116791c 100644 --- a/doc/fluid/dev/use_eigen_cn.md +++ b/doc/fluid/dev/use_eigen_cn.md @@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override { ### paddle::framework::Tensor到EigenTensor的转换 -如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 +如上一小节所示,在具体的计算中,我们需要先把输入Tensor和输出Tensor转换为Eigen支持的格式。我们在[eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h)中提供了一些全局函数用来实现paddle::framework::Tensor到EigenTensor/EigenMatrix/EigenVector/EigenScalar的转换。 以EigenTensor为例,做一个介绍 @@ -125,7 +125,7 @@ From是EigenTensor模板提供的一个接口,可以实现从paddle::framework 在Eigen中,不同rank的Tensor是不同类型,Vector是rank为1的Tensor。需要额外注意的是,EigenVector::From方法是把paddle中的一维Tensor转为Eigen的一维Tensor,在这里用EigenVector来表示;而EigenVector::Flatten方法是把paddle中的一个Tensor进行reshape操作,压扁成为Eigen的一维Tensor,类型仍然为EigenVector。 -更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc)。 +更多的转换方法请参考eigen_test.cc中的[单元测试](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc)。 diff --git a/doc/fluid/dev/use_eigen_en.md b/doc/fluid/dev/use_eigen_en.md index e169106e12f5d62696f1f0e7163562793b32c18c..3a466f73d1f9b94a29b171015279c782ca50bd02 100644 --- a/doc/fluid/dev/use_eigen_en.md +++ b/doc/fluid/dev/use_eigen_en.md @@ -107,7 +107,7 @@ void Compute(const framework::ExecutionContext& context) const override { ### paddle::framework::Tensor到EigenTensor的转换 -As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. +As shown above, in actual computation, we need to transform the input and output `Tensor`s into formats Eigen supports. We show some functions in [eigen.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen.h) to implement the transformation from `paddle::framework::Tensor`to `EigenTensor/EigenMatrix/EigenVector/EigenScalar`. Using EigenTensor as an example: @@ -125,7 +125,7 @@ EigenTensor::Type et = EigenTensor::From(t); In Eigen, tensors with different ranks are different types, with `Vector` bring a rank-1 instance. Note that `EigenVector::From` uses a transformation from an 1-dimensional Paddle tensor to a 1-dimensional Eigen tensor while `EigenVector::Flatten` reshapes a paddle tensor and flattens it into a 1-dimensional Eigen tensor. Both resulting tensors are still typed EigenVector. -For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/eigen_test.cc) in the `eigen_test.cc` file. +For more transformations, see the [unit tests](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/eigen_test.cc) in the `eigen_test.cc` file. diff --git a/doc/v2/howto/optimization/pprof_1.png b/doc/fluid/howto/optimization/pprof_1.png similarity index 100% rename from doc/v2/howto/optimization/pprof_1.png rename to doc/fluid/howto/optimization/pprof_1.png diff --git a/doc/v2/howto/optimization/pprof_2.png b/doc/fluid/howto/optimization/pprof_2.png similarity index 100% rename from doc/v2/howto/optimization/pprof_2.png rename to doc/fluid/howto/optimization/pprof_2.png diff --git a/doc/fluid/howto/optimization/timeline.jpeg b/doc/fluid/howto/optimization/timeline.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..38ec3f80c982857531f30a8bb0fa26ea5bf05385 Binary files /dev/null and b/doc/fluid/howto/optimization/timeline.jpeg differ diff --git a/doc/fluid/howto/optimization/timeline.md b/doc/fluid/howto/optimization/timeline.md new file mode 100644 index 0000000000000000000000000000000000000000..9d9565a3e698a83ca465c5da83ff892360c33b8f --- /dev/null +++ b/doc/fluid/howto/optimization/timeline.md @@ -0,0 +1,27 @@ +## how to use timeline tool to do profile + +1. Add `with profiler.profiler(...)` to the main training loop. After run, the code will generate a profile record file `/tmp/profile`. **Warning**: Please do not run too many batches when use profiler to record timeline information, for the profile record will grow with the batch number. + + ```python + with profiler.profiler('All', 'total', '/tmp/profile') as prof: + for pass_id in range(pass_num): + for batch_id, data in enumerate(train_reader()): + exe.run(fluid.default_main_program(), + feed=feeder.feed(data), + fetch_list=[], + use_program_cache=True) + ... + ``` + +1. Run `python paddle/tools/timeline.py` to process `/tmp/profile`, it will generate another +file `/tmp/timeline` by default. You can change the path by cmd parameter, please take a look at +[timeline.py](https://github.com/PaddlePaddle/Paddle/blob/develop/tools/timeline.py) for details. + +1. Open chrome and visit , use `load` button to load the generated `timeline` file. + + ![chrome tracing](./tracing.jpeg) + +1. The resulting timeline should be like: + + + ![chrome timeline](./timeline.jpeg) diff --git a/doc/fluid/howto/optimization/tracing.jpeg b/doc/fluid/howto/optimization/tracing.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..3a49fc4f8a401a9463b0157e2f38c164ca02dcc5 Binary files /dev/null and b/doc/fluid/howto/optimization/tracing.jpeg differ diff --git a/doc/fluid/read_source.md b/doc/fluid/read_source.md index edf46aff8c6cc9fc01d26c6453b3a8123238ef91..bb6d4563f5617fb98af055bca2f6f0479bdb4393 100644 --- a/doc/fluid/read_source.md +++ b/doc/fluid/read_source.md @@ -2,17 +2,17 @@ Examples: https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/tests/book -Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/framework +Core: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/framework -Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators +Operator: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators -Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory +Memory: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory -Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/platform +Platform: https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/platform # Compile Time -The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/framework.proto). +The following **defines** the NN. The definition goes into this [protocol buffer](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/framework.proto). ```python x = fluid.layers.data(name='x', shape=[13], dtype='float32') @@ -29,10 +29,10 @@ sgd_optimizer.minimize(avg_cost) - Variables: `x`, `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/framework.py#) - Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/fluid/layers) - Every Layer has one or more operators and variables/parameters - - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files: - - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h) - - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) - - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h) + - All the operators are defined at [`paddle/fluid/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/operators). Other worth-looking files: + - Base class: [`paddle/fluid/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/operator.h) + - Operator Registration: [`paddle/fluid/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_registry.h) + - Operator Lookup: [`paddle/fluid/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/op_info.h) - Optimizer: `fluid.optimizer.SGD`. It does the following - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/backward.py)] - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/optimizer.py)] @@ -55,13 +55,13 @@ exe.run(fluid.default_main_program(), fetch_list=[avg_cost]) ``` -- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/place.h) - - The device handle are at [paddle/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/platform/device_context.h) -- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/executor.cc)] +- Place: `place`. one of CPU, GPU or FPGA. [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/place.h) + - The device handle are at [paddle/fluid/platform/device_context.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/platform/device_context.h) +- Executor: `fluid.Executor(place)`. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/executor.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/executor.cc)] - Feeds the data: `feed=feeder.feed(data)` - Evaluates all the operators - Fetches the result: `fetch_list=[avg_cost]` - Other worth looking files: - - Scope: [paddle/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/scope.h). Where all the variables live - - Variable: [paddle/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/variable.h). Where all the data (most likely tensors) live - - Tensor: [paddle/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/tensor.h). Where we allocate memory through [`paddle/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/memory) + - Scope: [paddle/fluid/framework/scope.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/scope.h). Where all the variables live + - Variable: [paddle/fluid/framework/variable.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/variable.h). Where all the data (most likely tensors) live + - Tensor: [paddle/fluid/framework/tensor.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/framework/tensor.h). Where we allocate memory through [`paddle/fluid/memory/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/memory) diff --git a/doc/v2/build_and_install/pip_install_cn.rst b/doc/v2/build_and_install/pip_install_cn.rst index 8e4165da6b8135d083766c650f1092158f9d01c2..b3d882743785e8ee301b71b696230531d2b7ba58 100644 --- a/doc/v2/build_and_install/pip_install_cn.rst +++ b/doc/v2/build_and_install/pip_install_cn.rst @@ -34,15 +34,15 @@ PaddlePaddle可以使用常用的Python包管理工具 :align: center .. csv-table:: 各个版本最新的whl包 - :header: "版本说明", "cp27-cp27mu", "cp27-cp27m", "C-API" - :widths: 1, 3, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "暂无" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + :header: "版本说明", "cp27-cp27mu", "cp27-cp27m" + :widths: 1, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/doc/v2/build_and_install/pip_install_en.rst b/doc/v2/build_and_install/pip_install_en.rst index 0d4c925b6e2731bdfd76582309aa7e8adbafa6ae..1e409d86b9775094998f72f92954f4bbc1013ea1 100644 --- a/doc/v2/build_and_install/pip_install_en.rst +++ b/doc/v2/build_and_install/pip_install_en.rst @@ -37,15 +37,15 @@ If the links below shows up the login form, just click "Log in as guest" to star :align: center .. csv-table:: whl package of each version - :header: "version", "cp27-cp27mu", "cp27-cp27m", "C-API" - :widths: 1, 3, 3, 3 - - "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" - "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "Not Available" - "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" - "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_", "`paddle.tgz `_" + :header: "version", "cp27-cp27mu", "cp27-cp27m" + :widths: 1, 3, 3 + + "cpu_avx_mkl", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_avx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cpu_noavx_openblas", "`paddlepaddle-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda7.5_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn5_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" + "cuda8.0_cudnn7_avx_mkl", "`paddlepaddle_gpu-0.11.0-cp27-cp27mu-linux_x86_64.whl `_", "`paddlepaddle_gpu-0.11.0-cp27-cp27m-linux_x86_64.whl `_" .. _pip_dependency: diff --git a/doc/v2/getstarted/index_cn.rst b/doc/v2/getstarted/index_cn.rst index 1dc141396b95bda776aeff87ac30fad6baf37bd2..75af7354be93a6eeabfa9ccf86903505402a7ca6 100644 --- a/doc/v2/getstarted/index_cn.rst +++ b/doc/v2/getstarted/index_cn.rst @@ -1,8 +1,19 @@ 新手入门 ============ + +如果需要快速了解PaddlePaddle的使用,可以参考以下指南。 + .. toctree:: :maxdepth: 1 quickstart_cn.rst + + +在使用PaddlePaddle构建应用时,需要了解一些基本概念。 +这里以一个线性回归为例子,详细介绍了PaddlePaddle的使用流程,包括数据格式,模型配置与训练等。 + +.. toctree:: + :maxdepth: 1 + concepts/use_concepts_cn.rst diff --git a/doc/v2/howto/capi/compile_paddle_lib_cn.md b/doc/v2/howto/capi/compile_paddle_lib_cn.md index fd8dec8164580b9dcb716e69f3cc5357639f17d3..e223fd33a8420abcdfdad53d1cfc5ed160a1b37e 100644 --- a/doc/v2/howto/capi/compile_paddle_lib_cn.md +++ b/doc/v2/howto/capi/compile_paddle_lib_cn.md @@ -1,22 +1,80 @@ -## 安装与编译C-API预测库 - -### 概述 - -使用 C-API 进行预测依赖于将 PaddlePaddle 核心代码编译成链接库,只需在编译时需配制下面这些编译选项: - -必须配置选项: -- `WITH_C_API`,必须配置为`ON`。 - -推荐配置选项: -- `WITH_PYTHON`,推荐配置为`OFF` -- `WITH_SWIG_PY`,推荐配置为`OFF` -- `WITH_GOLANG`,推荐设置为`OFF` - -可选配置选项: -- `WITH_GPU`,可配置为`ON/OFF` -- `WITH_MKL`,可配置为`ON/OFF` - -对推荐配置中的选项建议按照设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 +## 安装、编译与链接C-API预测库 + +### 直接下载安装 + +从CI系统中下载最新的C-API开发包进行安装,用户可以从下面的表格中找到需要的版本: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
版本说明C-API
cpu_avx_mklpaddle.tgz
cpu_avx_openblas暂无
cpu_noavx_openblaspaddle.tgz
cuda7.5_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn5_avx_mklpaddle.tgz
cuda8.0_cudnn7_avx_mklpaddle.tgz
+ +### 从源码编译 + +用户也可以从 PaddlePaddle 核心代码编译C-API链接库,只需在编译时配制下面这些编译选项: + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
选项
WITH_C_APION
WITH_PYTHONOFF(推荐)
WITH_SWIG_PYOFF(推荐)
WITH_GOLANGOFF(推荐)
WITH_GPUON/OFF
WITH_MKLON/OFF
+ +建议按照推荐值设置,以避免链接不必要的库。其它可选编译选项按需进行设定。 下面的代码片段从github拉取最新代码,配制编译选项(需要将PADDLE_ROOT替换为PaddlePaddle预测库的安装路径): @@ -100,23 +158,19 @@ cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \ 目前提供三种链接方式: -1. 链接`libpaddle_capi_shared.so` 动态库 - - 使用 PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_shared.so`时,需注意: - 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 - 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 - 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 - - 这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**。 - -2. 链接静态库 `libpaddle_capi_whole.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 需要指定`-Wl,--whole-archive`链接选项。 - 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 - 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 - 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 - -3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a` - - 使用PaddlePaddle C-API 开发预测程序链接`libpaddle_capi_whole.a`时,需注意: - 1. 这种链接方式主要用于移动端预测。 - 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 - 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 - 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 +1. 链接`libpaddle_capi_shared.so` 动态库(这种方式最为简便,链接相对容易,**在无特殊需求情况下,推荐使用此方式**),需注意: + 1. 如果编译时指定编译CPU版本,且使用`OpenBLAS`数学库,在使用C-API开发预测程序时,只需要链接`libpaddle_capi_shared.so`这一个库。 + 1. 如果是用编译时指定CPU版本,且使用`MKL`数学库,由于`MKL`库有自己独立的动态库文件,在使用PaddlePaddle C-API开发预测程序时,需要自己链接MKL链接库。 + 1. 如果编译时指定编译GPU版本,CUDA相关库会在预测程序运行时动态装载,需要将CUDA相关的库设置到`LD_LIBRARY_PATH`环境变量中。 + +2. 链接静态库 `libpaddle_capi_whole.a`,需注意: + 1. 需要指定`-Wl,--whole-archive`链接选项。 + 1. 需要显式地链接 `gflags`、`glog`、`libz`、`protobuf` 等第三方库,可在`PADDLE_ROOT/third_party`下找到。 + 1. 如果在编译 C-API 时使用OpenBLAS数学库,需要显示地链接`libopenblas.a`。 + 1. 如果在编译 C-API 是使用MKL数学库,需要显示地链接MKL的动态库。 + +3. 链接静态库 `libpaddle_capi_layers.a`和`libpaddle_capi_engine.a`,需注意: + 1. 这种链接方式主要用于移动端预测。 + 1. 为了减少生成链接库的大小把`libpaddle_capi_whole.a`拆成以上两个静态链接库。 + 1. 需指定`-Wl,--whole-archive -lpaddle_capi_layers` 和 `-Wl,--no-whole-archive -lpaddle_capi_engine` 进行链接。 + 1. 第三方依赖库需要按照与方式2同样方法显示地进行链接。 diff --git a/doc/v2/howto/index_cn.rst b/doc/v2/howto/index_cn.rst index 0c534f107b6e047035c424ed2ea59f3982799b63..b0268907bceb11cd53a4630c3f8b8e0424abe247 100644 --- a/doc/v2/howto/index_cn.rst +++ b/doc/v2/howto/index_cn.rst @@ -1,11 +1,37 @@ 进阶使用 ======== +PaddlePaddle支持用户灵活地设置各种命令行参数,以实现对模型训练或预测流程的控制。使用方式请参考: + .. toctree:: :maxdepth: 1 cmd_parameter/index_cn.rst + +PaddlePaddle支持在fabric集群、MPI集群、kubernetes集群上分布式训练任务,具体环境配置和使用说明请参考: + +.. toctree:: + :maxdepth: 1 + cluster/index_cn.rst + +PaddlePaddle提供了用于预测的C-API,关于C-API的使用,我们提供了如下指南: + +.. toctree:: + :maxdepth: 1 + capi/index_cn.rst + +PaddlePaddle支持多种灵活和高效的循环神经网络,具体配置使用方式请参考: + +.. toctree:: + :maxdepth: 1 + rnn/index_cn.rst + +关于如何使用内置的定时工具、nvprof 或 nvvp 来运行性能分析和调优,请参考: + +.. toctree:: + :maxdepth: 1 + optimization/gpu_profiling_cn.rst diff --git a/doc/v2/howto/rnn/hierarchical_layer_cn.rst b/doc/v2/howto/rnn/hierarchical_layer_cn.rst index e05173c2006ff47ecb6ca5a4fe1502de750acc59..2f8f408b40299890da694862a7b9418cf9ff07f2 100644 --- a/doc/v2/howto/rnn/hierarchical_layer_cn.rst +++ b/doc/v2/howto/rnn/hierarchical_layer_cn.rst @@ -22,7 +22,7 @@ pooling ======== -pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API。 +pooling 的使用示例如下。 .. code-block:: bash @@ -47,7 +47,7 @@ pooling 的使用示例如下,详细见 :ref:`api_v2.layer_pooling` 配置API last_seq 和 first_seq ===================== -last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详细见 :ref:`api_v2.layer_last_seq` 配置API。 +last_seq 的使用示例如下(first_seq 类似)。 .. code-block:: bash @@ -68,7 +68,7 @@ last_seq 的使用示例如下( :ref:`api_v2.layer_first_seq` 类似),详 expand ====== -expand 的使用示例如下,详细见 :ref:`api_v2.layer_expand` 配置API。 +expand 的使用示例如下。 .. code-block:: bash diff --git a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst index efdc44455ea4dc81a87b4d4fc8a81e78b15cb06a..b05b66415fbb829f471b1491b9881f65137bfe17 100644 --- a/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst +++ b/doc/v2/howto/rnn/hrnn_rnn_api_compare_cn.rst @@ -4,7 +4,7 @@ 单双层RNN API对比介绍 ##################### -本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp `_\ 。 +本文以PaddlePaddle的双层RNN单元测试为示例,用多对效果完全相同的、分别使用单双层RNN作为网络配置的模型,来讲解如何使用双层RNN。本文中所有的例子,都只是介绍双层RNN的API接口,并不是使用双层RNN解决实际的问题。如果想要了解双层RNN在具体问题中的使用,请参考\ :ref:`algo_hrnn_demo`\ 。本文中示例所使用的单元测试文件是\ `test_RecurrentGradientMachine.cpp `_\ 。 示例1:双层RNN,子序列间无Memory ================================ @@ -166,11 +166,6 @@ 在上面代码中,单层和双层序列的使用和示例2中的示例类似,区别是同时处理了两个输入。而对于双层序列,两个输入的子序列长度也并不相同。但是,我们使用了\ :code:`targetInlink`\ 参数设置了外层\ :code:`recurrent_group`\ 的输出格式。所以外层输出的序列形状,和\ :code:`emb2`\ 的序列形状一致。 -示例4:beam_search的生成 -======================== - -TBD - 词汇表 ====== diff --git a/doc/v2/howto/rnn/index_cn.rst b/doc/v2/howto/rnn/index_cn.rst index bcc8c2f46eb662ec3650e829a77992224dbbb8e7..2032fb9e296ab024c68da1348064580c8c88d5be 100644 --- a/doc/v2/howto/rnn/index_cn.rst +++ b/doc/v2/howto/rnn/index_cn.rst @@ -1,10 +1,34 @@ RNN模型 =========== +循环神经网络(RNN)是对序列数据建模的重要工具。PaddlePaddle提供了灵活的接口以支持复杂循环神经网络的构建。 +这里将分为以下四个部分详细介绍如何使用PaddlePaddle搭建循环神经网络。 + +第一部分由浅入深的展示了使用PaddlePaddle搭建循环神经网络的全貌:首先以简单的循环神经网络(vanilla RNN)为例, +说明如何封装配置循环神经网络组件;然后更进一步的通过序列到序列(sequence to sequence)模型,逐步讲解如何构建完整而复杂的循环神经网络模型。 .. toctree:: :maxdepth: 1 rnn_config_cn.rst + +Recurrent Group是PaddlePaddle中实现复杂循环神经网络的关键,第二部分阐述了PaddlePaddle中Recurrent Group的相关概念和原理, +对Recurrent Group接口进行了详细说明。另外,对双层RNN(对应的输入为双层序列)及Recurrent Group在其中的使用进行了介绍。 + +.. toctree:: + :maxdepth: 1 + recurrent_group_cn.md + +第三部分对双层序列进行了解释说明,列出了PaddlePaddle中支持双层序列作为输入的Layer,并对其使用进行了逐一介绍。 + +.. toctree:: + :maxdepth: 1 + hierarchical_layer_cn.rst + +第四部分以PaddlePaddle的双层RNN单元测试中的网络配置为示例,辅以效果相同的单层RNN网络配置作为对比,讲解了多种情况下双层RNN的使用。 + +.. toctree:: + :maxdepth: 1 + hrnn_rnn_api_compare_cn.rst diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 7405ef17d3e01e4837412c96385e13a9e605a75b..d725763b01d5953985f8e090605f68a8419b5498 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -5,3 +5,4 @@ add_subdirectory(operators) add_subdirectory(pybind) add_subdirectory(inference) add_subdirectory(string) +add_subdirectory(recordio) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 82c7d4a2ec648449fc65ca2ae0de397b2f6fa120..48713f2c2ac62a37b7b7a4602f7f6a325aecb0b8 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -5,14 +5,14 @@ cc_library(ddim SRCS ddim.cc DEPS eigen3 boost) cc_test(ddim_test SRCS ddim_test.cc DEPS ddim) nv_test(dim_test SRCS dim_test.cu DEPS ddim) -if (WITH_GPU) +if(WITH_GPU) nv_library(tensor SRCS tensor.cc tensor_util.cu DEPS ddim place paddle_memory device_context framework_proto) else() cc_library(tensor SRCS tensor.cc tensor_util.cc DEPS ddim place paddle_memory device_context framework_proto) -endif () +endif() cc_test(tensor_test SRCS tensor_test.cc DEPS tensor) -if (WITH_GPU) +if(WITH_GPU) nv_test(tensor_util_test SRCS tensor_util_test.cc tensor_util_test.cu DEPS tensor) else() cc_test(tensor_util_test SRCS tensor_util_test.cc DEPS tensor) @@ -39,8 +39,13 @@ cc_library(data_device_transform SRCS data_device_transform.cc DEPS tensor) nv_test(data_device_transform_test SRCS data_device_transform_test.cu DEPS operator op_registry init math_function) -cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) -cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) +if(WITH_GPU) + nv_library(data_type_transform SRCS data_type_transform.cu DEPS tensor) + nv_test(data_type_transform_test SRCS data_type_transform_test.cc data_type_transform_test.cu DEPS data_type_transform) +else() + cc_library(data_type_transform SRCS data_type_transform.cc DEPS tensor) + cc_test(data_type_transform_test SRCS data_type_transform_test.cc DEPS data_type_transform) +endif() cc_library(data_layout_transform SRCS data_layout_transform.cc DEPS tensor math_function) cc_test(data_layout_transform_test SRCS data_layout_transform_test.cc DEPS data_layout_transform) diff --git a/paddle/fluid/framework/channel.h b/paddle/fluid/framework/channel.h index bda1bfb23b18f8c6b9f1c3eded461a9322a154af..9f8fb12098d622058a86f83c1c42a1feb1cfb2e2 100644 --- a/paddle/fluid/framework/channel.h +++ b/paddle/fluid/framework/channel.h @@ -28,24 +28,19 @@ class Channel { virtual bool Send(T*) = 0; virtual bool Receive(T*) = 0; virtual size_t Cap() = 0; + virtual void Lock() = 0; + virtual void Unlock() = 0; virtual void Close() = 0; virtual ~Channel() {} }; // Forward declaration of channel implementations. -namespace details { template -class Buffered; -template -class UnBuffered; -} // namespace details +class ChannelImpl; template Channel* MakeChannel(size_t buffer_size) { - if (buffer_size > 0) { - return new details::Buffered(buffer_size); - } - return new details::UnBuffered(); + return new ChannelImpl(buffer_size); } template @@ -89,6 +84,19 @@ class ChannelHolder { if (IsInitialized()) holder_->Close(); } + size_t Cap() { + if (IsInitialized()) return holder_->Cap(); + return -1; + } + + void Lock() { + if (IsInitialized()) holder_->Lock(); + } + + void Unlock() { + if (IsInitialized()) holder_->Unlock(); + } + inline bool IsInitialized() const { return holder_ != nullptr; } inline const std::type_index Type() { @@ -106,6 +114,9 @@ class ChannelHolder { virtual const std::type_index Type() const = 0; virtual void* Ptr() const = 0; virtual void Close() = 0; + virtual void Lock() = 0; + virtual void Unlock() = 0; + virtual size_t Cap() = 0; }; template @@ -115,11 +126,28 @@ class ChannelHolder { } virtual const std::type_index Type() const { return type_; } + virtual void* Ptr() const { return static_cast(channel_.get()); } + virtual void Close() { if (channel_) channel_->Close(); } + virtual size_t Cap() { + if (channel_) + return channel_->Cap(); + else + return -1; + } + + virtual void Lock() { + if (channel_) channel_->Lock(); + } + + virtual void Unlock() { + if (channel_) channel_->Unlock(); + } + std::unique_ptr> channel_; const std::type_index type_; }; @@ -131,5 +159,4 @@ class ChannelHolder { } // namespace framework } // namespace paddle -#include "paddle/fluid/framework/details/buffered_channel.h" -#include "paddle/fluid/framework/details/unbuffered_channel.h" +#include "paddle/fluid/framework/channel_impl.h" diff --git a/paddle/fluid/framework/channel_impl.h b/paddle/fluid/framework/channel_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..a4561031fd8c49613269e7008ce558f25f9765e4 --- /dev/null +++ b/paddle/fluid/framework/channel_impl.h @@ -0,0 +1,229 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include // for size_t +#include +#include +#include +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace framework { + +template +class ChannelImpl : public paddle::framework::Channel { + friend Channel *paddle::framework::MakeChannel(size_t); + friend void paddle::framework::CloseChannel(Channel *); + + public: + virtual bool Send(T *); + virtual bool Receive(T *); + virtual size_t Cap() { return cap_; } + virtual void Lock(); + virtual void Unlock(); + virtual void Close(); + + ChannelImpl(size_t); + virtual ~ChannelImpl(); + + private: + struct QueueMessage { + T *data; + std::condition_variable_any cond; + bool chan_closed = false; + bool completed = false; + + QueueMessage(T *item) : data(item) {} + + void Wait(std::unique_lock &lock) { + cond.wait(lock, [this]() { return completed; }); + } + + void Notify() { + completed = true; + cond.notify_all(); + } + }; + + bool send_return(bool value) { + send_ctr--; + destructor_cond_.notify_all(); + return value; + } + + bool recv_return(bool value) { + recv_ctr--; + destructor_cond_.notify_all(); + return value; + } + + size_t cap_; + std::recursive_mutex mu_; + bool closed_; + std::deque buf_; + std::deque> recvq; + std::deque> sendq; + std::atomic send_ctr{0}; + std::atomic recv_ctr{0}; + std::condition_variable_any destructor_cond_; +}; + +template +ChannelImpl::ChannelImpl(size_t capacity) + : cap_(capacity), closed_(false), send_ctr(0), recv_ctr(0) { + PADDLE_ENFORCE_GE(capacity, 0); +} + +template +bool ChannelImpl::Send(T *item) { + send_ctr++; + std::unique_lock lock{mu_}; + + // If channel is closed, do nothing + if (closed_) { + lock.unlock(); + // TODO(abhinavarora) Should panic on closed channel + return send_return(false); + } + + // If there is a receiver, directly pass the value we want + // to send to the receiver, bypassing the channel buffer if any + if (!recvq.empty()) { + std::shared_ptr m = recvq.front(); + recvq.pop_front(); + // Do the data transfer + *(m->data) = std::move(*item); + // Wake up the blocked process and unlock + m->Notify(); + lock.unlock(); + return send_return(true); + } + + // Unbuffered channel will always bypass this + // If buffered channel has space in buffer, + // write the element to the buffer. + if (buf_.size() < cap_) { + // Copy to buffer + buf_.push_back(std::move(*item)); + // Release lock and return true + lock.unlock(); + return send_return(true); + } + + // Block on channel, because some receiver will complete + // the operation for us + auto m = std::make_shared(item); + sendq.push_back(m); + m->Wait(lock); + // TODO(abhinavarora) Should panic on closed channel + return send_return(!m->chan_closed); +} + +template +bool ChannelImpl::Receive(T *item) { + recv_ctr++; + std::unique_lock lock{mu_}; + + // If channel is closed and buffer is empty or + // channel is unbuffered + if (closed_ && buf_.empty()) { + lock.unlock(); + return recv_return(false); + } + + // If there is a sender, directly receive the value we want + // from the sender, bypassing the channel buffer if any + if (!sendq.empty()) { + std::shared_ptr m = sendq.front(); + sendq.pop_front(); + // Do the data transfer + *item = std::move(*(m->data)); + // Wake up the blocked process and unlock + m->Notify(); + lock.unlock(); + return recv_return(true); + } + + // If this is a buffered channel and there are items in buffer + if (buf_.size() > 0) { + // Directly read from buffer + *item = std::move(buf_.front()); + buf_.pop_front(); + // Release lock and return true + lock.unlock(); + return recv_return(true); + } + + // No sender available, block on this channel + // Some receiver will complete the option for us + auto m = std::make_shared(item); + recvq.push_back(m); + m->Wait(lock); + + return recv_return(!m->chan_closed); +} + +template +void ChannelImpl::Lock() { + mu_.lock(); +} + +template +void ChannelImpl::Unlock() { + mu_.unlock(); +} + +template +void ChannelImpl::Close() { + std::unique_lock lock{mu_}; + + if (closed_) { + // TODO(abhinavarora): closing an already closed channel should panic + lock.unlock(); + return; + } + + closed_ = true; + + // Empty the readers + while (!recvq.empty()) { + std::shared_ptr m = recvq.front(); + recvq.pop_front(); + m->chan_closed = true; + m->Notify(); + } + + // Empty the senders + while (!sendq.empty()) { + std::shared_ptr m = sendq.front(); + sendq.pop_front(); + m->chan_closed = true; + m->Notify(); + } +} + +template +ChannelImpl::~ChannelImpl() { + Close(); + // The destructor must wait for all readers and writers to complete their task + // The channel has been closed, so we will not accept new readers and writers + std::unique_lock lock{mu_}; + destructor_cond_.wait(lock, + [this]() { return send_ctr == 0 && recv_ctr == 0; }); +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/channel_test.cc b/paddle/fluid/framework/channel_test.cc index 695169fcb9e93b5e69d3d4ae6f63f8e4c2d1605f..edfb41c72489113d9803c2957baed1ce44f8296d 100644 --- a/paddle/fluid/framework/channel_test.cc +++ b/paddle/fluid/framework/channel_test.cc @@ -23,8 +23,19 @@ using paddle::framework::Channel; using paddle::framework::ChannelHolder; using paddle::framework::MakeChannel; using paddle::framework::CloseChannel; -using paddle::framework::details::Buffered; -using paddle::framework::details::UnBuffered; + +TEST(Channel, ChannelCapacityTest) { + const size_t buffer_size = 10; + auto ch = MakeChannel(buffer_size); + EXPECT_EQ(ch->Cap(), buffer_size); + CloseChannel(ch); + delete ch; + + ch = MakeChannel(0); + EXPECT_EQ(ch->Cap(), 0U); + CloseChannel(ch); + delete ch; +} void RecevingOrderEqualToSendingOrder(Channel *ch) { unsigned sum_send = 0; @@ -35,38 +46,17 @@ void RecevingOrderEqualToSendingOrder(Channel *ch) { } }); for (int i = 0; i < 5; i++) { - int recv; + int recv = 999; EXPECT_EQ(ch->Receive(&recv), true); EXPECT_EQ(recv, i); } - + std::this_thread::sleep_for(std::chrono::milliseconds(200)); CloseChannel(ch); t.join(); EXPECT_EQ(sum_send, 10U); delete ch; } -TEST(Channel, MakeAndClose) { - using paddle::framework::details::Buffered; - using paddle::framework::details::UnBuffered; - { - // MakeChannel should return a buffered channel is buffer_size > 0. - auto ch = MakeChannel(10); - EXPECT_NE(dynamic_cast *>(ch), nullptr); - EXPECT_EQ(dynamic_cast *>(ch), nullptr); - CloseChannel(ch); - delete ch; - } - { - // MakeChannel should return an un-buffered channel is buffer_size = 0. - auto ch = MakeChannel(0); - EXPECT_EQ(dynamic_cast *>(ch), nullptr); - EXPECT_NE(dynamic_cast *>(ch), nullptr); - CloseChannel(ch); - delete ch; - } -} - TEST(Channel, SufficientBufferSizeDoesntBlock) { const size_t buffer_size = 10; auto ch = MakeChannel(buffer_size); @@ -166,7 +156,6 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { const size_t buffer_size = 10; auto ch = MakeChannel(buffer_size); - size_t sum = 0; std::thread t([&]() { // Try to write more than buffer size. for (size_t i = 0; i < 2 * buffer_size; ++i) { @@ -174,12 +163,9 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations else EXPECT_EQ(ch->Send(&i), false); - sum += i; } }); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec - EXPECT_EQ(sum, 45U); - + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec CloseChannel(ch); t.join(); delete ch; @@ -211,7 +197,7 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) { }, &thread_ended[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec // Verify that all the threads are blocked for (size_t i = 0; i < num_threads; i++) { @@ -222,7 +208,7 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) { // This should unblock all receivers CloseChannel(ch); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { @@ -232,10 +218,7 @@ void ChannelCloseUnblocksReceiversTest(Channel *ch) { for (size_t i = 0; i < num_threads; i++) t[i].join(); } -void ChannelCloseUnblocksSendersTest(Channel *ch) { - using paddle::framework::details::Buffered; - using paddle::framework::details::UnBuffered; - +void ChannelCloseUnblocksSendersTest(Channel *ch, bool isBuffered) { size_t num_threads = 5; std::thread t[num_threads]; bool thread_ended[num_threads]; @@ -253,9 +236,9 @@ void ChannelCloseUnblocksSendersTest(Channel *ch) { }, &thread_ended[i], &send_success[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait - if (dynamic_cast *>(ch)) { + if (isBuffered) { // If ch is Buffered, atleast 4 threads must be blocked. int ct = 0; for (size_t i = 0; i < num_threads; i++) { @@ -272,14 +255,14 @@ void ChannelCloseUnblocksSendersTest(Channel *ch) { // This should unblock all senders CloseChannel(ch); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { EXPECT_EQ(thread_ended[i], true); } - if (dynamic_cast *>(ch)) { + if (isBuffered) { // Verify that only 1 send was successful int ct = 0; for (size_t i = 0; i < num_threads; i++) { @@ -304,7 +287,7 @@ TEST(Channel, BufferedChannelCloseUnblocksReceiversTest) { // any senders waiting for channel to have write space TEST(Channel, BufferedChannelCloseUnblocksSendersTest) { auto ch = MakeChannel(1); - ChannelCloseUnblocksSendersTest(ch); + ChannelCloseUnblocksSendersTest(ch, true); delete ch; } @@ -320,7 +303,7 @@ TEST(Channel, UnbufferedChannelCloseUnblocksReceiversTest) { // unblocks any senders waiting for senders TEST(Channel, UnbufferedChannelCloseUnblocksSendersTest) { auto ch = MakeChannel(0); - ChannelCloseUnblocksReceiversTest(ch); + ChannelCloseUnblocksSendersTest(ch, false); delete ch; } @@ -342,7 +325,7 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { ch->Receive(&recv); EXPECT_EQ(recv, i); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.5 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec EXPECT_EQ(sum_send, 3U); CloseChannel(ch); @@ -368,7 +351,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) { ch->Send(&i); sum_send += i; } - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec EXPECT_EQ(sum_send, 10U); EXPECT_EQ(sum_receive, 10U); // send three more elements @@ -386,7 +369,7 @@ TEST(Channel, UnbufferedMoreReceiveLessSendTest) { // This tests that destroying a channel unblocks // any senders waiting for channel to have write space -void ChannelDestroyUnblockSenders(Channel *ch) { +void ChannelDestroyUnblockSenders(Channel *ch, bool isBuffered) { size_t num_threads = 5; std::thread t[num_threads]; bool thread_ended[num_threads]; @@ -405,11 +388,9 @@ void ChannelDestroyUnblockSenders(Channel *ch) { &thread_ended[i], &send_success[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec - bool is_buffered_channel = false; - if (dynamic_cast *>(ch)) is_buffered_channel = true; + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec - if (is_buffered_channel) { + if (isBuffered) { // If channel is buffered, verify that atleast 4 threads are blocked int ct = 0; for (size_t i = 0; i < num_threads; i++) { @@ -432,13 +413,13 @@ void ChannelDestroyUnblockSenders(Channel *ch) { EXPECT_EQ(thread_ended[i], true); } - // Count number of successfuld sends + // Count number of successful sends int ct = 0; for (size_t i = 0; i < num_threads; i++) { if (send_success[i]) ct++; } - if (is_buffered_channel) { + if (isBuffered) { // Only 1 send must be successful EXPECT_EQ(ct, 1); } else { @@ -495,7 +476,7 @@ TEST(Channel, BufferedChannelDestroyUnblocksReceiversTest) { TEST(Channel, BufferedChannelDestroyUnblocksSendersTest) { size_t buffer_size = 1; auto ch = MakeChannel(buffer_size); - ChannelDestroyUnblockSenders(ch); + ChannelDestroyUnblockSenders(ch, true); } // This tests that destroying an unbuffered channel also unblocks @@ -507,7 +488,20 @@ TEST(Channel, UnbufferedChannelDestroyUnblocksReceiversTest) { TEST(Channel, UnbufferedChannelDestroyUnblocksSendersTest) { auto ch = MakeChannel(0); - ChannelDestroyUnblockSenders(ch); + ChannelDestroyUnblockSenders(ch, false); +} + +TEST(ChannelHolder, ChannelHolderCapacityTest) { + const size_t buffer_size = 10; + ChannelHolder *ch = new ChannelHolder(); + ch->Reset(buffer_size); + EXPECT_EQ(ch->Cap(), buffer_size); + delete ch; + + ch = new ChannelHolder(); + ch->Reset(0); + EXPECT_EQ(ch->Cap(), 0U); + delete ch; } void ChannelHolderSendReceive(ChannelHolder *ch) { @@ -641,7 +635,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { }, &thread_ended[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec // Verify that all the threads are blocked for (size_t i = 0; i < num_threads; i++) { @@ -652,7 +646,7 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { // This should unblock all receivers ch->close(); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait 0.1 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { @@ -663,9 +657,6 @@ void ChannelHolderCloseUnblocksReceiversTest(ChannelHolder *ch) { } void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { - using paddle::framework::details::Buffered; - using paddle::framework::details::UnBuffered; - size_t num_threads = 5; std::thread t[num_threads]; bool thread_ended[num_threads]; @@ -683,7 +674,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { }, &thread_ended[i], &send_success[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait if (isBuffered) { // If ch is Buffered, atleast 4 threads must be blocked. @@ -702,7 +693,7 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { // This should unblock all senders ch->close(); - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait // Verify that all threads got unblocked for (size_t i = 0; i < num_threads; i++) { @@ -775,7 +766,7 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { &thread_ended[i], &send_success[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(500)); // wait 0.5 sec + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec if (isBuffered) { // If channel is buffered, verify that atleast 4 threads are blocked int ct = 0; @@ -836,7 +827,7 @@ void ChannelHolderDestroyUnblockReceivers(ChannelHolder *ch) { }, &thread_ended[i]); } - std::this_thread::sleep_for(std::chrono::milliseconds(100)); // wait + std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait // Verify that all threads are blocked for (size_t i = 0; i < num_threads; i++) { diff --git a/paddle/fluid/framework/data_transform.cc b/paddle/fluid/framework/data_transform.cc index 0475fc1d9aaeda39b3fe845b70461c434bcaafc4..bfad9ac1e9cad1936ed961ad1da55787d2faa23e 100644 --- a/paddle/fluid/framework/data_transform.cc +++ b/paddle/fluid/framework/data_transform.cc @@ -42,6 +42,7 @@ void DataTransform(const OpKernelType& expected_kernel_type, PassTensorData(&out, &in); } + // do data type transform if (expected_kernel_type.data_type_ != kernel_type_for_var.data_type_) { TransDataType(kernel_type_for_var, expected_kernel_type, in, &out); transformed = true; diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index 1dec766a345d8d6c6fdafc1b50450a9dde91fe5d..4c1b3e7581fe716271c62389c6053a24158913d2 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -16,13 +16,16 @@ limitations under the License. */ #include #include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace framework { inline proto::VarType::Type ToDataType(std::type_index type) { using namespace paddle::framework::proto; - if (typeid(float).hash_code() == type.hash_code()) { + if (typeid(platform::float16).hash_code() == type.hash_code()) { + return proto::VarType::FP16; + } else if (typeid(float).hash_code() == type.hash_code()) { return proto::VarType::FP32; } else if (typeid(double).hash_code() == type.hash_code()) { return proto::VarType::FP64; @@ -40,6 +43,8 @@ inline proto::VarType::Type ToDataType(std::type_index type) { inline std::type_index ToTypeIndex(proto::VarType::Type type) { using namespace paddle::framework::proto; switch (type) { + case proto::VarType::FP16: + return typeid(platform::float16); case proto::VarType::FP32: return typeid(float); case proto::VarType::FP64: @@ -59,6 +64,9 @@ template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { using namespace paddle::framework::proto; switch (type) { + case proto::VarType::FP16: + visitor.template operator()(); + break; case proto::VarType::FP32: visitor.template operator()(); break; diff --git a/paddle/fluid/framework/data_type_transform.cc b/paddle/fluid/framework/data_type_transform.cc index 54cc1575d880242c01cd4752f522380a673b7c75..c0523f3c795b103c0c27081ec5dc717f6a0f11e0 100644 --- a/paddle/fluid/framework/data_type_transform.cc +++ b/paddle/fluid/framework/data_type_transform.cc @@ -47,9 +47,16 @@ struct CastDataType { auto* context = static_cast(ctx_); trans(*context, in_begin, in_end, out_begin, CastDataTypeFunctor()); +#ifdef __NVCC__ + } else if (platform::is_gpu_place(in_.place())) { + platform::Transform trans; + auto* context = static_cast(ctx_); + trans(*context, in_begin, in_end, out_begin, + CastDataTypeFunctor()); + context->Wait(); +#endif } else { - // TODO(dzhwinter): enhance Copy CPU<->GPU with different data type? - PADDLE_THROW("Unsupport CPU <-> GPU!"); + PADDLE_THROW("Unsupported place!"); } } }; @@ -65,6 +72,10 @@ void TransDataType(const OpKernelType& kernel_type_for_var, auto ctx = pool.Get(in.place()); switch (src_type) { + case proto::VarType::FP16: + framework::VisitDataType(dst_type, + CastDataType(in, out, ctx)); + break; case proto::VarType::FP32: framework::VisitDataType(dst_type, CastDataType(in, out, ctx)); break; diff --git a/paddle/fluid/framework/data_type_transform.cu b/paddle/fluid/framework/data_type_transform.cu new file mode 120000 index 0000000000000000000000000000000000000000..f46491293ef4ad688c1bce9327f5f28011dec809 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform.cu @@ -0,0 +1 @@ +data_type_transform.cc \ No newline at end of file diff --git a/paddle/fluid/framework/data_type_transform_test.cc b/paddle/fluid/framework/data_type_transform_test.cc index 724c8c301f25cca21704de398fc416dff46c330c..6b9a8f5e28b372c45abfaa2c20575a55d9a9dd03 100644 --- a/paddle/fluid/framework/data_type_transform_test.cc +++ b/paddle/fluid/framework/data_type_transform_test.cc @@ -22,32 +22,145 @@ TEST(DataTypeTransform, CPUTransform) { auto place = CPUPlace(); - Tensor in; - Tensor out; - - float* ptr = in.mutable_data(make_ddim({2, 3}), place); - int data_number = 2 * 3; - - for (int i = 0; i < data_number; ++i) { - ptr[i] = i / 3; - } - + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, place, + DataLayout::kAnyLayout, LibraryType::kPlain); auto kernel_fp32 = OpKernelType(proto::VarType::FP32, place, DataLayout::kAnyLayout, LibraryType::kPlain); auto kernel_fp64 = OpKernelType(proto::VarType::FP64, place, DataLayout::kAnyLayout, LibraryType::kPlain); auto kernel_int32 = OpKernelType(proto::VarType::INT32, place, DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int64 = OpKernelType(proto::VarType::INT64, place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_bool = OpKernelType(proto::VarType::BOOL, place, + DataLayout::kAnyLayout, LibraryType::kPlain); - TransDataType(kernel_fp32, kernel_fp64, in, &out); - double* out_data_double = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_double[i], static_cast(i / 3)); + // data type transform from float32 + { + Tensor in; + Tensor out; + + float* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i / 3; + } + + TransDataType(kernel_fp32, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(i / 3)); + } + + TransDataType(kernel_fp32, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(i / 3)); + } } - TransDataType(kernel_fp32, kernel_int32, in, &out); - int* out_data_int = out.data(); - for (int i = 0; i < data_number; ++i) { - ASSERT_EQ(out_data_int[i], static_cast(i / 3)); + // data type transform from/to float16 + { + Tensor in; + Tensor out; + + float16* ptr = in.mutable_data(make_ddim({2, 3}), place); + int data_number = 2 * 3; + + for (int i = 0; i < data_number; ++i) { + ptr[i] = i; + } + + // transform from float16 to other data types + TransDataType(kernel_fp16, kernel_fp32, in, &out); + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_fp64, in, &out); + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int32, in, &out); + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int64, in, &out); + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_bool, in, &out); + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + TransDataType(kernel_fp32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + TransDataType(kernel_fp64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + TransDataType(kernel_int32, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + TransDataType(kernel_int64, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + TransDataType(kernel_bool, kernel_fp16, in, &out); + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + } } } diff --git a/paddle/fluid/framework/data_type_transform_test.cu b/paddle/fluid/framework/data_type_transform_test.cu new file mode 100644 index 0000000000000000000000000000000000000000..de389ddabcb86de0155757406a406e44086c5474 --- /dev/null +++ b/paddle/fluid/framework/data_type_transform_test.cu @@ -0,0 +1,222 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/data_type_transform.h" +#include "paddle/fluid/framework/tensor_util.h" + +#include "gtest/gtest.h" + +TEST(DataTypeTransform, GPUTransform) { + using namespace paddle::framework; + using namespace paddle::platform; + + auto cpu_place = CPUPlace(); + auto gpu_place = CUDAPlace(0); + CUDADeviceContext context(gpu_place); + + auto kernel_fp16 = OpKernelType(proto::VarType::FP16, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp32 = OpKernelType(proto::VarType::FP32, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_fp64 = OpKernelType(proto::VarType::FP64, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int32 = OpKernelType(proto::VarType::INT32, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_int64 = OpKernelType(proto::VarType::INT64, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + auto kernel_bool = OpKernelType(proto::VarType::BOOL, gpu_place, + DataLayout::kAnyLayout, LibraryType::kPlain); + + // data type transform from float32 + { + Tensor in; + Tensor in_gpu; + Tensor out_gpu; + Tensor out; + + float* in_ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); + float arr[6] = {0, 1, 2, 3, 4, 5}; + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(in_ptr, arr, sizeof(arr)); + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_fp32, kernel_fp64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(arr[i])); + } + + TransDataType(kernel_fp32, kernel_int32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(arr[i])); + } + } + + // data type transform from/to float16 + { + Tensor in; + Tensor in_gpu; + Tensor out_gpu; + Tensor out; + + float16* ptr = in.mutable_data(make_ddim({2, 3}), cpu_place); + float16 arr[6] = {float16(0), float16(1), float16(2), + float16(3), float16(4), float16(5)}; + int data_number = sizeof(arr) / sizeof(arr[0]); + memcpy(ptr, arr, sizeof(arr)); + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + + // transform from float16 to other data types + TransDataType(kernel_fp16, kernel_fp32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + float* out_data_float = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_float[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_fp64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + double* out_data_double = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_double[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int32, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int* out_data_int = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_int64, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + int64_t* out_data_int64 = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_int64[i], static_cast(ptr[i])); + } + + TransDataType(kernel_fp16, kernel_bool, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + bool* out_data_bool = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(out_data_bool[i], static_cast(ptr[i])); + } + + // transform float to float16 + float* in_data_float = in.mutable_data(make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_float[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_fp32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_float[i]).x); + } + + // transform double to float16 + double* in_data_double = + in.mutable_data(make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_double[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_fp64, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_double[i]).x); + } + + // transform int to float16 + int* in_data_int = in.mutable_data(make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_int[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_int32, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_int[i]).x); + } + + // transform int64 to float16 + int64_t* in_data_int64 = + in.mutable_data(make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_int64[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_int64, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_int64[i]).x); + } + + // transform bool to float16 + bool* in_data_bool = in.mutable_data(make_ddim({2, 3}), cpu_place); + for (int i = 0; i < data_number; ++i) { + in_data_bool[i] = i; + } + + TensorCopy(in, gpu_place, context, &in_gpu); + context.Wait(); + TransDataType(kernel_bool, kernel_fp16, in_gpu, &out_gpu); + TensorCopy(out_gpu, cpu_place, context, &out); + context.Wait(); + + ptr = out.data(); + for (int i = 0; i < data_number; ++i) { + EXPECT_EQ(ptr[i].x, static_cast(in_data_bool[i]).x); + } + } +} diff --git a/paddle/fluid/framework/details/buffered_channel.h b/paddle/fluid/framework/details/buffered_channel.h deleted file mode 100644 index 88faf3acf7c17b0cb3770a8910e400a1f6688f5f..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/buffered_channel.h +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include -#include - -#include "paddle/fluid/framework/channel.h" -#include "paddle/fluid/platform/enforce.h" - -namespace paddle { -namespace framework { -namespace details { - -// Four of the properties of Buffered Channel: -// - A send to a full channel blocks temporarily until a receive from the -// channel or the channel is closed. -// - A receive from an empty channel blocks temporarily until a send to the -// channel or the channel is closed. -// - A send to a closed channel returns false immediately. -// - A receive from a closed channel returns false immediately. - -template -class Buffered : public paddle::framework::Channel { - friend Channel* paddle::framework::MakeChannel(size_t); - friend void paddle::framework::CloseChannel(Channel*); - - public: - virtual bool Send(T*); - virtual bool Receive(T*); - virtual size_t Cap() { return cap_; } - virtual void Close(); - virtual ~Buffered(); - - private: - size_t cap_; - std::mutex mu_; - std::condition_variable empty_cond_var_; - std::condition_variable full_cond_var_; - std::condition_variable destructor_cond_var_; - std::deque channel_; - std::atomic closed_{false}; - std::atomic send_ctr{0}; - std::atomic recv_ctr{0}; - - Buffered(size_t cap) : cap_(cap), closed_(false) { - PADDLE_ENFORCE_GT(cap, 0); - } - - void NotifyAllParticipants(std::unique_lock*); -}; - -template -bool Buffered::Send(T* item) { - bool ret = false; - if (closed_) { - return ret; - } - send_ctr++; - std::unique_lock lock(mu_); - full_cond_var_.wait(lock, - [this]() { return channel_.size() < cap_ || closed_; }); - if (!closed_) { - channel_.push_back(std::move(*item)); - lock.unlock(); - empty_cond_var_.notify_one(); - ret = true; - } - send_ctr--; - destructor_cond_var_.notify_one(); - return ret; -} - -template -bool Buffered::Receive(T* item) { - bool ret = false; - // Once the channel has been closed and all data has been consumed, - // just return false. Don't even try acquiring the mutex. - if (closed_ && channel_.empty()) { - return false; - } - recv_ctr++; - std::unique_lock lock(mu_); - empty_cond_var_.wait(lock, [this]() { return !channel_.empty() || closed_; }); - if (!channel_.empty()) { - *item = std::move(channel_.front()); - channel_.pop_front(); - full_cond_var_.notify_one(); - ret = true; - } - recv_ctr--; - destructor_cond_var_.notify_one(); - return ret; -} - -template -void Buffered::Close() { - if (closed_) { - return; - } - std::unique_lock lock(mu_); - closed_ = true; - NotifyAllParticipants(&lock); -} - -template -Buffered::~Buffered() { - std::unique_lock lock(mu_); - closed_ = true; - channel_.clear(); - NotifyAllParticipants(&lock); - - // The destructor must wait for all readers and writers to complete their task - // The channel has been closed, so we will not accept new readers and writers - lock.lock(); - destructor_cond_var_.wait( - lock, [this]() { return send_ctr == 0 && recv_ctr == 0; }); -} - -template -void Buffered::NotifyAllParticipants(std::unique_lock* lock) { - lock->unlock(); - full_cond_var_.notify_all(); - empty_cond_var_.notify_all(); -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/unbuffered_channel.h b/paddle/fluid/framework/details/unbuffered_channel.h deleted file mode 100644 index 5c9424928cb7029aac813e7b2f29f81a0093f836..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/unbuffered_channel.h +++ /dev/null @@ -1,174 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once -#include -#include -#include - -#include "paddle/fluid/framework/channel.h" - -namespace paddle { -namespace framework { -namespace details { - -// Four of the properties of UnBuffered Channel: -// - A send to a channel blocks temporarily until a receive from the -// channel or the channel is closed. -// - A receive from a channel blocks temporarily until a send to the -// channel or the channel is closed. -// - A send to a closed channel returns false immediately. -// - A receive from a closed channel returns false immediately. -template -class UnBuffered : public paddle::framework::Channel { - friend Channel* paddle::framework::MakeChannel(size_t); - friend void paddle::framework::CloseChannel(Channel*); - - public: - virtual bool Send(T*); - virtual bool Receive(T*); - virtual size_t Cap() { return 0; } - virtual void Close(); - virtual ~UnBuffered(); - - private: - std::mutex mu_ch_; - // Mutex for readers and writers who are waiting for other reader - // and writer to complete execution - std::recursive_mutex mu_read_, mu_write_; - // reader_found_ is set true when a reader is ready to accept data - // writer_found_ is set true when a writer is ready to send data - // A transaction occurs only when both are true - std::atomic reader_found_{false}, writer_found_{false}; - std::condition_variable cv_channel_; - std::condition_variable_any cv_reader_, cv_writer_, cv_destructor_; - T* item{nullptr}; - std::atomic closed_{false}; - std::atomic send_ctr{0}; - std::atomic recv_ctr{0}; - - UnBuffered() : closed_(false) {} - - void NotifyAllParticipants(std::unique_lock*); -}; - -// This function implements the concept of how data should -// be sent from a writer to a reader. -template -bool UnBuffered::Send(T* data) { - bool ret = false; - if (closed_) { - return ret; - } - send_ctr++; - // Prevent other writers from entering - std::unique_lock writer_lock(mu_write_); - writer_found_ = true; - std::unique_lock cv_lock(mu_write_); - // If writer comes first, it should wait till a reader arrives - cv_writer_.wait(cv_lock, - [this]() { return reader_found_ == true || closed_; }); - cv_reader_.notify_one(); - if (!closed_) { - std::unique_lock channel_lock(mu_ch_); - item = data; - channel_lock.unlock(); - cv_channel_.notify_one(); - channel_lock.lock(); - cv_channel_.wait(channel_lock, - [this]() { return item == nullptr || closed_; }); - ret = true; - } - writer_found_ = false; - send_ctr--; - cv_destructor_.notify_one(); - return ret; -} - -// This function implements the concept of how -// data that was sent by a writer is read from a reader. -template -bool UnBuffered::Receive(T* data) { - bool ret = false; - // If channel is closed, we don't even want any reader to enter. - // Unlike a buffered channel, an unbuffered channel does not allow - // readers to read after closing because there is no buffer to be consumed. - if (closed_) return ret; - recv_ctr++; - // Prevent other readers from entering - std::unique_lock read_lock{mu_read_}; - reader_found_ = true; - std::unique_lock cv_lock{mu_read_}; - // If reader comes first, it should wait till a writer arrives - cv_reader_.wait(cv_lock, - [this]() { return writer_found_ == true || closed_; }); - cv_writer_.notify_one(); - if (!closed_) { - std::unique_lock lock_ch{mu_ch_}; - // Reader should wait for the writer to first write its data - cv_channel_.wait(lock_ch, [this]() { return item != nullptr || closed_; }); - if (!closed_) { - *data = std::move(*item); - item = nullptr; - lock_ch.unlock(); - ret = true; - } - cv_channel_.notify_one(); - } - reader_found_ = false; - recv_ctr--; - cv_destructor_.notify_one(); - return ret; -} - -// This function implements the sequence of events -// that take place once the channel is closed. -template -void UnBuffered::Close() { - if (closed_) { - return; - } - std::unique_lock lock(mu_ch_); - item = nullptr; - closed_ = true; - NotifyAllParticipants(&lock); -} - -// This function implements the sequence of events -// that are executed once the object of an UnBuffered -// channel is destroyed. -template -UnBuffered::~UnBuffered() { - std::unique_lock lock(mu_ch_); - item = nullptr; - closed_ = true; - NotifyAllParticipants(&lock); - lock.lock(); - cv_destructor_.wait(lock, - [this]() { return send_ctr == 0 && recv_ctr == 0; }); -} - -// This function notifies all the readers, writers and -// the channel condition variables. -template -void UnBuffered::NotifyAllParticipants(std::unique_lock* lock) { - lock->unlock(); - cv_writer_.notify_all(); - cv_channel_.notify_all(); - cv_reader_.notify_all(); -} - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 961e3e22f278d6e0346defd90190c53fd31ede08..f8e7d0d99074936ad81c4ddc52be6907ead4c27d 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -125,8 +125,9 @@ void Executor::Run(const ProgramDesc& pdesc, Scope* scope, int block_id, for (auto& op_desc : block.AllOps()) { auto op = paddle::framework::OpRegistry::CreateOp(*op_desc); - VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); + VLOG(4) << place_ << " " << op->DebugStringEx(local_scope); op->Run(*local_scope, place_); + VLOG(3) << place_ << " " << op->DebugStringEx(local_scope); if (FLAGS_benchmark) { VLOG(2) << "Memory used after operator " + op->Type() + " running: " diff --git a/paddle/fluid/framework/reader.cc b/paddle/fluid/framework/reader.cc index dc1caa72a4c09c7045c87575abab0faac7b5bbc8..91879d6d45868bb37ca44baafb8b0e8677cd6d1a 100644 --- a/paddle/fluid/framework/reader.cc +++ b/paddle/fluid/framework/reader.cc @@ -25,92 +25,5 @@ DDim ReaderBase::shape(size_t idx) const { return shapes_[idx]; } -void ShuffleReader::ReadNext(std::vector* out) { - if (iteration_pos_ >= buffer_.size()) { - // Reload buffer with new data - buffer_.clear(); - buffer_.reserve(buffer_size_); - for (int i = 0; i < buffer_size_; ++i) { - if (reader_->HasNext()) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - } else { - break; - } - } - // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be - // optimize. - std::random_shuffle(buffer_.begin(), buffer_.end()); - iteration_pos_ = 0; - } - out->clear(); - if (!buffer_.empty()) { - std::swap(*out, buffer_[iteration_pos_++]); - } - // if buffer_ is empty, the 'out' will return as an empty vector. -} - -void BatchReader::ReadNext(std::vector* out) { - buffer_.clear(); - buffer_.reserve(batch_size_); - for (int i = 0; i < batch_size_; ++i) { - if (reader_->HasNext()) { - buffer_.push_back(std::vector()); - reader_->ReadNext(&buffer_.back()); - } else { - break; - } - } - // Concat instances - out->clear(); - if (buffer_.empty()) { - // if buffer_ is empty, the 'out' will return as an empty vector. - return; - } - int out_num = buffer_[0].size(); - out->reserve(out_num); - for (int j = 0; j < out_num; ++j) { - // Merge shape and check date type - std::type_index batch_type = buffer_[0][j].type(); - DDim batch_shape = buffer_[0][j].dims(); - for (size_t i = 1; i < buffer_.size(); ++i) { - std::type_index ins_type = buffer_[i][j].type(); - DDim ins_shape = buffer_[i][j].dims(); - PADDLE_ENFORCE_EQ(batch_type, ins_type); - PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), - slice_ddim(ins_shape, 1, ins_shape.size())); - PADDLE_ENFORCE_GT(ins_shape[0], 0); - batch_shape[0] += ins_shape[0]; - } - - LoDTensor out_tensor; - out_tensor.Resize(batch_shape); - out_tensor.mutable_data(platform::CPUPlace(), batch_type); - int64_t dst_offset = 0; - - // Merge lod and data - LoD batch_lod; - for (size_t i = 0; i < buffer_.size(); ++i) { - DDim ins_shape = buffer_[i][j].dims(); - LoD ins_lod = buffer_[i][j].lod(); - if (i == 0) { - batch_lod = ins_lod; - } else { - PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); - for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { - auto& lod_level = batch_lod[level_idx]; - for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { - lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); - } - } - } - Tensor dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); - TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst); - dst_offset += ins_shape[0]; - } - out_tensor.set_lod(batch_lod); - out->push_back(out_tensor); - } -} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/reader.h b/paddle/fluid/framework/reader.h index 4a5eba5fb733b3e9da2b245b4dda18725c9b0895..1be3f4ef1f46bd8d72a285afa69b52d6f519ccf5 100644 --- a/paddle/fluid/framework/reader.h +++ b/paddle/fluid/framework/reader.h @@ -26,7 +26,6 @@ class ReaderBase { PADDLE_ENFORCE(!shapes_.empty()); } virtual void ReadNext(std::vector* out) = 0; - virtual bool HasNext() const = 0; virtual void ReInit() = 0; @@ -52,91 +51,14 @@ class DecoratedReader : public ReaderBase { PADDLE_ENFORCE_NOT_NULL(reader_); } - bool HasNext() const override { return reader_->HasNext(); } - void ReInit() override { reader_->ReInit(); } protected: ReaderBase* reader_; }; -// file readers - -template -class RandomDataGenerator : public FileReader { - public: - RandomDataGenerator(const std::vector& shapes, float min, float max) - : FileReader(shapes), min_(min), max_(max) { - PADDLE_ENFORCE_LE( - min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); - unsigned int seed = std::random_device()(); - engine_.seed(seed); - dist_ = std::uniform_real_distribution(min_, max_); - } - - void ReadNext(std::vector* out) override { - out->clear(); - out->reserve(shapes_.size()); - for (const DDim& shape : shapes_) { - PADDLE_ENFORCE_GE( - shape.size(), 2, - "The rank of reader's output data should be 2 at least.(Now it's %d)", - shape.size()); - LoDTensor out_tensor; - out_tensor.Resize(shape); - T* data = out_tensor.mutable_data(platform::CPUPlace()); - int64_t numel = product(shape); - for (int64_t i = 0; i < numel; ++i) { - data[i] = dist_(engine_); - } - out->push_back(out_tensor); - } - } - - bool HasNext() const override { return true; } - - void ReInit() override { return; } - - private: - float min_; - float max_; - std::minstd_rand engine_; - std::uniform_real_distribution dist_; -}; - -// decorated readers - -class ShuffleReader : public DecoratedReader { - public: - ShuffleReader(ReaderBase* reader, int buffer_size) - : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { - buffer_.reserve(buffer_size); - } - - void ReadNext(std::vector* out) override; - - private: - int buffer_size_; - std::vector> buffer_; - size_t iteration_pos_; -}; - -class BatchReader : public DecoratedReader { - public: - BatchReader(ReaderBase* reader, int batch_size) - : DecoratedReader(reader), batch_size_(batch_size) { - buffer_.reserve(batch_size_); - } - - void ReadNext(std::vector* out) override; - - private: - int batch_size_; - std::vector> buffer_; -}; - -// The ReaderHolder is used as readers' unified wrapper, -// making it easier to access different type readers in Variables. +// The ReaderHolder is used as reader' unified wrapper, +// making it easier to access different type reader in Variables. class ReaderHolder { public: void Reset(ReaderBase* reader) { reader_.reset(reader); } @@ -144,7 +66,6 @@ class ReaderHolder { ReaderBase* Get() const { return reader_.get(); } void ReadNext(std::vector* out) { reader_->ReadNext(out); } - bool HasNext() const { return reader_->HasNext(); } void ReInit() { reader_->ReInit(); } DDim shape(size_t idx) const { return reader_->shape(idx); } diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index 9b465b85b0a02ffe990ab669a22f78e923e24f99..8b7533ce712b0a01060842b6f71449ed6bd23e2c 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -187,7 +187,6 @@ bool TensorContainsInf(const framework::Tensor& tensor) { void TensorToStream(std::ostream& os, const Tensor& tensor, const platform::DeviceContext& dev_ctx) { - // TODO(typhoonzero): serialize to ostream { // the 1st field, uint32_t version constexpr uint32_t version = 0; os.write(reinterpret_cast(&version), sizeof(version)); diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc index 8aebfcb3b624f49b747e0874e807782b8d4a9951..9687a86ca25be7886e67028a38e54b3065c8e4b5 100644 --- a/paddle/fluid/framework/tensor_util_test.cc +++ b/paddle/fluid/framework/tensor_util_test.cc @@ -235,27 +235,53 @@ TEST(TensorToVector, Tensor) { TEST(TensorContainsNAN, CPU) { using namespace paddle::framework; using namespace paddle::platform; - Tensor src; - float* buf = src.mutable_data({3}, CPUPlace()); - buf[0] = 0.0; - buf[1] = NAN; - buf[2] = 0.0; - ASSERT_TRUE(TensorContainsNAN(src)); - buf[1] = 0.0; - ASSERT_FALSE(TensorContainsNAN(src)); + { + Tensor src; + float* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1] = NAN; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsNAN(src)); + buf[1] = 0.0; + ASSERT_FALSE(TensorContainsNAN(src)); + } + + { + Tensor src; + float16* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 0.0; + buf[1].x = 0x7fff; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsNAN(src)); + buf[1] = 0.0; + ASSERT_FALSE(TensorContainsNAN(src)); + } } TEST(TensorContainsInf, CPU) { using namespace paddle::framework; using namespace paddle::platform; - Tensor src; - double* buf = src.mutable_data({3}, CPUPlace()); - buf[0] = 1.0; - buf[1] = INFINITY; - buf[2] = 0.0; - ASSERT_TRUE(TensorContainsInf(src)); - buf[1] = 1.0; - ASSERT_FALSE(TensorContainsInf(src)); + { + Tensor src; + double* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1] = INFINITY; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsInf(src)); + buf[1] = 1.0; + ASSERT_FALSE(TensorContainsInf(src)); + } + + { + Tensor src; + float16* buf = src.mutable_data({3}, CPUPlace()); + buf[0] = 1.0; + buf[1].x = 0x7c00; + buf[2] = 0.0; + ASSERT_TRUE(TensorContainsInf(src)); + buf[1] = 1.0; + ASSERT_FALSE(TensorContainsInf(src)); + } } TEST(Tensor, FromAndToStream) { diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu index d630ec44a2aa6f2567eedbb58b9ae181700f5fc9..4766ec28aa3cff6be3259f258f1c9543ae471f5d 100644 --- a/paddle/fluid/framework/tensor_util_test.cu +++ b/paddle/fluid/framework/tensor_util_test.cu @@ -25,32 +25,65 @@ static __global__ void FillNAN(float* buf) { buf[1] = 0.1; buf[2] = NAN; } + static __global__ void FillInf(float* buf) { buf[0] = 0.0; buf[1] = INFINITY; buf[2] = 0.5; } +static __global__ void FillNAN(platform::float16* buf) { + buf[0] = 0.0; + buf[1] = 0.1; + buf[2].x = 0x7fff; +} + +static __global__ void FillInf(platform::float16* buf) { + buf[0] = 0.0; + buf[1].x = 0x7c00; + buf[2] = 0.5; +} + TEST(TensorContainsNAN, GPU) { - Tensor tensor; - platform::CUDAPlace gpu(0); - auto& pool = platform::DeviceContextPool::Instance(); + using namespace paddle::platform; + CUDAPlace gpu(0); + auto& pool = DeviceContextPool::Instance(); auto* cuda_ctx = pool.GetByPlace(gpu); - float* buf = tensor.mutable_data({3}, gpu); - FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); - cuda_ctx->Wait(); - ASSERT_TRUE(TensorContainsNAN(tensor)); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsNAN(tensor)); + } } TEST(TensorContainsInf, GPU) { - Tensor tensor; - platform::CUDAPlace gpu(0); - auto& pool = platform::DeviceContextPool::Instance(); + using namespace paddle::platform; + CUDAPlace gpu(0); + auto& pool = DeviceContextPool::Instance(); auto* cuda_ctx = pool.GetByPlace(gpu); - float* buf = tensor.mutable_data({3}, gpu); - FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); - cuda_ctx->Wait(); - ASSERT_TRUE(TensorContainsInf(tensor)); + { + Tensor tensor; + float* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } + { + Tensor tensor; + float16* buf = tensor.mutable_data({3}, gpu); + FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf); + cuda_ctx->Wait(); + ASSERT_TRUE(TensorContainsInf(tensor)); + } } } // namespace framework diff --git a/paddle/fluid/framework/threadpool.h b/paddle/fluid/framework/threadpool.h index 3adc260caf5eaaa1408a26ed91457666e6c4adce..df51fb24a588c84788d7d0b671f932ff4c40f9c2 100644 --- a/paddle/fluid/framework/threadpool.h +++ b/paddle/fluid/framework/threadpool.h @@ -67,10 +67,10 @@ class ThreadPool { } catch (platform::EnforceNotMet ex) { return std::unique_ptr( new platform::EnforceNotMet(ex)); - } catch (...) { - LOG(FATAL) - << "Unexpected exception is catched in thread pool. All " - "throwable exception in Fluid should be an EnforceNotMet."; + } catch (const std::exception& e) { + LOG(FATAL) << "Unexpected exception is catched in thread pool. All " + "throwable exception in Fluid should be an EnforceNotMet." + << e.what(); } return nullptr; }); diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index bdb147955ca0700dc0854b54c38d961caf8845f3..17ccca8cdcbcaabaddbbc0ca1d3ca4fdf054b0fb 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -5,7 +5,8 @@ cc_library(paddle_fluid_api DEPS ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) # Create static library -cc_library(paddle_fluid DEPS paddle_fluid_api ${FLUID_CORE_MODULES} ${GLOB_OP_LIB}) +get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES) +cc_library(paddle_fluid DEPS ${fluid_modules}) # Create shared library cc_library(paddle_fluid_shared SHARED diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 80eb9889670744ae527ea29609b33631a021bfa8..52e9c0baa64508f82d0a86a88c8c5f8d23f9f7f2 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -22,14 +22,14 @@ namespace paddle { namespace inference { void ReadBinaryFile(const std::string& filename, std::string& contents) { - VLOG(3) << "loading model from " << filename; - std::ifstream inputfs(filename, std::ios::in | std::ios::binary); - inputfs.seekg(0, std::ios::end); + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); + fin.seekg(0, std::ios::end); contents.clear(); - contents.resize(inputfs.tellg()); - inputfs.seekg(0, std::ios::beg); - inputfs.read(&contents[0], contents.size()); - inputfs.close(); + contents.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&contents[0], contents.size()); + fin.close(); } bool IsPersistable(const framework::VarDesc* var) { @@ -97,6 +97,7 @@ std::unique_ptr Load(framework::Executor& executor, const std::string& dirname) { std::string model_filename = dirname + "/__model__"; std::string program_desc_str; + VLOG(3) << "loading model from " << model_filename; ReadBinaryFile(model_filename, program_desc_str); std::unique_ptr main_program( diff --git a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc index d6fc51301bc45d3ce01142c0f015e4bb69e519bf..e9a27171f1cd68e7b10c860fb4a1417b930ed565 100644 --- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc +++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, image_classification) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,13 +32,11 @@ TEST(inference, image_classification) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [0.0, 1.0]. SetupTensor(input, - {batch_size, 3, 32, 32}, + {FLAGS_batch_size, 3, 32, 32}, static_cast(0), static_cast(1)); std::vector cpu_feeds; @@ -46,7 +47,9 @@ TEST(inference, image_classification) { cpu_fetchs1.push_back(&output1); // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); + LOG(INFO) << "--- CPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat); LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA @@ -55,7 +58,9 @@ TEST(inference, image_classification) { cpu_fetchs2.push_back(&output2); // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); + LOG(INFO) << "--- GPU Runs: ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat); LOG(INFO) << output2.dims(); CheckError(output1, output2); diff --git a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc index 99bee94cb82633219df7a4a5c5bada15d2d3ce64..1fb0f9e77797cf6e61e918700763ee33a495cb96 100644 --- a/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc +++ b/paddle/fluid/inference/tests/book/test_inference_recognize_digits.cc @@ -17,10 +17,13 @@ limitations under the License. */ #include "paddle/fluid/inference/tests/test_helper.h" DEFINE_string(dirname, "", "Directory of the inference model."); +DEFINE_int32(batch_size, 1, "Batch size of input data"); +DEFINE_int32(repeat, 1, "Running the inference program repeat times"); TEST(inference, recognize_digits) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; + if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) { + LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model " + "--batch_size=1 --repeat=1"; } LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; @@ -29,77 +32,39 @@ TEST(inference, recognize_digits) { // 0. Call `paddle::framework::InitDevices()` initialize all the devices // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - int64_t batch_size = 1; - paddle::framework::LoDTensor input; // Use normilized image pixels as input data, // which should be in the range [-1.0, 1.0]. SetupTensor(input, - {batch_size, 1, 28, 28}, + {FLAGS_batch_size, 1, 28, 28}, static_cast(-1), static_cast(1)); std::vector cpu_feeds; cpu_feeds.push_back(&input); - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); + for (auto is_combined : {false, true}) { + paddle::framework::LoDTensor output1; + std::vector cpu_fetchs1; + cpu_fetchs1.push_back(&output1); - // Run inference on CPU - TestInference(dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); + // Run inference on CPU + LOG(INFO) << "--- CPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined); + LOG(INFO) << output1.dims(); #ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); + paddle::framework::LoDTensor output2; + std::vector cpu_fetchs2; + cpu_fetchs2.push_back(&output2); - // Run inference on CUDA GPU - TestInference(dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); + // Run inference on CUDA GPU + LOG(INFO) << "--- GPU Runs: is_combined=" << is_combined << " ---"; + TestInference( + dirname, cpu_feeds, cpu_fetchs2, FLAGS_repeat, is_combined); + LOG(INFO) << output2.dims(); - CheckError(output1, output2); + CheckError(output1, output2); #endif -} - -TEST(inference, recognize_digits_combine) { - if (FLAGS_dirname.empty()) { - LOG(FATAL) << "Usage: ./example --dirname=path/to/your/model"; } - - LOG(INFO) << "FLAGS_dirname: " << FLAGS_dirname << std::endl; - std::string dirname = FLAGS_dirname; - - // 0. Call `paddle::framework::InitDevices()` initialize all the devices - // In unittests, this is done in paddle/testing/paddle_gtest_main.cc - - paddle::framework::LoDTensor input; - // Use normilized image pixels as input data, - // which should be in the range [-1.0, 1.0]. - SetupTensor( - input, {1, 1, 28, 28}, static_cast(-1), static_cast(1)); - std::vector cpu_feeds; - cpu_feeds.push_back(&input); - - paddle::framework::LoDTensor output1; - std::vector cpu_fetchs1; - cpu_fetchs1.push_back(&output1); - - // Run inference on CPU - TestInference( - dirname, cpu_feeds, cpu_fetchs1); - LOG(INFO) << output1.dims(); - -#ifdef PADDLE_WITH_CUDA - paddle::framework::LoDTensor output2; - std::vector cpu_fetchs2; - cpu_fetchs2.push_back(&output2); - - // Run inference on CUDA GPU - TestInference( - dirname, cpu_feeds, cpu_fetchs2); - LOG(INFO) << output2.dims(); - - CheckError(output1, output2); -#endif } diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 49518e50d8541477234f17ac5b8709aeb57662ff..0f5fe6d0aa9a5522c67a3c06f8677f1f2f259eb3 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -15,6 +15,7 @@ limitations under the License. */ #include #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/profiler.h" template void SetupTensor(paddle::framework::LoDTensor& input, @@ -87,31 +88,60 @@ void CheckError(paddle::framework::LoDTensor& output1, EXPECT_EQ(count, 0U) << "There are " << count << " different elements."; } -template +template void TestInference(const std::string& dirname, const std::vector& cpu_feeds, - std::vector& cpu_fetchs) { + std::vector& cpu_fetchs, + const int repeat = 1, + const bool is_combined = false) { // 1. Define place, executor, scope auto place = Place(); auto executor = paddle::framework::Executor(place); auto* scope = new paddle::framework::Scope(); + // Profile the performance + paddle::platform::ProfilerState state; + if (paddle::platform::is_cpu_place(place)) { + state = paddle::platform::ProfilerState::kCPU; + } else { +#ifdef PADDLE_WITH_CUDA + state = paddle::platform::ProfilerState::kCUDA; + // The default device_id of paddle::platform::CUDAPlace is 0. + // Users can get the device_id using: + // int device_id = place.GetDeviceId(); + paddle::platform::SetDeviceId(0); +#else + PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); +#endif + } + + // Enable the profiler + paddle::platform::EnableProfiler(state); + // 2. Initialize the inference_program and load parameters std::unique_ptr inference_program; - if (IsCombined) { - // All parameters are saved in a single file. - // Hard-coding the file names of program and parameters in unittest. - // The file names should be consistent with that used in Python API - // `fluid.io.save_inference_model`. - std::string prog_filename = "__model_combined__"; - std::string param_filename = "__params_combined__"; - inference_program = paddle::inference::Load(executor, - *scope, - dirname + "/" + prog_filename, - dirname + "/" + param_filename); - } else { - // Parameters are saved in separate files sited in the specified `dirname`. - inference_program = paddle::inference::Load(executor, *scope, dirname); + { + paddle::platform::RecordEvent record_event( + "init_program", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + if (is_combined) { + // All parameters are saved in a single file. + // Hard-coding the file names of program and parameters in unittest. + // The file names should be consistent with that used in Python API + // `fluid.io.save_inference_model`. + std::string prog_filename = "__model_combined__"; + std::string param_filename = "__params_combined__"; + inference_program = + paddle::inference::Load(executor, + *scope, + dirname + "/" + prog_filename, + dirname + "/" + param_filename); + } else { + // Parameters are saved in separate files sited in the specified + // `dirname`. + inference_program = paddle::inference::Load(executor, *scope, dirname); + } } // 3. Get the feed_target_names and fetch_target_names @@ -134,7 +164,21 @@ void TestInference(const std::string& dirname, } // 6. Run the inference program - executor.Run(*inference_program, scope, feed_targets, fetch_targets); + { + // Run repeat times to profile the performance + for (int i = 0; i < repeat; ++i) { + paddle::platform::RecordEvent record_event( + "run_inference", + paddle::platform::DeviceContextPool::Instance().Get(place)); + + executor.Run(*inference_program, scope, feed_targets, fetch_targets); + } + } + + // Disable the profiler and print the timing information + paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault, + "profiler.txt"); + paddle::platform::ResetProfiler(); delete scope; } diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index e1c02ec1613a7c31258f2d305b040ea627cebcdd..5d436a7e0c3752c889c19820507589f34d3bee94 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -1,5 +1,7 @@ file(GLOB GENERAL_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") +string(REPLACE "_mkldnn" "" GENERAL_OPS "${GENERAL_OPS}") string(REPLACE ".cc" "" GENERAL_OPS "${GENERAL_OPS}") +list(REMOVE_DUPLICATES GENERAL_OPS) set(DEPS_OPS "") set(pybind_file ${PADDLE_SOURCE_DIR}/paddle/fluid/pybind/pybind.h) file(WRITE ${pybind_file} "// Generated by the paddle/operator/CMakeLists.txt. DO NOT EDIT!\n\n") @@ -13,6 +15,8 @@ function(op_library TARGET) set(cu_cc_srcs) set(cudnn_cu_cc_srcs) set(CUDNN_FILE) + set(mkldnn_cc_srcs) + set(MKLDNN_FILE) set(op_common_deps operator op_registry math_function) set(options "") set(oneValueArgs "") @@ -36,12 +40,20 @@ function(op_library TARGET) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) endif() + if(WITH_MKLDNN) + string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) + list(APPEND mkldnn_cc_srcs ${MKLDNN_FILE}.cc) + endif() + endif() else() foreach(src ${op_library_SRCS}) if (${src} MATCHES ".*\\.cu$") list(APPEND cu_srcs ${src}) elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") list(APPEND cudnn_cu_cc_srcs ${src}) + elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") + list(APPEND mkldnn_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cu.cc$") list(APPEND cu_cc_srcs ${src}) elseif(${src} MATCHES ".*\\.cc$") @@ -62,15 +74,15 @@ function(op_library TARGET) set(DEPS_OPS ${TARGET} ${DEPS_OPS} PARENT_SCOPE) endif() if (WITH_GPU) - nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} ${op_common_deps}) else() - cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${op_library_DEPS} - ${op_common_deps}) + cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} + ${op_common_deps}) endif() # Define operators that don't need pybind here. - foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op" "create_reader_op") + foreach(manual_pybind_op "net_op" "compare_op" "logical_op" "nccl_op" "tensor_array_read_write_op") if ("${TARGET}" STREQUAL "${manual_pybind_op}") set(pybind_flag 1) endif() @@ -101,7 +113,8 @@ function(op_library TARGET) # pybind USE_CPU_ONLY_OP list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len) - if (${pybind_flag} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) + list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) + if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") set(pybind_flag 1) endif() @@ -112,6 +125,11 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") endif() + # pybind USE_OP_DEVICE_KERNEL for MKLDNN + if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") + endif() + # pybind USE_OP if (${pybind_flag} EQUAL 0) file(APPEND ${pybind_file} "USE_OP(${TARGET});\n") @@ -128,8 +146,8 @@ else() set(DEPS_OPS ${DEPS_OPS} nccl_op) endif() +add_subdirectory(detail) if(WITH_DISTRIBUTE) - add_subdirectory(detail) set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf) set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") op_library(send_op DEPS ${DISTRIBUTE_DEPS}) @@ -170,26 +188,31 @@ op_library(recurrent_op DEPS executor) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale) op_library(cos_sim_op DEPS cos_sim_functor) op_library(parallel_do_op DEPS executor) -op_library(create_reader_op DEPS reader) if (WITH_GPU) - op_library(conv_op DEPS vol2col depthwise_conv) + op_library(conv_op DEPS vol2col depthwise_conv im2col) else() - op_library(conv_op DEPS vol2col) + op_library(conv_op DEPS vol2col im2col) endif() -op_library(conv_transpose_op DEPS vol2col) +op_library(conv_transpose_op DEPS vol2col im2col) # FIXME(typhoonzero): save/load depends lodtensor serialization functions op_library(save_op DEPS lod_tensor) op_library(load_op DEPS lod_tensor) op_library(save_combine_op DEPS lod_tensor) op_library(load_combine_op DEPS lod_tensor) +op_library(concat_op DEPS concat) list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS}) foreach(src ${GENERAL_OPS}) op_library(${src}) endforeach() -file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\nUSE_NO_KERNEL_OP(create_random_data_generator);\n") +file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") + +add_subdirectory(reader) +foreach(src ${READER_LIBRARY}) + set(OP_LIBRARY ${src} ${OP_LIBRARY}) +endforeach() set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") diff --git a/paddle/fluid/operators/cast_op.cc b/paddle/fluid/operators/cast_op.cc index a5ec47d84fe423f77de494e342931a6d278049d2..72f8cb04f2de3af4ee526c3d9b86ff96e34f0b0a 100644 --- a/paddle/fluid/operators/cast_op.cc +++ b/paddle/fluid/operators/cast_op.cc @@ -63,13 +63,27 @@ class CastOpGradMaker : public framework::SingleGradOpDescMaker { } }; +class CastOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + framework::OpKernelType kt = OperatorWithKernel::GetExpectedKernelType(ctx); + // CastOp kernel's device type is decided by input tensor place + kt.place_ = ctx.Input("X")->place(); + return kt; + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; using CPU = paddle::platform::CPUDeviceContext; -REGISTER_OP_WITH_KERNEL(cast, ops::CastOpGradMaker, ops::CastOpInferShape, - ops::CastOpProtoMaker); +REGISTER_OPERATOR(cast, ops::CastOp, ops::CastOpGradMaker, + ops::CastOpInferShape, ops::CastOpProtoMaker); REGISTER_OP_CPU_KERNEL(cast, ops::CastOpKernel, ops::CastOpKernel, ops::CastOpKernel, diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index bdce8f0a6f5f817d2010e8b4de28a7d1be4cc08b..0eedd8ee51ebfff6f553d8e19e97c3a45a95fa6a 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -100,7 +100,8 @@ class ConcatOpGrad : public framework::OperatorWithKernel { namespace ops = paddle::operators; REGISTER_OP_EX(concat, ops::ConcatOp, ops::ConcatOpMaker, concat_grad, ops::ConcatOpGrad, false) -REGISTER_OP_CPU_KERNEL(concat, - ops::ConcatKernel) -REGISTER_OP_CPU_KERNEL(concat_grad, - ops::ConcatGradKernel) +REGISTER_OP_CPU_KERNEL( + concat, ops::ConcatKernel) +REGISTER_OP_CPU_KERNEL( + concat_grad, + ops::ConcatGradKernel) diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index 208a4481c6afe1b8f62e8f675c951c3349639f46..92c8ab6d9ff11ec6acd46a39877eb67d624748a9 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/operators/strided_memcpy.h" namespace paddle { @@ -27,54 +28,30 @@ class ConcatKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto ins = ctx.MultiInput("X"); - auto* out = ctx.Output("Out"); + framework::Tensor* out = ctx.Output("Out"); int64_t axis = static_cast(ctx.Attr("axis")); auto place = ctx.GetPlace(); out->mutable_data(place); - auto out_stride = framework::stride_numel(out->dims()); - - size_t output_offset = 0; - - // If axis >=1, copy to out immediately need to call many times - // of cuda memcpy. Copy the input to cpu and do the stride copy, - // then copy to gpu output. - - if (platform::is_gpu_place(place) && axis >= 1) { - platform::CPUPlace copy_place; - auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place); - framework::Tensor cpu_out; - cpu_out.Resize(out->dims()); - cpu_out.mutable_data(copy_place); - auto& dev_ctx = ctx.device_context(); - std::vector> cpu_ins; - for (auto* in : ins) { - std::unique_ptr cpu_in(new framework::Tensor); - framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get()); - cpu_ins.emplace_back(std::move(cpu_in)); - } - // TODO(dzhwinter): overlap copy and compute stream - // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/ - dev_ctx.Wait(); - - for (auto& in : cpu_ins) { - auto& cpu_in = *in.get(); - auto in_stride = framework::stride_numel(cpu_in.dims()); - - StridedNumelCopyWithAxis( - cpu_ctx, axis, cpu_out.data() + output_offset, out_stride, - cpu_in.data(), in_stride, in_stride[axis]); - output_offset += in_stride[axis]; - } - framework::TensorCopy(cpu_out, place, dev_ctx, out); - } else { + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && ins.size() < 10) { + size_t output_offset = 0; for (auto* in : ins) { auto in_stride = framework::stride_numel(in->dims()); + auto out_stride = framework::stride_numel(out->dims()); StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data() + output_offset, out_stride, in->data(), in_stride, in_stride[axis]); output_offset += in_stride[axis]; } + } else { + std::vector inputs(ins.size()); + for (size_t j = 0; j < ins.size(); ++j) { + inputs[j] = *ins[j]; + } + auto& dev_ctx = ctx.template device_context(); + paddle::operators::math::ConcatFunctor concat_functor; + concat_functor(dev_ctx, inputs, static_cast(axis), out); } } }; @@ -86,16 +63,31 @@ class ConcatGradKernel : public framework::OpKernel { auto* in = ctx.Input(framework::GradVarName("Out")); auto outs = ctx.MultiOutput(framework::GradVarName("X")); int64_t axis = static_cast(ctx.Attr("axis")); - size_t input_offset = 0; - auto in_stride = framework::stride_numel(in->dims()); - for (auto& out : outs) { - out->mutable_data(ctx.GetPlace()); - auto out_stride = framework::stride_numel(out->dims()); - StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), - out_stride, in->data() + input_offset, - in_stride, out_stride[axis]); - input_offset += out_stride[axis]; + // Sometimes direct copies will be faster, this maybe need deeply analysis. + if (axis == 0 && outs.size() < 10) { + size_t input_offset = 0; + auto in_stride = framework::stride_numel(in->dims()); + + for (auto& out : outs) { + out->mutable_data(ctx.GetPlace()); + auto out_stride = framework::stride_numel(out->dims()); + StridedNumelCopyWithAxis(ctx.device_context(), axis, out->data(), + out_stride, in->data() + input_offset, + in_stride, out_stride[axis]); + input_offset += out_stride[axis]; + } + } else { + std::vector outputs(outs.size()); + for (size_t j = 0; j < outs.size(); ++j) { + outs[j]->mutable_data(ctx.GetPlace()); + outputs[j] = *outs[j]; + } + + auto& dev_ctx = ctx.template device_context(); + paddle::operators::math::ConcatGradFunctor + concat_grad_functor; + concat_grad_functor(dev_ctx, *in, static_cast(axis), outputs); } } }; diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a8a5d4c71c4510f04eea2f7ef12f836d1fd9c9b --- /dev/null +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -0,0 +1,276 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/conv_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +template +class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* output = ctx.Output("Output"); + + // Get an unique name from "argument" name of "Output" variable + // This name will be used as key when saving info into device context + const std::string key = ctx.op().Output("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + // TODO(pzelazko-intel) add support for group convolution and dilation + PADDLE_ENFORCE(groups == 1, "group convolution is not implemented yet"); + PADDLE_ENFORCE( + dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + PADDLE_ENFORCE(input->dims().size() == 4, + "Input must be with 4 dimensions, i.e. NCHW"); + PADDLE_ENFORCE(filter->dims().size() == 4, + "Filter must be with 4 dimensions, i.e. OIHW"); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); + auto dst_memory = mkldnn::memory({dst_md, mkldnn_engine}, output_data); + + std::shared_ptr conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine); + + // save conv_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + // create convolution op primitive + auto conv_prim = mkldnn::convolution_forward(*conv_pd, src_memory, + weights_memory, dst_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + private: + std::unique_ptr + ConvFwdPrimitiveDesc(const mkldnn::memory::desc& src, + const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine) const { + mkldnn::memory::dims stride_dims = {strides[0], strides[1]}; + mkldnn::memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto conv_desc = mkldnn::convolution_forward::desc( + mkldnn::prop_kind::forward, mkldnn::convolution_direct, src, weights, + dst, stride_dims, padding_dims, padding_dims, + mkldnn::padding_kind::zero); + + auto p_conv_pd = + new mkldnn::convolution_forward::primitive_desc(conv_desc, engine); + + return std::unique_ptr( + p_conv_pd); + } +}; + +template +class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const Tensor* input = ctx.Input("Input"); + const Tensor* filter = ctx.Input("Filter"); + const Tensor* output = ctx.Input("Output"); + const Tensor* output_grad = + ctx.Input(framework::GradVarName("Output")); + Tensor* input_grad = ctx.Output(framework::GradVarName("Input")); + Tensor* filter_grad = ctx.Output(framework::GradVarName("Filter")); + + if (!input_grad && !filter_grad) return; + + // Get an unique name from "argument" name of "Output" variable + // This name will be used as key when saving info into device context + const std::string key = ctx.op().Input("Output"); + const std::string key_conv_pd = key + "@conv_pd"; + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + const T* input_data = input->data(); + const T* filter_data = filter->data(); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = nullptr; + T* filter_grad_data = nullptr; + + if (input_grad) { + input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + } + if (filter_grad) { + filter_grad_data = filter_grad->mutable_data(ctx.GetPlace()); + } + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto diff_src_md = platform::MKLDNNMemDesc( + src_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + auto weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_weights_md = + platform::MKLDNNMemDesc(weights_tz, mkldnn::memory::data_type::f32, + mkldnn::memory::format::oihw); + auto diff_dst_md = platform::MKLDNNMemDesc( + dst_tz, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw); + + // create memory + auto diff_dst_memory = mkldnn::memory({diff_weights_md, mkldnn_engine}, + (void*)output_grad_data); + // Retrieve conv_pd from device context + auto conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); + PADDLE_ENFORCE(conv_pd != nullptr, + "Fail to find conv_pd in device context"); + + // create backward conv primitive for weights + if (filter_grad) { + // create primitive descriptor + mkldnn::convolution_backward_weights::primitive_desc conv_bwd_weights_pd = + ConvBwdWeightsPrimitiveDesc(src_md, diff_weights_md, diff_dst_md, + strides, paddings, *conv_pd, + mkldnn_engine); + + // create memory + auto diff_weights_memory = mkldnn::memory( + {diff_weights_md, mkldnn_engine}, (void*)filter_grad_data); + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + + // create backward conv primitive for weights + auto conv_bwd_weights_prim = mkldnn::convolution_backward_weights( + conv_bwd_weights_pd, src_memory, diff_dst_memory, + diff_weights_memory); + + // push primitive and execute it + std::vector pipeline{conv_bwd_weights_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + if (input_grad) { + // create primitive descriptor + mkldnn::convolution_backward_data::primitive_desc conv_bwd_data_pd = + ConvBwdDataPrimitiveDesc(diff_src_md, weights_md, diff_dst_md, + strides, paddings, *conv_pd, mkldnn_engine); + + // create memory + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)input_grad_data); + auto weights_memory = + mkldnn::memory({weights_md, mkldnn_engine}, (void*)filter_data); + + // create backward conv primitive for data + auto conv_bwd_data_prim = mkldnn::convolution_backward_data( + conv_bwd_data_pd, diff_dst_memory, weights_memory, diff_src_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{conv_bwd_data_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + } // Compute() + + private: + mkldnn::convolution_backward_weights::primitive_desc + ConvBwdWeightsPrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& diff_weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_weights_desc = mkldnn::convolution_backward_weights::desc( + mkldnn::convolution_direct, src, diff_weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_weights::primitive_desc( + conv_bwd_weights_desc, engine, conv_pd); + } + + mkldnn::convolution_backward_data::primitive_desc ConvBwdDataPrimitiveDesc( + const mkldnn::memory::desc& diff_src, const mkldnn::memory::desc& weights, + const mkldnn::memory::desc& diff_dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::convolution_forward::primitive_desc& conv_pd, + const mkldnn::engine& engine) const { + auto conv_bwd_data_desc = mkldnn::convolution_backward_data::desc( + mkldnn::convolution_direct, diff_src, weights, diff_dst, strides, + paddings, paddings, mkldnn::padding_kind::zero); + return mkldnn::convolution_backward_data::primitive_desc(conv_bwd_data_desc, + engine, conv_pd); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; + +REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 83b7708bf337b70f97c5e9126efd142b9b957b00..4b02b80d7772fa15d2333692551da5e59d93765f 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -13,6 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/conv_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { @@ -64,22 +70,21 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), @@ -131,6 +136,9 @@ Conv2DOpMaker::Conv2DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -224,6 +232,9 @@ Conv3DOpMaker::Conv3DOpMaker(OpProto* proto, OpAttrChecker* op_checker) "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -284,23 +295,21 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto& dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); + // TODO(pzelazko-intel): enable MKLDNN layout when it's ready framework::DataLayout layout_ = framework::StringToDataLayout(data_format); return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), diff --git a/paddle/fluid/operators/create_reader_op.cc b/paddle/fluid/operators/create_reader_op.cc deleted file mode 100644 index 17ed7e24eca9ae923ef71cca7d4c0005eda271f1..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/create_reader_op.cc +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/reader.h" - -namespace paddle { -namespace operators { - -static std::vector RestoreShapes( - const std::vector& shape_concat, const std::vector& ranks) { - std::vector res; - int offset = 0; - for (int len : ranks) { - auto start_it = shape_concat.begin() + offset; - auto end_it = start_it + len; - res.push_back(framework::make_ddim(std::vector(start_it, end_it))); - offset += len; - } - return res; -} - -// general infershape for file readers -class CreateFileReaderInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "The output file reader should not be null."); - const auto shape_concat = - ctx->Attrs().Get>("shape_concat"); - const auto ranks = ctx->Attrs().Get>("ranks"); - std::vector shapes = RestoreShapes(shape_concat, ranks); - ctx->SetReaderDims("Out", shapes); - - if (ctx->IsRuntime()) { - const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ( - lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); - framework::VarDesc* reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - reader->SetLoDLevels(lod_levels); - } - } -}; - -// general infershape for decorated readers -class CreateDecoratedReaderInferShape : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), - "Input(UnderlyingReader) should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "The output decorated reader should not be null."); - ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); - - if (ctx->IsRuntime()) { - framework::VarDesc* in_reader = boost::get( - ctx->GetInputVarPtrs("UnderlyingReader")[0]); - framework::VarDesc* out_reader = - boost::get(ctx->GetOutputVarPtrs("Out")[0]); - out_reader->SetLoDLevels(in_reader->GetLoDLevels()); - } - } -}; - -// general var type inference for file readers -class CreateFileReaderInferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - std::string reader_name = op_desc.Output("Out")[0]; - framework::VarDesc* reader = block->FindVarRecursive(reader_name); - reader->SetType(framework::proto::VarType::READER); - } -}; - -// general var type inference for decorated readers -class CreateDecoratedReaderInferVarType : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc& op_desc, - framework::BlockDesc* block) const override { - std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; - framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); - std::string out_reader_name = op_desc.Output("Out")[0]; - framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); - out_reader->SetType(framework::proto::VarType::READER); - out_reader->SetDataTypes(in_reader->GetDataTypes()); - } -}; - -template -class CreateRandomDataGeneratorOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - const auto& shape_concat = Attr>("shape_concat"); - const auto& ranks = Attr>("ranks"); - PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); - PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), - int(shape_concat.size()), - "The accumulate of all ranks should be equal to the " - "shape concat's length."); - std::vector shapes = RestoreShapes(shape_concat, ranks); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - out->Reset(new framework::RandomDataGenerator(shapes, Attr("min"), - Attr("max"))); - } -}; - -class CreateRandomDataGeneratorOpMaker - : public framework::OpProtoAndCheckerMaker { - public: - CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(op_proto, op_checker) { - AddOutput("Out", "(ReaderHolder) The created random reader."); - AddAttr>("shape_concat", - "The concat of all data's shapes."); - AddAttr>( - "ranks", - "The ranks of each data." - "e.g." - "shape_concat = [2,3,4,5,6]" - "ranks = [3,2]" - "It means the reader will generate two data each time," - "whose shapes are [2,3,4] and [5,6] respectively."); - AddAttr>("lod_levels", "The LoD levels of each data."); - AddAttr("min", "The lower bound of reader's uniform distribution."); - AddAttr("max", "The upper bound of reader's uniform distribution."); - AddComment(R"DOC( - CreateRandomDataGenerator Operator - - This Op creates a random reader. - The reader generates random data instead of really reading from files. - Generated data follow an uniform distribution between 'min' and 'max'. - )DOC"); - } -}; - -class CreateShuffleReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - out->Reset(new framework::ShuffleReader(underlying_reader.Get(), - Attr("buffer_size"))); - } -}; - -class CreateShuffleReaderOpMaker : public framework::OpProtoAndCheckerMaker { - public: - CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(op_proto, op_checker) { - AddInput( - "UnderlyingReader", - "(ReaderHolder) The underlying reader for creating a shuffle reader."); - AddOutput("Out", "(ReaderHolder) The created shuffle reader."); - AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); - AddComment(R"DOC( - CreateShuffleReader Operator - - A shuffle reader takes another reader as its 'underlying reader' - and yields the underlying reader's outputs in a shuffled order. - )DOC"); - } -}; - -class CreateBatchReaderOp : public framework::OperatorBase { - public: - using framework::OperatorBase::OperatorBase; - - private: - void RunImpl(const framework::Scope& scope, - const platform::Place& dev_place) const override { - const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) - ->Get(); - auto* out = scope.FindVar(Output("Out")) - ->template GetMutable(); - out->Reset(new framework::BatchReader(underlying_reader.Get(), - Attr("batch_size"))); - } -}; - -class CreateBatchReaderOpMaker : public framework::OpProtoAndCheckerMaker { - public: - CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) - : OpProtoAndCheckerMaker(op_proto, op_checker) { - AddInput( - "UnderlyingReader", - "(ReaderHolder) The underlying reader for creating a batch reader."); - AddOutput("Out", "(ReaderHolder) The created batch reader."); - AddAttr("batch_size", - "How many instances the batch reader yields each time.") - .GreaterThan(0); - AddComment(R"DOC( - CreateBatchReader Operator - - A batch reader takes another reader as its 'underlying reader', - gathers the underlying reader's outputs and then yields them in batches. - )DOC"); - } -}; - -} // namespace operators -} // namespace paddle - -namespace ops = paddle::operators; -REGISTER_OPERATOR(create_random_data_generator, - ops::CreateRandomDataGeneratorOp, - ops::CreateFileReaderInferShape, - ops::CreateRandomDataGeneratorOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::CreateFileReaderInferVarType); -REGISTER_OPERATOR(create_shuffle_reader, ops::CreateShuffleReaderOp, - ops::CreateDecoratedReaderInferShape, - ops::CreateShuffleReaderOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::CreateDecoratedReaderInferVarType); -REGISTER_OPERATOR(create_batch_reader, ops::CreateBatchReaderOp, - ops::CreateDecoratedReaderInferShape, - ops::CreateBatchReaderOpMaker, - paddle::framework::EmptyGradOpMaker, - ops::CreateDecoratedReaderInferVarType); diff --git a/paddle/fluid/operators/detail/CMakeLists.txt b/paddle/fluid/operators/detail/CMakeLists.txt index 571a75c9dcd903672d460f192bf28ddbeaea7c78..94395ccfbcbd74ee40552a5c70dc8b8063a5f851 100644 --- a/paddle/fluid/operators/detail/CMakeLists.txt +++ b/paddle/fluid/operators/detail/CMakeLists.txt @@ -1 +1,6 @@ -grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) +if(WITH_DISTRIBUTE) + grpc_library(sendrecvop_grpc SRCS bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(test_serde.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(serde_test SRCS test_serde.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc) +endif() diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.cc b/paddle/fluid/operators/detail/bytebuffer_stream.cc new file mode 100644 index 0000000000000000000000000000000000000000..a9488156e073e515926240c9bb66d7b6edf8f82e --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.cc @@ -0,0 +1,88 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#include "bytebuffer_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +GrpcByteBufferSource::GrpcByteBufferSource() {} + +bool GrpcByteBufferSource::Init(const grpc::ByteBuffer& src) { + cur_ = -1; + left_ = 0; + ptr_ = nullptr; + byte_count_ = 0; + bool ok = src.Dump(&slices_).ok(); + if (!ok) { + slices_.clear(); + } + return ok; +} + +bool GrpcByteBufferSource::Next(const void** data, int* size) { + // Use loop instead of if in case buffer contained empty slices. + while (left_ == 0) { + // Advance to next slice. + cur_++; + if (cur_ >= slices_.size()) { + return false; + } + const ::grpc::Slice& s = slices_[cur_]; + left_ = s.size(); + ptr_ = reinterpret_cast(s.begin()); + } + + *data = ptr_; + *size = left_; + byte_count_ += left_; + ptr_ += left_; + left_ = 0; + return true; +} + +void GrpcByteBufferSource::BackUp(int count) { + ptr_ -= count; + left_ += count; + byte_count_ -= count; +} + +bool GrpcByteBufferSource::Skip(int count) { + const void* data; + int size; + while (Next(&data, &size)) { + if (size >= count) { + BackUp(size - count); + return true; + } + // size < count; + count -= size; + } + // error or we have too large count; + return false; +} + +google::protobuf::int64 GrpcByteBufferSource::ByteCount() const { + return byte_count_; +} + +} // namespace detail +} // namespace operators +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/operators/detail/bytebuffer_stream.h b/paddle/fluid/operators/detail/bytebuffer_stream.h new file mode 100644 index 0000000000000000000000000000000000000000..099deb12d0e436427c147ab9b1eb553b712e14fb --- /dev/null +++ b/paddle/fluid/operators/detail/bytebuffer_stream.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" + +namespace paddle { +namespace operators { +namespace detail { + +// A ZeroCopyInputStream that reads from a grpc::ByteBuffer. +class GrpcByteBufferSource + : public ::google::protobuf::io::ZeroCopyInputStream { + public: + GrpcByteBufferSource(); + bool Init(const ::grpc::ByteBuffer& src); // Can be called multiple times. + bool Next(const void** data, int* size) override; + void BackUp(int count) override; + bool Skip(int count) override; + ::google::protobuf::int64 ByteCount() const override; + + private: + std::vector<::grpc::Slice> slices_; + size_t cur_; // Current slice index. + int left_; // Number of bytes in slices_[cur_] left to yield. + const char* ptr_; // Address of next byte in slices_[cur_] to yield. + ::google::protobuf::int64 byte_count_; +}; + +} // namespace detail +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/detail/proto_encoder_helper.h b/paddle/fluid/operators/detail/proto_encoder_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..4a7bfb8bd586fe84c9243bc64117d146c4386674 --- /dev/null +++ b/paddle/fluid/operators/detail/proto_encoder_helper.h @@ -0,0 +1,147 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +// NOTE: This file was originally created by tensorflow +// (https://github.com/tensorflow/tensorflow/) we borrow this +// file and did some modifications so that we can send gRPC +// requests without too much copying of the tensor data. + +#pragma once + +#include +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace detail { + +char* EncodeVarint32(char* dst, uint32_t v) { + // Operate on characters as unsigneds + unsigned char* ptr = reinterpret_cast(dst); + static const int B = 128; + if (v < (1 << 7)) { + *(ptr++) = v; + } else if (v < (1 << 14)) { + *(ptr++) = v | B; + *(ptr++) = v >> 7; + } else if (v < (1 << 21)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = v >> 14; + } else if (v < (1 << 28)) { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = v >> 21; + } else { + *(ptr++) = v | B; + *(ptr++) = (v >> 7) | B; + *(ptr++) = (v >> 14) | B; + *(ptr++) = (v >> 21) | B; + *(ptr++) = v >> 28; + } + return reinterpret_cast(ptr); +} + +char* EncodeVarint64(char* dst, uint64_t v) { + static const int B = 128; + unsigned char* ptr = reinterpret_cast(dst); + while (v >= B) { + *(ptr++) = (v & (B - 1)) | B; + v >>= 7; + } + *(ptr++) = static_cast(v); + return reinterpret_cast(ptr); +} + +int VarintLength(uint64_t v) { + int len = 1; + while (v >= 128) { + v >>= 7; + len++; + } + return len; +} + +class ProtoEncodeHelper { + public: + ProtoEncodeHelper(char* buf, int max_size) + : base_(buf), p_(buf), limit_(base_ + max_size) {} + + ~ProtoEncodeHelper() { + // Make sure callers didn't do operations that went over max_size promised + PADDLE_ENFORCE_LE(p_, limit_); + } + + const char* data() const { return base_; } + size_t size() const { return p_ - base_; } + + void WriteUint64(int tag, uint64_t v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + Encode64(v); + } + void WriteBool(int tag, bool v) { + Encode32(combine(tag, WIRETYPE_VARINT)); + EncodeBool(v); + } + void WriteString(int tag, const std::string& v) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(v.size()); + EncodeBytes(v.data(), v.size()); + } + void WriteVarlengthBeginning(int tag, uint32_t len) { + Encode32(combine(tag, WIRETYPE_LENGTH_DELIMITED)); + Encode32(len); + } + void WriteRawBytes(const std::string& v) { EncodeBytes(v.data(), v.size()); } + + private: + // Note: this module's behavior must match the protocol buffer wire encoding + // format. + enum { + WIRETYPE_VARINT = 0, + WIRETYPE_LENGTH_DELIMITED = 2, + }; + static uint32_t combine(uint32_t tag, uint32_t type) { + return ((tag << 3) | type); + } + inline void Encode32(uint32_t v) { + if (v < 128) { + // Fast path for single-byte values. Many of the calls will use a + // constant value for v, so the comparison will get optimized away + // when Encode32 is inlined into the caller. + *p_ = v; + p_++; + } else { + p_ = EncodeVarint32(p_, v); + } + } + void Encode64(uint64_t v) { p_ = EncodeVarint64(p_, v); } + void EncodeBool(bool v) { + *p_ = (v ? 1 : 0); // Equal to varint32 encoding of 0 or 1 + p_++; + } + void EncodeBytes(const char* bytes, int N) { + memcpy(p_, bytes, N); + p_ += N; + } + + char* base_; + char* p_; + char* limit_; // Just for CHECKs +}; + +} // detail +} // operators +} // paddle diff --git a/paddle/fluid/operators/detail/send_recv.proto b/paddle/fluid/operators/detail/send_recv.proto index 8f962b4c69cc83dc2ab98b7dc27e18bc4b42bf18..b0215d4a80c9440f09c35434903fd6166b03e8b0 100644 --- a/paddle/fluid/operators/detail/send_recv.proto +++ b/paddle/fluid/operators/detail/send_recv.proto @@ -33,10 +33,34 @@ enum VarType { } message VariableMessage { + enum Type { + // Pod Types + BOOL = 0; + INT16 = 1; + INT32 = 2; + INT64 = 3; + FP16 = 4; + FP32 = 5; + FP64 = 6; + } + + message LodData { repeated int64 lod_data = 1; } + string varname = 1; // TODO(Yancey1989): reference framework::proto::VarDesc::VarType VarType type = 2; - bytes serialized = 3; + // bool persistable is not needed for sending. + // tensor info: + Type data_type = 3; + repeated int64 dims = 4; + + // lod details: + int64 lod_level = 5; + repeated LodData lod = 6; + // tensor data + bytes serialized = 7; + // selected_rows data + bytes rows = 8; } message VoidMessage {} diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.cc b/paddle/fluid/operators/detail/sendrecvop_utils.cc index 169fd40fd950a74e61a4ed06a370f25b533957db..f196fc9862d2374583d50820a6c3b63c866bf048 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.cc +++ b/paddle/fluid/operators/detail/sendrecvop_utils.cc @@ -13,6 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/detail/bytebuffer_stream.h" +#include "paddle/fluid/operators/detail/proto_encoder_helper.h" namespace paddle { namespace operators { @@ -63,6 +68,233 @@ void DeserializeFromMessage(const sendrecv::VariableMessage& msg, } } +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg) { + using VarMsg = sendrecv::VariableMessage; + sendrecv::VariableMessage request; + std::string header; + request.AppendToString(&header); + // When using GPU, need to free the copied CPU buffer + // when the ByteBuffer destroies + // TODO(typhoonzero): add unref here, if we have dependent + // parallelism execution, need to know when to free the tensor. + DestroyCallback destroy_callback = [](void* backing) {}; + + void* buf = malloc(1024); + void* payload; + size_t payload_size; + ProtoEncodeHelper e((char*)buf, 1024); + e.WriteString(VarMsg::kVarnameFieldNumber, name); + if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 0); + } else if (var->IsType()) { + e.WriteUint64(VarMsg::kTypeFieldNumber, 1); + } + + switch (framework::ToVarType(var->Type())) { + case framework::proto::VarType_Type_LOD_TENSOR: { + auto tensor = var->Get(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(tensor.type())); + for (auto& dim : framework::vectorize(tensor.dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + auto lod = tensor.lod(); // std::vector> + if (lod.size() > 0) { + e.WriteUint64(VarMsg::kLodLevelFieldNumber, lod.size()); + + for (auto& each : lod) { + e.WriteVarlengthBeginning(VarMsg::kLodFieldNumber, + 2 + // tag + varintlength of submessage + 1 + // kLodDataFieldNumber + each.size()); + // auto copied from GPU + for (auto& d : each) { + e.WriteUint64(VarMsg::LodData::kLodDataFieldNumber, d); + } + } + } + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + PADDLE_ENFORCE(platform::is_gpu_place(tensor.place())); + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor.memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor.place()), + reinterpret_cast(tensor.data()), + copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + destroy_callback = [](void* backing) { + platform::CPUPlace cpu; + memory::Free(cpu, backing); + }; +#endif + } else { + payload = tensor.data(); + } + payload_size = tensor.memory_size(); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + case framework::proto::VarType_Type_SELECTED_ROWS: { + // TODO(typhoonzero): selectedrows implement should not use unique_ptr + auto* slr = var->GetMutable(); + e.WriteUint64(VarMsg::kDataTypeFieldNumber, + framework::ToDataType(slr->value().type())); + for (auto& dim : framework::vectorize(slr->value().dims())) { + e.WriteUint64(VarMsg::kDimsFieldNumber, dim); + } + e.WriteUint64(VarMsg::kLodLevelFieldNumber, 0); + auto* tensor = slr->mutable_value(); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = + static_cast(ctx); + auto copy_size = tensor->memory_size(); + payload = memory::Alloc(cpu, copy_size); + memory::Copy(cpu, payload, + boost::get(tensor->place()), + reinterpret_cast(tensor->data()), + copy_size, gpu_dev_ctx.stream()); + ctx.Wait(); + destroy_callback = [](void* backing) { + platform::CPUPlace cpu; + memory::Free(cpu, backing); + }; +#endif + } else { + payload = slr->mutable_value()->data(); + } + payload_size = tensor->memory_size(); + e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload_size); + } break; + default: + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + break; + } + // steal reference of tensor data + ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows + int num_slices = 2; // only SelectedRows have rows buffer + slices[0] = ::grpc::Slice(e.size()); + memcpy(const_cast(slices[0].begin()), e.data(), e.size()); + slices[1] = ::grpc::Slice( + grpc_slice_new_with_user_data(payload, payload_size, destroy_callback, + static_cast(payload)), + ::grpc::Slice::STEAL_REF); + + if (framework::ToVarType(var->Type()) == + framework::proto::VarType_Type_SELECTED_ROWS) { + auto* slr = var->GetMutable(); + + ProtoEncodeHelper e2((char*)buf, 128); + // NOTE: rows is of type int64_t + size_t rows_memory_size = + slr->rows().capacity() * framework::SizeOfType(typeid(int64_t)); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); + slices[2] = ::grpc::Slice(e2.size()); + memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); + + slices[3] = ::grpc::Slice( + grpc_slice_new_with_user_data( + const_cast( + reinterpret_cast(slr->rows().data())), + rows_memory_size, + [](void* backing) { + // TODO(typhoonzero): add unref here, same as above. + }, + const_cast( + reinterpret_cast(slr->rows().data()))), + ::grpc::Slice::STEAL_REF); + num_slices = 4; + } + + ::grpc::ByteBuffer tmp(&slices[0], num_slices); + msg->Swap(&tmp); +} + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var) { + sendrecv::VariableMessage meta; + GrpcByteBufferSource source; + source.Init(msg); + ::google::protobuf::io::CodedInputStream input(&source); + // do zerocopy parsing + PADDLE_ENFORCE(meta.ParseFromCodedStream(&input)); + PADDLE_ENFORCE(input.ConsumedEntireMessage()); + // dims is needed by both tensor and selectedrows + std::vector vecdims; + for (auto& d : meta.dims()) { + vecdims.push_back(d); + } + framework::DDim dims = framework::make_ddim(vecdims); + + if (meta.type() == sendrecv::LOD_TENSOR) { + auto* tensor = var->GetMutable(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + framework::LoD lod; + for (int i = 0; i < meta.lod_level(); ++i) { + framework::Vector v; + for (int j = 0; j < meta.lod(i).lod_data_size(); ++j) { + v.push_back(meta.lod(i).lod_data(j)); + } + lod.push_back(v); + } + tensor->set_lod(lod); + // How to avoid copying and use the message buffer directly? + // Maybe need to find a way to release all memory except tensor content. + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); + ctx.Wait(); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + } else if (meta.type() == sendrecv::SELECTED_ROWS) { + auto* slr = var->GetMutable(); + auto* tensor = slr->mutable_value(); + int64_t* rows_data = slr->mutable_rows()->data(); + tensor->Resize(dims); + void* tensor_data = tensor->mutable_data( + ctx.GetPlace(), + paddle::operators::detail::ToTypeIndex(meta.data_type())); + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + platform::CPUPlace cpu; + auto& gpu_dev_ctx = static_cast(ctx); + memory::Copy(boost::get(tensor->place()), + tensor_data, cpu, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size(), gpu_dev_ctx.stream()); + ctx.Wait(); +#endif + } else { + memcpy(tensor_data, + reinterpret_cast(meta.serialized().data()), + meta.serialized().size()); + } + // copy rows CPU data, GPU data will be copied lazly + memcpy(rows_data, reinterpret_cast(meta.rows().data()), + meta.rows().size()); + } +} + } // namespace detail } // namespace operators -} // namespace paddle +} // namespace paddle \ No newline at end of file diff --git a/paddle/fluid/operators/detail/sendrecvop_utils.h b/paddle/fluid/operators/detail/sendrecvop_utils.h index 670d0e162473750d0a5f8e9025ef1cf9a9ef407c..5208091e54b4da2bb0265f84827ce23b57e954dc 100644 --- a/paddle/fluid/operators/detail/sendrecvop_utils.h +++ b/paddle/fluid/operators/detail/sendrecvop_utils.h @@ -33,6 +33,8 @@ namespace detail { #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV" #define BATCH_BARRIER_MESSAGE "BATCH_BARRIER@RECV" +typedef void (*DestroyCallback)(void*); + void SerializeToMessage(const std::string& name, const framework::Variable* var, const platform::DeviceContext& ctx, sendrecv::VariableMessage* msg); @@ -40,6 +42,32 @@ void SerializeToMessage(const std::string& name, const framework::Variable* var, void DeserializeFromMessage(const sendrecv::VariableMessage& msg, const platform::DeviceContext& ctx, framework::Variable* var); + +void SerializeToByteBuffer(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, + ::grpc::ByteBuffer* msg); + +void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, + const platform::DeviceContext& ctx, + framework::Variable* var); + +inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { + switch (type) { + case sendrecv::VariableMessage::FP32: + return typeid(float); // NOLINT + case sendrecv::VariableMessage::FP64: + return typeid(double); // NOLINT + case sendrecv::VariableMessage::INT32: + return typeid(int); // NOLINT + case sendrecv::VariableMessage::INT64: + return typeid(int64_t); // NOLINT + case sendrecv::VariableMessage::BOOL: + return typeid(bool); // NOLINT + default: + PADDLE_THROW("Not support type %d", type); + } +} + } // namespace detail } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/detail/test_serde.cc b/paddle/fluid/operators/detail/test_serde.cc new file mode 100644 index 0000000000000000000000000000000000000000..2f06e5a686b996858d21930a1afa2861efca4a9b --- /dev/null +++ b/paddle/fluid/operators/detail/test_serde.cc @@ -0,0 +1,186 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/detail/sendrecvop_utils.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestTensor(platform::Place place) { + // serialize var to ByteBuffer + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({4, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + int tensor_numel = 4 * 8 * 4 * 2; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 0); + EXPECT_EQ(varmsg.dims()[0], 4); + EXPECT_EQ(varmsg.dims()[1], 8); + EXPECT_EQ(varmsg.dims()[2], 4); + EXPECT_EQ(varmsg.dims()[3], 2); + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 31.9); + } + + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + auto tensor2 = var2.Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + EXPECT_EQ(varmsg.lod_level(), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(0), 1); + EXPECT_EQ(varmsg.lod(0).lod_data(1), 3); + EXPECT_EQ(varmsg.lod(0).lod_data(2), 8); + for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); +} + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + framework::Variable var; + auto* slr = var.GetMutable(); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({2, 10})); + tensor->mutable_data(place); + int tensor_numel = 2 * 10; + math::set_constant(ctx, tensor, 32.7); + rows->push_back(3); + rows->push_back(10); + + ::grpc::ByteBuffer msg; + operators::detail::SerializeToByteBuffer("myvar", &var, ctx, &msg); + EXPECT_GT(msg.Length(), 0); + + // deserialize + std::vector<::grpc::Slice> slices; + (void)msg.Dump(&slices); + std::string tmp; + for (const auto& s : slices) { + tmp.append(reinterpret_cast(s.begin()), s.size()); + } + sendrecv::VariableMessage varmsg; + EXPECT_TRUE(varmsg.ParseFromString(tmp)); + + EXPECT_EQ(varmsg.varname(), "myvar"); + EXPECT_EQ(varmsg.type(), 1); + + const float* tensor_data = + reinterpret_cast(varmsg.serialized().data()); + const int64_t* rows_data = + reinterpret_cast(varmsg.rows().data()); + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data[i], 32.7); + } + EXPECT_EQ(rows_data[0], 3); + EXPECT_EQ(rows_data[1], 10); + // deserialize zero-copy + framework::Variable var2; + operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); + + auto* slr2 = var2.GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + EXPECT_EQ(rows_data2[0], 3); + EXPECT_EQ(rows_data2[1], 10); +} + +TEST(SelectedRows, CPU) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +} + +TEST(SelectedRows, GPU) { + platform::CUDAPlace place; + RunSerdeTestSelectedRows(place); +} + +TEST(Tensor, CPU) { + platform::CPUPlace place; + RunSerdeTestTensor(place); +} + +TEST(Tensor, GPU) { + platform::CUDAPlace place; + RunSerdeTestTensor(place); +} \ No newline at end of file diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index 880bfe3b04394f20079af98d07dc2a12db920b97..73c84c2fe0155d21d7059938330e44fa3668c6df 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -71,7 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { return framework::OpKernelType( framework::ToDataType( ctx.Input("DetectRes")->type()), - ctx.device_context()); + platform::CPUPlace()); } }; @@ -142,7 +142,15 @@ class DetectionMAPOpMaker : public framework::OpProtoAndCheckerMaker { AddOutput("MAP", "(Tensor) A tensor with shape [1], store the mAP evaluate " "result of the detection."); - + AddAttr("class_num", + "(int) " + "The class number."); + AddAttr( + "background_label", + "(int, defalut: 0) " + "The index of background label, the background label will be ignored. " + "If set to -1, then all categories will be considered.") + .SetDefault(0); AddAttr( "overlap_threshold", "(float) " diff --git a/paddle/fluid/operators/detection_map_op.h b/paddle/fluid/operators/detection_map_op.h index b2b0995b35bf16432e73ebcfd3341adb00c11fd8..a009e9dfce130bfd6c506c71b58a418a569bdf7a 100644 --- a/paddle/fluid/operators/detection_map_op.h +++ b/paddle/fluid/operators/detection_map_op.h @@ -69,6 +69,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { float overlap_threshold = ctx.Attr("overlap_threshold"); float evaluate_difficult = ctx.Attr("evaluate_difficult"); auto ap_type = GetAPType(ctx.Attr("ap_type")); + int class_num = ctx.Attr("class_num"); auto label_lod = in_label->lod(); auto detect_lod = in_detect->lod(); @@ -95,17 +96,19 @@ class DetectionMAPOpKernel : public framework::OpKernel { if (in_pos_count != nullptr && state) { GetInputPos(*in_pos_count, *in_true_pos, *in_false_pos, label_pos_count, - true_pos, false_pos); + true_pos, false_pos, class_num); } CalcTrueAndFalsePositive(gt_boxes, detect_boxes, evaluate_difficult, overlap_threshold, label_pos_count, true_pos, false_pos); - T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos); + int background_label = ctx.Attr("background_label"); + T map = CalcMAP(ap_type, label_pos_count, true_pos, false_pos, + background_label); GetOutputPos(ctx, label_pos_count, true_pos, false_pos, *out_pos_count, - *out_true_pos, *out_false_pos); + *out_true_pos, *out_false_pos, class_num); T* map_data = out_map->mutable_data(ctx.GetPlace()); map_data[0] = map; @@ -141,6 +144,15 @@ class DetectionMAPOpKernel : public framework::OpKernel { } } + inline void ClipBBox(const Box& bbox, Box* clipped_bbox) const { + T one = static_cast(1.0); + T zero = static_cast(0.0); + clipped_bbox->xmin = std::max(std::min(bbox.xmin, one), zero); + clipped_bbox->ymin = std::max(std::min(bbox.ymin, one), zero); + clipped_bbox->xmax = std::max(std::min(bbox.xmax, one), zero); + clipped_bbox->ymax = std::max(std::min(bbox.ymax, one), zero); + } + void GetBoxes(const framework::LoDTensor& input_label, const framework::LoDTensor& input_detect, std::vector>>& gt_boxes, @@ -190,24 +202,20 @@ class DetectionMAPOpKernel : public framework::OpKernel { const std::map>>& false_pos, framework::Tensor& output_pos_count, framework::LoDTensor& output_true_pos, - framework::LoDTensor& output_false_pos) const { - int max_class_id = 0; + framework::LoDTensor& output_false_pos, const int class_num) const { int true_pos_count = 0; int false_pos_count = 0; - for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) { - int label = it->first; - if (label > max_class_id) max_class_id = label; - int label_num_pos = it->second; - if (label_num_pos == 0 || true_pos.find(label) == true_pos.end()) - continue; - auto label_true_pos = true_pos.find(label)->second; - auto label_false_pos = false_pos.find(label)->second; - true_pos_count += label_true_pos.size(); - false_pos_count += label_false_pos.size(); + for (auto it = true_pos.begin(); it != true_pos.end(); ++it) { + auto tp = it->second; + true_pos_count += tp.size(); + } + for (auto it = false_pos.begin(); it != false_pos.end(); ++it) { + auto fp = it->second; + false_pos_count += fp.size(); } int* pos_count_data = output_pos_count.mutable_data( - framework::make_ddim({max_class_id + 1, 1}), ctx.GetPlace()); + framework::make_ddim({class_num, 1}), ctx.GetPlace()); T* true_pos_data = output_true_pos.mutable_data( framework::make_ddim({true_pos_count, 2}), ctx.GetPlace()); @@ -217,7 +225,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { false_pos_count = 0; std::vector true_pos_starts = {0}; std::vector false_pos_starts = {0}; - for (int i = 0; i <= max_class_id; ++i) { + for (int i = 0; i < class_num; ++i) { auto it_count = label_pos_count.find(i); pos_count_data[i] = 0; if (it_count != label_pos_count.end()) { @@ -258,17 +266,16 @@ class DetectionMAPOpKernel : public framework::OpKernel { return; } - void GetInputPos( - const framework::Tensor& input_pos_count, - const framework::LoDTensor& input_true_pos, - const framework::LoDTensor& input_false_pos, - std::map& label_pos_count, - std::map>>& true_pos, - std::map>>& false_pos) const { + void GetInputPos(const framework::Tensor& input_pos_count, + const framework::LoDTensor& input_true_pos, + const framework::LoDTensor& input_false_pos, + std::map& label_pos_count, + std::map>>& true_pos, + std::map>>& false_pos, + const int class_num) const { constexpr T kEPS = static_cast(1e-6); - int class_number = input_pos_count.dims()[0]; const int* pos_count_data = input_pos_count.data(); - for (int i = 0; i < class_number; ++i) { + for (int i = 0; i < class_num; ++i) { label_pos_count[i] = pos_count_data[i]; } @@ -362,7 +369,9 @@ class DetectionMAPOpKernel : public framework::OpKernel { size_t max_idx = 0; auto score = pred_boxes[i].first; for (size_t j = 0; j < matched_bboxes.size(); ++j) { - T overlap = JaccardOverlap(pred_boxes[i].second, matched_bboxes[j]); + Box& pred_box = pred_boxes[i].second; + ClipBBox(pred_box, &pred_box); + T overlap = JaccardOverlap(pred_box, matched_bboxes[j]); if (overlap > max_overlap) { max_overlap = overlap; max_idx = j; @@ -391,17 +400,19 @@ class DetectionMAPOpKernel : public framework::OpKernel { } } - T CalcMAP( - APType ap_type, const std::map& label_pos_count, - const std::map>>& true_pos, - const std::map>>& false_pos) const { + T CalcMAP(APType ap_type, const std::map& label_pos_count, + const std::map>>& true_pos, + const std::map>>& false_pos, + const int background_label) const { T mAP = 0.0; int count = 0; for (auto it = label_pos_count.begin(); it != label_pos_count.end(); ++it) { int label = it->first; int label_num_pos = it->second; - if (label_num_pos == 0 || true_pos.find(label) == true_pos.end()) + if (label_num_pos == background_label || + true_pos.find(label) == true_pos.end()) { continue; + } auto label_true_pos = true_pos.find(label)->second; auto label_false_pos = false_pos.find(label)->second; // Compute average precision. @@ -450,7 +461,7 @@ class DetectionMAPOpKernel : public framework::OpKernel { } } if (count != 0) mAP /= count; - return mAP * 100; + return mAP; } }; // namespace operators diff --git a/paddle/fluid/operators/elementwise_div_op.h b/paddle/fluid/operators/elementwise_div_op.h index 6bcc577456b13f7930e07c24564953e93c5339ed..95649ac46e6bd41b9e1a865794cdec3ae1e6e247 100644 --- a/paddle/fluid/operators/elementwise_div_op.h +++ b/paddle/fluid/operators/elementwise_div_op.h @@ -41,77 +41,14 @@ class ElementwiseDivKernel : public framework::OpKernel { }; template -struct ElementwiseDivGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto y_e = framework::EigenVector::Flatten(*y); - auto z_e = framework::EigenVector::Flatten(*z); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = -1.0 * dz_e * z_e / y_e; - } - } -}; - -template -struct ElementwiseDivBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } +struct DivGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout / y; } }; template -struct ElementwiseDivBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e / y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0 * (x_e * dz_e) / (y_e_bcast * y_e_bcast)) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct DivGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return -dout * x / (y * y); } }; @@ -128,10 +65,8 @@ class ElementwiseDivGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseDivBroadCastGradFunctor, - ElementwiseDivBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, DivGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX(), DivGradDY()); } }; diff --git a/paddle/fluid/operators/elementwise_max_op.h b/paddle/fluid/operators/elementwise_max_op.h index ab3a3d58275e8a48a92aa0c8a50af072e83ac968..527a18ee3ba88a158a13266a7fbcdafe59ec69d9 100644 --- a/paddle/fluid/operators/elementwise_max_op.h +++ b/paddle/fluid/operators/elementwise_max_op.h @@ -41,76 +41,16 @@ class ElementwiseMaxKernel : public framework::OpKernel { }; template -struct ElementwiseMaxGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e).template cast() * dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e <= y_e).template cast() * dz_e; - } +struct MaxGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x > y); } }; template -struct ElementwiseMaxBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseMaxBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e > y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e <= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct MaxGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x <= y); } }; @@ -127,12 +67,9 @@ class ElementwiseMaxGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseMaxBroadCastGradFunctor, - ElementwiseMaxBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, MaxGradDy>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MaxGradDx(), MaxGradDy()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise_min_op.h b/paddle/fluid/operators/elementwise_min_op.h index f0eec9d2468b519473f813d71127d670adb08250..d4e5831463f3e54c72789b6876ea696cf1b4ef4b 100644 --- a/paddle/fluid/operators/elementwise_min_op.h +++ b/paddle/fluid/operators/elementwise_min_op.h @@ -41,76 +41,16 @@ class ElementwiseMinKernel : public framework::OpKernel { }; template -struct ElementwiseMinGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e).template cast() * dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e >= y_e).template cast() * dz_e; - } +struct MinGradDx { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x < y); } }; template -struct ElementwiseMinBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseMinBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = (x_e < y_e_bcast).template cast() * dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = ((x_e >= y_e_bcast).template cast() * dz_e) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } +struct MinGradDy { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { + return dout * (x >= y); } }; @@ -127,12 +67,9 @@ class ElementwiseMinGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseMinBroadCastGradFunctor, - ElementwiseMinBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, MinGradDy>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MinGradDx(), MinGradDy()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise_mul_op.h b/paddle/fluid/operators/elementwise_mul_op.h index 46d69ed87d4c2eeae674730b00858b6358b97773..dc73cb6f23614504640283af01981d3f69e89126 100644 --- a/paddle/fluid/operators/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise_mul_op.h @@ -40,78 +40,13 @@ class ElementwiseMulKernel : public framework::OpKernel { }; template -struct ElementwiseMulGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e * y_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = x_e * dz_e; - } - } -}; - -template -struct ElementwiseMulBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n)) - .broadcast(Eigen::DSizes(pre, 1)) - .reshape(Eigen::DSizes(x_e.size())); - - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e * y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e * dz_e) - .reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } +struct MulGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * y; } }; template -struct ElementwiseMulBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto x_e = framework::EigenVector::Flatten(*x); - auto y_e = framework::EigenVector::Flatten(*y); - auto dz_e = framework::EigenVector::Flatten(*dz); - - auto y_e_bcast = y_e.reshape(Eigen::DSizes(1, n, 1)) - .broadcast(Eigen::DSizes(pre, 1, post)) - .reshape(Eigen::DSizes(x_e.size())); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e * y_e_bcast; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (x_e * dz_e) - .reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } - } +struct MulGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout * x; } }; template @@ -127,12 +62,9 @@ class ElementwiseMulGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseMulBroadCastGradFunctor, - ElementwiseMulBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, MulGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, MulGradDX(), MulGradDY()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/elementwise_op_function.h b/paddle/fluid/operators/elementwise_op_function.h index ffda53a383ced411415e528886a23f28f6a62648..0b4238436ffcc586fe8bc7abbe4cfbc1654dcb88 100644 --- a/paddle/fluid/operators/elementwise_op_function.h +++ b/paddle/fluid/operators/elementwise_op_function.h @@ -301,7 +301,7 @@ struct ElemwiseGradNoBroadcast { dx_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); } if (dy_ != nullptr) { - dy_[i] = dx_op_(x_[i], y_[i], out_[i], dout_[i]); + dy_[i] = dy_op_(x_[i], y_[i], out_[i], dout_[i]); } } diff --git a/paddle/fluid/operators/elementwise_sub_op.h b/paddle/fluid/operators/elementwise_sub_op.h index a8fc242ed79ecee77dc5bb56882a1a15f872fccf..fe088b8203722a43b9aba7be3878b8f4ca68ba12 100644 --- a/paddle/fluid/operators/elementwise_sub_op.h +++ b/paddle/fluid/operators/elementwise_sub_op.h @@ -40,61 +40,13 @@ class ElementwiseSubKernel : public framework::OpKernel { }; template -struct ElementwiseSubGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * dz_e; - } - } +struct SubGradDX { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return dout; } }; template -struct ElementwiseSubBroadCastGradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * - dz_e.reshape(Eigen::DSizes(pre, n)) - .sum(Eigen::array{{0}}); - } - } -}; - -template -struct ElementwiseSubBroadCast2GradFunctor { - template - void operator()(Device d, X x, Y y, Z z, dX dx, dY dy, dZ dz, Pre pre, N n, - Post post) { - auto dz_e = framework::EigenVector::Flatten(*dz); - if (dx) { - auto dx_e = framework::EigenVector::Flatten(*dx); - dx_e.device(d) = dz_e; - } - - if (dy) { - auto dy_e = framework::EigenVector::Flatten(*dy); - dy_e.device(d) = (-1.0) * - dz_e.reshape(Eigen::DSizes(pre, n, post)) - .sum(Eigen::array{{0, 2}}); - } - } +struct SubGradDY { + HOSTDEVICE T operator()(T x, T y, T out, T dout) const { return -dout; } }; template @@ -110,12 +62,9 @@ class ElementwiseSubGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); int axis = ctx.Attr("axis"); - ElementwiseGradCompute, - ElementwiseSubBroadCastGradFunctor, - ElementwiseSubBroadCast2GradFunctor>( - ctx, x, y, out, dout, axis, dx, dy); + ElemwiseGradCompute, SubGradDY>( + ctx, *x, *y, *out, *dout, axis, dx, dy, SubGradDX(), SubGradDY()); } }; - } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 768106fadf355ea6fb148491e232dc0ef1453a75..a181d802262d15b188060dae4330cec0e24714ab 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,46 +1,59 @@ add_subdirectory(detail) -if(WITH_GPU) - nv_library(math_function SRCS math_function.cc math_function.cu im2col.cc im2col.cu DEPS cblas device_context framework_proto) - nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor) - nv_library(selected_rows_functor SRCS selected_rows_functor.cc selected_rows_functor.cu DEPS selected_rows math_function) - nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) - nv_library(softmax SRCS softmax.cc softmax.cu DEPS device_context) - nv_library(cross_entropy SRCS cross_entropy.cc cross_entropy.cu DEPS device_context) - nv_library(pooling SRCS pooling.cc pooling.cu DEPS device_context) - nv_library(depthwise_conv SRCS depthwise_conv.cu DEPS device_context) - nv_library(sequence_pooling SRCS sequence_pooling.cc sequence_pooling.cu DEPS device_context math_function) - nv_library(vol2col SRCS vol2col.cc vol2col.cu DEPS device_context tensor) - nv_library(context_project SRCS context_project.cc context_project.cu DEPS device_context math_function) - nv_library(sequence2batch SRCS sequence2batch.cc sequence2batch.cu DEPS device_context tensor math_function) - nv_library(sequence_padding SRCS sequence_padding.cc sequence_padding.cu DEPS lod_tensor device_context) - nv_library(sequence_scale SRCS sequence_scale.cc sequence_scale.cu DEPS lod_tensor device_context) - nv_library(lstm_compute SRCS lstm_compute.cc lstm_compute.cu DEPS device_context activation_functions) - nv_library(maxouting SRCS maxouting.cc maxouting.cu DEPS device_context) - nv_library(unpooling SRCS unpooling.cc unpooling.cu DEPS device_context) - nv_library(gru_compute SRCS gru_compute.cc gru_compute.cu DEPS device_context activation_functions math_function) - nv_library(cos_sim_functor SRCS cos_sim_functor.cc cos_sim_functor.cu DEPS device_context) -else() - cc_library(math_function SRCS math_function.cc im2col.cc DEPS cblas device_context framework_proto) - cc_library(selected_rows_functor SRCS selected_rows_functor.cc DEPS selected_rows math_function) - cc_library(softmax SRCS softmax.cc DEPS device_context) - cc_library(cross_entropy SRCS cross_entropy.cc DEPS device_context) - cc_library(pooling SRCS pooling.cc DEPS device_context) - cc_library(sequence_pooling SRCS sequence_pooling.cc DEPS device_context math_function) - cc_library(vol2col SRCS vol2col.cc DEPS device_context tensor) - cc_library(context_project SRCS context_project.cc DEPS device_context math_function) - cc_library(sequence2batch SRCS sequence2batch.cc DEPS device_context tensor math_function) - cc_library(sequence_padding SRCS sequence_padding.cc DEPS lod_tensor device_context) - cc_library(sequence_scale SRCS sequence_scale.cc DEPS lod_tensor device_context) - cc_library(lstm_compute SRCS lstm_compute.cc DEPS device_context activation_functions) - cc_library(maxouting SRCS maxouting.cc DEPS device_context) - cc_library(unpooling SRCS unpooling.cc DEPS device_context) - cc_library(gru_compute SRCS gru_compute.cc DEPS device_context activation_functions math_function) - cc_library(cos_sim_functor SRCS cos_sim_functor.cc DEPS device_context) -endif() +function(math_library TARGET) + # math_library is a function to create math library. + # The interface is the same as cc_library. + # But it handle split GPU/CPU code and link some common library. + set(cc_srcs) + set(cu_srcs) + set(math_common_deps device_context framework_proto) + set(multiValueArgs DEPS) + cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" + "${multiValueArgs}" ${ARGN}) + + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc) + list(APPEND cc_srcs ${TARGET}.cc) + endif() + if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) + list(APPEND cu_srcs ${TARGET}.cu) + endif() + + list(LENGTH cc_srcs cc_srcs_len) + if (WITH_GPU) + nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + elseif(${cc_srcs_len} GREATER 0) + cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) + endif() +endfunction() -cc_test(math_function_test SRCS math_function_test.cc DEPS math_function tensor) +# please add new math_library in alphabetical order +math_library(concat) +math_library(context_project DEPS im2col math_function) +math_library(cross_entropy) +math_library(cos_sim_functor) +math_library(depthwise_conv) +math_library(gru_compute DEPS activation_functions math_function) +math_library(im2col) +math_library(lstm_compute DEPS activation_functions) +math_library(math_function DEPS cblas) +math_library(maxouting) +math_library(pooling) +math_library(selected_rows_functor DEPS selected_rows) +math_library(sequence2batch) +math_library(sequence_padding) +math_library(sequence_pooling DEPS math_function) +math_library(sequence_scale) +math_library(softmax) +math_library(unpooling) +math_library(vol2col) + +cc_test(math_function_test SRCS math_function_test.cc) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) -cc_test(im2col_test SRCS im2col_test.cc DEPS math_function tensor) -cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col tensor) +cc_test(im2col_test SRCS im2col_test.cc DEPS im2col) +cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col) cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_padding) +if(WITH_GPU) + nv_test(math_function_gpu_test SRCS math_function_test.cu) + nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu DEPS selected_rows_functor) +endif() +cc_test(concat_test SRCS concat_test.cc DEPS concat) diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc new file mode 100644 index 0000000000000000000000000000000000000000..b542143419e05e9baf29e9a2322447f32ddd9829 --- /dev/null +++ b/paddle/fluid/operators/math/concat.cc @@ -0,0 +1,119 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/concat.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const std::vector& input, const int axis, + framework::Tensor* output) { + // TODO(zcd): Add input data validity checking + int num = input.size(); + + int rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int out_rows = rows, out_cols = 0; + + std::vector input_cols(input.size()); + for (int i = 0; i < num; ++i) { + int t_cols = input[i].numel() / rows; + out_cols += t_cols; + input_cols[i] = t_cols; + } + auto& cpu_place = boost::get(context.GetPlace()); + + // computation + for (int k = 0; k < out_rows; ++k) { + T* dst_ptr = output->data() + k * out_cols; + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = input_cols[j]; + const T* src_prt = input[j].data() + k * col_len; + memory::Copy(cpu_place, dst_ptr + col_idx, cpu_place, src_prt, + sizeof(T) * col_len); + col_idx += col_len; + } + } + } +}; + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatGradFunctor { + public: + void operator()(const platform::CPUDeviceContext& context, + const framework::Tensor& input, const int axis, + std::vector& outputs) { + // TODO(zcd): Add input data validity checking + int num = outputs.size(); + + int input_rows = 1; + auto dim_0 = outputs[0].dims(); + for (int i = 0; i < axis; ++i) { + input_rows *= dim_0[i]; + } + int input_cols = 0; + + std::vector output_cols(outputs.size()); + for (int i = 0; i < num; ++i) { + int t_cols = outputs[i].numel() / input_rows; + input_cols += t_cols; + output_cols[i] = t_cols; + } + auto& cpu_place = boost::get(context.GetPlace()); + + // computation + for (int k = 0; k < input_rows; ++k) { + const T* src_ptr = input.data() + k * input_cols; + int col_idx = 0; + for (int j = 0; j < num; ++j) { + int col_len = output_cols[j]; + T* dst_ptr = outputs[j].data() + k * col_len; + memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, + sizeof(T) * col_len); + col_idx += col_len; + } + } + } +}; + +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; + +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu new file mode 100644 index 0000000000000000000000000000000000000000..60b266f08fb2d4217c5933902d69de96fc2abe22 --- /dev/null +++ b/paddle/fluid/operators/math/concat.cu @@ -0,0 +1,280 @@ +/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/mixed_vector.h" +#include "paddle/fluid/operators/math/concat.h" +#include "paddle/fluid/platform/cuda_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +template +__device__ T upper_bound(const T* first, T count, T val) { + const T* orig = first; + const T* it = nullptr; + T step = 0; + while (count > 0) { + it = first; + step = count / 2; + it += step; + if (!(val < *it)) { + first = ++it; + count -= step + 1; + } else { + count = step; + } + } + return first - orig; +} + +template +__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, + const int output_rows, const int output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int segment = upper_bound(input_cols, col_size, tid_x) - 1; + + int curr_offset = input_cols[segment]; + int curr_segment = segment; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + T curr_col_offset; + while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* input_ptr = inputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * segment_width + local_col]; + } +} + +template +__global__ void KernelConcat(T** inputs, const int input_col, + const int output_rows, const int output_cols, + T* output) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + double inv_input_col = 1.0 / input_col; + for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * inv_input_col; + int in_offset = tid_x - split * input_col; + T* input_ptr = inputs[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { + output[tid_y * output_cols + tid_x] = + input_ptr[tid_y * input_col + in_offset]; + } + } +} + +template +__global__ void KernelConcatGrad(const T* input, const int input_row, + const int input_col, const int* output_cols, + int col_size, T** outputs) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int segment = upper_bound(output_cols, col_size, tid_x) - 1; + int curr_offset = output_cols[segment]; + int curr_segment = segment; + for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { + T curr_col_offset; + while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs[curr_segment]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input[tid_y * input_col + tid_x]; + } +} + +template +__global__ void KernelConcatGrad(const T* input, const int input_row, + const int input_col, const int output_cols, + T** outputs) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + double inv_input_col = 1.0 / input_col; + for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x * inv_input_col; + int in_offset = tid_x - split * input_col; + T* output_ptr = outputs[split]; + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * output_cols + in_offset] = + input[tid_y * input_col + tid_x]; + } +} + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const std::vector& input, const int axis, + framework::Tensor* output) { + // TODO(zcd): Add input data validity checking + int num = input.size(); + int rows = 1; + auto dim_0 = input[0].dims(); + for (int i = 0; i < axis; ++i) { + rows *= dim_0[i]; + } + int cols = input[0].numel() / rows; + int out_rows = rows, out_cols = 0; + + framework::Vector inputs_data(num * sizeof(T*) / 2); + framework::Vector inputs_cols(num + 1); + inputs_cols[0] = 0; + T** inputs_ptr = reinterpret_cast(inputs_data.data()); + + bool sameShape = true; + for (int i = 0; i < num; ++i) { + int t_cols = input[i].numel() / rows; + if (sameShape) { + if (t_cols != cols) sameShape = false; + } + out_cols += t_cols; + inputs_cols[i + 1] = out_cols; + inputs_ptr[i] = const_cast(input[i].data()); + } + + T** ins_gpu = + reinterpret_cast(inputs_data.CUDAMutableData(context.GetPlace())); + const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace()); + + // computation + // set the thread block and grid according to CurrentDeviceId + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((out_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (sameShape) { + KernelConcat<<>>( + ins_gpu, cols, out_rows, out_cols, output->data()); + } else { + KernelConcat<<>>( + ins_gpu, ins_col_gpu, static_cast(inputs_cols.size()), out_rows, + out_cols, output->data()); + } + } +}; + +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ +template +class ConcatGradFunctor { + public: + void operator()(const platform::CUDADeviceContext& context, + const framework::Tensor& input, const int axis, + std::vector& outputs) { + // TODO(zcd): Add input data validity checking + int num = outputs.size(); + int input_row = 1; + auto dim_0 = outputs[0].dims(); + for (int i = 0; i < axis; ++i) { + input_row *= dim_0[i]; + } + + int output_col_0 = outputs[0].numel() / input_row; + int input_col = 0; + bool sameShape = true; + + framework::Vector outputs_data(num * sizeof(T*) / 2); + framework::Vector outputs_cols(num + 1); + outputs_cols[0] = 0; + T** outputs_ptr = reinterpret_cast(outputs_data.data()); + + for (int i = 0; i < num; ++i) { + int t_col = outputs[i].numel() / input_row; + if (sameShape) { + if (t_col != output_col_0) sameShape = false; + } + input_col += t_col; + outputs_cols[i + 1] = input_col; + outputs_ptr[i] = outputs[i].data(); + } + + T** outs_gpu = + reinterpret_cast(outputs_data.CUDAMutableData(context.GetPlace())); + const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace()); + + // computation + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((input_col + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((input_col + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (sameShape) { + KernelConcatGrad<<>>( + input.data(), input_row, input_col, output_col_0, outs_gpu); + } else { + KernelConcatGrad<<>>( + input.data(), input_row, input_col, outs_col_gpu, + static_cast(outputs_cols.size()), outs_gpu); + } + } +}; + +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; +template class ConcatFunctor; + +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; +template class ConcatGradFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h new file mode 100644 index 0000000000000000000000000000000000000000..22147d79e4b1eeee76f7445dd963bf5062049a34 --- /dev/null +++ b/paddle/fluid/operators/math/concat.h @@ -0,0 +1,63 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { +namespace math { + +/* + * \brief Concatenate the input tensors along the dimension axis. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input[0] = [[1,2],[3,4]] + * Input[1] = [[5,6]] + * axis = 0 + * + * Output = [[1,2], + * [3,4], + * [5,6]] + */ +template +class ConcatFunctor { + public: + void operator()(const DeviceContext& context, + const std::vector& input, const int axis, + framework::Tensor* output); +}; + +/* + * \brief Split the input tensors along the dimension axis into outputs. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input = [[1,2], + * [3,4], + * [5,6]] + * axis = 0 + * + * Output[0] = [[1,2],[3,4]] + * Output[1] = [[5,6]] + */ +template +class ConcatGradFunctor { + public: + void operator()(const DeviceContext& context, const framework::Tensor& input, + const int axis, std::vector& outputs); +}; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/concat_test.cc b/paddle/fluid/operators/math/concat_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..1741af8148bb90863f294ba4930006a58b5ddbf9 --- /dev/null +++ b/paddle/fluid/operators/math/concat_test.cc @@ -0,0 +1,336 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/concat.h" +#include +#include +#include "paddle/fluid/framework/tensor_util.h" + +using namespace paddle::framework; +using namespace paddle::platform; + +template +void testConcat() { + Tensor input_a_cpu; + Tensor input_b_cpu; + Tensor out_cpu; + Tensor input_a; + Tensor input_b; + Tensor out; + + DeviceContext* context = new DeviceContext(Place()); + // DeviceContext context(Place()); + + /** + * cast1: + * inputs: + * t_a.shape: [2, 3, 4] + * t_b.shape: [3, 3, 4] + * output: + * out.shape: [5, 3, 4] + */ + auto dim_a = make_ddim({2, 3, 4}); + auto dim_b = make_ddim({3, 3, 4}); + auto dim_out = make_ddim({5, 3, 4}); + + input_a.mutable_data(dim_a, Place()); + input_b.mutable_data(dim_b, Place()); + out.mutable_data(dim_out, Place()); + + if (paddle::platform::is_gpu_place(Place())) { + input_a_cpu.mutable_data(dim_a, CPUPlace()); + input_b_cpu.mutable_data(dim_b, CPUPlace()); + out_cpu.mutable_data(dim_out, CPUPlace()); + } + + int* a_ptr; + int* b_ptr; + if (paddle::platform::is_gpu_place(Place())) { + a_ptr = input_a_cpu.data(); + b_ptr = input_b_cpu.data(); + } else { + a_ptr = input_a.data(); + b_ptr = input_b.data(); + } + + for (int i = 0; i < 2 * 3 * 4; ++i) { + a_ptr[i] = i; + } + for (int i = 0; i < 3 * 3 * 4; ++i) { + b_ptr[i] = i; + } + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(input_a_cpu, Place(), *context, &input_a); + TensorCopy(input_b_cpu, Place(), *context, &input_b); + } + + std::vector input; + input.push_back(input_a); + input.push_back(input_b); + + paddle::operators::math::ConcatFunctor concat_functor; + concat_functor(*context, input, 0, &out); + + // check the dim of input_a, input_b + PADDLE_ENFORCE_EQ(input_a.dims(), dim_a); + PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); + + int* out_ptr; + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(out, CPUPlace(), *context, &out_cpu); + out_ptr = out_cpu.data(); + } else { + out_ptr = out.data(); + } + + int cols = 2 * 3 * 4; + int idx_a = 0, idx_b = 0; + for (int j = 0; j < 5 * 3 * 4; ++j) { + if (j >= cols) { + PADDLE_ENFORCE_EQ(out_ptr[j], b_ptr[idx_b]); + ++idx_b; + } else { + PADDLE_ENFORCE_EQ(out_ptr[j], a_ptr[idx_a]); + ++idx_a; + } + } + // + /** + * cast2: + * inputs: + * t_a.shape: [2, 3, 4] + * t_b.shape: [2, 4, 4] + * output: + * out.shape: [2, 7, 4] + */ + dim_a = make_ddim({2, 3, 4}); + dim_b = make_ddim({2, 4, 4}); + dim_out = make_ddim({2, 7, 4}); + + input_a.Resize(dim_a); + input_b.Resize(dim_b); + out.Resize(dim_out); + if (paddle::platform::is_gpu_place(Place())) { + input_a_cpu.Resize(dim_a); + input_b_cpu.Resize(dim_b); + out_cpu.Resize(dim_out); + } + + if (paddle::platform::is_gpu_place(Place())) { + a_ptr = input_a_cpu.data(); + b_ptr = input_b_cpu.data(); + } else { + a_ptr = input_a.data(); + b_ptr = input_b.data(); + } + + for (int i = 0; i < 2 * 3 * 4; ++i) { + a_ptr[i] = i; + } + for (int i = 0; i < 2 * 4 * 4; ++i) { + b_ptr[i] = i; + } + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(input_a_cpu, Place(), *context, &input_a); + TensorCopy(input_b_cpu, Place(), *context, &input_b); + } + + input.clear(); + input.push_back(input_a); + input.push_back(input_b); + + concat_functor(*context, input, 1, &out); + + // check the dim of input_a, input_b + PADDLE_ENFORCE_EQ(input_a.dims(), dim_a); + PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(out, CPUPlace(), *context, &out_cpu); + out_ptr = out_cpu.data(); + } else { + out_ptr = out.data(); + } + + cols = 3 * 4; + idx_a = 0, idx_b = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 28; ++j) { + if (j >= cols) { + PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], b_ptr[idx_b]); + ++idx_b; + } else { + PADDLE_ENFORCE_EQ(out_ptr[i * 28 + j], a_ptr[idx_a]); + ++idx_a; + } + } + } + + /** + * cast3: + * inputs: + * t_a.shape: [2, 3, 5] + * t_b.shape: [2, 3, 4] + * output: + * out.shape: [2, 3, 9] + */ + dim_a = make_ddim({2, 3, 4}); + dim_b = make_ddim({2, 3, 5}); + dim_out = make_ddim({2, 3, 9}); + + input_a.Resize(dim_a); + input_b.Resize(dim_b); + out.Resize(dim_out); + if (paddle::platform::is_gpu_place(Place())) { + input_a_cpu.Resize(dim_a); + input_b_cpu.Resize(dim_b); + out_cpu.Resize(dim_out); + } + + if (paddle::platform::is_gpu_place(Place())) { + a_ptr = input_a_cpu.data(); + b_ptr = input_b_cpu.data(); + } else { + a_ptr = input_a.data(); + b_ptr = input_b.data(); + } + + for (int i = 0; i < 2 * 3 * 4; ++i) { + a_ptr[i] = i; + } + for (int i = 0; i < 2 * 3 * 5; ++i) { + b_ptr[i] = i; + } + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(input_a_cpu, Place(), *context, &input_a); + TensorCopy(input_b_cpu, Place(), *context, &input_b); + } + + input.clear(); + input.push_back(input_a); + input.push_back(input_b); + + concat_functor(*context, input, 2, &out); + + // check the dim of input_a, input_b + PADDLE_ENFORCE_EQ(input_a.dims(), dim_a); + PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(out, CPUPlace(), *context, &out_cpu); + out_ptr = out_cpu.data(); + } else { + out_ptr = out.data(); + } + + // check the data + cols = 4; + idx_a = 0, idx_b = 0; + for (int i = 0; i < 6; ++i) { + for (int j = 0; j < 9; ++j) { + if (j >= cols) { + PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], b_ptr[idx_b]); + ++idx_b; + } else { + PADDLE_ENFORCE_EQ(out_ptr[i * 9 + j], a_ptr[idx_a]); + ++idx_a; + } + } + } + + /** + * cast4: + * inputs: + * axis = 1 + * t_a.shape: [2, 3, 4] + * t_b.shape: [2, 3, 4] + * output: + * out.shape: [2, 6, 4] + */ + dim_a = make_ddim({2, 3, 4}); + dim_b = make_ddim({2, 3, 4}); + dim_out = make_ddim({2, 6, 4}); + + input_a.Resize(dim_a); + input_b.Resize(dim_b); + out.Resize(dim_out); + if (paddle::platform::is_gpu_place(Place())) { + input_a_cpu.Resize(dim_a); + input_b_cpu.Resize(dim_b); + out_cpu.Resize(dim_out); + } + + if (paddle::platform::is_gpu_place(Place())) { + a_ptr = input_a_cpu.data(); + b_ptr = input_b_cpu.data(); + } else { + a_ptr = input_a.data(); + b_ptr = input_b.data(); + } + + for (int i = 0; i < 2 * 3 * 4; ++i) { + a_ptr[i] = i; + } + for (int i = 0; i < 2 * 3 * 4; ++i) { + b_ptr[i] = i; + } + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(input_a_cpu, Place(), *context, &input_a); + TensorCopy(input_b_cpu, Place(), *context, &input_b); + } + + input.clear(); + input.push_back(input_a); + input.push_back(input_b); + + concat_functor(*context, input, 1, &out); + + // check the dim of input_a, input_b + PADDLE_ENFORCE_EQ(input_a.dims(), dim_a); + PADDLE_ENFORCE_EQ(input_b.dims(), dim_b); + + if (paddle::platform::is_gpu_place(Place())) { + TensorCopy(out, CPUPlace(), *context, &out_cpu); + out_ptr = out_cpu.data(); + } else { + out_ptr = out.data(); + } + + // check the data + cols = 12; + idx_a = 0, idx_b = 0; + for (int i = 0; i < 2; ++i) { + for (int j = 0; j < 24; ++j) { + if (j >= cols) { + PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], b_ptr[idx_b]); + ++idx_b; + } else { + PADDLE_ENFORCE_EQ(out_ptr[i * 24 + j], a_ptr[idx_a]); + ++idx_a; + } + } + } +} + +TEST(math, concat) { + testConcat(); +#ifdef PADDLE_WITH_CUDA + testConcat(); +#endif +} diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 41eab3ade207a592e1a72cd22580f18723167eff..35d251f71a0cb631d5900498ea3188b5ddeae334 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -15,11 +15,23 @@ limitations under the License. */ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -46,6 +58,15 @@ void gemm( beta, C, ldc); } +template <> +void gemm( + const platform::CPUDeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + PADDLE_THROW("float16 GEMM not supported on CPU"); +} + template <> void gemm( const platform::CPUDeviceContext& context, const bool transA, @@ -68,6 +89,15 @@ void gemm( lda, B, ldb, beta, C, ldc); } +template <> +void matmul( + const platform::CPUDeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + PADDLE_THROW("float16 matmul not supported on CPU"); +} + template <> void matmul( const platform::CPUDeviceContext& context, @@ -126,6 +156,15 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + PADDLE_THROW("float16 batched_gemm not supported on CPU"); +} + #ifdef PADDLE_WITH_MKLML // Use cblas_{s,d}gemm_batched if available: Run with 1 group of size batchSize. template <> @@ -245,11 +284,13 @@ template struct SetConstant; template struct SetConstant; template struct SetConstant; -#define DEFINE_CPU_TRANS(RANK) \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ - template struct Transpose; \ +#define DEFINE_CPU_TRANS(RANK) \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ + template struct Transpose; \ template struct Transpose; DEFINE_CPU_TRANS(1); diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu index f8d0349ac5cd96791bcb508230f5ff6d594c0e05..36655508be2ea9e748333171073c7dc258de52f2 100644 --- a/paddle/fluid/operators/math/math_function.cu +++ b/paddle/fluid/operators/math/math_function.cu @@ -16,11 +16,40 @@ limitations under the License. */ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function_impl.h" +#include "paddle/fluid/platform/float16.h" namespace paddle { namespace operators { namespace math { +using float16 = paddle::platform::float16; + +template <> +void gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, N)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, @@ -60,6 +89,28 @@ void gemm( lda, &beta, C, N)); } +template <> +void gemm( + const platform::CUDADeviceContext& context, const bool transA, + const bool transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const int lda, const float16* B, + const int ldb, const float16 beta, float16* C, const int ldc) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemm( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + h_A, lda, &h_beta, h_C, ldc)); +} + template <> void gemm( const platform::CUDADeviceContext& context, const bool transA, @@ -90,6 +141,35 @@ void gemm( lda, &beta, C, ldc)); } +template <> +void matmul( + const platform::CUDADeviceContext& context, + const framework::Tensor& matrix_a, bool trans_a, + const framework::Tensor& matrix_b, bool trans_b, float16 alpha, + framework::Tensor* matrix_out, float16 beta) { + auto dim_a = matrix_a.dims(); + auto dim_b = matrix_b.dims(); + auto dim_out = matrix_out->dims(); + PADDLE_ENFORCE(dim_a.size() == 2 && dim_b.size() == 2 && dim_out.size() == 2, + "The input and output of matmul be matrix"); + + PADDLE_ENFORCE(platform::is_gpu_place(matrix_a.place()) && + platform::is_gpu_place(matrix_b.place()) && + platform::is_gpu_place(matrix_out->place()), + "Matrix must all be in CUDAPlace"); + + int M = dim_out[0]; + int N = dim_out[1]; + int K = (trans_a == false) ? dim_a[1] : dim_a[0]; + + CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans; + CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans; + + gemm( + context, transA, transB, M, N, K, alpha, matrix_a.data(), + matrix_b.data(), beta, matrix_out->data()); +} + template <> void matmul( const platform::CUDADeviceContext& context, @@ -148,6 +228,34 @@ void matmul( matrix_b.data(), beta, matrix_out->data()); } +template <> +void batched_gemm( + const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, + const CBLAS_TRANSPOSE transB, const int M, const int N, const int K, + const float16 alpha, const float16* A, const float16* B, const float16 beta, + float16* C, const int batchCount, const int strideA, const int strideB) { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + int lda = (transA == CblasNoTrans) ? K : M; + int ldb = (transB == CblasNoTrans) ? N : K; + int ldc = N; + cublasOperation_t cuTransA = + (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + const int strideC = M * N; + + const half h_alpha = static_cast(alpha); + const half h_beta = static_cast(beta); + const half* h_A = reinterpret_cast(A); + const half* h_B = reinterpret_cast(B); + half* h_C = reinterpret_cast(C); + + PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( + context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb, + strideB, h_A, lda, strideA, &h_beta, h_C, ldc, strideC, batchCount)); +} + template <> void batched_gemm( const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA, diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/fluid/operators/math/math_function_test.cu index 207d6a87bce178e446253ae2d880b6dd743cfc83..442e62d563ebd40316d001914c93447c102cbf61 100644 --- a/paddle/fluid/operators/math/math_function_test.cu +++ b/paddle/fluid/operators/math/math_function_test.cu @@ -14,30 +14,41 @@ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/math_function.h" -TEST(math_function, notrans_mul_trans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; - - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); +void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size, + const std::vector& data) { + PADDLE_ENFORCE_EQ(size, data.size()); + for (size_t i = 0; i < data.size(); ++i) { + in_ptr[i] = paddle::platform::float16(data[i]); + } +} + +TEST(math_function, notrans_mul_trans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - out_gpu.mutable_data({2, 2}, *gpu_place); + out_gpu.mutable_data({2, 2}, gpu_place); - paddle::operators::math::matmul( + paddle::operators::math::matmul( context, input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -45,33 +56,71 @@ TEST(math_function, notrans_mul_trans) { EXPECT_EQ(out_ptr[1], 14); EXPECT_EQ(out_ptr[2], 14); EXPECT_EQ(out_ptr[3], 50); - delete gpu_place; } -TEST(math_function, trans_mul_notrans) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor out_gpu; - paddle::framework::Tensor out; +TEST(math_function, notrans_mul_trans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({2, 2}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, false, input2_gpu, true, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 5); + EXPECT_EQ(static_cast(out_ptr[1]), 14); + EXPECT_EQ(static_cast(out_ptr[2]), 14); + EXPECT_EQ(static_cast(out_ptr[3]), 50); +} + +TEST(math_function, trans_mul_notrans_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr, 6 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input1, *gpu_place, context, &input2_gpu); - - out_gpu.mutable_data({3, 3}, *gpu_place); + out_gpu.mutable_data({3, 3}, gpu_place); paddle::operators::math::matmul( context, input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0); - paddle::framework::TensorCopy(out_gpu, *cpu_place, context, &out); + TensorCopy(out_gpu, cpu_place, context, &out); float* out_ptr = out.data(); context.Wait(); @@ -84,45 +133,88 @@ TEST(math_function, trans_mul_notrans) { EXPECT_EQ(out_ptr[6], 15); EXPECT_EQ(out_ptr[7], 22); EXPECT_EQ(out_ptr[8], 29); - delete gpu_place; } -TEST(math_function, gemm_notrans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, trans_mul_notrans_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor out_gpu; + Tensor out; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input1, gpu_place, context, &input2_gpu); + + out_gpu.mutable_data({3, 3}, gpu_place); + + paddle::operators::math::matmul( + context, input1_gpu, true, input2_gpu, false, float16(1), &out_gpu, + float16(0)); + + TensorCopy(out_gpu, cpu_place, context, &out); + + float16* out_ptr = out.data(); + context.Wait(); + EXPECT_EQ(static_cast(out_ptr[0]), 9); + EXPECT_EQ(static_cast(out_ptr[1]), 12); + EXPECT_EQ(static_cast(out_ptr[2]), 15); + EXPECT_EQ(static_cast(out_ptr[3]), 12); + EXPECT_EQ(static_cast(out_ptr[4]), 17); + EXPECT_EQ(static_cast(out_ptr[5]), 22); + EXPECT_EQ(static_cast(out_ptr[6]), 15); + EXPECT_EQ(static_cast(out_ptr[7]), 22); + EXPECT_EQ(static_cast(out_ptr[8]), 29); +} + +TEST(math_function, gemm_notrans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({3, 4}, *cpu_place); + float* input2_ptr = input2.mutable_data({3, 4}, cpu_place); float arr2[12] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); + TensorCopy(input3_gpu, cpu_place, context, &input3); // numpy code: // a = np.arange(6).reshape(2, 3) @@ -139,47 +231,105 @@ TEST(math_function, gemm_notrans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; } -TEST(math_function, gemm_trans_cublas) { - paddle::framework::Tensor input1; - paddle::framework::Tensor input2; - paddle::framework::Tensor input3; - paddle::framework::Tensor input1_gpu; - paddle::framework::Tensor input2_gpu; - paddle::framework::Tensor input3_gpu; +TEST(math_function, gemm_notrans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({3, 4}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, false, m, n, k, float16(1), a, 3, b + 1, 4, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + // numpy code: + // a = np.arange(6).reshape(2, 3) + // b = np.arange(12).reshape(3, 4)[:, 1:] + // c = np.arange(8).reshape(2, 4)[:, 1:] + // out = np.arange(8).reshape(2, 4) + // out[:, 1:] = np.dot(a, b) + c + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); +} + +TEST(math_function, gemm_trans_cublas_fp32) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); int m = 2; int n = 3; int k = 3; - auto* cpu_place = new paddle::platform::CPUPlace(); - float* input1_ptr = input1.mutable_data({2, 3}, *cpu_place); + float* input1_ptr = input1.mutable_data({2, 3}, cpu_place); float arr1[6] = {0, 1, 2, 3, 4, 5}; memcpy(input1_ptr, arr1, 6 * sizeof(float)); - float* input2_ptr = input2.mutable_data({4, 3}, *cpu_place); + float* input2_ptr = input2.mutable_data({4, 3}, cpu_place); float arr2[12] = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}; memcpy(input2_ptr, arr2, 12 * sizeof(float)); - float* input3_ptr = input3.mutable_data({2, 4}, *cpu_place); + float* input3_ptr = input3.mutable_data({2, 4}, cpu_place); float arr3[8] = {0, 1, 2, 3, 4, 5, 6, 7}; memcpy(input3_ptr, arr3, 8 * sizeof(float)); - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::platform::CUDADeviceContext context(*gpu_place); - - paddle::framework::TensorCopy(input1, *gpu_place, context, &input1_gpu); - paddle::framework::TensorCopy(input2, *gpu_place, context, &input2_gpu); - paddle::framework::TensorCopy(input3, *gpu_place, context, &input3_gpu); + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); float* a = input1_gpu.data(); float* b = input2_gpu.data(); - float* c = input3_gpu.mutable_data(*gpu_place); + float* c = input3_gpu.mutable_data(gpu_place); paddle::operators::math::gemm( context, false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4); - paddle::framework::TensorCopy(input3_gpu, *cpu_place, context, &input3); - context.Wait(); + TensorCopy(input3_gpu, cpu_place, context, &input3); + context.Wait(); EXPECT_EQ(input3_ptr[0], 0); EXPECT_EQ(input3_ptr[1], 24); EXPECT_EQ(input3_ptr[2], 28); @@ -188,27 +338,81 @@ TEST(math_function, gemm_trans_cublas) { EXPECT_EQ(input3_ptr[5], 73); EXPECT_EQ(input3_ptr[6], 86); EXPECT_EQ(input3_ptr[7], 99); - delete gpu_place; +} + +TEST(math_function, gemm_trans_cublas_fp16) { + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor input1; + Tensor input2; + Tensor input3; + Tensor input1_gpu; + Tensor input2_gpu; + Tensor input3_gpu; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + int m = 2; + int n = 3; + int k = 3; + float16* input1_ptr = input1.mutable_data({2, 3}, cpu_place); + fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5}); + float16* input2_ptr = input2.mutable_data({4, 3}, cpu_place); + fill_fp16_data(input2_ptr, input2.numel(), + {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11}); + float16* input3_ptr = input3.mutable_data({2, 4}, cpu_place); + fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7}); + + TensorCopy(input1, gpu_place, context, &input1_gpu); + TensorCopy(input2, gpu_place, context, &input2_gpu); + TensorCopy(input3, gpu_place, context, &input3_gpu); + float16* a = input1_gpu.data(); + float16* b = input2_gpu.data(); + float16* c = input3_gpu.mutable_data(gpu_place); + + paddle::operators::math::gemm( + context, false, true, m, n, k, float16(1), a, 3, b + 3, 3, float16(1), + c + 1, 4); + + TensorCopy(input3_gpu, cpu_place, context, &input3); + + context.Wait(); + EXPECT_EQ(static_cast(input3_ptr[0]), 0); + EXPECT_EQ(static_cast(input3_ptr[1]), 24); + EXPECT_EQ(static_cast(input3_ptr[2]), 28); + EXPECT_EQ(static_cast(input3_ptr[3]), 32); + EXPECT_EQ(static_cast(input3_ptr[4]), 4); + EXPECT_EQ(static_cast(input3_ptr[5]), 73); + EXPECT_EQ(static_cast(input3_ptr[6]), 86); + EXPECT_EQ(static_cast(input3_ptr[7]), 99); } template void GemvTest(int m, int n, bool trans) { - paddle::framework::Tensor mat_a; - paddle::framework::Tensor vec_b; - paddle::framework::Tensor vec_c; - auto* cpu_place = new paddle::platform::CPUPlace(); - - T* data_a = mat_a.mutable_data({m, n}, *cpu_place); - T* data_b = vec_b.mutable_data({trans ? m : n}, *cpu_place); - T* data_c = vec_c.mutable_data({trans ? n : m}, *cpu_place); - - auto* gpu_place = new paddle::platform::CUDAPlace(0); - paddle::framework::Tensor g_mat_a; - paddle::framework::Tensor g_vec_b; - paddle::framework::Tensor g_vec_c; - T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), *gpu_place); - T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), *gpu_place); - T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), *gpu_place); + using namespace paddle::framework; + using namespace paddle::platform; + + Tensor mat_a; + Tensor vec_b; + Tensor vec_c; + + CPUPlace cpu_place; + CUDAPlace gpu_place(0); + CUDADeviceContext context(gpu_place); + + T* data_a = mat_a.mutable_data({m, n}, cpu_place); + T* data_b = vec_b.mutable_data({trans ? m : n}, cpu_place); + T* data_c = vec_c.mutable_data({trans ? n : m}, cpu_place); + + Tensor g_mat_a; + Tensor g_vec_b; + Tensor g_vec_c; + T* g_data_a = g_mat_a.mutable_data(mat_a.dims(), gpu_place); + T* g_data_b = g_vec_b.mutable_data(vec_b.dims(), gpu_place); + T* g_data_c = g_vec_c.mutable_data(vec_c.dims(), gpu_place); for (int i = 0; i < mat_a.numel(); ++i) { data_a[i] = static_cast(i); @@ -217,16 +421,14 @@ void GemvTest(int m, int n, bool trans) { data_b[i] = static_cast(i); } - paddle::platform::CUDADeviceContext context(*gpu_place); - paddle::framework::TensorCopy(mat_a, *gpu_place, context, &g_mat_a); - paddle::framework::TensorCopy(vec_b, *gpu_place, context, &g_vec_b); + TensorCopy(mat_a, gpu_place, context, &g_mat_a); + TensorCopy(vec_b, gpu_place, context, &g_vec_b); - paddle::operators::math::gemv( + paddle::operators::math::gemv( context, trans, static_cast(m), static_cast(n), 1., g_data_a, g_data_b, 0., g_data_c); - paddle::framework::TensorCopy(g_vec_c, paddle::platform::CPUPlace(), context, - &vec_c); + TensorCopy(g_vec_c, cpu_place, context, &vec_c); if (!trans) { for (int i = 0; i < m; ++i) { diff --git a/paddle/fluid/operators/math/sequence2batch.cc b/paddle/fluid/operators/math/sequence2batch.cc index 72bf2ab17016157f22bcf180090660346ed4f60d..8899abff360ea867872d3433722cdb37ef358500 100644 --- a/paddle/fluid/operators/math/sequence2batch.cc +++ b/paddle/fluid/operators/math/sequence2batch.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/sequence2batch.h" -#include "paddle/fluid/operators/math/math_function.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/multiclass_nms_op.cc b/paddle/fluid/operators/multiclass_nms_op.cc index c4e70cde6f8c6bdf1f28b010b0b90091772fdffb..0f80f752c95e97ed4d6d299788734de9d29713db 100644 --- a/paddle/fluid/operators/multiclass_nms_op.cc +++ b/paddle/fluid/operators/multiclass_nms_op.cc @@ -324,7 +324,7 @@ class MultiClassNMSOpMaker : public framework::OpProtoAndCheckerMaker { " Please note, M is equal to the 1st dimension of BBoxes. "); AddAttr( "background_label", - "(int64_t, defalut: 0) " + "(int, defalut: 0) " "The index of background label, the background label will be ignored. " "If set to -1, then all categories will be considered.") .SetDefault(0); diff --git a/paddle/fluid/operators/pool_mkldnn_op.cc b/paddle/fluid/operators/pool_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..c88578570c1acdecaa97dd8b12a702778fef2b7e --- /dev/null +++ b/paddle/fluid/operators/pool_mkldnn_op.cc @@ -0,0 +1,217 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/pool_op.h" +#include "paddle/fluid/platform/mkldnn_helper.h" + +namespace paddle { +namespace operators { + +template +class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + const Tensor* input = ctx.Input("X"); + Tensor* output = ctx.Output("Out"); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when saving info into device context + const std::string key = ctx.op().Output("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(input->dims()[i + 2]); + } + } + + // Only 2D pooling is supported now + PADDLE_ENFORCE(ksize.size() == 2, "ksize must be 2D, i.e. 2D pooling"); + PADDLE_ENFORCE(pooling_type == "max" || pooling_type == "avg", + "pooling_type must be 'max' or 'avg'"); + PADDLE_ENFORCE(input->dims().size() == 4, + "Input dim must be with 4, i.e. NCHW"); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // TODO(pzelazko-intel): support more formats + auto src_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto dst_md = platform::MKLDNNMemDesc(dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + std::shared_ptr pool_pd = + CreatePrimitiveDesc(src_md, dst_md, strides, paddings, ksize, + pooling_type, mkldnn_engine); + + // save pool_pd into global device context to be referred in backward path + dev_ctx.SetBlob(key_pool_pd, pool_pd); + + std::shared_ptr workspace_memory = + CreateWorkspaceMemory(pool_pd, pooling_type, mkldnn_engine); + + // save pool_workspace_memory to be referred in backward path + dev_ctx.SetBlob(key_pool_workspace_memory, workspace_memory); + + auto src_memory = + mkldnn::memory({src_md, mkldnn_engine}, (void*)input_data); + auto dst_memory = + mkldnn::memory({dst_md, mkldnn_engine}, (void*)output_data); + + auto pool_prim = mkldnn::pooling_forward(*pool_pd, src_memory, dst_memory, + *workspace_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{pool_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } + + private: + std::unique_ptr CreatePrimitiveDesc( + const mkldnn::memory::desc& src, const mkldnn::memory::desc& dst, + const std::vector& stride, const std::vector& padding, + const std::vector& kernel, const std::string& pooling_type, + const mkldnn::engine& engine) const { + auto pool_desc = mkldnn::pooling_forward::desc( + mkldnn::prop_kind::forward, + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + src, dst, stride, kernel, padding, padding, mkldnn::padding_kind::zero); + + auto p_pool_pd = + new mkldnn::pooling_forward::primitive_desc(pool_desc, engine); + return std::unique_ptr(p_pool_pd); + } + + std::unique_ptr CreateWorkspaceMemory( + std::shared_ptr pool_pd, + const std::string& pooling_type, const mkldnn::engine& engine) const { + mkldnn::memory::primitive_desc workspace_md = + pooling_type == "max" + ? pool_pd->workspace_primitive_desc() + : mkldnn::memory::primitive_desc( + {{}, mkldnn::memory::f32, mkldnn::memory::format::nchw}, + engine); + + auto p_workspace_memory = new mkldnn::memory(workspace_md); + return std::unique_ptr(p_workspace_memory); + } +}; + +template +class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { + public: + void Compute(const paddle::framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), + "It must use CPUPlace."); + + const Tensor* in_x = ctx.Input("X"); + const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); + Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); + + // Get an unique name from "argument" name of "Out" variable + // This name will be used as key when referring info from device context + const std::string key = ctx.op().Input("Out"); + const std::string key_pool_pd = key + "@pool_pd"; + const std::string key_pool_workspace_memory = + key + "@pool_workspace_memory"; + + std::string pooling_type = ctx.Attr("pooling_type"); + std::vector ksize = ctx.Attr>("ksize"); + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + + if (ctx.Attr("global_pooling")) { + for (size_t i = 0; i < ksize.size(); ++i) { + paddings[i] = 0; + ksize[i] = static_cast(in_x->dims()[i + 2]); + } + } + + auto& dev_ctx = + ctx.template device_context(); + const mkldnn::engine& mkldnn_engine = dev_ctx.GetEngine(); + + const T* out_grad_data = out_grad->data(); + T* in_x_grad_data = in_x_grad->mutable_data(ctx.GetPlace()); + + std::vector diff_src_tz = + paddle::framework::vectorize2int(in_x_grad->dims()); + std::vector diff_dst_tz = + paddle::framework::vectorize2int(out_grad->dims()); + + auto diff_src_md = platform::MKLDNNMemDesc(diff_src_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + auto diff_dst_md = platform::MKLDNNMemDesc(diff_dst_tz, mkldnn::memory::f32, + mkldnn::memory::format::nchw); + + // Retrieve pool_pd/pool_workspace_memory from device context + auto pool_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_pd)); + PADDLE_ENFORCE(pool_pd != nullptr, + "Fail to find pool_pd in device context"); + + auto workspace_memory = std::static_pointer_cast( + dev_ctx.GetBlob(key_pool_workspace_memory)); + PADDLE_ENFORCE(workspace_memory != nullptr, + "Fail to find workspace_memory in device context"); + + auto pool_bwd_desc = mkldnn::pooling_backward::desc( + pooling_type == "max" ? mkldnn::algorithm::pooling_max + : mkldnn::algorithm::pooling_avg, + diff_src_md, diff_dst_md, strides, ksize, paddings, paddings, + mkldnn::padding_kind::zero); + auto pool_bwd_pd = mkldnn::pooling_backward::primitive_desc( + pool_bwd_desc, mkldnn_engine, *pool_pd); + + auto diff_src_memory = + mkldnn::memory({diff_src_md, mkldnn_engine}, (void*)in_x_grad_data); + auto diff_dst_memory = + mkldnn::memory({diff_dst_md, mkldnn_engine}, (void*)out_grad_data); + + auto bwd_prim = mkldnn::pooling_backward( + pool_bwd_pd, diff_dst_memory, *workspace_memory, diff_src_memory); + + // push primitive to stream and wait until it's executed + std::vector pipeline{bwd_prim}; + mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait(); + } // Compute() +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_KERNEL(pool2d, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNOpKernel); +REGISTER_OP_KERNEL(pool2d_grad, MKLDNN, ::paddle::platform::CPUPlace, + paddle::operators::PoolMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index a87a3511ee46dd657c27da26feb43ba43a08f25d..d78da10016a0e2b1d9a0ca9f3dfe4e8009bbe61d 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -13,12 +13,25 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/pool_op.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cudnn_helper.h" +#endif +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif namespace paddle { namespace operators { -int PoolOutputSize(int input_size, int filter_size, int padding, int stride) { - int output_size = (input_size - filter_size + 2 * padding) / stride + 1; +int PoolOutputSize(int input_size, int filter_size, int padding, int stride, + bool ceil_mode) { + int output_size; + if (!ceil_mode) { + output_size = (input_size - filter_size + 2 * padding) / stride + 1; + } else { + output_size = + (input_size - filter_size + 2 * padding + stride - 1) / stride + 1; + } PADDLE_ENFORCE(output_size > 0, "Due to the settings of padding(%d), filter_size(%d) and " "stride(%d), the output size is less than 0, please check " @@ -38,6 +51,7 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); + bool ceil_mode = ctx->Attrs().Get("ceil_mode"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -59,8 +73,8 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { std::vector output_shape({in_x_dims[0], in_x_dims[1]}); for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back( - PoolOutputSize(in_x_dims[i + 2], ksize[i], paddings[i], strides[i])); + output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i], ceil_mode)); } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->ShareLoD("X", "Out"); @@ -68,20 +82,18 @@ void PoolOp::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOp::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -99,20 +111,18 @@ void PoolOpGrad::InferShape(framework::InferShapeContext *ctx) const { framework::OpKernelType PoolOpGrad::GetExpectedKernelType( const framework::ExecutionContext &ctx) const { - bool use_cudnn = ctx.Attr("use_cudnn"); - use_cudnn &= platform::is_gpu_place(ctx.GetPlace()); + framework::LibraryType library_{framework::LibraryType::kPlain}; #ifdef PADDLE_WITH_CUDA - if (platform::is_gpu_place(ctx.GetPlace())) { - auto &dev_ctx = ctx.template device_context(); - use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + if (platform::CanCUDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kCUDNN; } #endif - framework::LibraryType library_; - if (use_cudnn) { - library_ = framework::LibraryType::kCUDNN; - } else { - library_ = framework::LibraryType::kPlain; +#ifdef PADDLE_WITH_MKLDNN + if (library_ == framework::LibraryType::kPlain && + platform::CanMKLDNNBeUsed(ctx)) { + library_ = framework::LibraryType::kMKLDNN; } +#endif std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); @@ -167,6 +177,15 @@ Pool2dOpMaker::Pool2dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr( + "ceil_mode", + "(bool, default false) Wether to use the ceil function to calculate " + "output height and width. False is the default. If it is set to False, " + "the floor function will be used.") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -187,16 +206,21 @@ Parameters(ksize, strides, paddings) are two elements. These two elements represent height and width, respectively. The input(X) size and output(Out) size may be different. -Example: +Example: Input: X shape: $(N, C, H_{in}, W_{in})$ Output: Out shape: $(N, C, H_{out}, W_{out})$ - Where - $$ + For ceil_mode = false: + $$ H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ + For ceil_mode = true: + $$ + H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0] + strides[0] - 1)}{strides[0]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1] + strides[1] - 1)}{strides[1]} + 1 + $$ )DOC"); } @@ -251,6 +275,15 @@ Pool3dOpMaker::Pool3dOpMaker(OpProto *proto, OpAttrChecker *op_checker) "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") .SetDefault(false); + AddAttr( + "ceil_mode", + "(bool, default false) Wether to use the ceil function to calculate " + "output height and width. False is the default. If it is set to False, " + "the floor function will be used.") + .SetDefault(false); + AddAttr("use_mkldnn", + "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -267,8 +300,8 @@ The pooling3d operation calculates the output based on the input, pooling_type, ksize, strides, and paddings parameters. Input(X) and output(Out) are in NCDHW format, where N is batch size, C is the number of channels, and D, H and W are the depth, height and -width of the feature, respectively. Parameters(ksize, strides, paddings) -are three elements. These three elements represent depth, height and +width of the feature, respectively. Parameters(ksize, strides, paddings) +are three elements. These three elements represent depth, height and width, respectively. The input(X) size and output(Out) size may be different. Example: @@ -276,12 +309,18 @@ Example: X shape: $(N, C, D_{in}, H_{in}, W_{in})$ Output: Out shape: $(N, C, D_{out}, H_{out}, W_{out})$ - Where + For ceil_mode = false: $$ D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$ + For ceil_mode = true: + $$ + D_{out} = \frac{(D_{in} - ksize[0] + 2 * paddings[0] + strides[0] -1)}{strides[0]} + 1 \\ + H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ + W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 + $$ )DOC"); } diff --git a/paddle/fluid/operators/read_op.cc b/paddle/fluid/operators/read_op.cc index 62beab82d4f2b0b795d5d32f50352172de6870cc..2a5605e0d378a184ae132e657b2872279784855d 100644 --- a/paddle/fluid/operators/read_op.cc +++ b/paddle/fluid/operators/read_op.cc @@ -60,15 +60,16 @@ class ReadOp : public framework::OperatorBase { const platform::Place& dev_place) const override { framework::ReaderHolder* reader = scope.FindVar(Input("Reader"))->GetMutable(); - if (!reader->HasNext()) { + std::vector out_arg_names = Outputs("Out"); + std::vector ins; + reader->ReadNext(&ins); + if (ins.empty()) { reader->ReInit(); + reader->ReadNext(&ins); PADDLE_ENFORCE( - reader->HasNext(), + !ins.empty(), "Reader can not read the next data even it has been re-initialized."); } - std::vector out_arg_names = Outputs("Out"); - std::vector ins; - reader->ReadNext(&ins); PADDLE_ENFORCE_EQ(ins.size(), out_arg_names.size()); for (size_t i = 0; i < ins.size(); ++i) { auto* out = diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..335c5b26a864381bf87a2824b78f521cdce063e4 --- /dev/null +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -0,0 +1,6 @@ +cc_library(reader_op_registry SRCS reader_op_registry.cc DEPS operator op_registry reader) +op_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc DEPS reader_op_registry) +op_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc DEPS reader_op_registry) +op_library(create_batch_reader_op SRCS create_batch_reader_op.cc DEPS reader_op_registry) +op_library(create_double_buffer_reader_op SRCS create_double_buffer_reader_op.cc DEPS reader_op_registry) +set(READER_LIBRARY create_random_data_generator_op create_shuffle_reader_op create_batch_reader_op create_double_buffer_reader_op PARENT_SCOPE) diff --git a/paddle/fluid/operators/reader/create_batch_reader_op.cc b/paddle/fluid/operators/reader/create_batch_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..277f2856c07b3fec2113486539aec1d9139fae92 --- /dev/null +++ b/paddle/fluid/operators/reader/create_batch_reader_op.cc @@ -0,0 +1,137 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +class BatchReader : public framework::DecoratedReader { + public: + BatchReader(ReaderBase* reader, int batch_size) + : DecoratedReader(reader), batch_size_(batch_size) { + buffer_.reserve(batch_size_); + } + + void ReadNext(std::vector* out) override; + + private: + int batch_size_; + std::vector> buffer_; +}; + +class CreateBatchReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset( + new BatchReader(underlying_reader.Get(), Attr("batch_size"))); + } +}; + +class CreateBatchReaderOpMaker : public DecoratedReaderMakerBase { + public: + CreateBatchReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : DecoratedReaderMakerBase(op_proto, op_checker) { + AddAttr("batch_size", + "How many instances the batch reader yields each time.") + .GreaterThan(0); + AddComment(R"DOC( + CreateBatchReader Operator + + A batch reader takes another reader as its 'underlying reader', + gathers the underlying reader's outputs and then yields them in batches. + )DOC"); + } +}; + +void BatchReader::ReadNext(std::vector* out) { + buffer_.clear(); + buffer_.reserve(batch_size_); + for (int i = 0; i < batch_size_; ++i) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + if (buffer_.back().empty()) { + buffer_.pop_back(); + break; + } + } + // Concat instances + out->clear(); + if (buffer_.empty()) { + // if buffer_ is empty, the 'out' will return as an empty vector. + return; + } + int out_num = buffer_[0].size(); + out->reserve(out_num); + for (int j = 0; j < out_num; ++j) { + // Merge shape and check date type + std::type_index batch_type = buffer_[0][j].type(); + framework::DDim batch_shape = buffer_[0][j].dims(); + for (size_t i = 1; i < buffer_.size(); ++i) { + std::type_index ins_type = buffer_[i][j].type(); + framework::DDim ins_shape = buffer_[i][j].dims(); + PADDLE_ENFORCE_EQ(batch_type, ins_type); + PADDLE_ENFORCE_EQ(slice_ddim(batch_shape, 1, batch_shape.size()), + slice_ddim(ins_shape, 1, ins_shape.size())); + PADDLE_ENFORCE_GT(ins_shape[0], 0); + batch_shape[0] += ins_shape[0]; + } + + framework::LoDTensor out_tensor; + out_tensor.Resize(batch_shape); + out_tensor.mutable_data(platform::CPUPlace(), batch_type); + int64_t dst_offset = 0; + + // Merge lod and data + framework::LoD batch_lod; + for (size_t i = 0; i < buffer_.size(); ++i) { + framework::DDim ins_shape = buffer_[i][j].dims(); + framework::LoD ins_lod = buffer_[i][j].lod(); + if (i == 0) { + batch_lod = ins_lod; + } else { + PADDLE_ENFORCE_EQ(batch_lod.size(), ins_lod.size()); + for (size_t level_idx = 0; level_idx < batch_lod.size(); ++level_idx) { + auto& lod_level = batch_lod[level_idx]; + for (size_t k = 1; k < ins_lod[level_idx].size(); ++k) { + lod_level.push_back(ins_lod[level_idx][k] + lod_level.back()); + } + } + } + auto dst = out_tensor.Slice(dst_offset, dst_offset + ins_shape[0]); + TensorCopy(buffer_[i][j], platform::CPUPlace(), &dst); + dst_offset += ins_shape[0]; + } + out_tensor.set_lod(batch_lod); + out->push_back(out_tensor); + } +} + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::reader; +REGISTER_DECORATED_READER_OPERATOR(create_batch_reader, + ops::CreateBatchReaderOp, + ops::CreateBatchReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b6a0609a1e23195ececee0f16a69daa1c1c46ed8 --- /dev/null +++ b/paddle/fluid/operators/reader/create_double_buffer_reader_op.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/channel.h" +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +static constexpr size_t kDoubleBufferSize = 2; + +class DoubleBufferReader : public framework::DecoratedReader { + public: + explicit DoubleBufferReader(ReaderBase* reader) + : DecoratedReader(reader), + buffer_(framework::MakeChannel>( + kDoubleBufferSize)) { + std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); + prefetch.detach(); + } + + void ReadNext(std::vector* out) override; + void ReInit() override; + + ~DoubleBufferReader() { buffer_->Close(); } + + private: + void PrefetchThreadFunc(); + + framework::Channel>* buffer_; +}; + +class CreateDoubleBufferReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new DoubleBufferReader(underlying_reader.Get())); + } +}; + +class CreateDoubleBufferReaderOpMaker : public DecoratedReaderMakerBase { + public: + CreateDoubleBufferReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : DecoratedReaderMakerBase(op_proto, op_checker) { + AddComment(R"DOC( + CreateDoubleBufferReader Operator + + A double buffer reader takes another reader as its 'underlying reader'. + It launches another thread to execute the 'underlying reader' asynchronously, + which prevents reading process from blocking subsequent training. + )DOC"); + } +}; + +void DoubleBufferReader::ReadNext(std::vector* out) { + out->clear(); + buffer_->Receive(out); +} + +void DoubleBufferReader::ReInit() { + reader_->ReInit(); + buffer_->Close(); + // The existing prefetch thread will terminate for the buffer_ is closed. + buffer_ = framework::MakeChannel>( + kDoubleBufferSize); + std::thread prefetch(&DoubleBufferReader::PrefetchThreadFunc, this); + prefetch.detach(); +} + +void DoubleBufferReader::PrefetchThreadFunc() { + VLOG(5) << "A new prefetch thread starts."; + while (true) { + std::vector batch; + reader_->ReadNext(&batch); + if (batch.empty()) { + // EOF + buffer_->Close(); + VLOG(5) << "Reached the end of the file. The prefetch thread terminates."; + break; + } + if (!buffer_->Send(&batch)) { + VLOG(5) << "WARNING: The double buffer channel has been closed. The " + "prefetch thread terminates."; + break; + } + } +} + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::reader; +REGISTER_DECORATED_READER_OPERATOR(create_double_buffer_reader, + ops::CreateDoubleBufferReaderOp, + ops::CreateDoubleBufferReaderOpMaker); diff --git a/paddle/fluid/operators/reader/create_random_data_generator_op.cc b/paddle/fluid/operators/reader/create_random_data_generator_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..73c39b5da4484b27f75aeba3c8171c5ffed2398f --- /dev/null +++ b/paddle/fluid/operators/reader/create_random_data_generator_op.cc @@ -0,0 +1,108 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +template +class RandomDataGenerator : public framework::FileReader { + public: + RandomDataGenerator(const std::vector& shapes, float min, + float max) + : FileReader(shapes), min_(min), max_(max) { + PADDLE_ENFORCE_LE( + min, max, "'min' shouldn't be greater than 'max'.(%f vs %f)", min, max); + unsigned int seed = std::random_device()(); + engine_.seed(seed); + dist_ = std::uniform_real_distribution(min_, max_); + } + + void ReadNext(std::vector* out) override { + out->clear(); + out->reserve(shapes_.size()); + for (const framework::DDim& shape : shapes_) { + PADDLE_ENFORCE_GE( + shape.size(), 2, + "The rank of reader's output data should be 2 at least.(Now it's %d)", + shape.size()); + framework::LoDTensor out_tensor; + out_tensor.Resize(shape); + T* data = out_tensor.mutable_data(platform::CPUPlace()); + int64_t numel = framework::product(shape); + for (int64_t i = 0; i < numel; ++i) { + data[i] = dist_(engine_); + } + out->push_back(out_tensor); + } + } + + void ReInit() override { return; } + + private: + float min_; + float max_; + std::minstd_rand engine_; + std::uniform_real_distribution dist_; +}; + +template +class CreateRandomDataGeneratorOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& shape_concat = Attr>("shape_concat"); + const auto& ranks = Attr>("ranks"); + PADDLE_ENFORCE(!shape_concat.empty() && !ranks.empty()); + PADDLE_ENFORCE_EQ(std::accumulate(ranks.begin(), ranks.end(), 0), + int(shape_concat.size()), + "The accumulate of all ranks should be equal to the " + "shape concat's length."); + std::vector shapes = RestoreShapes(shape_concat, ranks); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset(new RandomDataGenerator(shapes, Attr("min"), + Attr("max"))); + } +}; + +class CreateRandomDataGeneratorOpMaker : public FileReaderMakerBase { + public: + CreateRandomDataGeneratorOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : FileReaderMakerBase(op_proto, op_checker) { + AddAttr("min", "The lower bound of reader's uniform distribution."); + AddAttr("max", "The upper bound of reader's uniform distribution."); + AddComment(R"DOC( + CreateRandomDataGenerator Operator + + This Op creates a random reader. + The reader generates random data instead of really reading from files. + Generated data follow an uniform distribution between 'min' and 'max'. + )DOC"); + } +}; + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::reader; +REGISTER_FILE_READER_OPERATOR(create_random_data_generator, + ops::CreateRandomDataGeneratorOp, + ops::CreateRandomDataGeneratorOpMaker); diff --git a/paddle/fluid/operators/reader/create_shuffle_reader_op.cc b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4dac3831109beeed660d32f08fb27c7adf62ac2b --- /dev/null +++ b/paddle/fluid/operators/reader/create_shuffle_reader_op.cc @@ -0,0 +1,97 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +class ShuffleReader : public framework::DecoratedReader { + public: + ShuffleReader(ReaderBase* reader, int buffer_size) + : DecoratedReader(reader), buffer_size_(buffer_size), iteration_pos_(0) { + buffer_.reserve(buffer_size); + } + + void ReadNext(std::vector* out) override; + + private: + int buffer_size_; + std::vector> buffer_; + size_t iteration_pos_; +}; + +void ShuffleReader::ReadNext(std::vector* out) { + if (iteration_pos_ >= buffer_.size()) { + // Reload buffer with new data + buffer_.clear(); + buffer_.reserve(buffer_size_); + for (int i = 0; i < buffer_size_; ++i) { + buffer_.push_back(std::vector()); + reader_->ReadNext(&buffer_.back()); + if (buffer_.back().empty()) { + buffer_.pop_back(); + break; + } + } + // TODO(fengjiayi): 'std::random_shuffle' can be very slow. It needs to be + // optimize. + std::random_shuffle(buffer_.begin(), buffer_.end()); + iteration_pos_ = 0; + } + out->clear(); + if (!buffer_.empty()) { + std::swap(*out, buffer_[iteration_pos_++]); + } + // if buffer_ is empty, the 'out' will return as an empty vector. +} + +class CreateShuffleReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + const auto& underlying_reader = scope.FindVar(Input("UnderlyingReader")) + ->Get(); + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + out->Reset( + new ShuffleReader(underlying_reader.Get(), Attr("buffer_size"))); + } +}; + +class CreateShuffleReaderOpMaker : public DecoratedReaderMakerBase { + public: + CreateShuffleReaderOpMaker(OpProto* op_proto, OpAttrChecker* op_checker) + : DecoratedReaderMakerBase(op_proto, op_checker) { + AddAttr("buffer_size", "The shuffle buffer size.").GreaterThan(0); + AddComment(R"DOC( + CreateShuffleReader Operator + + A shuffle reader takes another reader as its 'underlying reader' + and yields the underlying reader's outputs in a shuffled order. + )DOC"); + } +}; +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators::reader; +REGISTER_DECORATED_READER_OPERATOR(create_shuffle_reader, + ops::CreateShuffleReaderOp, + ops::CreateShuffleReaderOpMaker); diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc new file mode 100644 index 0000000000000000000000000000000000000000..7ea4f4b8d9feecac5bc2d0338bbbe9ab7a532040 --- /dev/null +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -0,0 +1,116 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +std::vector RestoreShapes(const std::vector& shape_concat, + const std::vector& ranks) { + std::vector res; + int offset = 0; + for (int len : ranks) { + auto start_it = shape_concat.begin() + offset; + auto end_it = start_it + len; + res.push_back(framework::make_ddim(std::vector(start_it, end_it))); + offset += len; + } + return res; +} + +FileReaderMakerBase::FileReaderMakerBase( + framework::OpProtoAndCheckerMaker::OpProto* op_proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddOutput("Out", "(ReaderHolder) The created random reader."); + AddAttr>("shape_concat", "The concat of all data's shapes."); + AddAttr>( + "ranks", + "The ranks of each data." + "e.g." + "shape_concat = [2,3,4,5,6]" + "ranks = [3,2]" + "It means the reader will generate two data each time," + "whose shapes are [2,3,4] and [5,6] respectively."); + AddAttr>("lod_levels", "The LoD levels of each data."); +} + +void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output file reader should not be null."); + const auto shape_concat = ctx->Attrs().Get>("shape_concat"); + const auto ranks = ctx->Attrs().Get>("ranks"); + std::vector shapes = RestoreShapes(shape_concat, ranks); + ctx->SetReaderDims("Out", shapes); + + if (ctx->IsRuntime()) { + const auto lod_levels = ctx->Attrs().Get>("lod_levels"); + PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size()); + framework::VarDesc* reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + reader->SetLoDLevels(lod_levels); + } +} + +void FileReaderInferVarType::operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const { + std::string reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* reader = block->FindVarRecursive(reader_name); + reader->SetType(framework::proto::VarType::READER); +} + +void DecoratedReaderInferShape::operator()( + framework::InferShapeContext* ctx) const { + PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), + "Input(UnderlyingReader) should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "The output decorated reader should not be null."); + ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); + + if (ctx->IsRuntime()) { + framework::VarDesc* in_reader = boost::get( + ctx->GetInputVarPtrs("UnderlyingReader")[0]); + framework::VarDesc* out_reader = + boost::get(ctx->GetOutputVarPtrs("Out")[0]); + out_reader->SetLoDLevels(in_reader->GetLoDLevels()); + } +} +void DecoratedReaderInferVarType::operator()( + const framework::OpDesc& op_desc, framework::BlockDesc* block) const { + std::string in_reader_name = op_desc.Input("UnderlyingReader")[0]; + framework::VarDesc* in_reader = block->FindVarRecursive(in_reader_name); + std::string out_reader_name = op_desc.Output("Out")[0]; + framework::VarDesc* out_reader = block->FindVarRecursive(out_reader_name); + out_reader->SetType(framework::proto::VarType::READER); + out_reader->SetDataTypes(in_reader->GetDataTypes()); +} + +DecoratedReaderMakerBase::DecoratedReaderMakerBase( + framework::OpProtoAndCheckerMaker::OpProto* op_proto, + framework::OpAttrChecker* op_checker) + : OpProtoAndCheckerMaker(op_proto, op_checker) { + AddInput("UnderlyingReader", + "(ReaderHolder) The underlying reader for creating a batch reader."); + AddOutput("Out", "(ReaderHolder) The created batch reader."); +} + +} // namespace reader + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/reader_op_registry.h b/paddle/fluid/operators/reader/reader_op_registry.h new file mode 100644 index 0000000000000000000000000000000000000000..d1f0498f4692247cda72fbcbdd5070ddfaa11553 --- /dev/null +++ b/paddle/fluid/operators/reader/reader_op_registry.h @@ -0,0 +1,75 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/reader.h" + +namespace paddle { +namespace operators { +namespace reader { + +extern std::vector RestoreShapes( + const std::vector& shape_concat, const std::vector& ranks); + +class FileReaderMakerBase : public framework::OpProtoAndCheckerMaker { + public: + FileReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker); +}; + +class FileReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override; +}; + +class FileReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override; +}; + +// general infershape for decorated reader +class DecoratedReaderInferShape : public framework::InferShapeBase { + public: + void operator()(framework::InferShapeContext* ctx) const override; +}; + +// general var type inference for decorated reader +class DecoratedReaderInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override; +}; + +class DecoratedReaderMakerBase : public framework::OpProtoAndCheckerMaker { + public: + DecoratedReaderMakerBase(OpProto* op_proto, OpAttrChecker* op_checker); +}; + +} // namespace reader +} // namespace operators +} // namespace paddle + +#define REGISTER_FILE_READER_OPERATOR(op_name, ...) \ + REGISTER_OPERATOR(op_name, __VA_ARGS__, \ + paddle::operators::reader::FileReaderInferShape, \ + paddle::framework::EmptyGradOpMaker, \ + paddle::operators::reader::FileReaderInferVarType) + +#define REGISTER_DECORATED_READER_OPERATOR(op_name, ...) \ + REGISTER_OPERATOR(op_name, __VA_ARGS__, \ + paddle::operators::reader::DecoratedReaderInferShape, \ + paddle::framework::EmptyGradOpMaker, \ + paddle::operators::reader::DecoratedReaderInferVarType) diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 2ad49437a97be5c69f81f0242b6ee23b90df955b..b094e649c3dcdd9ac85979230570d46af308f7ca 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -118,6 +118,9 @@ class ReshapeOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>( "shape", "(std::vector) Target shape of reshape operator.") .SetDefault(std::vector()); + AddAttr("inplace", + "Change the source tensor's shape without copy memory.") + .SetDefault(true); AddComment(R"DOC( Reshape Operator. diff --git a/paddle/fluid/operators/reshape_op.h b/paddle/fluid/operators/reshape_op.h index 0c97dc639f07522a5ffd5e5cb27426d65bc9ab75..23fbf1655c7566bb229a2be608ee91c8ef024af7 100644 --- a/paddle/fluid/operators/reshape_op.h +++ b/paddle/fluid/operators/reshape_op.h @@ -38,9 +38,15 @@ class ReshapeKernel : public framework::OpKernel { out_dims = out->dims(); } - out->mutable_data(ctx.GetPlace()); - framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); - out->Resize(out_dims); + bool inplace = ctx.Attr("inplace"); + if (!inplace) { + out->mutable_data(ctx.GetPlace()); + framework::TensorCopy(*in, ctx.GetPlace(), ctx.device_context(), out); + out->Resize(out_dims); + } else { + out->ShareDataWith(*in); + out->Resize(out_dims); + } } private: diff --git a/paddle/fluid/operators/send_op.cc b/paddle/fluid/operators/send_op.cc index 178976f96fdbd08cead7b7c518ea1fbaaa2a5db8..8fdd08eae6b22cd57506d6e75182c1a7e2022562 100644 --- a/paddle/fluid/operators/send_op.cc +++ b/paddle/fluid/operators/send_op.cc @@ -24,15 +24,15 @@ limitations under the License. */ namespace paddle { namespace operators { -static bool IsVariableInitialized(const framework::Scope& scope, - const std::string& varname) { +static bool NeedSend(const framework::Scope& scope, + const std::string& varname) { auto* var = scope.FindVar(varname); PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.", varname); if (var->IsType()) { return var->Get().IsInitialized(); } else if (var->IsType()) { - return var->Get().value().IsInitialized(); + return var->Get().rows().size() > 0UL; } else { PADDLE_THROW( "Variable type in send side should be in " @@ -67,7 +67,7 @@ class SendOp : public framework::OperatorBase { detail::RPCClient* rpc_client = client_var->GetMutable(); for (size_t i = 0; i < ins.size(); i++) { - if (IsVariableInitialized(scope, ins[i])) { + if (NeedSend(scope, ins[i])) { VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); } else { diff --git a/paddle/fluid/operators/sgd_op.cc b/paddle/fluid/operators/sgd_op.cc index 7cc73de8788e3dceb763b6f5a1519459d0fb05dd..d0aa2f9cbadaadf4e7e625628d9db5677d50d277 100644 --- a/paddle/fluid/operators/sgd_op.cc +++ b/paddle/fluid/operators/sgd_op.cc @@ -39,6 +39,14 @@ class SGDOp : public framework::OperatorWithKernel { // and run time. ctx->SetOutputDim("ParamOut", param_dim); } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("Param")->type()), + ctx.GetPlace()); + } }; class SGDOpMaker : public framework::OpProtoAndCheckerMaker { diff --git a/paddle/fluid/operators/sgd_op.h b/paddle/fluid/operators/sgd_op.h index 2fec84815a9ecc63675de88816b23cfaa75aca65..0ad801079400f1830d85a945e57a434a86adeb00 100644 --- a/paddle/fluid/operators/sgd_op.h +++ b/paddle/fluid/operators/sgd_op.h @@ -47,6 +47,12 @@ class SGDOpKernel : public framework::OpKernel { PADDLE_ENFORCE_EQ(param, param_out); auto* grad = ctx.Input("Grad"); + // for distributed training, a sparse var may be empty, + // just skip updating. + if (grad->rows().size() == 0) { + return; + } + auto in_height = grad->height(); auto out_dims = param_out->dims(); PADDLE_ENFORCE_EQ(in_height, out_dims[0]); @@ -60,13 +66,15 @@ class SGDOpKernel : public framework::OpKernel { auto* in_data = in_value.data(); auto* out_data = param_out->data(); auto* lr = learning_rate->data(); - for (size_t i = 0; i < in_rows.size(); i++) { + PADDLE_ENFORCE(in_rows[i] < in_height, + "Input rows index should less than height"); for (int64_t j = 0; j < in_row_numel; j++) { out_data[in_rows[i] * in_row_numel + j] -= lr[0] * in_data[i * in_row_numel + j]; } } + } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } diff --git a/paddle/fluid/operators/split_selected_rows_op.h b/paddle/fluid/operators/split_selected_rows_op.h index 23baf8e72eca87f3865fc2b63ce2de96f799dce3..0e9ce165b98845f4745ee70b028513ea31cc6657 100644 --- a/paddle/fluid/operators/split_selected_rows_op.h +++ b/paddle/fluid/operators/split_selected_rows_op.h @@ -21,15 +21,24 @@ limitations under the License. */ namespace paddle { namespace operators { -static int FindOutIdx(int row, const std::vector& height_sections) { - int offset = 0; - for (size_t i = 0; i < height_sections.size(); ++i) { - if (row >= offset && row < (offset + height_sections[i])) { - return i; +static int FindOutIdx(int row, const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (row < abs_sections[i]) { + return i - 1; } - offset += height_sections[i]; } - return -1; + return abs_sections.size() - 1; +} + +static std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; } template @@ -40,16 +49,23 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { auto outs = ctx.MultiOutput("Out"); auto height_sections = ctx.Attr>("height_sections"); + auto abs_sections = ToAbsoluteSection(height_sections); + auto x_rows = x->rows(); std::vector> outs_rows_idx; + std::vector> outs_dense_idx; + outs_rows_idx.resize(outs.size()); + outs_dense_idx.resize(outs.size()); auto row_numel = x->value().numel() / x->value().dims()[0]; auto src = x->value().data(); + // split rows index into output sparse vars for (size_t i = 0; i < x_rows.size(); ++i) { - int out_idx = FindOutIdx(x_rows[i], height_sections); - outs_rows_idx[out_idx].push_back(i); + int out_idx = FindOutIdx(x_rows[i], abs_sections); + outs_rows_idx[out_idx].push_back(x_rows[i]); + outs_dense_idx[out_idx].push_back(i); } auto place = ctx.GetPlace(); @@ -61,19 +77,20 @@ class SplitSelectedRowsOpKernel : public framework::OpKernel { dims[0] = rows_idx.size(); outs[i]->mutable_value()->mutable_data(dims, x->place()); for (auto idx : rows_idx) { - outs[i]->mutable_rows()->push_back(x_rows[idx]); + outs[i]->mutable_rows()->push_back(idx - abs_sections[i]); } auto dst = outs[i]->mutable_value()->mutable_data(ctx.GetPlace()); for (size_t j = 0; j < rows_idx.size(); j++) { if (platform::is_cpu_place(place)) { - memory::Copy(platform::CPUPlace(), dst + j * row_numel, - platform::CPUPlace(), src + rows_idx[j] * row_numel, - sizeof(T) * row_numel); + memory::Copy( + platform::CPUPlace(), dst + j * row_numel, platform::CPUPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel); } else { #ifdef PADDLE_WITH_CUDA auto stream = ctx.cuda_device_context().stream(); memory::Copy(platform::CUDAPlace(), dst + j * row_numel, - platform::CUDAPlace(), src + rows_idx[j] * row_numel, + platform::CUDAPlace(), + src + outs_dense_idx[i][j] * row_numel, sizeof(T) * row_numel, stream); #else PADDLE_THROW("Paddle is not compiled with GPU"); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index c3abb3ea4a53126c22c817069e8ad955b202f09d..d3d5c8a3429e2070c5472355b4440401eaa699cb 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -76,10 +76,16 @@ class SumOp : public framework::OperatorWithKernel { static_cast(dtype), ctx.device_context()); } else if (x_vars[0]->IsType()) { - return framework::OpKernelType( - framework::ToDataType( - x_vars[0]->Get().value().type()), - ctx.device_context()); + for (auto& var : x_vars) { + auto& value = var->Get().value(); + if (value.IsInitialized()) { + return framework::OpKernelType(framework::ToDataType(value.type()), + ctx.device_context()); + } + } + // if input sparse vars are not initialized, use an default kernel type. + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.device_context()); } else if (x_vars[0]->IsType()) { for (auto& x_var : x_vars) { auto& array = x_var->Get(); diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h index 48b2d2779aeeb168cf87e61557e01d5cbde476b3..e7e5346cdca5efaf81c2b0fddedde7406e3b874d 100644 --- a/paddle/fluid/operators/sum_op.h +++ b/paddle/fluid/operators/sum_op.h @@ -109,6 +109,12 @@ class SumKernel : public framework::OpKernel { in_dim[0] = static_cast(first_dim); out_value->Resize(framework::make_ddim(in_dim)); + + // if all the input sparse vars are empty, no need to + // merge these vars. + if (first_dim == 0UL) { + return; + } out_value->mutable_data(context.GetPlace()); math::SelectedRowsAddTo functor; @@ -116,7 +122,7 @@ class SumKernel : public framework::OpKernel { int64_t offset = 0; for (int i = 0; i < N; i++) { auto &sel_row = get_selected_row(i); - if (!sel_row.value().IsInitialized() || sel_row.rows().size() == 0) { + if (sel_row.rows().size() == 0) { continue; } PADDLE_ENFORCE_EQ(out->height(), sel_row.height()); diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 48c967de1155a705b76f0285a4baed52d7ce1f58..1842ecd745e3f5cb75600ce00d89018f81682632 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include + +#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" @@ -282,5 +284,17 @@ class ScopedPoolingDescriptor { DISABLE_COPY_AND_ASSIGN(ScopedPoolingDescriptor); }; +inline bool CanCUDNNBeUsed(const framework::ExecutionContext& ctx) { + bool use_cudnn = ctx.Attr("use_cudnn"); + use_cudnn &= paddle::platform::is_gpu_place(ctx.GetPlace()); +#ifdef PADDLE_WITH_CUDA + if (use_cudnn) { + auto& dev_ctx = ctx.template device_context(); + use_cudnn &= dev_ctx.cudnn_handle() != nullptr; + } +#endif + return use_cudnn; +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 7da6e04d0a8b81bcb5fb6b105ebdd5b908cf8f1d..bb9fbd468f38fffc94107e321e777fc0e772fbe6 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -33,9 +33,15 @@ DeviceContextPool::DeviceContextPool( PADDLE_ENFORCE_GT(places.size(), 0); for (size_t i = 0; i < places.size(); i++) { if (platform::is_cpu_place(places[i])) { +#ifdef PADDLE_WITH_MKLDNN + device_contexts_.emplace(places[i], + new platform::MKLDNNDeviceContext( + boost::get(places[i]))); +#else device_contexts_.emplace(places[i], new platform::CPUDeviceContext( boost::get(places[i]))); +#endif } else if (platform::is_gpu_place(places[i])) { #ifdef PADDLE_WITH_CUDA device_contexts_.emplace(places[i], @@ -121,6 +127,8 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { CUDADeviceContext::CUDADeviceContext(CUDAPlace place) : place_(place) { SetDeviceId(place_.device); + multi_process = GetCUDAMultiProcessors(place_.device); + max_threads_per_mp = GetCUDAMaxThreadsPerMultiProcessor(place_.device); PADDLE_ENFORCE(cudaStreamCreate(&stream_)); eigen_stream_.reset(new EigenCudaStreamDevice()); eigen_stream_->Reinitialize(&stream_, place); @@ -154,6 +162,10 @@ void CUDADeviceContext::Wait() const { PADDLE_ENFORCE(cudaGetLastError()); } +int CUDADeviceContext::GetMaxPhysicalThreadCount() const { + return multi_process * max_threads_per_mp; +} + Eigen::GpuDevice* CUDADeviceContext::eigen_device() const { return eigen_device_.get(); } @@ -170,64 +182,38 @@ cudaStream_t CUDADeviceContext::stream() const { return stream_; } #ifdef PADDLE_WITH_MKLDNN MKLDNNDeviceContext::MKLDNNDeviceContext(CPUPlace place) - : CPUDeviceContext(place), ready_(false) { - stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); - engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0)); + : CPUDeviceContext(place), engine_(mkldnn::engine::cpu, 0), p_blobs_() { + p_blobs_.reset(new std::unordered_map>()); } -template -void MKLDNNDeviceContext::AddElement(const std::string& op_key, - const T& value) { - if (GetElement(op_key)) { - return; - } - GetElementPool().emplace(op_key, std::move(value)); -} +void MKLDNNDeviceContext::SetBlob(const std::string& name, + std::shared_ptr data) const { + std::unordered_map>* p; + p = p_blobs_.get(); -template -const T& MKLDNNDeviceContext::GetElement(const std::string& op_key) const { - auto it = GetElementPool().find(op_key); - return it == GetElementPool().end() ? nullptr : it->second; -} + auto it = p->find(name); -template <> -const std::unordered_map>& -MKLDNNDeviceContext::GetElementPool() const { - return memory_pool_; -} + if (it == p->end()) { + (*p)[name] = data; // create new blob + } else { + it->second = data; // set data to existing blob + } -template <> -const std::unordered_map>& -MKLDNNDeviceContext::GetElementPool() const { - return primitive_pool_; + return; } -template <> -const std::unordered_map>& -MKLDNNDeviceContext::GetElementPool() const { - return primitive_desc_pool_; -} +std::shared_ptr MKLDNNDeviceContext::GetBlob( + const std::string& name) const { + std::unordered_map>* p; + p = p_blobs_.get(); -void MKLDNNDeviceContext::Execute(bool block) { - if (pipeline_.empty()) { - return; - } - ResetStream(); - stream_->submit(pipeline_).wait(block); - ready_ = false; - pipeline_.clear(); -} + auto it = p->find(name); -void MKLDNNDeviceContext::ResetStream() { - if (ready_) { - return; + if (it != p->end()) { + return it->second; } - // TODO(TJ): change me when mkldnn have specific method to reset this state - stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager)); - ready_ = true; + + return nullptr; } #endif diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index a294ba5101528c9ac0007bdcfc5255a0c2674aad..e779644190de1246cd650fbf91eeaeb03494643f 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -22,7 +22,7 @@ limitations under the License. */ #endif #ifdef PADDLE_WITH_MKLDNN -#include "paddle/fluid/platform/mkldnn_helper.h" +#include #endif #include "paddle/fluid/platform/enforce.h" @@ -79,6 +79,9 @@ class CUDADeviceContext : public DeviceContext { /*! \brief Return place in the device context. */ Place GetPlace() const override; + /*! \brief Return the max physical thread count in the device context */ + int GetMaxPhysicalThreadCount() const; + /*! \brief Return eigen device in the device context. */ Eigen::GpuDevice* eigen_device() const; @@ -100,6 +103,9 @@ class CUDADeviceContext : public DeviceContext { cudaStream_t stream_; cudnnHandle_t cudnn_handle_; cublasHandle_t cublas_handle_; + + int multi_process; + int max_threads_per_mp; }; template <> @@ -114,46 +120,19 @@ class MKLDNNDeviceContext : public CPUDeviceContext { public: explicit MKLDNNDeviceContext(CPUPlace place); - /* \brief Add new element: memory, primitive or primitive desc */ - template - void AddElement(const std::string& op_key, const T& value); - - /* \brief Get existed element: memory, primitive or primitive desc */ - template - const T& GetElement(const std::string& op_key) const; - - /* \brief Get element pool: memory, primitive or primitive desc pool */ - template - const std::unordered_map>& - GetElementPool() const; - /* \brief Get the active engine */ - const MKLDNNEngine& engine() const { return *engine_; } - - /* \brief Submit primitive to pipeline */ - void Submit(const MKLDNNPrimitivePtr& p) { pipeline_.push_back(*p); } + const mkldnn::engine& GetEngine() const { return engine_; } - /*! \brief Execute all submitted primitives in pipeline */ - void Execute(bool block = true); + // Set data to blob (i.e. name/data pair). Create blob if not existing + void SetBlob(const std::string& name, std::shared_ptr data) const; - protected: - /*! \brief Reset the stream to prepare next exectue */ - void ResetStream(); + // Find a saved blob. Return nullptr if not found + std::shared_ptr GetBlob(const std::string& name) const; private: - std::unordered_map> - memory_pool_; - std::unordered_map> - primitive_pool_; - std::unordered_map> - primitive_desc_pool_; - std::vector pipeline_; - MKLDNNStreamPtr stream_; - MKLDNNEnginePtr engine_; - bool ready_; + mkldnn::engine engine_; + std::shared_ptr>> + p_blobs_; }; #endif diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 265343573b3c85335f0b1c9a55e60cf9cf13f3c3..78e00d5420bbea40c9bea4be919ec4ce5ececdcb 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -192,21 +192,36 @@ class DeviceTracerImpl : public DeviceTracer { } void AddCPURecords(const char *anno, uint64_t start_ns, uint64_t end_ns) { + if (!anno) { + // TODO(panyx0718): Currently, it doesn't support nested situation + // Up-level can be cleared by low-level and therefore get nullptr + // here. + return; + } std::lock_guard l(trace_mu_); - cpu_records_.push_back( - CPURecord{anno, start_ns, end_ns, - std::hash{}(std::this_thread::get_id())}); + cpu_records_.push_back(CPURecord{anno, start_ns, end_ns, 0}); } void AddMemRecords(const std::string &name, uint64_t start_ns, uint64_t end_ns, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id, uint64_t bytes) { + // 0 means timestamp information could not be collected for the kernel. + if (start_ns == 0 || end_ns == 0) { + VLOG(3) << name << " cannot be traced"; + return; + } + std::lock_guard l(trace_mu_); mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, stream_id, correlation_id, bytes}); } void AddKernelRecords(uint64_t start, uint64_t end, uint32_t device_id, uint32_t stream_id, uint32_t correlation_id) { + // 0 means timestamp information could not be collected for the kernel. + if (start == 0 || end == 0) { + VLOG(3) << correlation_id << " cannot be traced"; + return; + } std::lock_guard l(trace_mu_); kernel_records_.push_back( KernelRecord{start, end, device_id, stream_id, correlation_id}); @@ -279,10 +294,10 @@ class DeviceTracerImpl : public DeviceTracer { event->set_device_id(r.device_id); event->mutable_memcopy()->set_bytes(r.bytes); } - std::string profile_str; - google::protobuf::TextFormat::PrintToString(profile_pb, &profile_str); std::ofstream profile_f; profile_f.open(profile_path, std::ios::out | std::ios::trunc); + std::string profile_str; + profile_pb.SerializeToString(&profile_str); profile_f << profile_str; profile_f.close(); return profile_pb; diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 580ed9bb57fca942bb76b9f7bf76f48ea281a0de..fa9041134d863ebfd8d1e00379da3b92323ae6e3 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -68,6 +68,8 @@ extern void *cublas_dso_handle; __macro(cublasDgemv_v2); \ __macro(cublasSgemm_v2); \ __macro(cublasDgemm_v2); \ + __macro(cublasHgemm); \ + __macro(cublasSgemmEx); \ __macro(cublasSgeam_v2); \ __macro(cublasDgeam_v2); \ __macro(cublasCreate_v2); \ @@ -83,6 +85,7 @@ extern void *cublas_dso_handle; __macro(cublasDgemmStridedBatched); \ __macro(cublasCgemmStridedBatched); \ __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); \ __macro(cublasSgetrfBatched); \ __macro(cublasSgetriBatched); \ __macro(cublasDgetrfBatched); \ diff --git a/paddle/fluid/platform/float16.h b/paddle/fluid/platform/float16.h index 5832bd9ce303d48496e8c15324fd7e4a39484eb4..52fb8c2531357ad7a2b2f8613e5c7fbcef52c6bb 100644 --- a/paddle/fluid/platform/float16.h +++ b/paddle/fluid/platform/float16.h @@ -20,10 +20,6 @@ limitations under the License. */ #include #endif // PADDLE_WITH_CUDA -#include "unsupported/Eigen/CXX11/Tensor" - -#include "paddle/fluid/platform/hostdevice.h" - #ifdef __GNUC__ #define PADDLE_GNUC_VER (__GNUC__ * 10 + __GNUC_MINOR__) #else @@ -64,6 +60,18 @@ limitations under the License. */ namespace paddle { namespace platform { +// Forward declare float16 for eigen.h +struct float16; + +} // namespace platform +} // namespace paddle + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/platform/hostdevice.h" + +namespace paddle { +namespace platform { + // Use PADDLE_ALIGNED(2) to ensure that each float16 will be allocated // and aligned at least on a 2-byte boundary, which leads to efficient // memory access of float16 struct and also makes float16 compatible @@ -729,6 +737,22 @@ HOSTDEVICE inline bool operator>=(const float16& a, const float16& b) { } #endif +HOSTDEVICE inline bool(isnan)(const float16& a) { +#if defined(PADDLE_CUDA_FP16) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530 + return __hisnan(half(a)); +#else + return (a.x & 0x7fff) > 0x7c00; +#endif +} + +HOSTDEVICE inline bool(isinf)(const float16& a) { + return (a.x & 0x7fff) == 0x7c00; +} + +HOSTDEVICE inline bool(isfinite)(const float16& a) { + return !((isnan)(a)) && !((isinf)(a)); +} + } // namespace platform } // namespace paddle @@ -750,3 +774,27 @@ struct is_pod { }; } // namespace std + +namespace Eigen { +namespace numext { + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isnan)( + const paddle::platform::float16& a) { + return (paddle::platform::isnan)(a); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isinf)( + const paddle::platform::float16& a) { + return (paddle::platform::isinf)(a); +} + +template <> +EIGEN_DEVICE_FUNC EIGEN_ALWAYS_INLINE bool(isfinite)( + const paddle::platform::float16& a) { + return (paddle::platform::isfinite)(a); +} + +} // namespace numext +} // namespace Eigen diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 05e1eae853e20b3fd86438c03f52628179a311ca..da4041bad0d82fe1c8c7a12fd0c7177e6dbddef3 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -33,6 +33,26 @@ int GetCUDADeviceCount() { return count; } +int GetCUDAMultiProcessors(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int count; + PADDLE_ENFORCE( + cudaDeviceGetAttribute(&count, cudaDevAttrMultiProcessorCount, id), + "cudaDeviceGetAttribute failed in " + "paddle::platform::GetCUDAMultiProcessors"); + return count; +} + +int GetCUDAMaxThreadsPerMultiProcessor(int id) { + PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); + int count; + PADDLE_ENFORCE(cudaDeviceGetAttribute( + &count, cudaDevAttrMaxThreadsPerMultiProcessor, id), + "cudaDeviceGetAttribute failed in " + "paddle::platform::GetCUDAMaxThreadsPerMultiProcessor"); + return count; +} + int GetCurrentDeviceId() { int device_id; PADDLE_ENFORCE( diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 3d4883d8078daa2b55d8ea792b47e93e4f4feec8..c38ccf0f2ade1d2405177b541b33fd84283726ff 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -30,6 +30,12 @@ const std::string kEnvFractionGpuMemoryToUse = //! Get the total number of GPU devices in system. int GetCUDADeviceCount(); +//! Get the MultiProcessors of the ith GPU. +int GetCUDAMultiProcessors(int i); + +//! Get the MaxThreads of each MultiProcessor of the ith GPU. +int GetCUDAMaxThreadsPerMultiProcessor(int i); + //! Get the current GPU device id in system. int GetCurrentDeviceId(); diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 6d71f352c6eda12e2bc032ecbc26cfefe320f703..90b78142b845e7e12c0c7dfb391f6aa3bd848436 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -16,12 +16,15 @@ limitations under the License. */ #include +#include "paddle/fluid/framework/operator.h" + namespace paddle { namespace platform { using MKLDNNStream = mkldnn::stream; using MKLDNNEngine = mkldnn::engine; using MKLDNNMemory = mkldnn::memory; +using MKLDNNMemoryDescriptor = mkldnn::memory::desc; using MKLDNNPrimitive = mkldnn::primitive; using MKLDNNPrimitiveDesc = mkldnn::handle; @@ -31,5 +34,17 @@ typedef std::unique_ptr MKLDNNMemoryPtr; typedef std::unique_ptr MKLDNNPrimitivePtr; typedef std::unique_ptr MKLDNNPrimitiveDescPtr; +inline mkldnn::memory::desc MKLDNNMemDesc(const std::vector& dims, + mkldnn::memory::data_type data_type, + mkldnn::memory::format format) { + mkldnn::memory::dims tz = dims; + return mkldnn::memory::desc({tz}, data_type, format); +} + +inline bool CanMKLDNNBeUsed(const framework::ExecutionContext& ctx) { + bool use_mkldnn = ctx.Attr("use_mkldnn"); + return use_mkldnn && platform::is_cpu_place(ctx.GetPlace()); +} + } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 094f9224f719f6656fd0c44aa4a620730432ccac..28ef3e04b1c50e0d42eeb27608259c6449429da5 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -178,7 +178,7 @@ void EnableProfiler(ProfilerState state) { } #ifdef PADDLE_WITH_CUDA if (g_state == ProfilerState::kCUDA) { - // Generate some dummy evenets first to reduce the startup overhead. + // Generate some dummy events first to reduce the startup overhead. for (int i = 0; i < 5; i++) { ForEachDevice([](int d) { DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d)); diff --git a/paddle/fluid/platform/profiler.proto b/paddle/fluid/platform/profiler.proto index 06db7ed63895d5385952169e0672bf9888d7abae..71b5a9b12ef4a045ebfd3ee3d06ee25032083ff5 100644 --- a/paddle/fluid/platform/profiler.proto +++ b/paddle/fluid/platform/profiler.proto @@ -15,7 +15,7 @@ limitations under the License. */ syntax = "proto2"; package paddle.platform.proto; -message MemCopy { optional uint64 bytes = 3; } +message MemCopy { optional uint64 bytes = 1; } message Event { optional string name = 1; diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index 4a86d8ec622772d0b62d5f4755a7b6138529b1d1..fc77e0f3213da776e0b05ad5b5da9081665cdf6e 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -75,6 +75,7 @@ TEST(RecordEvent, RecordEvent) { * ... * PopEvent(evt_name, dev_ctx); */ + LOG(INFO) << "Usage 1: PushEvent & PopEvent"; for (int loop = 0; loop < 3; ++loop) { for (int i = 1; i < 5; ++i) { std::string name = "op_" + std::to_string(i); @@ -93,6 +94,7 @@ TEST(RecordEvent, RecordEvent) { * ... * } */ + LOG(INFO) << "Usage 2: RecordEvent"; for (int i = 1; i < 5; ++i) { std::string name = "evs_op_" + std::to_string(i); RecordEvent record_event(name, dev_ctx); @@ -100,6 +102,34 @@ TEST(RecordEvent, RecordEvent) { while (counter != i * 1000) counter++; } + /* Usage 3 + * { + * RecordEvent record_event(name1, dev_ctx); + * ... + * code to be analyzed + * ... + * { + * RecordEvent nested_record_event(name2, dev_ctx); + * ... + * code to be analyzed + * ... + * } + * } + */ + LOG(INFO) << "Usage 3: nested RecordEvent"; + for (int i = 1; i < 5; ++i) { + std::string name = "ano_evs_op_" + std::to_string(i); + RecordEvent record_event(name, dev_ctx); + int counter = 1; + while (counter != i * 100) counter++; + { + std::string nested_name = "nested_ano_evs_op_" + std::to_string(i); + RecordEvent nested_record_event(nested_name, dev_ctx); + int nested_counter = 1; + while (nested_counter != i * 100) nested_counter++; + } + } + // Bad Usage: PushEvent("event_without_pop", dev_ctx); PopEvent("event_without_push", dev_ctx); diff --git a/paddle/fluid/recordio/CMakeLists.txt b/paddle/fluid/recordio/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..e1e7c2cdb3d0c960d5cd408420b5aaead73e70d7 --- /dev/null +++ b/paddle/fluid/recordio/CMakeLists.txt @@ -0,0 +1,6 @@ +# internal library. +cc_library(header SRCS header.cc) +cc_test(header_test SRCS header_test.cc DEPS header) +cc_library(chunk SRCS chunk.cc DEPS snappystream snappy header zlib) +cc_test(chunk_test SRCS chunk_test.cc DEPS chunk) +cc_library(recordio DEPS chunk header) diff --git a/paddle/fluid/recordio/chunk.cc b/paddle/fluid/recordio/chunk.cc new file mode 100644 index 0000000000000000000000000000000000000000..587fd375c38ca83e1c65cb3ccc20b3509b6348c7 --- /dev/null +++ b/paddle/fluid/recordio/chunk.cc @@ -0,0 +1,134 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/recordio/chunk.h" + +#include +#include +#include "paddle/fluid/platform/enforce.h" +#include "snappystream.hpp" +#include "zlib.h" + +namespace paddle { +namespace recordio { +constexpr size_t kMaxBufSize = 1024; + +template +static void ReadStreamByBuf(std::istream& in, int limit, Callback callback) { + char buf[kMaxBufSize]; + std::streamsize actual_size; + size_t counter = 0; + do { + auto actual_max = + limit > 0 ? std::min(limit - counter, kMaxBufSize) : kMaxBufSize; + actual_size = in.readsome(buf, actual_max); + if (actual_size == 0) { + break; + } + callback(buf, actual_size); + if (limit > 0) { + counter += actual_size; + } + } while (actual_size == kMaxBufSize); +} + +static void PipeStream(std::istream& in, std::ostream& os) { + ReadStreamByBuf( + in, -1, [&os](const char* buf, size_t len) { os.write(buf, len); }); +} +static uint32_t Crc32Stream(std::istream& in, int limit = -1) { + auto crc = crc32(0, nullptr, 0); + ReadStreamByBuf(in, limit, [&crc](const char* buf, size_t len) { + crc = crc32(crc, reinterpret_cast(buf), len); + }); + return crc; +} + +bool Chunk::Write(std::ostream& os, Compressor ct) const { + // NOTE(dzhwinter): don't check records.numBytes instead, because + // empty records are allowed. + if (records_.empty()) { + return false; + } + std::stringstream sout; + std::unique_ptr compressed_stream; + switch (ct) { + case Compressor::kNoCompress: + break; + case Compressor::kSnappy: + compressed_stream.reset(new snappy::oSnappyStream(sout)); + break; + default: + PADDLE_THROW("Not implemented"); + } + + std::ostream& buf_stream = compressed_stream ? *compressed_stream : sout; + + for (auto& record : records_) { + size_t sz = record.size(); + buf_stream.write(reinterpret_cast(&sz), sizeof(uint32_t)) + .write(record.data(), record.size()); + } + + if (compressed_stream) { + compressed_stream.reset(); + } + + auto end_pos = sout.tellg(); + sout.seekg(0, std::ios::beg); + uint32_t len = static_cast(end_pos - sout.tellg()); + uint32_t crc = Crc32Stream(sout); + sout.seekg(0, std::ios::beg); + + Header hdr(static_cast(records_.size()), crc, ct, len); + hdr.Write(os); + PipeStream(sout, os); + return true; +} + +void Chunk::Parse(std::istream& sin) { + Header hdr; + hdr.Parse(sin); + auto beg_pos = sin.tellg(); + auto crc = Crc32Stream(sin, hdr.CompressSize()); + PADDLE_ENFORCE_EQ(hdr.Checksum(), crc); + + Clear(); + + sin.seekg(beg_pos, std::ios::beg); + std::unique_ptr compressed_stream; + switch (hdr.CompressType()) { + case Compressor::kNoCompress: + break; + case Compressor::kSnappy: + compressed_stream.reset(new snappy::iSnappyStream(sin)); + break; + default: + PADDLE_THROW("Not implemented"); + } + + std::istream& stream = compressed_stream ? *compressed_stream : sin; + + for (uint32_t i = 0; i < hdr.NumRecords(); ++i) { + uint32_t rec_len; + stream.read(reinterpret_cast(&rec_len), sizeof(uint32_t)); + std::string buf; + buf.resize(rec_len); + stream.read(&buf[0], rec_len); + Add(buf); + } +} + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/chunk.h b/paddle/fluid/recordio/chunk.h new file mode 100644 index 0000000000000000000000000000000000000000..0ba9c63abbe72e7a51ddb1af5f0d206aa9f6cc5b --- /dev/null +++ b/paddle/fluid/recordio/chunk.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/fluid/platform/macros.h" +#include "paddle/fluid/recordio/header.h" + +namespace paddle { +namespace recordio { + +// A Chunk contains the Header and optionally compressed records. +class Chunk { +public: + Chunk() : num_bytes_(0) {} + void Add(std::string buf) { + records_.push_back(buf); + num_bytes_ += buf.size(); + } + // dump the chunk into w, and clears the chunk and makes it ready for + // the next add invocation. + bool Write(std::ostream& fo, Compressor ct) const; + void Clear() { + records_.clear(); + num_bytes_ = 0; + } + void Parse(std::istream& sin); + size_t NumBytes() { return num_bytes_; } + const std::string& Record(int i) const { return records_[i]; } + +private: + std::vector records_; + // sum of record lengths in bytes. + size_t num_bytes_; + DISABLE_COPY_AND_ASSIGN(Chunk); +}; + +size_t CompressData(const char* in, size_t in_length, Compressor ct, char* out); + +void DeflateData(const char* in, size_t in_length, Compressor ct, char* out); + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/chunk_test.cc b/paddle/fluid/recordio/chunk_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a67ba32ed6ab8bda230d1414975c96a0be6d682b --- /dev/null +++ b/paddle/fluid/recordio/chunk_test.cc @@ -0,0 +1,49 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/recordio/chunk.h" + +#include + +#include "gtest/gtest.h" + +using namespace paddle::recordio; + +TEST(Chunk, SaveLoad) { + Chunk ch; + ch.Add(std::string("12345", 6)); + ch.Add(std::string("123", 4)); + std::stringstream ss; + ch.Write(ss, Compressor::kNoCompress); + ch.Clear(); + ch.Parse(ss); + ASSERT_EQ(ch.NumBytes(), 10U); +} + +TEST(Chunk, Compressor) { + Chunk ch; + ch.Add(std::string("12345", 6)); + ch.Add(std::string("123", 4)); + ch.Add(std::string("123", 4)); + ch.Add(std::string("123", 4)); + std::stringstream ss; + ch.Write(ss, Compressor::kSnappy); + std::stringstream ss2; + ch.Write(ss2, Compressor::kNoCompress); + ASSERT_LE(ss.tellp(), ss2.tellp()); // Compress should contain less data; + + ch.Clear(); + ch.Parse(ss); + ASSERT_EQ(ch.NumBytes(), 18); +} diff --git a/paddle/fluid/recordio/header.cc b/paddle/fluid/recordio/header.cc new file mode 100644 index 0000000000000000000000000000000000000000..3641caaa8981020519cbc31e5362348c02d3bbce --- /dev/null +++ b/paddle/fluid/recordio/header.cc @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/recordio/header.h" + +namespace paddle { +namespace recordio { + +Header::Header() + : num_records_(0), + checksum_(0), + compressor_(Compressor::kNoCompress), + compress_size_(0) {} + +Header::Header(uint32_t num, uint32_t sum, Compressor c, uint32_t cs) + : num_records_(num), checksum_(sum), compressor_(c), compress_size_(cs) {} + +void Header::Parse(std::istream& is) { + is.read(reinterpret_cast(&num_records_), sizeof(uint32_t)) + .read(reinterpret_cast(&checksum_), sizeof(uint32_t)) + .read(reinterpret_cast(&compressor_), sizeof(uint32_t)) + .read(reinterpret_cast(&compress_size_), sizeof(uint32_t)); +} + +void Header::Write(std::ostream& os) const { + os.write(reinterpret_cast(&num_records_), sizeof(uint32_t)) + .write(reinterpret_cast(&checksum_), sizeof(uint32_t)) + .write(reinterpret_cast(&compressor_), sizeof(uint32_t)) + .write(reinterpret_cast(&compress_size_), sizeof(uint32_t)); +} + +std::ostream& operator<<(std::ostream& os, Header h) { + os << h.NumRecords() << h.Checksum() + << static_cast(h.CompressType()) << h.CompressSize(); + return os; +} + +bool operator==(Header l, Header r) { + return l.NumRecords() == r.NumRecords() && l.Checksum() == r.Checksum() && + l.CompressType() == r.CompressType() && + l.CompressSize() == r.CompressSize(); +} + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/header.h b/paddle/fluid/recordio/header.h new file mode 100644 index 0000000000000000000000000000000000000000..cbd52642a668d1eaeeafb672e50af1a476975080 --- /dev/null +++ b/paddle/fluid/recordio/header.h @@ -0,0 +1,66 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +namespace paddle { +namespace recordio { + +// Default ChunkSize +constexpr size_t kDefaultMaxChunkSize = 32 * 1024 * 1024; +// MagicNumber for memory checking +constexpr uint32_t kMagicNumber = 0x01020304; + +enum class Compressor : uint32_t { + // NoCompression means writing raw chunk data into files. + // With other choices, chunks are compressed before written. + kNoCompress = 0, + // Snappy had been the default compressing algorithm widely + // used in Google. It compromises between speech and + // compression ratio. + kSnappy = 1, + // Gzip is a well-known compression algorithm. It is + // recommmended only you are looking for compression ratio. + kGzip = 2, +}; + +// Header is the metadata of Chunk +class Header { +public: + Header(); + Header(uint32_t num, uint32_t sum, Compressor ct, uint32_t cs); + + void Write(std::ostream& os) const; + void Parse(std::istream& is); + + uint32_t NumRecords() const { return num_records_; } + uint32_t Checksum() const { return checksum_; } + Compressor CompressType() const { return compressor_; } + uint32_t CompressSize() const { return compress_size_; } + +private: + uint32_t num_records_; + uint32_t checksum_; + Compressor compressor_; + uint32_t compress_size_; +}; + +// Allow Header Loggable +std::ostream& operator<<(std::ostream& os, Header h); +bool operator==(Header l, Header r); + +} // namespace recordio +} // namespace paddle diff --git a/paddle/fluid/recordio/header_test.cc b/paddle/fluid/recordio/header_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..a7d627c3eb4a7af1954795f77e5f24739edadae8 --- /dev/null +++ b/paddle/fluid/recordio/header_test.cc @@ -0,0 +1,31 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/recordio/header.h" + +#include + +#include "gtest/gtest.h" + +using namespace paddle::recordio; + +TEST(Recordio, ChunkHead) { + Header hdr(0, 1, Compressor::kGzip, 3); + std::stringstream ss; + hdr.Write(ss); + ss.seekg(0, std::ios::beg); + Header hdr2; + hdr2.Parse(ss); + EXPECT_TRUE(hdr == hdr2); +} diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh index 06319fc638984f8f8ed897c362f516e1534bc8db..6be2bd8fad9e33cf4e1dcafdd6b8f39111bdbe88 100644 --- a/paddle/scripts/docker/build.sh +++ b/paddle/scripts/docker/build.sh @@ -213,7 +213,7 @@ function gen_fluid_inference_lib() { if [ ${WITH_C_API:-OFF} == "OFF" ] ; then cat < && cd .. && rm -rf nccl - CMD ["bash", "/paddle/paddle/scripts/docker/build.sh"] diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh new file mode 100644 index 0000000000000000000000000000000000000000..7efc1fe8651a02684c92fa4b924a4f4416503710 --- /dev/null +++ b/tools/manylinux1/build_scripts/install_nccl2.sh @@ -0,0 +1,18 @@ +#!/bin/bash +DEB="nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb" +DIR="/nccl2" +mkdir -p $DIR +# we cached the nccl2 deb package in BOS, so we can download it with wget +# install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down +wget -O $DIR/$DEB \ + "http://nccl2-deb.gz.bcebos.com/nccl-repo-ubuntu1604-2.1.4-ga-cuda8.0_1-1_amd64.deb?responseContentDisposition=attachment" + +cd $DIR && ar x $DEB && tar xf data.tar.xz +DEBS=$(find ./var/ -name "*.deb") +for sub_deb in $DEBS; do + echo $sub_deb + ar x $sub_deb && tar xf data.tar.xz +done +mv -f usr/include/nccl.h /usr/local/include/ +mv -f usr/lib/libnccl* /usr/local/lib/ +rm -rf $DIR diff --git a/tools/timeline.py b/tools/timeline.py index d1d1dae2bd85e85b069dc41bdd3e3e54579f9c49..ee83a1baecdd4243bb6c546486a837393980fb65 100644 --- a/tools/timeline.py +++ b/tools/timeline.py @@ -159,7 +159,7 @@ if args.timeline_path: with open(profile_path, 'r') as f: profile_s = f.read() profile_pb = profiler_pb2.Profile() - text_format.Merge(profile_s, profile_pb) + profile_pb.ParseFromString(profile_s) tl = Timeline(profile_pb) with open(timeline_path, 'w') as f: