diff --git a/README.md b/README.md index c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd..32a302cc5431a62b310d4812b545bd929f090e0a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + + ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + + ## Features - **Flexibility** @@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. +## 特点 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **连接产品** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 5f3ce300acc44ad8d2898c27296b866c403f3cc8..10b633a4fc1063aab5c0d34b994f9c233e228f17 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") - config = distribute_transpiler.DistributeTranspilerConfig() + config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var + config.min_block_size = 1048576 t = distribute_transpiler.DistributeTranspiler(config=config) + t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 30b227b6452abf44171a1a4e04569e66b16e67a4..6b50cff7a66a33d9413627bfbc663cca06ba86f3 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -14,14 +14,16 @@ INCLUDE(ExternalProject) -find_library(SSL_LIBRARY NAMES ssl) +find_package(OpenSSL REQUIRED) + +message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY}) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) -find_library(CRYPTO_LIBRARY NAMES crypto) ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY}) - +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) @@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib") +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/gongweibao/brpc" - GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a" + GIT_TAG "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -50,7 +53,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_PREFIX_PATH=${prefix_path} - -DBRPC_WITH_GLOG=ON + -DWITH_GLOG=ON -DIOBUF_WITH_HUGE_BLOCK=ON -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} ${EXTERNAL_OPTIONAL_ARGS} @@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) +add_definitions(-DBRPC_WITH_GLOG) LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4fe9c13fb7f2c04ae04e985252996dfa308ac304..9be625b620287cd4c644ae6908000fd5eec5d5c7 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(WITH_TESTING) - ENABLE_TESTING() +#FIXME:(gongwb) Move brpc's gtest dependency. +IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) + IF(WITH_TESTING) + ENABLE_TESTING() + ENDIF(WITH_TESTING) + INCLUDE(ExternalProject) SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest) @@ -76,4 +80,4 @@ IF(WITH_TESTING) ADD_DEPENDENCIES(gtest_main extern_gtest) LIST(APPEND external_project_dependencies gtest gtest_main) -ENDIF(WITH_TESTING) +ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index fb5091731da02b497a14f119e944905eee4979d5..0df61b01ab64c8b751bdc3893dd5294ad39ab928 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -24,8 +24,8 @@ ExternalProject_Add( extern_leveldb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LEVELDB_SOURCES_DIR} - URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" - URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 CONFIGURE_COMMAND "" BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index fd4cf92d85d5daa891d602d4365122c870920bba..d1171c4d48d0184f0cbdabd5d8dd0ea98a6818be 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -77,6 +77,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -198,6 +200,7 @@ paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) +paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index ce429fefa77b81dff9bf997ba092e92d97cb0dc0..cea4a448574cfa822ef1498cc3ea1d58a7aea43f 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -72,6 +72,8 @@ cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) +cc_library(garbage_collector SRCS garbage_collector.cc DEPS device_context memory) + cc_library(reader SRCS reader.cc DEPS lod_tensor ddim) cc_test(reader_test SRCS reader_test.cc DEPS reader) @@ -167,9 +169,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) + + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + else() if(WITH_NGRAPH) if(NOT WIN32) @@ -183,6 +188,8 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() +target_link_libraries(executor garbage_collector) + cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 92639dfc611d111cee2c666acc6103be4bed952d..2a380201f297f42dd82a6809bef9a72660066819 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -35,6 +35,6 @@ TEST(DataType, float16) { EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info - std::string type = "float16"; + std::string type = "::paddle::platform::float16"; EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index 2f76cb714fcf7297c17552bcff965c6039253a2e..97f7713d97494b1d6b11ef77796fbf4f75e598dc 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) +if(WITH_DISTRIBUTE) + if(NOT WITH_GRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endif() +endif() + if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + ddim dynload_cuda selected_rows_functor sendrecvop_rpc) else() nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda selected_rows_functor) @@ -30,7 +37,7 @@ else() variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor sendrecvop_grpc) + ddim selected_rows_functor sendrecvop_rpc) else() cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) @@ -45,10 +52,10 @@ cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base s cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) -if (WITH_GPU) - cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle scale_loss_grad_op_handle rpc_op_handle - all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) -endif() +cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) +cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) +cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) +cc_library(reference_count_pass SRCS reference_count_pass.cc DEPS computation_op_handle graph graph_helper pass op_graph_view reference_count_pass_helper) cc_library(sequential_execution_pass SRCS sequential_execution_pass.cc DEPS graph graph_helper pass) cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_helper pass) @@ -56,10 +63,7 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass) -if (WITH_GPU) - list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) -endif() +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) diff --git a/paddle/fluid/framework/details/computation_op_handle.cc b/paddle/fluid/framework/details/computation_op_handle.cc index 7ad1e40c600c6e70cea822fac777ff20163078e6..7beb8c8de9fc49aebc66ca44de8736240aabbc30 100644 --- a/paddle/fluid/framework/details/computation_op_handle.cc +++ b/paddle/fluid/framework/details/computation_op_handle.cc @@ -20,11 +20,13 @@ namespace paddle { namespace framework { namespace details { ComputationOpHandle::ComputationOpHandle(ir::Node *node, Scope *scope, - platform::Place place) + platform::Place place, + size_t scope_idx) : OpHandleBase(node), op_(framework::OpRegistry::CreateOp(*node->Op())), scope_(scope), - place_(place) {} + place_(place), + scope_idx_(scope_idx) {} void ComputationOpHandle::RunImpl() { WaitInputVarGenerated(place_); diff --git a/paddle/fluid/framework/details/computation_op_handle.h b/paddle/fluid/framework/details/computation_op_handle.h index 662a91d6b4dfcfed563fdf2e46c22f83f90b40af..601ae4f8c6de11b0bf25d4f9a92ef8eada67be3d 100644 --- a/paddle/fluid/framework/details/computation_op_handle.h +++ b/paddle/fluid/framework/details/computation_op_handle.h @@ -28,7 +28,8 @@ namespace framework { namespace details { struct ComputationOpHandle : public OpHandleBase { public: - ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place); + ComputationOpHandle(ir::Node *node, Scope *scope, platform::Place place, + size_t scope_idx); std::string Name() const override; @@ -38,6 +39,8 @@ struct ComputationOpHandle : public OpHandleBase { void SetLockAndRecordEventFree(bool b) { is_lock_and_record_event_free_ = b; } + size_t GetScopeIdx() const { return scope_idx_; } + protected: void RunImpl() override; @@ -47,6 +50,7 @@ struct ComputationOpHandle : public OpHandleBase { std::unique_ptr op_; Scope *scope_; platform::Place place_; + size_t scope_idx_; bool is_lock_and_record_event_free_{false}; }; } // namespace details diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.cc b/paddle/fluid/framework/details/eager_deletion_op_handle.cc new file mode 100644 index 0000000000000000000000000000000000000000..abacb11e3b018308c20a67630e3ff34cca7d3387 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.cc @@ -0,0 +1,122 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif + +namespace paddle { +namespace framework { +namespace details { + +EagerDeletionOpHandle::EagerDeletionOpHandle( + ir::Node *node, const Scope *scope, const platform::Place &place, + const std::unordered_set &var_names, GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts) + : OpHandleBase(node), + scope_(scope), + var_names_(var_names), + gc_(gc), + ref_cnts_(ref_cnts) { +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + dev_ctx_ = reinterpret_cast( + platform::DeviceContextPool::Instance().Get(place)); + if (dynamic_cast(gc_)) { + platform::CUDADeviceGuard guard( + boost::get(place).device); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + PADDLE_ENFORCE_NOT_NULL(event_); + } + } +#endif +} + +EagerDeletionOpHandle::~EagerDeletionOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(gpu_place.device); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif +} + +std::string EagerDeletionOpHandle::Name() const { return "eager_deletion"; } + +void EagerDeletionOpHandle::RunImpl() { + auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + std::deque> garbages; + for (auto &name : var_names_) { + auto it = ref_cnts_->find(name); + // Var not found, not reference count has not decreased to 0 + if (it == ref_cnts_->end() || it->second.fetch_sub(1) != 1) { + continue; + } + + auto *var = exec_scope->FindVar(name); + if (var == nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << name; + + if (var->IsType()) { + garbages.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->mutable_value()->MoveMemoryHolder()); + } else if (var->IsType()) { + auto *tensor_arr = var->GetMutable(); + for (auto &t : *tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); + } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); + } + } + + if (!garbages.empty()) { + ClearGarbages(&garbages); + } +} + +void EagerDeletionOpHandle::ClearGarbages( + std::deque> *garbages) { +#ifdef PADDLE_WITH_CUDA + if (event_) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = + reinterpret_cast(gc_)->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(std::move(*garbages), callback_func); + } else { +#endif + gc_->Add(std::move(*garbages)); +#ifdef PADDLE_WITH_CUDA + } +#endif +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_op_handle.h b/paddle/fluid/framework/details/eager_deletion_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..64867afad5b70a2ba31e5cb315daffcf433b5935 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_op_handle.h @@ -0,0 +1,58 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +class Scope; + +namespace details { + +class EagerDeletionOpHandle : public OpHandleBase { + public: + EagerDeletionOpHandle(ir::Node *node, const Scope *scope, + const platform::Place &place, + const std::unordered_set &var_names, + GarbageCollector *gc, + AtomicReferenceCountMap *ref_cnts); + + ~EagerDeletionOpHandle(); + + std::string Name() const override; + + protected: + void RunImpl() override; + + private: + void ClearGarbages(std::deque> *garbages); + + const Scope *scope_; + std::unordered_set var_names_; + GarbageCollector *gc_; // not own + AtomicReferenceCountMap *ref_cnts_; // not own +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext *dev_ctx_{nullptr}; + cudaEvent_t event_{nullptr}; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/eager_deletion_pass.cc b/paddle/fluid/framework/details/eager_deletion_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..4e42d0b4972d567dd769cad6ff8b9d45380ab77a --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.cc @@ -0,0 +1,101 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_pass.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +std::unique_ptr EagerDeletionPass::ApplyImpl( + std::unique_ptr graph) const { + auto &ref_cnts = + Get>(kRuntimeReferenceCount); + PADDLE_ENFORCE(ref_cnts.empty(), + "kRuntimeReferenceCount should be initialized here!"); + + const auto &vars = graph->Get(kGraphVars); + ref_cnts.resize(vars.size()); + + const auto &last_live_ops = + Get>(kLastLiveOpsOfVars); + const auto &gcs = Get(kGarbageCollector); + const auto &places = Get>(kAllPlaces); + + // a reverse map of last_live_ops + // i.e., last op --> variable names which can be deleted. + std::unordered_map> + op_vars_map; + + for (auto &var_ops_map : last_live_ops) { + for (auto &var_ops_pair : var_ops_map) { + const std::string &var_name = var_ops_pair.first; + for (auto *op : var_ops_pair.second) { + op_vars_map[op].insert(var_name); + } + } + } + + for (auto &pair : op_vars_map) { + auto *op = pair.first; + auto &var_names = pair.second; + + auto *eager_deletion_node = + graph->CreateEmptyNode("eager_deletion", ir::Node::Type::kOperation); + auto *eager_deletion_op = new EagerDeletionOpHandle( + eager_deletion_node, op->GetScope(), op->GetPlace(), var_names, + gcs.at(places[op->GetScopeIdx()]).get(), + &(ref_cnts[op->GetScopeIdx()])); + + auto it = std::find_if( + op->Outputs().begin(), op->Outputs().end(), [](VarHandleBase *var) { + return dynamic_cast(var) != nullptr; + }); + + if (it != op->Outputs().end()) { + eager_deletion_op->AddInput(*it); + } else { + auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dep_var); + op->AddOutput(dep_var); + eager_deletion_op->AddInput(dep_var); + } + + auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); + graph->Get(kGraphDepVars).emplace(dummy_leaf); + eager_deletion_op->AddOutput(dummy_leaf); + } + + VLOG(10) << "Create " << op_vars_map.size() << " EagerDeletionOpHandle(s)"; + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(eager_deletion_pass, + paddle::framework::details::EagerDeletionPass) + .RequirePassAttr(paddle::framework::details::kRuntimeReferenceCount) + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars) + .RequirePassAttr(paddle::framework::details::kAllPlaces) + .RequirePassAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/eager_deletion_pass.h b/paddle/fluid/framework/details/eager_deletion_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..d7a7a9709d970841060778806451bc21cb2c7571 --- /dev/null +++ b/paddle/fluid/framework/details/eager_deletion_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class EagerDeletionPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index cbae5321d9a3fdcaebca2bd9111fff6a10b235a8..8af1d62dea89343ff2d41dd7c6ac837459df7685 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -565,7 +565,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, int dev_id) const { result->Get(kGraphOps).emplace_back( new ComputationOpHandle(result->CreateOpNode(node->Op()), - local_scopes_[dev_id], places_[dev_id])); + local_scopes_[dev_id], places_[dev_id], dev_id)); CreateOpHandleIOs(result, node, dev_id); } @@ -688,8 +688,8 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, for (size_t scope_idx = 0; scope_idx < num_places; ++scope_idx) { auto p = places_[scope_idx]; auto s = local_scopes_[scope_idx]; - result->Get(kGraphOps).emplace_back( - new ComputationOpHandle(result->CreateOpNode(node->Op()), s, p)); + result->Get(kGraphOps).emplace_back(new ComputationOpHandle( + result->CreateOpNode(node->Op()), s, p, scope_idx)); CreateOpHandleIOs(result, node, scope_idx); } } diff --git a/paddle/fluid/framework/details/op_graph_view.cc b/paddle/fluid/framework/details/op_graph_view.cc index 4838c4198ff35ba3fb562f3a7c0563ee60179e3b..d3865c2c2919c2d43521e4f51013e5fa1b10416d 100644 --- a/paddle/fluid/framework/details/op_graph_view.cc +++ b/paddle/fluid/framework/details/op_graph_view.cc @@ -23,6 +23,8 @@ namespace details { OpGraphView::OpGraphView(const std::vector &ops) { Build(ops); } void OpGraphView::Build(const std::vector &ops) { + preceding_ops_.clear(); + pending_ops_.clear(); for (auto &op : ops) { preceding_ops_[op]; pending_ops_[op]; @@ -40,6 +42,7 @@ void OpGraphView::Build(const std::vector &ops) { std::unordered_set OpGraphView::AllOps() const { std::unordered_set ret; + ret.reserve(preceding_ops_.size()); for (auto &pair : preceding_ops_) { ret.insert(pair.first); } diff --git a/paddle/fluid/framework/details/op_graph_view.h b/paddle/fluid/framework/details/op_graph_view.h index afb3e8e59461eeba10d7027fc70b89cc170c1805..77aa02eba56acb3bb20a5c5a55c75af78a3c1c81 100644 --- a/paddle/fluid/framework/details/op_graph_view.h +++ b/paddle/fluid/framework/details/op_graph_view.h @@ -14,7 +14,7 @@ #pragma once -#include +#include #include #include #include @@ -34,6 +34,11 @@ class OpGraphView { bool HasOp(OpHandleBase *op) const; + // Use a visitor to visit all pending ops of op + // Stop when callback returns false + template + bool VisitAllPendingOps(OpHandleBase *op, Callback &&callback) const; + private: void Build(const std::vector &ops); void EnforceHasOp(OpHandleBase *op) const; @@ -44,6 +49,28 @@ class OpGraphView { pending_ops_; }; +template +bool OpGraphView::VisitAllPendingOps(OpHandleBase *op, + Callback &&callback) const { + EnforceHasOp(op); + std::unordered_set visited; + std::queue q; + q.push(op); + do { + op = q.front(); + q.pop(); + for (auto &pending_op : pending_ops_.at(op)) { + if (visited.count(pending_op) == 0) { + visited.insert(pending_op); + if (!callback(pending_op)) { + return false; + } + } + } + } while (!q.empty()); + return true; +} + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_op_handle.h b/paddle/fluid/framework/details/reference_count_op_handle.h deleted file mode 100644 index cc4ccfbdfc720284e683a8f3f59a4aa57a3a9eb1..0000000000000000000000000000000000000000 --- a/paddle/fluid/framework/details/reference_count_op_handle.h +++ /dev/null @@ -1,138 +0,0 @@ -// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include -#include -#include -#include - -#include "paddle/fluid/framework/details/op_handle_base.h" -#include "paddle/fluid/framework/garbage_collector.h" -#include "paddle/fluid/framework/scope.h" -#include "paddle/fluid/framework/selected_rows.h" -#include "paddle/fluid/framework/tensor.h" - -namespace paddle { -namespace framework { -namespace details { - -using ReferenceCountMap = std::unordered_map; -using AtomicReferenceCountMap = - std::unordered_map>; -using DeviceReferenceCountMap = - std::unordered_map>; -using AtomicDeviceReferenceCountMap = - std::unordered_map>; -using DeviceGarbageCollectorMap = - std::unordered_map>>; - -class ReferenceCountOpHandle : public OpHandleBase { - public: - ReferenceCountOpHandle(ir::Node *node, const Scope *scope, - const platform::CUDAPlace &place, - const std::vector &var_names, - GarbageCollector *gc, - AtomicReferenceCountMap *ref_cnts) - : OpHandleBase(node), scope_(scope), gc_(gc), ref_cnts_(ref_cnts) { - dev_ctx_ = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - if (IsStreamGarabageCollector()) { - platform::SetDeviceId(place.device); - PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); - } - - for (auto &name : var_names) AddVar(name); - } - - ~ReferenceCountOpHandle() { - if (IsStreamGarabageCollector()) { - auto gpu_place = boost::get(dev_ctx_->GetPlace()); - platform::SetDeviceId(gpu_place.device); - PADDLE_ENFORCE(cudaEventDestroy(event_)); - } - } - - std::string Name() const override { return "reference_count"; } - - void AddVar(const std::string &name) { - auto it = var_names_.find(name); - if (it != var_names_.end()) - ++(it->second); - else - var_names_[name] = 1; - } - - protected: - void RunImpl() override { - auto *exec_scope = scope_->FindVar(kLocalExecScopeName)->Get(); - std::vector tensors; - for (auto &pair : var_names_) { - auto &name = pair.first; - auto it = ref_cnts_->find(name); - if (it == ref_cnts_->end()) continue; - - auto *var = exec_scope->FindVar(name); - if (var == nullptr) continue; - - if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back(var->GetMutable()); - } - } else if (var->IsType()) { - if (it->second.fetch_sub(pair.second) <= pair.second) { - tensors.emplace_back( - var->GetMutable()->mutable_value()); - } - } - } - - if (!tensors.empty()) { - ClearTensors(tensors); - } - } - - private: - void ClearTensors(const std::vector &tensors) { - auto *gc = dynamic_cast *>(gc_); - if (gc != nullptr) { - auto compute_stream = dev_ctx_->stream(); - auto callback_stream = gc->stream(); - auto callback_func = [=]() { - PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); - PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); - }; - gc_->Add(tensors, callback_func); - } else { - gc_->Add(tensors); - } - } - - bool IsStreamGarabageCollector() const { - return dynamic_cast *>(gc_) != nullptr; - } - - const Scope *scope_; - platform::CUDADeviceContext *dev_ctx_; - std::unordered_map var_names_; - GarbageCollector *gc_; // not own - AtomicReferenceCountMap *ref_cnts_; // not own - cudaEvent_t event_; -}; - -} // namespace details -} // namespace framework -} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass.cc b/paddle/fluid/framework/details/reference_count_pass.cc index 08783fb5f8b18329c9167edb0dac39b7dd42a746..13a042d8e6ed7f18c76387b666d681df0eabd0b5 100644 --- a/paddle/fluid/framework/details/reference_count_pass.cc +++ b/paddle/fluid/framework/details/reference_count_pass.cc @@ -14,187 +14,240 @@ #include #include +#include #include #include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/eager_deletion_op_handle.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/op_graph_view.h" #include "paddle/fluid/framework/details/reference_count_pass.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/graph_helper.h" namespace paddle { namespace framework { namespace details { -static ComputationOpHandle *FindNextComputationOpHandle(VarHandle *var_in) { - std::queue queue; - queue.push(var_in); - do { - auto *var = queue.front(); - queue.pop(); - for (auto *op : var->PendingOps()) { - auto *compute_op = dynamic_cast(op); - if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { - return compute_op; +// A functor to shrink/remove operators who depend on other operators in a set +class ShrinkDepsOpFunctor { + private: + enum RelationShip { kSame = 0, kNoDeps = 1, kBefore = 2, kAfter = 3 }; + + public: + explicit ShrinkDepsOpFunctor(const std::vector &all_ops) + : graph_(all_ops) {} + + template + OpSet operator()(const OpSet &op_set) const { + using KeyType = typename OpSet::key_type; + static_assert( + std::is_base_of::type>::value, + "Key type of OpSet must be OpHandleBase, or derived of OpHandleBase"); + + if (op_set.size() <= 1) return op_set; + std::vector ops(op_set.begin(), op_set.end()); + OpSet ret; + auto rels = GetRelations(ops); + auto not_before = [](RelationShip r) { return r != kBefore; }; + for (size_t i = 0; i < rels.size(); ++i) { + if (std::all_of(rels[i].begin(), rels[i].end(), not_before)) { + ret.emplace(static_cast(ops[i])); } - for (auto *out_var : op->Outputs()) { - queue.push(out_var); + } + return ret; + } + + private: + std::vector> GetRelations( + const std::vector &ops) const { + std::unordered_map op_to_idx; + for (size_t i = 0; i < ops.size(); ++i) { + PADDLE_ENFORCE(graph_.HasOp(ops[i]), "Op does not exist in graph"); + op_to_idx[ops[i]] = i; + } + + PADDLE_ENFORCE(op_to_idx.size() == ops.size(), "Duplicate ops"); + + std::vector> ret(ops.size()); + for (auto &e : ret) { + e.assign(ops.size(), kSame); + } + + size_t found_num = ops.size(); + size_t total_num = ops.size() * ops.size(); + auto visitor = [&](OpHandleBase *op, size_t i) { + auto it = op_to_idx.find(op); + if (it != op_to_idx.end()) { + size_t j = it->second; + if (i != j && ret[i][j] == kSame) { + ret[i][j] = kBefore; + ret[j][i] = kAfter; + found_num += 2; + if (found_num == total_num) { + return false; + } + } + } + return true; + }; + + for (size_t i = 0; i < ops.size(); ++i) { + auto sub_visitor = [&, i](OpHandleBase *op) { return visitor(op, i); }; + if (!graph_.VisitAllPendingOps(ops[i], sub_visitor)) { + break; + } + } + + for (size_t i = 0; i < ops.size(); ++i) { + for (size_t j = i + 1; j < ops.size(); ++j) { + if (ret[i][j] != kSame) continue; + ret[i][j] = kNoDeps; + ret[j][i] = kNoDeps; + } + } + + return ret; + } + + const OpGraphView graph_; +}; + +/** + * Find the nearest downstream computation op handle. If the op is a + * computation op, just return itself. + */ +static ComputationOpHandle *FindNextComputationOpHandleOrReturnItself( + OpHandleBase *op, size_t scope_idx) { + std::queue q; + std::unordered_set visited; + q.push(op); + do { + auto *op = q.front(); + q.pop(); + auto *compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetScopeIdx() == scope_idx) { + return compute_op; + } + for (auto *out_var : op->Outputs()) { + for (auto *pending_op : out_var->PendingOps()) { + if (visited.count(pending_op)) continue; + visited.insert(pending_op); } } - } while (!queue.empty()); + } while (!q.empty()); return nullptr; } -static void AddDependencyBetween(OpHandleBase *in, OpHandleBase *out, - ir::Graph *graph) { - auto it = std::find_if( - in->Outputs().begin(), in->Outputs().end(), [](VarHandleBase *var) { - return dynamic_cast(var) != nullptr; - }); - - if (it != in->Outputs().end()) { - out->AddInput(*it); - } else { - auto *dep_var = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dep_var); - in->AddOutput(dep_var); - out->AddInput(dep_var); +static std::unordered_set +ExtractComputationOpFromLastLivedVar(VarHandle *var, size_t scope_idx, + const ShrinkDepsOpFunctor &shrink_func, + bool *ok) { + // stage one. Get last op for variable. + std::unordered_set candidates; + { + if (var->PendingOps().empty() && var->GeneratedOp()) { + // No operator depends on this variable. So the last operator is the op + // who generates this variable. + candidates.emplace(var->GeneratedOp()); + } else { + candidates = var->PendingOps(); + } + + // No pending ops or generated op is nullptr + if (candidates.empty()) { + *ok = false; + return {}; + } + } + + // stage two. Try to cast them to computation op. + // return (*ok=false) when failed. + // + // The reason why we cannot make any types of op handle to be the last lived + // op is: + // some op handle may operate on many DeviceContext, however, our garbage + // collector can only wait one DeviceContext for now. So currently, we wait + // the nearest compute op. + std::unordered_set computation_op; + { + for (auto *op : candidates) { + auto *compute_op = + FindNextComputationOpHandleOrReturnItself(op, scope_idx); + if (compute_op == nullptr) { + *ok = false; + return {}; + } + computation_op.emplace(compute_op); + } } + + // stage three. Try to shrink computation op if they depend on each other. + // Get the smallest set of the most ops. + *ok = true; + return shrink_func(computation_op); +} + +static VarDesc *TryGetLatestVarDesc(const std::vector &vars) { + VarDesc *var_desc = nullptr; + std::find_if(vars.rbegin(), vars.rend(), [&](VarHandle *var_handle) -> bool { + var_desc = var_handle->Node()->Var(); + return var_desc != nullptr; + }); + return var_desc; } std::unique_ptr ReferenceCountPass::ApplyImpl( std::unique_ptr graph) const { - auto &ref_cnts = Get(kGlobalReferenceCount); - auto &cur_ref_cnts = Get(kCurReferenceCount); - auto &gcs = Get(kGarbageCollector); - - // It is not easy to find the right reference counts of varaibles in graph - // Step 1: Find all variables in computation ops - // Step 2: Find all variables in non-computation ops which refers to variables - // in computation ops - std::unordered_set names; - std::unordered_map - compute_ref_cnt_map; - - auto get_ref_cnts_from_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - std::vector var_names_in_op; - auto *compute_op = dynamic_cast(op); - if (compute_op == nullptr || - !platform::is_gpu_place(compute_op->GetPlace())) - return var_names_in_op; - auto place = boost::get(compute_op->GetPlace()); - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - if (!platform::is_gpu_place(var_handle->place_) || - boost::get(var_handle->place_) != place) - continue; + auto &ref_cnts = Get>(kGlobalReferenceCount); + auto &last_live_ops_of_vars = + Get>(kLastLiveOpsOfVars); + + PADDLE_ENFORCE(last_live_ops_of_vars.empty() && ref_cnts.empty(), + "Last Live Ops and Reference Counts of vars should be " + "initialized at here."); - VarDesc *var_desc = var_handle->Node()->Var(); - auto var_name = var_handle->Node()->Name(); + const auto &vars = graph->Get(kGraphVars); - // This is weird but there is really some variables without var_desc - // in computation_op - if (var_desc == nullptr) { - var_desc = compute_op->Node()->Op()->Block()->FindVar(var_name); - if (var_desc == nullptr) continue; + last_live_ops_of_vars.resize(vars.size()); + ref_cnts.resize(vars.size()); + + ShrinkDepsOpFunctor shrink_func( + ir::FilterByNodeWrapper(*graph)); + + for (size_t i = 0; i < vars.size(); ++i) { + for (auto &name_var_pair : vars[i]) { + // Whether this variable can be reused or deleted? If not, we do not + // compute reference counts and dependencies. + VarDesc *var_desc = TryGetLatestVarDesc(name_var_pair.second); + + if (var_desc == nullptr || var_desc->Persistable()) { + continue; } - if (var_desc->Persistable()) continue; auto var_type = var_desc->Proto()->type().type(); if (var_type != proto::VarType::LOD_TENSOR && - var_type != proto::VarType::SELECTED_ROWS) { + var_type != proto::VarType::SELECTED_ROWS && + var_type != proto::VarType::LOD_TENSOR_ARRAY) { + // Var type cannot be deleted continue; } - // compute op only runs in one device - if (ref_cnts[place.device]->count(var_name)) - ++(*ref_cnts[place.device])[var_name]; - else - (*ref_cnts[place.device])[var_name] = 1; + bool ok; + auto result = ExtractComputationOpFromLastLivedVar( + name_var_pair.second.back(), i, shrink_func, &ok); - names.insert(var_name); - var_names_in_op.push_back(var_name); - } - return var_names_in_op; - }; - - auto update_ref_cnts_from_non_compute_op = [&]( - OpHandleBase *op, const std::vector &vars) { - if (dynamic_cast(op) != nullptr) return; - for (VarHandleBase *var_handle_base : vars) { - auto *var_handle = dynamic_cast(var_handle_base); - if (var_handle == nullptr || !var_handle->Node()->IsVar()) continue; - - auto var_name = var_handle->Node()->Name(); - auto var_place = var_handle->place_; - if (!platform::is_gpu_place(var_place)) continue; - auto place = boost::get(var_place); - if (names.count(var_name) == 0) continue; - if (ref_cnts.count(place.device) && - ref_cnts[place.device]->count(var_name)) { - ++(*ref_cnts[place.device])[var_name]; - - auto *next_compute_op = FindNextComputationOpHandle(var_handle); - if (next_compute_op != nullptr) { - if (compute_ref_cnt_map.count(next_compute_op)) { - compute_ref_cnt_map[next_compute_op]->AddVar(var_name); - VLOG(5) << "Add reference count of " << var_name << " to Operator " - << next_compute_op->Name(); - } else { - // Create new reference_count_op_handle - ir::Node *ref_cnt_node = graph->CreateEmptyNode( - "reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, next_compute_op->GetScope(), place, {var_name}, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(next_compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[next_compute_op] = ref_cnt_handle; - } - } + if (ok) { + auto &var_name = name_var_pair.first; + PADDLE_ENFORCE(!result.empty(), "Last living ops of %s cannot be empty", + var_name); + ref_cnts[i].emplace(var_name, result.size()); + last_live_ops_of_vars[i].emplace(var_name, std::move(result)); } } - }; - - auto all_ops = ir::FilterByNodeWrapper(*graph); - for (auto &op : all_ops) { - auto in_var_names = get_ref_cnts_from_compute_op(op, op->Inputs()); - auto out_var_names = get_ref_cnts_from_compute_op(op, op->Outputs()); - if (in_var_names.empty() && out_var_names.empty()) continue; - in_var_names.insert(in_var_names.end(), out_var_names.begin(), - out_var_names.end()); - auto *compute_op = dynamic_cast(op); - auto place = boost::get(compute_op->GetPlace()); - ir::Node *ref_cnt_node = - graph->CreateEmptyNode("reference_count", ir::Node::Type::kOperation); - auto *ref_cnt_handle = new ReferenceCountOpHandle( - ref_cnt_node, compute_op->GetScope(), place, in_var_names, - gcs[place.device].get(), cur_ref_cnts[place.device].get()); - AddDependencyBetween(compute_op, ref_cnt_handle, graph.get()); - compute_ref_cnt_map[compute_op] = ref_cnt_handle; - } - - for (auto &op : all_ops) { - update_ref_cnts_from_non_compute_op(op, op->Inputs()); - update_ref_cnts_from_non_compute_op(op, op->Outputs()); - } - - std::vector new_all_ops; - new_all_ops.reserve(compute_ref_cnt_map.size() + all_ops.size()); - for (auto &op : all_ops) { - new_all_ops.emplace_back(std::move(op)); - auto it = compute_ref_cnt_map.find(new_all_ops.back()); - if (it != compute_ref_cnt_map.end()) { - // Add LeafNode to ReferenceCountOpHandle - auto *dummy_leaf = new DummyVarHandle(graph->CreateControlDepVar()); - graph->Get(kGraphDepVars).emplace(dummy_leaf); - it->second->AddOutput(dummy_leaf); - new_all_ops.emplace_back(std::move(it->second)); - } } - all_ops.swap(new_all_ops); return graph; } @@ -205,5 +258,4 @@ std::unique_ptr ReferenceCountPass::ApplyImpl( REGISTER_PASS(reference_count_pass, paddle::framework::details::ReferenceCountPass) .RequirePassAttr(paddle::framework::details::kGlobalReferenceCount) - .RequirePassAttr(paddle::framework::details::kCurReferenceCount) - .RequirePassAttr(paddle::framework::details::kGarbageCollector); + .RequirePassAttr(paddle::framework::details::kLastLiveOpsOfVars); diff --git a/paddle/fluid/framework/details/reference_count_pass.h b/paddle/fluid/framework/details/reference_count_pass.h index 7081280b0600b9c1985987d02d679c298ad4b8bd..bcbef027354ef5a5fcc7da28103a9565982c7631 100644 --- a/paddle/fluid/framework/details/reference_count_pass.h +++ b/paddle/fluid/framework/details/reference_count_pass.h @@ -14,7 +14,6 @@ #pragma once -#include "paddle/fluid/framework/details/reference_count_op_handle.h" #include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/pass.h" @@ -22,10 +21,6 @@ namespace paddle { namespace framework { namespace details { -constexpr char kGlobalReferenceCount[] = "reference_count"; -constexpr char kCurReferenceCount[] = "current_reference_count"; -constexpr char kGarbageCollector[] = "garbage_collector"; - class ReferenceCountPass : public ir::Pass { protected: std::unique_ptr ApplyImpl( diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.cc b/paddle/fluid/framework/details/reference_count_pass_helper.cc new file mode 100644 index 0000000000000000000000000000000000000000..89bd08c2d041d795205b29bb29aba311d1dbd932 --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" + +namespace paddle { +namespace framework { +namespace details {} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/reference_count_pass_helper.h b/paddle/fluid/framework/details/reference_count_pass_helper.h new file mode 100644 index 0000000000000000000000000000000000000000..1c083dbf001b08e40a54cc89b21c3dea1f18f16a --- /dev/null +++ b/paddle/fluid/framework/details/reference_count_pass_helper.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { +namespace details { + +class ComputationOpHandle; + +using ReferenceCountMap = std::unordered_map; + +using AtomicReferenceCountMap = + std::unordered_map>; + +using GarbageCollectorMap = + std::map>; + +const char kGlobalReferenceCount[] = "global_reference_count"; +const char kRuntimeReferenceCount[] = "runtime_reference_count"; +const char kGarbageCollector[] = "garbage_collector"; +const char kAllPlaces[] = "all_places"; + +using LastLiveOpsOfVars = + std::unordered_map>; +const char kLastLiveOpsOfVars[] = "last_live_ops_of_var"; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc index 499246a9856bb3ba67a155c6f00c3ad06af50edf..57f6fc66c57e2a53d9cf30d7761626a50bc379ea 100644 --- a/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.cc @@ -18,9 +18,6 @@ #include #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/platform/profiler.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_op_handle.h" -#endif namespace paddle { namespace framework { @@ -69,27 +66,12 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); drop_scope_counter_ += 1; -#ifdef PADDLE_WITH_CUDA - const std::string gc_name = "garbage_collector"; - DeviceGarbageCollectorMap *gc = - Graph().Has(gc_name) ? &(Graph().Get(gc_name)) - : nullptr; -#endif - if (!fetch_tensors.empty() || drop_scope_counter_ == strategy_.num_iteration_per_drop_scope_) { drop_scope_counter_ = 0; // Wait All computational streams for (auto p : places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); -#ifdef PADDLE_WITH_CUDA - if (gc != nullptr && platform::is_gpu_place(p)) { - auto gpu_place = boost::get(p); - auto &gc_at_place = gc->at(gpu_place.device); - gc_at_place->Wait(); - gc_at_place->Reset(); - } -#endif } for (auto &scope : local_scopes_) { auto &local_scope = diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index e97cf44c75cfdc2e7df22aa870916866b18b3b5a..8c3912120b5b405eb14da8493aaae6e7d12537a0 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor.h" +#include #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/lod_rank_table.h" @@ -41,11 +42,43 @@ namespace { int kProgramId = -1; } // namespace +static std::unordered_map GetNonPersistableReferenceCounts( + const BlockDesc& block, const std::vector& skip_var_list) { + std::unordered_map ref_cnts; + std::unordered_set skip_vars(skip_var_list.begin(), + skip_var_list.end()); + + auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { + for (auto& name_pair : name_map) { + for (auto& name : name_pair.second) { + if (skip_vars.count(name)) continue; + auto* var_desc = block.FindVar(name); + if (var_desc == nullptr || var_desc->Persistable()) continue; + auto type = var_desc->Proto()->type().type(); + if (type != proto::VarType::LOD_TENSOR && + type != proto::VarType::SELECTED_ROWS && + type != proto::VarType::LOD_TENSOR_ARRAY) { + continue; + } + ++ref_cnts[name]; + } + } + }; + + for (auto op_desc : block.AllOps()) { + update_ref_cnts(op_desc, op_desc->Inputs()); + update_ref_cnts(op_desc, op_desc->Outputs()); + } + return ref_cnts; +} + ExecutorPrepareContext::ExecutorPrepareContext( - const framework::ProgramDesc& prog, size_t block_id) + const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars) : prog_(prog), block_id_(block_id) { if (GetEagerDeletionThreshold() >= 0) { - ref_cnts_ = GetNonPersistableReferenceCount(prog_, block_id_); + global_ref_cnts_ = GetNonPersistableReferenceCounts(prog.Block(block_id), + skip_ref_cnt_vars); } } @@ -53,28 +86,40 @@ ExecutorPrepareContext::~ExecutorPrepareContext() { VLOG(5) << "destroy ExecutorPrepareContext"; } -template -static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, - GarbageCollector* gc, - RefCntMap* ref_cnts) { - std::unordered_set erase_tensors; +static void DeleteUnusedTensors( + const Scope& scope, const OperatorBase* op, GarbageCollector* gc, + std::unordered_map* ref_cnts) { + std::deque> garbages; auto handler = [&](const VariableNameMap& name_map) { for (auto& name_pair : name_map) { for (auto& name : name_pair.second) { auto it = ref_cnts->find(name); if (it == ref_cnts->end()) continue; - if ((it->second)-- == 1) { - auto* var = scope.FindVar(name); - if (var != nullptr) { - VLOG(10) << "Erase tensor \'" << name << "\'"; - if (var->IsType()) { - erase_tensors.insert(var->GetMutable()); - } else if (var->IsType()) { - erase_tensors.insert( - var->GetMutable()->mutable_value()); - } + if (--(it->second) != 0) { + continue; + } + auto* var = scope.FindVar(name); + if (var != nullptr) { + continue; + } + + VLOG(2) << "Erase variable " << name; + if (var->IsType()) { + garbages.emplace_back( + var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + garbages.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + auto* lod_tensor_arr = var->GetMutable(); + for (auto& t : *lod_tensor_arr) { + garbages.emplace_back(t.MoveMemoryHolder()); } + } else { + PADDLE_THROW("Type %s of %s is not supported eager deletion", + var->Type().name(), name); } } } @@ -83,8 +128,8 @@ static void DeleteUnusedTensors(const Scope& scope, const OperatorBase* op, handler(op->Inputs()); handler(op->Outputs()); - if (!erase_tensors.empty()) { - gc->Add(erase_tensors); + if (!garbages.empty()) { + gc->Add(std::move(garbages)); } } @@ -112,9 +157,9 @@ void Executor::Close() { #ifdef PADDLE_WITH_DISTRIBUTE // TODO(typhoonzero): complete message will need to use real trainer_id, // except 0. - ::paddle::operators::distributed::RPCClient::GetInstance< - ::paddle::operators::distributed::GRPCClient>(0) - ->SendComplete(); + auto client = + paddle::operators::distributed::RPCClient::GetInstance(0); + client->SendComplete(); #endif } @@ -325,9 +370,10 @@ void Executor::Run(const ProgramDesc& program, Scope* scope, } std::unique_ptr Executor::Prepare( - const ProgramDesc& program, int block_id) { + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars) { std::unique_ptr ctx( - new ExecutorPrepareContext(program, block_id)); + new ExecutorPrepareContext(program, block_id, skip_ref_cnt_vars)); PADDLE_ENFORCE_LT(static_cast(block_id), program.Size()); auto& block = program.Block(block_id); for (auto& op_desc : block.AllOps()) { @@ -338,16 +384,28 @@ std::unique_ptr Executor::Prepare( } std::vector> Executor::Prepare( - const ProgramDesc& program, const std::vector& block_ids) { + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars) { + PADDLE_ENFORCE( + skip_ref_cnt_vars.empty() || skip_ref_cnt_vars.size() == block_ids.size(), + "skip_ref_cnt_vars should be either empty or equals to block number %d", + block_ids.size()); std::vector> result; + size_t idx = 0; for (auto& bid : block_ids) { - auto* ctx = new ExecutorPrepareContext(program, bid); + ExecutorPrepareContext* ctx; + if (skip_ref_cnt_vars.empty()) { + ctx = new ExecutorPrepareContext(program, bid); + } else { + ctx = new ExecutorPrepareContext(program, bid, skip_ref_cnt_vars[idx]); + } PADDLE_ENFORCE_LT(static_cast(bid), program.Size()); auto& block = program.Block(bid); for (auto& op_desc : block.AllOps()) { ctx->ops_.push_back(OpRegistry::CreateOp(*op_desc)); } result.push_back(std::shared_ptr(ctx)); + ++idx; } return result; } @@ -365,22 +423,23 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, } int64_t max_memory_size = GetEagerDeletionThreshold(); - std::unique_ptr> gc; - // WhileOp would set keep_kids to true, - // because WhileGradOp needs the scopes created in WhileOp. - // Perhaps, we should not perform eager deletion in WhileOp - // The scopes and variables created by WhileOp would be deleted - // in WhileGradOp. + std::unique_ptr gc; + // skip while_op and while_grad_op temporarily if (max_memory_size >= 0 && !keep_kids) { ctx->ResetReferenceCount(); #ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place_)) { - gc.reset(new DefaultStreamGarbageCollector( - boost::get(place_), max_memory_size)); - } else { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place_), max_memory_size)); + } else { + gc.reset(new DefaultStreamGarbageCollector( + boost::get(place_), max_memory_size)); + } + } else if (platform::is_cpu_place(place_)) { #endif - gc.reset(new CPUGarbageCollector( - boost::get(place_), max_memory_size)); + gc.reset(new CPUGarbageCollector(boost::get(place_), + max_memory_size)); #ifdef PADDLE_WITH_CUDA } #endif @@ -389,17 +448,13 @@ void Executor::RunPreparedContext(ExecutorPrepareContext* ctx, Scope* scope, for (auto& op : ctx->ops_) { op->Run(*local_scope, place_); - if (gc != nullptr) { + if (gc) { DeleteUnusedTensors(*local_scope, op.get(), gc.get(), - &(ctx->cur_ref_cnts_)); + &(ctx->runtime_ref_cnts_)); } } - if (gc != nullptr) { - gc->Wait(); - } else { - platform::DeviceContextPool::Instance().Get(place_)->Wait(); - } + platform::DeviceContextPool::Instance().Get(place_)->Wait(); if (local_scope != scope) { scope->DeleteScope(local_scope); diff --git a/paddle/fluid/framework/executor.h b/paddle/fluid/framework/executor.h index 2d47903ffbd8d821b7c31386b225fe5e65ca2720..5a040ac641588ad4d89d1f6e4c0d6c296eff38eb 100644 --- a/paddle/fluid/framework/executor.h +++ b/paddle/fluid/framework/executor.h @@ -27,52 +27,21 @@ limitations under the License. */ namespace paddle { namespace framework { -template -std::unordered_map GetNonPersistableReferenceCount( - const ProgramDesc& prog, size_t block_id) { - auto& block = prog.Block(block_id); - std::unordered_map ref_cnts; - - auto update_ref_cnts = [&](OpDesc* op_desc, const VariableNameMap& name_map) { - for (auto& name_pair : name_map) { - for (auto& name : name_pair.second) { - auto* var_desc = block.FindVar(name); - if (var_desc == nullptr || var_desc->Persistable()) continue; - auto type = var_desc->Proto()->type().type(); - if (type != proto::VarType::LOD_TENSOR && - type != proto::VarType::SELECTED_ROWS) { - continue; - } - - auto it = ref_cnts.find(name); - if (it != ref_cnts.end()) { - ++it->second; - } else { - ref_cnts[name] = 1; - } - } - } - }; - - for (auto op_desc : block.AllOps()) { - update_ref_cnts(op_desc, op_desc->Inputs()); - update_ref_cnts(op_desc, op_desc->Outputs()); - } - return ref_cnts; -} - struct ExecutorPrepareContext { - ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id); + ExecutorPrepareContext(const framework::ProgramDesc& prog, size_t block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); + ~ExecutorPrepareContext(); - void ResetReferenceCount() { cur_ref_cnts_ = ref_cnts_; } + void ResetReferenceCount() { runtime_ref_cnts_ = global_ref_cnts_; } const framework::ProgramDesc& prog_; size_t block_id_; std::vector> ops_; - std::unordered_map ref_cnts_; - std::unordered_map cur_ref_cnts_; + std::unordered_map global_ref_cnts_; + std::unordered_map runtime_ref_cnts_; }; class Executor { @@ -108,10 +77,14 @@ class Executor { const std::string& fetch_holder_name = "fetch"); static std::unique_ptr Prepare( - const ProgramDesc& program, int block_id); + const ProgramDesc& program, int block_id, + const std::vector& skip_ref_cnt_vars = + std::vector()); static std::vector> Prepare( - const ProgramDesc& program, const std::vector& block_ids); + const ProgramDesc& program, const std::vector& block_ids, + const std::vector>& skip_ref_cnt_vars = + std::vector>()); void CreateVariables(const ProgramDesc& pdesc, Scope* scope, int block_id); diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index f03f39dfc6d75673274588d625e275aad7f10dab..8725ed46d595a2edb16a71ed5f23e706448e3ecd 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" namespace paddle { @@ -154,6 +155,8 @@ static void print_fetch_var(Scope* scope, const std::string& var_name) { } void ExecutorThreadWorker::TrainFiles() { + platform::SetNumThreads(1); + // todo: configurable SetDevice(); diff --git a/paddle/fluid/framework/garbage_collector.cc b/paddle/fluid/framework/garbage_collector.cc new file mode 100644 index 0000000000000000000000000000000000000000..54d9d0dc018b08decb2ff8965659bab98e81f3ab --- /dev/null +++ b/paddle/fluid/framework/garbage_collector.cc @@ -0,0 +1,89 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#ifdef PADDLE_WITH_CUDA +#include "paddle/fluid/platform/cuda_device_guard.h" +#endif +#include "paddle/fluid/framework/garbage_collector.h" + +namespace paddle { +namespace framework { + +GarbageCollector::GarbageCollector(const platform::Place &place, + size_t max_memory_size) + : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { + garbages_.reset(new GarbageQueue()); + dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); +} + +CPUGarbageCollector::CPUGarbageCollector(const platform::CPUPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void CPUGarbageCollector::ClearCallback(const std::function &callback) { + callback(); +} + +#ifdef PADDLE_WITH_CUDA +UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void UnsafeFastGPUGarbageCollector::ClearCallback( + const std::function &callback) { + callback(); +} + +DefaultStreamGarbageCollector::DefaultStreamGarbageCollector( + const platform::CUDAPlace &place, size_t max_memory_size) + : GarbageCollector(place, max_memory_size) {} + +void DefaultStreamGarbageCollector::Wait() const { + static_cast(this->dev_ctx_) + ->WaitStreamCallback(); +} + +void DefaultStreamGarbageCollector::ClearCallback( + const std::function &callback) { + static_cast(this->dev_ctx_) + ->AddStreamCallback(callback); +} + +StreamGarbageCollector::StreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size) + : GarbageCollector(place, max_memory_size) { + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamCreate(&stream_)); + callback_manager_.reset(new platform::StreamCallbackManager(stream_)); +} + +StreamGarbageCollector::~StreamGarbageCollector() { + auto place = boost::get(this->dev_ctx_->GetPlace()); + platform::CUDADeviceGuard guard(place.device); + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + PADDLE_ENFORCE(cudaStreamDestroy(stream_)); +} + +cudaStream_t StreamGarbageCollector::stream() const { return stream_; } + +void StreamGarbageCollector::Wait() const { callback_manager_->Wait(); } + +void StreamGarbageCollector::ClearCallback( + const std::function &callback) { + callback_manager_->AddCallback(callback); +} +#endif +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/garbage_collector.h b/paddle/fluid/framework/garbage_collector.h index 818b3334ea4171fd7a9cbaa896ee1672e8ecca51..2768671029c06562aa0d2e5eea3d3ff61d900ab5 100644 --- a/paddle/fluid/framework/garbage_collector.h +++ b/paddle/fluid/framework/garbage_collector.h @@ -14,7 +14,6 @@ #pragma once -#include #include #include #include @@ -24,134 +23,74 @@ namespace paddle { namespace framework { -// T should have memory_size() and clear() method -template class GarbageCollector { public: - GarbageCollector(const platform::Place &place, size_t max_memory_size) - : max_memory_size_((std::max)(max_memory_size, static_cast(1))) { - garbages_.reset(new std::deque()); - dev_ctx_ = platform::DeviceContextPool::Instance().Get(place); - } + using GarbageQueue = std::deque>; - virtual ~GarbageCollector() {} + GarbageCollector(const platform::Place &place, size_t max_memory_size); - void Reset() { - std::lock_guard guard(mutex_); - garbages_.reset(new std::deque()); - cur_memory_size_ = 0; - } + virtual ~GarbageCollector() = default; + + virtual void Wait() const {} template - void Add(const Container &objs) { - Add(objs, []() {}); - } + void Add(Container &&objs); template - void Add(const Container &objs, Callback &&callback) { - std::shared_ptr> clear_deque; - { - std::lock_guard guard(mutex_); - for (auto *obj : objs) { - garbages_->push_back(obj); - cur_memory_size_ += obj->memory_size(); - } - if (cur_memory_size_ >= max_memory_size_) { - cur_memory_size_ = 0; - clear_deque = garbages_; - garbages_.reset(new std::deque()); - } - } - - if (clear_deque != nullptr) { - callback(); - ClearCallback([=]() { - for (auto *obj : *clear_deque) obj->clear(); - }); - } - } - - virtual void Wait() const {} + void Add(Container &&objs, Callback &&callback); protected: virtual void ClearCallback(const std::function &callback) = 0; platform::DeviceContext *dev_ctx_; - std::shared_ptr> garbages_; + std::unique_ptr garbages_; mutable std::mutex mutex_; const size_t max_memory_size_; - size_t cur_memory_size_ = 0; + size_t cur_memory_size_{0}; }; -template -class CPUGarbageCollector : public GarbageCollector { +class CPUGarbageCollector : public GarbageCollector { public: - CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + CPUGarbageCollector(const platform::CPUPlace &place, size_t max_memory_size); protected: - void ClearCallback(const std::function &callback) override { - callback(); - } + void ClearCallback(const std::function &callback) override; }; #ifdef PADDLE_WITH_CUDA -template -class DefaultStreamGarbageCollector : public GarbageCollector { +class UnsafeFastGPUGarbageCollector : public GarbageCollector { public: - DefaultStreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) {} + UnsafeFastGPUGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size); - cudaStream_t stream() const { - return static_cast(this->dev_ctx_) - ->stream(); - } + protected: + void ClearCallback(const std::function &callback) override; +}; - void Wait() const override { - this->dev_ctx_->Wait(); - static_cast(this->dev_ctx_) - ->WaitStreamCallback(); - } +class DefaultStreamGarbageCollector : public GarbageCollector { + public: + DefaultStreamGarbageCollector(const platform::CUDAPlace &place, + size_t max_memory_size); + + void Wait() const override; protected: - void ClearCallback(const std::function &callback) override { - static_cast(this->dev_ctx_) - ->AddStreamCallback(callback); - } + void ClearCallback(const std::function &callback) override; }; -template -class StreamGarbageCollector : public GarbageCollector { +class StreamGarbageCollector : public GarbageCollector { public: StreamGarbageCollector(const platform::CUDAPlace &place, - size_t max_memory_size) - : GarbageCollector(place, max_memory_size) { - PADDLE_ENFORCE(cudaSetDevice(place.device)); - PADDLE_ENFORCE(cudaStreamCreate(&stream_)); - callback_manager_.reset(new platform::StreamCallbackManager(stream_)); - } + size_t max_memory_size); - ~StreamGarbageCollector() { - auto place = boost::get(this->dev_ctx_->GetPlace()); - PADDLE_ENFORCE(cudaSetDevice(place.device)); - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - PADDLE_ENFORCE(cudaStreamDestroy(stream_)); - } + ~StreamGarbageCollector(); - void Wait() const override { - PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); - std::lock_guard guard(this->mutex_); - callback_manager_->Wait(); - } + void Wait() const override; - cudaStream_t stream() const { return stream_; } + cudaStream_t stream() const; protected: - void ClearCallback(const std::function &callback) override { - std::lock_guard guard(this->mutex_); - callback_manager_->AddCallback(callback); - } + void ClearCallback(const std::function &callback) override; private: cudaStream_t stream_; @@ -159,5 +98,33 @@ class StreamGarbageCollector : public GarbageCollector { }; #endif +template +void GarbageCollector::Add(Container &&objs) { + Add(std::forward(objs), []() {}); +} + +template +void GarbageCollector::Add(Container &&objs, Callback &&callback) { + GarbageQueue *garbage_queue = nullptr; + { + std::lock_guard guard(mutex_); + for (auto &obj : objs) { + if (!obj) continue; + cur_memory_size_ += obj->size(); + garbages_->push_back(std::move(obj)); + } + if (cur_memory_size_ >= max_memory_size_) { + cur_memory_size_ = 0; + garbage_queue = garbages_.release(); + garbages_.reset(new GarbageQueue()); + } + } + + if (garbage_queue) { + callback(); + ClearCallback([garbage_queue]() { delete garbage_queue; }); + } +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 883575e41db2d883e9b969978419a10ffc58b97e..be4151b54b6087c10989c093bf007ccf3006bd65 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -42,6 +42,8 @@ pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) +pass_library(conv_elementwise_add_act_fuse_pass inference) +pass_library(conv_elementwise_add2_act_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e9905b7ecdba653bb4d8a4aa82234ffba5a9528 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..23f343f631628b432d91d4504019895ed4bac4a5 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, conv_op); // Input + IR_NODE_LINK_TO(conv_filter, conv_op); // Filter + IR_NODE_LINK_TO(conv_op, conv_out); // Output + IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..3b40a5a92665c07bc2b66e6a96721f573d40393f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAdd2ActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAdd2ActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe3b4fca79f372d570634a3c182a9ec3cf5522e1 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& activation, const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = + PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAddActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..ac69aa6458fc8c19b670dea2af1251c44dc353a8 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index bb2d953afbd64dc33d735f483a8eafd9ce951530..47fcf96a3f92b1f915e5254fff36feb8b2870730 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -73,14 +73,21 @@ class Graph { } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } template AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(Has(attr_name), "%s attr not registered for graph.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } template diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0118019df2f779a6409365555b530ae3b6d3971f..bf12d12459c59304167d9c52059a068b50de3980 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -17,6 +17,7 @@ #include #include +#include "graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_traits.h" @@ -25,6 +26,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" + namespace paddle { namespace framework { namespace ir { @@ -104,7 +106,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { for (auto &node : GraphTraits::DFS(graph)) { for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; + VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name(); pdnodes2nodes_[pdnode.get()].insert(&node); } } @@ -1099,6 +1101,115 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } + +std::unordered_set conv_act_set({"identity", "sigmoid", "relu", + "relu6", "relux", "tanh", + "band_pass"}); + +PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out}); + + return act_out; +} + +PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + elementwise_add_op_1->LinksFrom( + {elementwise_add_out, elementwise_add_in_y_1}); + act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); + return act_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d044802f22d02372e0ddb72c6fd702aebf2f82c3..0fee2f1c1852b296b3599d4b7219a032062a1d49 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -671,6 +671,51 @@ struct ElementwiseAdd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_y); PATTERN_DECL_NODE(elementwise_add_out); }; + +// Conv + ElementwiseAdd + an activation +// This pattern can futher fuse the conv related ops after the conv+bn fusion. +struct ConvElementwiseaddAct : public PatternBase { + ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd + ElementwiseAdd + Activation +struct ConvElementwiseadd2Act : public PatternBase { + ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, + "conv_elementwiseadd2_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(elementwise_add_op_1); + PATTERN_DECL_NODE(elementwise_add_in_y_1); // input + PATTERN_DECL_NODE(elementwise_add_out_1); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h index a3559247db6703d486ed01ce9f2058e671443096..27746ff1453b1b336da8c31497c066c338843b68 100644 --- a/paddle/fluid/framework/ir/pass.h +++ b/paddle/fluid/framework/ir/pass.h @@ -51,11 +51,18 @@ class Pass { AttrType &Get(const std::string &attr_name) const { PADDLE_ENFORCE(attrs_.find(attr_name) != attrs_.end(), "%s attr not registered for pass.", attr_name); - return *boost::any_cast(attrs_.at(attr_name)); + try { + return *boost::any_cast(attrs_.at(attr_name)); + } catch (boost::bad_any_cast &) { + PADDLE_THROW( + "Invalid attribute type of %s error, expected: %s, actual: %s", + attr_name, typeid(AttrType *).name(), + attrs_.at(attr_name).type().name()); + } } bool Has(const std::string &attr_name) const { - return attrs_.find(attr_name) != attrs_.end(); + return attrs_.count(attr_name) > 0; } void Erase(const std::string &attr_name) { diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index 3e17a512ce154de88ac890f3b29f03385595d95c..40db85400d2c8776b82ce0fa2fb4deed993b0255 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), - "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" + "CUDAPlace(0)]:library_" "type[CUDNN]"); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 05ab48412a8af4032580e31d07b376d42065c508..a62afe248baa2ab57edabfaac05bb858a1a1280c 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -879,6 +879,8 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( t = &(var->Get().value()); } if (t != nullptr) { + PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", + ipt_name, DebugString()); int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 28a4b14b27bda65c0ff8e3e73df1fab62439af2f..eb4baa06b5284512eab128e57f893bad43afda97 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -26,6 +26,7 @@ limitations under the License. */ #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" #include "paddle/fluid/framework/details/scope_buffered_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/platform/profiler.h" @@ -72,6 +73,26 @@ class ParallelExecutorPrivate { } } } + + std::unique_ptr PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size); + + inline bool HasGarbageCollectors() const { return !gcs_.empty(); } + + void ResetRuntimeReferenceCount(const std::vector &fetch_tensors, + const std::string &fetched_var_name) { + for (size_t i = 0; i < runtime_ref_cnts_.size(); ++i) { + for (auto &pair : global_ref_cnts_[i]) { + runtime_ref_cnts_[i][pair.first] = pair.second; + } + + for (auto &fetch_name : fetch_tensors) { + runtime_ref_cnts_[i].erase(fetch_name); + } + runtime_ref_cnts_[i].erase(fetched_var_name); + } + } + std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -83,8 +104,76 @@ class ParallelExecutorPrivate { bool own_local_scope_; bool use_cuda_; bool use_all_reduce_; + + // global_ref_cnts_ is only initialized when ParallelExecutor constructs, and + // then keeps unchanged + // Before each iteration, runtime_ref_cnts_ is reset to global_ref_cnts_ + std::vector global_ref_cnts_; + std::vector runtime_ref_cnts_; + details::GarbageCollectorMap gcs_; }; +std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( + std::unique_ptr graph, size_t max_memory_size) { + for (size_t i = 0; i < places_.size(); ++i) { + auto &place = places_[i]; + if (gcs_.count(place) > 0) { + continue; + } + std::unique_ptr gc; +#ifdef PADDLE_WITH_CUDA + if (platform::is_gpu_place(place)) { + if (IsFastEagerDeletionModeEnabled()) { + gc.reset(new UnsafeFastGPUGarbageCollector( + boost::get(place), max_memory_size)); + } else { + gc.reset(new StreamGarbageCollector( + boost::get(place), max_memory_size)); + } + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; + } else { +#endif + if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector(boost::get(place), + max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW("Unsupported place for garbage collection"); + } +#ifdef PADDLE_WITH_CUDA + } +#endif + + gcs_.emplace(place, std::move(gc)); + } + + if (!gcs_.empty()) { + std::vector last_live_ops_of_vars; + + auto ref_cnt_pass = + ir::PassRegistry::Instance().Get("reference_count_pass"); + ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, + &global_ref_cnts_); + ref_cnt_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + graph = ref_cnt_pass->Apply(std::move(graph)); + VLOG(10) << "ReferenceCountPass Applied"; + + auto eager_deletion_pass = + ir::PassRegistry::Instance().Get("eager_deletion_pass"); + eager_deletion_pass->SetNotOwned(details::kRuntimeReferenceCount, + &runtime_ref_cnts_); + eager_deletion_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + eager_deletion_pass->SetNotOwned(details::kLastLiveOpsOfVars, + &last_live_ops_of_vars); + eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); + graph = eager_deletion_pass->Apply(std::move(graph)); + VLOG(10) << "EagerDeletionPass Applied"; + } + + return graph; +} + std::vector &ParallelExecutor::GetLocalScopes() { return member_->local_scopes_; } @@ -151,36 +240,18 @@ ParallelExecutor::ParallelExecutor( std::unique_ptr graph = build_strategy.Apply( main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_, member_->nccl_ctxs_.get()); - - auto max_memory_size = GetEagerDeletionThreshold(); - if (max_memory_size >= 0) { - for (auto &place : member_->places_) { - if (!platform::is_gpu_place(place)) continue; - auto gpu_place = boost::get(place); - if (gcs_[gpu_place.device] == nullptr) { - ref_cnts_[gpu_place.device].reset(new details::ReferenceCountMap()); - cur_ref_cnts_[gpu_place.device].reset( - new details::AtomicReferenceCountMap()); - gcs_[gpu_place.device].reset( - new StreamGarbageCollector(gpu_place, max_memory_size)); - } - } - if (!gcs_.empty()) { - auto ref_cnt_pass = - ir::PassRegistry::Instance().Get("reference_count_pass"); - ref_cnt_pass->SetNotOwned(details::kGlobalReferenceCount, &ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kCurReferenceCount, &cur_ref_cnts_); - ref_cnt_pass->SetNotOwned(details::kGarbageCollector, &gcs_); - graph = ref_cnt_pass->Apply(std::move(graph)); - graph->SetNotOwned("garbage_collector", &gcs_); - } - } #else std::unique_ptr graph = build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif + auto max_memory_size = GetEagerDeletionThreshold(); + if (max_memory_size >= 0) { + graph = member_->PrepareGCAndRefCnts(std::move(graph), + static_cast(max_memory_size)); + } + // Step 3. Create vars in each scope. Passes may also create new vars. // skip control vars and empty vars std::vector var_infos; @@ -300,18 +371,9 @@ void ParallelExecutor::Run(const std::vector &fetch_tensors, #endif platform::RecordBlock b(0); -#ifdef PADDLE_WITH_CUDA - if (!gcs_.empty()) { - ResetReferenceCount(); - for (auto &pair : cur_ref_cnts_) { - auto &name_map = *(pair.second); - for (auto &fetch_name : fetch_tensors) { - name_map.erase(fetch_name); - } - name_map.erase(fetched_var_name); - } + if (member_->HasGarbageCollectors()) { + member_->ResetRuntimeReferenceCount(fetch_tensors, fetched_var_name); } -#endif auto fetch_data = member_->executor_->Run(fetch_tensors); *member_->global_scope_->Var(fetched_var_name)->GetMutable() = fetch_data; @@ -355,13 +417,11 @@ ParallelExecutor::~ParallelExecutor() { for (auto &p : member_->places_) { platform::DeviceContextPool::Instance().Get(p)->Wait(); } - // member_ must be destructed before gcs_ since the destructor of - // ReferenceCountOpHandle use raw pointers of gcs_ inside. - member_.reset(); + delete member_; } } // namespace framework } // namespace paddle -#ifdef PADDLE_WITH_CUDA + USE_PASS(reference_count_pass); -#endif +USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index ef09b98b2aa91a9d729b94d15dbb676dde4092b6..1fc17a0d64d50eb70ce66cacd4752a5b96d5e894 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -14,7 +14,6 @@ limitations under the License. */ #pragma once -#include #include #include #include @@ -29,10 +28,6 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA -#include "paddle/fluid/framework/details/reference_count_pass.h" -#endif - namespace paddle { namespace framework { @@ -75,24 +70,7 @@ class ParallelExecutor { private: void BCastParamsToDevices(const std::unordered_set &vars) const; - std::unique_ptr member_; - -#ifdef PADDLE_WITH_CUDA - // ref_cnts_ is only initialized when ParallelExecutor constructs, and then - // keeps unchanged - // Before each iteration, cur_ref_cnts_ is reset to ref_cnts_ - details::DeviceReferenceCountMap ref_cnts_; - details::AtomicDeviceReferenceCountMap cur_ref_cnts_; - details::DeviceGarbageCollectorMap gcs_; - - void ResetReferenceCount() { - for (auto &pair1 : ref_cnts_) { - for (auto &pair2 : *(pair1.second)) { - (*(cur_ref_cnts_[pair1.first]))[pair2.first] = pair2.second; - } - } - } -#endif + ParallelExecutorPrivate *member_; }; } // namespace framework diff --git a/paddle/fluid/framework/scope.cc b/paddle/fluid/framework/scope.cc index 0d261dd7ccc323abddd2c3ef13f1874661a8ca75..6fa5e99f9f3a7e871f1a742a30803853988ea6eb 100644 --- a/paddle/fluid/framework/scope.cc +++ b/paddle/fluid/framework/scope.cc @@ -38,6 +38,10 @@ DEFINE_double( "Memory size threshold (GB) when the garbage collector clear tensors." "Disabled when this value is less than 0"); +DEFINE_bool(fast_eager_deletion_mode, false, + "Fast eager deletion mode. If enabled, memory would release " + "immediately without waiting GPU kernel ends."); + // When in inference scenario, the scopes will not be written by two threads in // a mean time, but a scope may be read by multiple threads concurrently, and // the mutex will cause serious performance issue. @@ -58,6 +62,8 @@ int64_t GetEagerDeletionThreshold() { (static_cast(1) << 30)); } +bool IsFastEagerDeletionModeEnabled() { return FLAGS_fast_eager_deletion_mode; } + Scope::~Scope() { DropKids(); } Scope& Scope::NewScope() const { diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h index 1901ffbe57e0d85193c3a218f06eba06a0f287a5..aded1f771cedbf2442ad36d7fab3e6e6caffdc24 100644 --- a/paddle/fluid/framework/scope.h +++ b/paddle/fluid/framework/scope.h @@ -27,6 +27,7 @@ namespace paddle { namespace framework { int64_t GetEagerDeletionThreshold(); +bool IsFastEagerDeletionModeEnabled(); class Scope; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 057fe1f98cddcb4ed6ace89cfbd2307c8625fa97..6a1cbe5cd567429c922156f8bce7ca710b15a0f5 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -14,15 +14,14 @@ limitations under the License. */ #pragma once -#include #include #include #include #include #include - #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -159,6 +158,10 @@ class Tensor { const std::shared_ptr& Holder() const { return holder_; } size_t offset() const { return offset_; } + std::shared_ptr MoveMemoryHolder() { + return std::move(holder_); + } + private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index d67305670c91bb0814b8771332641e96974ade9d..a361b34437ade36dfba2c99db800a7d77ada8704 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { - AnalysisConfig config(false); +#ifdef PADDLE_WITH_CUDA + AnalysisConfig config(true); + config.fraction_of_gpu_memory = 0.15; +#else + AnalysisConfig config; +#endif config.model_dir = FLAGS_dirname; config.enable_ir_optim = true; diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 191225493c37903c5063ba2680d8c49e94996441..78396397397c3125c3990073d6b2887ebb477ff2 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -39,7 +39,7 @@ PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == framework::proto::VarType::INT32) { + } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index bc5139a7e54eaf7133ea96ae3b36915a236a2c5e..e6e7de24783b160769e0c9f43d8f0700a035c314 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,7 +118,10 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // }); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 24d15f12f9cd4a9280cd316bd727fdbccb831b9b..ae72a74acce826c3635d5d537540eaad79ff8199 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, for (auto* var : global_block.AllVars()) { if (IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); + VLOG(4) << "persistable variable's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9eb3fb5da1065f14d9ad1c3520f6415fbadfdca1..d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { contrib::AnalysisConfig config(true); + config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestPrediction(reinterpret_cast(&config), @@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) { profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } +TEST(resnext50, compare_analysis_native) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, false /*use tensorrt*/); +} + TEST(TensorRT_mobilenet, analysis) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - compare(model_dir, /* use_tensorrt */ false); + compare(model_dir, false /* use_tensorrt */); +} + +TEST(AnalysisPredictor, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + AnalysisConfig config(true); + config.model_dir = model_dir; + config.fraction_of_gpu_memory = 0.15; + config.pass_builder()->TurnOnDebug(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto& input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + } } } // namespace inference diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b1c2ee22951a3881b4ce5b82f9ff7eb01fde6e9e..b614e9b03502634a29333f331e25201a0f77ba38 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,4 @@ include(operators) -register_operators() +register_operators(DEPS naive_executor) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 66f8508f029f06502851bbf4121067a1741b30a6..e91d9ef7765568a842b31ba682dc1b7e0d8ffa08 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -32,6 +32,20 @@ static constexpr char kStepScopes[] = "StepScopes"; static constexpr char kX[] = "X"; static constexpr char kXGRAD[] = "X@GRAD"; static constexpr char kOutputs[] = "Out"; +static constexpr char kSkipEagerDeletionVars[] = "skip_eager_deletion_vars"; + +namespace { // NOLINT +static std::string GetSkipEagerDeletionVarsDebugString( + const std::vector &vars) { + std::string str = "Skip " + std::to_string(vars.size()) + + " var(s) in eager deletion mode: "; + for (auto &var : vars) { + str.append(var); + str.push_back(' '); + } + return str; +} +} // NOLINT class WhileOp : public framework::OperatorBase { public: @@ -59,7 +73,10 @@ class WhileOp : public framework::OperatorBase { "Condition of while op must in CPU memory."); bool is_test = Attr("is_test"); - auto ctx = executor.Prepare(*program, block->ID()); + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); while (cond.data()[0]) { auto ¤t_scope = scope.NewScope(); step_scopes->push_back(¤t_scope); @@ -96,6 +113,10 @@ class WhileOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Set to true for inference only, false " "for training. Some layers may run faster when this is true.") .SetDefault(false); + AddAttr>(kSkipEagerDeletionVars, + "Vars that would skip eager deletion." + "Users should not set this manually.") + .SetDefault(std::vector()); AddComment(R"DOC( )DOC"); } @@ -119,7 +140,10 @@ class WhileGradOp : public framework::OperatorBase { framework::Executor executor(dev_place); auto *block = Attr(kStepBlock); auto *program = block->Program(); - auto ctx = executor.Prepare(*program, block->ID()); + + auto &skip_vars = Attr>(kSkipEagerDeletionVars); + VLOG(2) << GetSkipEagerDeletionVarsDebugString(skip_vars); + auto ctx = executor.Prepare(*program, block->ID(), skip_vars); auto *step_scopes = scope.FindVar(Input(kStepScopes))->GetMutable(); @@ -341,6 +365,8 @@ class WhileGradOpDescMaker : public framework::SingleGradOpDescMaker { // while operator could be renamed. while_grad->SetAttr("original_output_grad", output_grads_list); + while_grad->SetAttr(kSkipEagerDeletionVars, std::vector()); + return std::unique_ptr(while_grad); } }; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 183850db18c3f9c4995cab8ca6259fb95b7367c1..8e0d2824953a372b96d5819be658636f9a3d78ba 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector dilations = ctx->Attrs().Get>("dilations"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, - "Conv intput should be 4-D or 5-D tensor."); + "Conv intput should be 4-D or 5-D tensor, get %u", + in_dims.size()); + PADDLE_ENFORCE_EQ( in_dims.size(), filter_dims.size(), "Conv input dimension and filter dimension should be the same."); diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index dd64cc327fc383937bc9a9d6e7daa0cec488e4cc..f2ba75485c58789de848b8833a1a527d45ced83c 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { } CudnnRNNCache *cudnn_rnn_cache = nullptr; if (cache_var->IsInitialized()) { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); } else { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 101dbe9c89616b7025337261469e2b1aa3e8bc76..eab4297c737bbf2424fff8814b73942cd520d778 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_GRPC) - grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows_functor memory) @@ -20,36 +20,43 @@ if(WITH_GRPC) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor selected_rows_functor scope math_function SERIAL) endif() - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() - set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc + collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor + proto_desc lookup_sparse_table_op snappystream snappy zlib) - cc_test(brpc_server_test SRCS rpc_server_test.cc + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${brpc_test_depends} SERIAL) cc_test(brpc_serde_test SRCS brpc_serde_test.cc DEPS ${brpc_test_depends} SERIAL) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) + endif() endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index 350969f74be258ffbfef687b56083a9c6508bc81..62e32977b8cd7e70ddd9f5d879c6844ff346ce80 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -14,135 +14,316 @@ #include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace distributed { -DEFINE_int32(brpc_channel_num, 24, - "Number of channels to send requests connected to one server"); DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); BRPCClient::~BRPCClient() { Wait(); } -void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response) { +void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used by other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to send SendVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleSendResponse"; } -bool BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); + const std::string method = "SendRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage request; + distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, + &cntl->request_attachment(), "", false, + trainer_id_); - google::protobuf::Closure* done = - brpc::NewCallback(&HandleSendResponse, cntl, response); + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - sendrecv::VariableMessage request; - ch_ctx->stub->SendVariable(cntl, &request, response, done); - }); + platform::RecordRPCEvent record_event(method, p_ctx); + + ch_ctx->stub->SendVariable(cntl, &request, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } +void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { + LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleFetchBarrierResponse"; +} void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response) { + sendrecv::VariableMessage* response, VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, + BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to GetVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + cls->DecreaseReqCount(); + var_h->Finish(false); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; - // framework::Variable* outvar = nullptr; - // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + + framework::Variable* outvar = nullptr; + int trainer_id; + distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), + *var_h->ctx(), var_h->scope(), &outvar, + &trainer_id); + VLOG(4) << "Finish HandleGetResponse"; + cls->DecreaseReqCount(); + var_h->Finish(true); } -bool BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + const std::string method = "GetRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_trainer_id(trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method, p_ctx); + + if (method_name == "GetMonomerVariable") { + ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else { + ch_ctx->stub->GetVariable(cntl, &req, response, done); + } + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; +} + +VarHandlePtr BRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); } -bool BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + + const std::string method = "PrefetchRPC"; + + VarHandlePtr var_h( + new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(in_var_name_val); + sendrecv::VariableMessage req; + distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, + &cntl->request_attachment(), out_var_name_val, + false, 0, table_name_val); + + platform::RecordRPCEvent record_event(method, p_ctx); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] {}); + ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } -void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - req_count_++; +VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + time_out); } -void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + const std::string method = "FetchBarrierRPC"; + // var handle + VarHandlePtr var_h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + + platform::RecordRPCEvent record_event(method, nullptr); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->GetVariable(cntl, &req, response, done); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; } -void BRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +bool BRPCClient::Wait() { + VLOG(9) << "begin to brpcclient wait"; + { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + } + VLOG(9) << "end to brpcclient wait"; + return true; } ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + VLOG(4) << "begin to GetChannel:" << ep; { std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { + VLOG(4) << "end to GetChannel:" << ep; return it->second; } } @@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ChannelQueuePtr q(new framework::BlockingQueue()); brpc::ChannelOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.protocol = "baidu_std"; - options.connection_type = "pooled"; - options.connect_timeout_ms = 100; + // don't use pooled type. the server can't afford that. + options.connection_type = "single"; + options.connect_timeout_ms = 1000; options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; options.max_retry = FLAGS_max_retry; - for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + + VLOG(1) << "create " << brpc_channel_num_per_server_ + << " brpc channels to pserver:" << ep; + + for (int i = 0; i < brpc_channel_num_per_server_; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { LOG(FATAL) << "Fail to initialize channel"; @@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { channels_[ep] = q; } + VLOG(4) << "end to GetChannel:" << ep; return q; } +VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); +} + +void BRPCClient::SendComplete() { + for (auto& kv : channels_) { + AsyncSendComplete(kv.first); + } +} + +VarHandlePtr BRPCClient::AsyncSendVarMessage( + const std::string& ep, const std::string& method_name, + const sendrecv::VariableMessage& req, int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + platform::RecordRPCEvent record_event(method_name, nullptr); + + VarHandlePtr var_h( + new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + if (method_name == "CheckPointNotifyRPC") { + ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); + } else if (method_name == "GetMonomerBarrier") { + ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->SendVariable(cntl, &req, response, done); + } + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(message); + + return AsyncSendVarMessage(ep, method_name, req, time_out); +} + +VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); +} + } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h index 8ff1f0a6076b3574c42065edcbac50eb75b3b483..80cc81bff37916b558aa87104f194eb37a7fc309 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -31,6 +31,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -53,33 +55,94 @@ class BRPCClient : public RPCClient { BRPCClient() {} virtual ~BRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; private: + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out = FLAGS_rpc_deadline); + void Proceed(); ChannelQueuePtr GetChannel(const std::string& ep); + VarHandlePtr AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, int64_t time_out); + + VarHandlePtr AsyncSendVarMessage(const std::string& ep, + const std::string& method_name, + const sendrecv::VariableMessage& req, + int64_t time_out); + + friend void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, + BRPCClient* cls); + void DecreaseReqCount() { + if (--req_count_ <= 0) { + sync_cond_.notify_all(); + } + } + private: std::unordered_map channels_; @@ -88,6 +151,8 @@ class BRPCClient : public RPCClient { std::condition_variable sync_cond_; std::atomic req_count_{0}; + static constexpr int brpc_channel_num_per_server_ = 4; + // mutex for GetChannel thread safety std::mutex chan_mutex_; DISABLE_COPY_AND_ASSIGN(BRPCClient); diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1be5673dfbc51e6a7beb8e3b8b2f162872a1988 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_BRPC_RDMA + +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "brpc/channel.h" +#include "brpc/rdma/rdma_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +RdmaMemPool& RdmaMemPool::Instance() { + static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); + return *g_rdma_mem_pool; +} + +void* RdmaMemPool::Find(const std::string& varname, int64_t size) { + pthread_rwlock_rdlock(&access_); + auto it = pool_.find(varname); + if (it == pool_.end()) { + pthread_rwlock_unlock(&access_); + return nullptr; + } + + auto info = it->second; + if (info.data_size != size) { + pthread_rwlock_unlock(&access_); + PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size, + info.data_size); + return nullptr; + } + + pthread_rwlock_unlock(&access_); + return info.data; +} + +void RdmaMemPool::Register(const std::string& varname, void* data, + int64_t data_size) { + void* old = Find(varname, data_size); + if (old != nullptr) { + if (data != old) { + PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old); + } + VLOG(7) << "Find on rdma:" << varname << " data:" << data + << " data_size:" << data_size; + return; + } + + VarInfo info; + info.data = data; + info.data_size = data_size; + + pthread_rwlock_wrlock(&access_); + pool_[varname] = info; + pthread_rwlock_unlock(&access_); + + if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { + LOG(FATAL) << "register " << varname << " data:" << data + << " data_size:" << data_size << " error"; + } + + VLOG(4) << "register on rdma:" << varname << " data:" << data + << " data_size:" << data_size; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..156a93ec5784715c0a68c1af2e31d640dfc60277 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_BRPC_RDMA + +#include // NOLINT +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +/* + * This class is used to avoid duplicated registion of brpc::rdma. + */ +class RdmaMemPool { + public: + static RdmaMemPool& Instance(); + RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} + + virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } + + void Register(const std::string& varname, void* data, int64_t size); + void* Find(const std::string& varname, int64_t size); + + private: + struct VarInfo { + void* data; + int64_t data_size; + + VarInfo() : data(nullptr), data_size(0) {} + }; + + private: + std::unordered_map pool_; + pthread_rwlock_t access_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..6fed9ba92c16477c79509a30911920e67b9f9bdd --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -0,0 +1,196 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class IOBufWriter { + public: + static void Append(butil::IOBuf* iobuf, int k, const char* v, int64_t vlen) { + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + iobuf->append(v, vlen); + } + + static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, + int64_t vlen, bool in_cuda_pinned, + void (*destroy)(void*), void* user_data) { + VLOG(7) << "AppendTCPZeroCopy " + << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + // FIXME(gongwb): use append_zerocopy + /* + if (in_cuda_pinned) { + iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); + } else { + iobuf->append_zerocopy(v, vlen, nullptr); + } + */ + iobuf->append(v, vlen); + destroy(user_data); + } + +#ifdef PADDLE_WITH_BRPC_RDMA + static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + RdmaMemPool::Instance().Register( + varname, static_cast(const_cast(v)), vlen); + + // FIXME(gongwb): use append_zerocopy + // iobuf->append_zerocopy(v, vlen, nullptr); + iobuf->append(v, vlen); + destroy(user_data); + return; + } +#endif + + static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { +#ifdef PADDLE_WITH_BRPC_RDMA + IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, + destroy, user_data); +#else + IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, + user_data); +#endif + } +}; + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, int trainer_id, + const std::string& table_name) { + std::unique_ptr payload; + + request->set_varname(name); + request->set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request->set_profile(platform::kEnableProfiler); + } else { + request->set_profile(platform::kDisableProfiler); + } + } + if (!out_varname.empty()) { + request->set_out_varname(out_varname); + } + if (!table_name.empty()) { + request->set_table_name(table_name); + } + if (var->IsType()) { + request->set_type(::sendrecv::LOD_TENSOR); + payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); + } else if (var->IsType()) { + request->set_type(::sendrecv::SELECTED_ROWS); + payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request->set_type(::sendrecv::NCCL_ID); + const ncclUniqueId& uid = var->Get(); + // TODO(gongwb): use append_zero to avoid data copy. + IOBufWriter::Append(iobuf, + sendrecv::VariableMessage::kSerializedFieldNumber, + uid.internal, NCCL_UNIQUE_ID_BYTES); + return; +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + PADDLE_ENFORCE_NOT_NULL(payload); + + // FIXME(gongwb): it seems that can use zero copy. + if (var_is_not_stable) { + IOBufWriter::Append( + iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size()); + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + true, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); +#endif + } else { + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + false, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); + } + } + + if (var->IsType()) { + auto* slr = var->GetMutable(); + size_t rows_memory_size = + slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + + IOBufWriter::Append(iobuf, ::sendrecv::VariableMessage::kRowsFieldNumber, + reinterpret_cast(slr->rows().data()), + static_cast(rows_memory_size)); + } +} + +void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, + const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + operators::distributed::BRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..ffaf44222422882d6c9a3cb8efbd335e8072fd49 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a2dc72150a3295205724cb04b4a246efe32d5e8 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "brpc/channel.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 564 * 128; + + // serialize var to IOBuf + { + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // desrialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); + } +} + +void RunTestLodTensor(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 512 * 8 * 4 * 2; + { + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // check sendrecv::VariableMessage meta data + { + EXPECT_EQ(msg.varname(), "myvar"); + EXPECT_EQ(msg.type(), 0); + EXPECT_EQ(msg.dims()[0], 512); + EXPECT_EQ(msg.dims()[1], 8); + EXPECT_EQ(msg.dims()[2], 4); + EXPECT_EQ(msg.dims()[3], 2); + EXPECT_EQ(msg.lod_level(), 1); + EXPECT_EQ(msg.lod(0).lod_data(0), 1); + EXPECT_EQ(msg.lod(0).lod_data(1), 3); + EXPECT_EQ(msg.lod(0).lod_data(2), 8); + } + + // deserialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + for (int i = 0; i < tensor_numel; ++i) + EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + } +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc index 862167f02084cfe81db1c0936bbfb0415fa85721..78d41aeac50a31d13fb3304909e7aed07f6a852a 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -13,84 +13,287 @@ // limitations under the License. #include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { -typedef std::unordered_map +namespace distributed = paddle::operators::distributed; + +typedef std::unordered_map HandlerMap; class BRPCServiceImpl : public SendRecvService { public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) - : request_send_h_(nullptr), - request_get_h_(nullptr), - request_prefetch_h_(nullptr) { - auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, + distributed::RPCServer* rpc_server) + : rpc_server_(rpc_server) { + VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); + auto it = rpc_call_map.find(distributed::kRequestSend); if (it != rpc_call_map.end()) { request_send_h_ = it->second; + send_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestSend))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + it = rpc_call_map.find(distributed::kRequestGet); if (it != rpc_call_map.end()) { request_get_h_ = it->second; + get_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGet))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; + prefetch_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestCheckpoint); + if (it != rpc_call_map.end()) { + request_checkpoint_h_ = it->second; + checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); + if (it != rpc_call_map.end()) { + request_get_monomer_handler_h_ = it->second; + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); + if (it != rpc_call_map.end()) { + request_get_monomer_barrier_handler_h_ = it->second; } } virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VoidMessage* response, google::protobuf::Closure* done) override { + send_threads_->Run( + [=] { _SendVariable(cntl_butil, request, response, done); }); + } + + void _SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_send_h_ != nullptr, "RequestSend handler should be registed first!"); brpc::ClosureGuard done_guard(done); - - paddle::framework::Scope* local_scope = request_send_h_->scope(); - paddle::framework::Variable* outvar = nullptr; - paddle::framework::Variable* invar = nullptr; + brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + VLOG(3) << "RequestSend var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); - if (!request_send_h_->sync_mode()) { - local_scope = &request_send_h_->scope()->NewScope(); - invar = local_scope->Var(varname); - } else { - invar = local_scope->FindVar(varname); - } + distributed::BRPCVariableResponse resp(request_send_h_->scope(), + request_send_h_->dev_ctx(), + !request_send_h_->sync_mode()); + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); - request_send_h_->Handle(varname, local_scope, invar, &outvar); + auto scope = resp.GetMutableLocalScope(); + auto invar = resp.GetVar(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; - if (!request_send_h_->sync_mode()) { - request_send_h_->scope()->DeleteScope(local_scope); - } + request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); } void GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + get_threads_->Run( + [=] { _GetVariable(cntl_butil, request, response, done); }); + } + + void _GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_get_h_ != nullptr, "RequestGet handler should be registed first!"); - } + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGet varname:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + auto scope = request_get_h_->scope(); + auto invar = scope->FindVar(varname); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), + response, &cntl->response_attachment(), "", + false); + } + } void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + prefetch_threads_->Run( + [=] { _PrefetchVariable(cntl_butil, request, response, done); }); + } + + void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_prefetch_h_ != nullptr, "kRequestPrefetch handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // prefetch process... + std::string in_var_name = request->varname(); + std::string out_var_name = request->out_varname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << ", out_var_name: " << out_var_name + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp( + request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); + + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + std::string table_name = request->table_name(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = scope->Var(out_var_name); + + request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + distributed::SerializeToIOBuf(out_var_name, outvar, + *request_prefetch_h_->dev_ctx(), response, + &cntl->response_attachment(), "", true); + } + + void CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + checkpoint_notify_threads_->Run( + [=] { _CheckpointNotify(cntl_butil, request, response, done); }); + } + + void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE( + request_checkpoint_h_ != nullptr, + "kRequestCheckpointNotify handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), + request_checkpoint_h_->dev_ctx()); + + auto scope = resp.GetMutableLocalScope(); + + std::string checkpoint_notify = request->varname(); + std::string checkpoint_dir = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + } + + void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_handler_h_ != nullptr, + "kRequestGetMonomerVariable handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // proc request. + std::string varname = request->varname(); + VLOG(3) << "GetMonomerVariable " << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, + request->trainer_id()); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, + &cntl->response_attachment(), "", false); + } + } + + void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_barrier_handler_h_ != nullptr, + "RequestGetMonomerBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + paddle::framework::Scope* scope = nullptr; + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_barrier_handler_h_->Handle( + varname, scope, invar, &outvar, request->trainer_id()); } private: - paddle::operators::distributed::RequestHandler* request_send_h_; - paddle::operators::distributed::RequestHandler* request_get_h_; - paddle::operators::distributed::RequestHandler* request_prefetch_h_; + distributed::RequestHandler* request_send_h_{nullptr}; + distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_prefetch_h_{nullptr}; + distributed::RequestHandler* request_checkpoint_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; + + distributed::RPCServer* rpc_server_{nullptr}; + + // FIXME(gongwb): brpc should support process one rpce use one threadpool. + std::unique_ptr send_threads_; + std::unique_ptr get_threads_; + std::unique_ptr prefetch_threads_; + std::unique_ptr checkpoint_notify_threads_; }; } // namespace sendrecv @@ -100,7 +303,7 @@ namespace distributed { void AsyncBRPCServer::StartServer() { // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); // Add the service into server. Notice the second parameter, because the // service is put on stack, we don't want server to delete it, otherwise @@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() { } brpc::ServerOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.idle_timeout_sec = idle_timeout_s_; options.max_concurrency = max_concurrency_; if (server_.Start(bind_address_.c_str(), &options) != 0) { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc new file mode 100644 index 0000000000000000000000000000000000000000..75306d72334abc98f1527d4557509bf2a1f8a9bb --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +namespace pb = ::google::protobuf; +using vr = ::sendrecv::VariableMessage; + +int BRPCVariableResponse::Parse(Source* source) { + pb::io::ZeroCopyInputStream* input_stream = source->contents(); + pb::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (1) { + unsigned int tag = 0; + if (!input.ReadLittleEndian32(&tag)) { + break; + } + + uint64_t num_bytes = 0; + if (!input.ReadLittleEndian64(&num_bytes)) { + break; + } + + int field = static_cast(tag); + int ret = field == 0 ? -1 : field; + switch (field) { + case vr::kSerializedFieldNumber: { + if (!ProcSerializedField(field, &input, num_bytes)) { + return ret; + } + break; + } + case vr::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return ret; + } + break; + } + default: { + PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field); + return ret; + } + } + } + + return 0; +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h new file mode 100644 index 0000000000000000000000000000000000000000..b0b91a42a01c79bc76aa19c8745ae8f7d3e9a297 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class BRPCSourceWrapper : public Source { + public: + explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return &source_; + } + + private: + butil::IOBufAsZeroCopyInputStream source_; +}; + +class BRPCVariableResponse : public VariableResponse { + public: + BRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~BRPCVariableResponse() {} + + // parse attachment from iobuf + int Parse(Source* source) override; + int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { + BRPCSourceWrapper wrapper(iobuf); + return VariableResponse::Parse(&wrapper, meta); + } +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index f14dfcdb238a9580affde96e4d5a0093743eb6c8..78956c9ea4942098002dd30e6f2f471ae49ab8d1 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); const std::string method = "SendMonomerFetchBarrierRPC"; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 94bf0a113be62613e081b7bf05763e48a0a6124c..299dfe35438c35ec922dcdc75bf774a00f111bbb 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -32,13 +32,6 @@ namespace paddle { namespace operators { namespace distributed { -static void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name, diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index 45d1d3479ce731894c26bbff40f456bbfdc13d44..8c7b7f1d7eeeced24d2ade2bcff4261b24587624 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -75,6 +75,10 @@ class RPCServer { void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num = 5); + int GetThreadNum(const std::string& rpc_name) { + return rpc_thread_num_[rpc_name]; + } + // Wait util all the clients have reached the barrier for one // rpc method. This function should be called in the // RequestHandler if you want to run the server/client in a diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 5fd42e884ab4704a1bf63ae7e2b12d7c11d585e8..25e2f77fb74f273c1cc5263046202b3fbf1084a3 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" @@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor( memory::Copy(cuda_pinned, result->ptr(), boost::get(tensor.place()), tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); return TensorPayload(result); #else diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 710c83916601756aa81d98c2a829be348ebd2daa..33eded0e6c0d90dadeeb63594983224d795fa244 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -50,6 +50,13 @@ class TensorPayload final { size_t memory_size_; }; +inline void SerializeDestroyCallback(void* payload) { + if (payload != nullptr) { + auto* shared_payload = reinterpret_cast(payload); + delete shared_payload; + } +} + TensorPayload GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index 28bb90af5675b2fe14813675ad001c0cf1d71e12..3c0b7ff24f9cf0b128d32aa185e5e3cb47d9c4b9 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_brpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index ab92ad4506d26020963b90d95cab5b5bc1d38ceb..20870ea07ebf673f1632230f9346c36b264eba30 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -26,10 +26,11 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" +#include "paddle/fluid/platform/profiler.h" -DEFINE_int32(rpc_send_thread_num, 5, "number of threads for rpc send"); -DEFINE_int32(rpc_get_thread_num, 5, "number of threads for rpc get"); -DEFINE_int32(rpc_prefetch_thread_num, 5, "number of threads for rpc prefetch"); +DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); +DEFINE_int32(rpc_get_thread_num, 12, "number of threads for rpc get"); +DEFINE_int32(rpc_prefetch_thread_num, 12, "number of threads for rpc prefetch"); namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index 58a3ca827221a626da8657640e039552fb91d56c..0bf4bebbc90028246acff940be7436fa5ba08c6c 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -58,7 +58,9 @@ class SendOp : public framework::OperatorBase { } if (sync_send) { for (size_t i = 0; i < rets.size(); i++) { + VLOG(7) << "before sync_send " << ins[i] << "from " << epmap[i]; PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + VLOG(7) << "after sync_send " << ins[i] << "from " << epmap[i]; } } } diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc index 8df43bb616179e2487534e0acabb71b09b87e1af..30873e9f87f22fa5b39cbf519760a9ec3979f98b 100644 --- a/paddle/fluid/operators/math/pooling.cc +++ b/paddle/fluid/operators/math/pooling.cc @@ -31,7 +31,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -51,16 +51,28 @@ class Pool2dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T ele = pool_process.initial(); for (int h = hstart; h < hend; ++h) { @@ -68,8 +80,9 @@ class Pool2dFunctor { pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[ph * output_width + pw] = ele; } @@ -94,7 +107,7 @@ class Pool2dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -115,18 +128,31 @@ class Pool2dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } + int pool_size = (exclusive || adaptive) + ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; float scale = 1.0 / pool_size; for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { @@ -251,7 +277,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -276,20 +302,38 @@ class Pool3dFunctor { const T* input_data = input.data(); T* output_data = output->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); + } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T ele = pool_process.initial(); for (int d = dstart; d < dend; ++d) { @@ -302,7 +346,7 @@ class Pool3dFunctor { } } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -330,7 +374,7 @@ class Pool3dGradFunctor { const framework::Tensor& output, const framework::Tensor& output_grad, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_grad_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -356,24 +400,41 @@ class Pool3dGradFunctor { const T* output_grad_data = output_grad.data(); T* input_grad_data = input_grad->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); + } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); - + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int pool_size = - exclusive + (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; float scale = 1.0 / pool_size; @@ -517,8 +578,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_height = input.dims()[2]; const int input_width = input.dims()[3]; @@ -538,16 +599,28 @@ class MaxPool2dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } T1 ele = static_cast(-FLT_MAX); int index = -1; @@ -584,7 +657,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_height = input_grad->dims()[2]; @@ -637,8 +710,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_depth = input.dims()[2]; const int input_height = input.dims()[3]; @@ -663,20 +736,38 @@ class MaxPool3dWithIndexFunctor { T1* output_data = output->mutable_data(context.GetPlace()); T2* mask_data = mask->mutable_data(context.GetPlace()); + int dstart, dend; + int hstart, hend; + int wstart, wend; for (int i = 0; i < batch_size; i++) { for (int c = 0; c < output_channels; ++c) { for (int pd = 0; pd < output_depth; ++pd) { - int dstart = pd * stride_depth - padding_depth; - int dend = std::min(dstart + ksize_depth, input_depth); - dstart = std::max(dstart, 0); + if (adaptive) { + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); + } else { + dstart = pd * stride_depth - padding_depth; + dend = std::min(dstart + ksize_depth, input_depth); + dstart = std::max(dstart, 0); + } for (int ph = 0; ph < output_height; ++ph) { - int hstart = ph * stride_height - padding_height; - int hend = std::min(hstart + ksize_height, input_height); - hstart = std::max(hstart, 0); + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + } else { + hstart = ph * stride_height - padding_height; + hend = std::min(hstart + ksize_height, input_height); + hstart = std::max(hstart, 0); + } for (int pw = 0; pw < output_width; ++pw) { - int wstart = pw * stride_width - padding_width; - int wend = std::min(wstart + ksize_width, input_width); - wstart = std::max(wstart, 0); + if (adaptive) { + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + wstart = pw * stride_width - padding_width; + wend = std::min(wstart + ksize_width, input_width); + wstart = std::max(wstart, 0); + } int output_idx = (pd * output_height + ph) * output_width + pw; T1 ele = static_cast(-FLT_MAX); @@ -718,7 +809,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_depth = input_grad->dims()[2]; diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu index cdc79e207aa9a2e59e25a07002134c12ad5a1df8..efce3f899a449c72ae3298f7ce0defb166ee8329 100644 --- a/paddle/fluid/operators/math/pooling.cu +++ b/paddle/fluid/operators/math/pooling.cu @@ -29,7 +29,7 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* output_data) { + bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -37,13 +37,23 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); - - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + int hstart, hend; + int wstart, wend; + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T ele = pool_process.initial(); @@ -52,8 +62,8 @@ __global__ void KernelPool2D(const int nthreads, const T* input_data, pool_process.compute(input_data[h * input_width + w], &ele); } } - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size = (exclusive || adaptive) ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); output_data[index] = ele; } @@ -66,22 +76,33 @@ __global__ void KernelPool2DGrad( const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* input_grad) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int phend = min(offsetH / stride_height + 1, output_height); - int pwend = min(offsetW / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = h_offset * output_height / input_height; + phend = + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); + } else { + phstart = (h_offset < ksize_height) + ? 0 + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) + ? 0 + : (w_offset - ksize_width) / stride_width + 1; + phend = min(h_offset / stride_height + 1, output_height); + pwend = min(w_offset / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; int output_idx = @@ -90,14 +111,22 @@ __global__ void KernelPool2DGrad( output_grad += output_idx; for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = exclusive ? (hend - hstart) * (wend - wstart) - : ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = static_cast(ceil(static_cast(input_height) / + ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = exclusive ? (hend - hstart) * (wend - wstart) + : ksize_height * ksize_width; + } int output_sub_idx = ph * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -181,7 +210,7 @@ void Pool2dDirectCUDAFunctor::operator()( KernelPool2D<<>>( nthreads, input, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, - padding_height, padding_width, pool_compute, exclusive, output); + padding_height, padding_width, pool_compute, exclusive, false, output); } /* @@ -196,7 +225,7 @@ class Pool2dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -223,7 +252,7 @@ class Pool2dFunctor { nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -242,7 +271,8 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -270,7 +300,7 @@ class Pool2dGradFunctor { nthreads, input_data, output_data, output_grad_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, stride_width, padding_height, padding_width, - pool_process, exclusive, input_grad_data); + pool_process, exclusive, adaptive, input_grad_data); } }; @@ -359,7 +389,7 @@ __global__ void KernelPool3D( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - PoolProcess pool_process, bool exclusive, T* output_data) { + PoolProcess pool_process, bool exclusive, bool adaptive, T* output_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -368,15 +398,30 @@ __global__ void KernelPool3D( int c = (index / output_width / output_height / output_depth) % channels; int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); + + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T ele = pool_process.initial(); input_data += (batch_idx * channels + c) * input_depth * input_height * input_width; @@ -388,7 +433,7 @@ __global__ void KernelPool3D( } } } - int pool_size = exclusive + int pool_size = (exclusive || adaptive) ? (dend - dstart) * (hend - hstart) * (wend - wstart) : ksize_depth * ksize_height * ksize_width; pool_process.finalize(static_cast(pool_size), &ele); @@ -405,28 +450,43 @@ __global__ void KernelPool3DGrad( const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, PoolProcess pool_process, - bool exclusive, T* input_grad) { + bool exclusive, bool adaptive, T* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { - int offsetW = index % input_width + padding_width; - int offsetH = (index / input_width) % input_height + padding_height; - int offsetD = + int w_offset = index % input_width + padding_width; + int h_offset = (index / input_width) % input_height + padding_height; + int d_offset = (index / input_width / input_height) % input_depth + padding_depth; int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pdstart = (offsetD < ksize_depth) - ? 0 - : (offsetD - ksize_depth) / stride_depth + 1; - int phstart = (offsetH < ksize_height) - ? 0 - : (offsetH - ksize_height) / stride_height + 1; - int pwstart = (offsetW < ksize_width) - ? 0 - : (offsetW - ksize_width) / stride_width + 1; - int pdend = min((offsetD) / stride_depth + 1, output_depth); - int phend = min((offsetH) / stride_height + 1, output_height); - int pwend = min((offsetW) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; + phend = + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = (d_offset < ksize_depth) + ? 0 + : (d_offset - ksize_depth) / stride_depth + 1; + phstart = (h_offset < ksize_height) + ? 0 + : (h_offset - ksize_height) / stride_height + 1; + pwstart = (w_offset < ksize_width) + ? 0 + : (w_offset - ksize_width) / stride_width + 1; + pdend = min((d_offset) / stride_depth + 1, output_depth); + phend = min((h_offset) / stride_height + 1, output_height); + pwend = min((w_offset) / stride_width + 1, output_width); + } T gradient = 0; T input = input_data[index]; @@ -439,18 +499,29 @@ __global__ void KernelPool3DGrad( for (int ph = phstart; ph < phend; ++ph) { for (int pw = pwstart; pw < pwend; ++pw) { // figure out the pooling size - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); - int pool_size = - exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) - : ksize_depth * ksize_height * ksize_width; + int pool_size; + if (adaptive) { + pool_size = + static_cast( + ceil(static_cast(input_depth) / ksize_depth)) * + static_cast( + ceil(static_cast(input_height) / ksize_height)) * + static_cast( + ceil(static_cast(input_width) / ksize_width)); + } else { + int dstart = pd * stride_depth - padding_depth; + int hstart = ph * stride_height - padding_height; + int wstart = pw * stride_width - padding_width; + int dend = min(dstart + ksize_depth, input_depth); + int hend = min(hstart + ksize_height, input_height); + int wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + pool_size = + exclusive ? (dend - dstart) * (hend - hstart) * (wend - wstart) + : ksize_depth * ksize_height * ksize_width; + } int output_sub_idx = (pd * output_height + ph) * output_width + pw; pool_process.compute(input, output_data[output_sub_idx], output_grad[output_sub_idx], @@ -525,7 +596,7 @@ class Pool3dFunctor { const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* output) { + bool exclusive, bool adaptive, framework::Tensor* output) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -559,7 +630,7 @@ class Pool3dFunctor { input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, padding_width, pool_process, exclusive, - output_data); + adaptive, output_data); } }; @@ -578,7 +649,8 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_process, - bool exclusive, framework::Tensor* input_grad) { + bool exclusive, bool adaptive, + framework::Tensor* input_grad) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -614,7 +686,7 @@ class Pool3dGradFunctor { input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, padding_depth, padding_height, - padding_width, pool_process, exclusive, input_grad_data); + padding_width, pool_process, exclusive, adaptive, input_grad_data); } }; @@ -703,7 +775,7 @@ __global__ void KernelMaxPool2dWithIdx( const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, const int padding_height, - const int padding_width, T1* output_data, T2* mask_data) { + const int padding_width, bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -711,13 +783,23 @@ __global__ void KernelMaxPool2dWithIdx( int c = (index / output_width / output_height) % channels; int batch_idx = index / output_width / output_height / channels; - int hstart = ph * stride_height - padding_height; - int hend = min(hstart + ksize_height, input_height); - hstart = max(hstart, 0); - - int wstart = pw * stride_width - padding_width; - int wend = min(wstart + ksize_width, input_width); - wstart = max(wstart, 0); + int hstart, hend; + int wstart, wend; + if (adaptive) { + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + hstart = ph * stride_height - padding_height; + hend = min(hstart + ksize_height, input_height); + hstart = max(hstart, 0); + + wstart = pw * stride_width - padding_width; + wend = min(wstart + ksize_width, input_width); + wstart = max(wstart, 0); + } input_data += (batch_idx * channels + c) * input_height * input_width; T1 ele = -FLT_MAX; @@ -742,36 +824,47 @@ __global__ void KernelMaxPool2DWithIdxGrad( const int channels, const int input_height, const int input_width, const int output_height, const int output_width, const int ksize_height, const int ksize_width, const int stride_height, const int stride_width, - const int padding_height, const int padding_width, T1* input_grad) { + const int padding_height, const int padding_width, bool adaptive, + T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset = index % input_width; int h_offset = (index / input_width) % input_height; - int c_offset = (index / input_width / input_height) % channels; + int offsetC = (index / input_width / input_height) % channels; int batch_idx = index / input_width / input_height / channels; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + phstart = h_offset * output_height / input_height; + phend = + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); + } else { + phstart = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + pwstart = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + phend = + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; int input_current_featuremap_idx = h_offset * input_width + w_offset; int output_idx = - (batch_idx * channels + c_offset) * output_height * output_width; + (batch_idx * channels + offsetC) * output_height * output_width; mask_data += output_idx; output_grad += output_idx; - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask_data[ph * output_width + pw] == input_current_featuremap_idx) gradient += output_grad[ph * output_width + pw]; } @@ -791,8 +884,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_height = input.dims()[2]; @@ -819,7 +912,8 @@ class MaxPool2dWithIndexFunctor { KernelMaxPool2dWithIdx<<>>( nthreads, input_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, stride_height, - stride_width, padding_height, padding_width, output_data, mask_data); + stride_width, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -835,7 +929,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -862,7 +956,7 @@ class MaxPool2dWithIndexGradFunctor { KernelMaxPool2DWithIdxGrad<<>>( nthreads, output_grad_data, mask_data, input_channels, input_height, input_width, output_height, output_width, ksize_height, ksize_width, - stride_height, stride_width, padding_height, padding_width, + stride_height, stride_width, padding_height, padding_width, adaptive, input_grad_data); } }; @@ -884,7 +978,7 @@ __global__ void KernelMaxPool3DWithIdx( const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, const int padding_width, - T1* output_data, T2* mask_data) { + bool adaptive, T1* output_data, T2* mask_data) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int pw = index % output_width; @@ -894,15 +988,29 @@ __global__ void KernelMaxPool3DWithIdx( int batch_idx = index / output_width / output_height / output_depth / channels; - int dstart = pd * stride_depth - padding_depth; - int hstart = ph * stride_height - padding_height; - int wstart = pw * stride_width - padding_width; - int dend = min(dstart + ksize_depth, input_depth); - int hend = min(hstart + ksize_height, input_height); - int wend = min(wstart + ksize_width, input_width); - dstart = max(dstart, 0); - hstart = max(hstart, 0); - wstart = max(wstart, 0); + int dstart, dend; + int hstart, hend; + int wstart, wend; + if (adaptive) { + dstart = AdaptStartIndex(pd, input_depth, output_depth); + dend = AdaptEndIndex(pd, input_depth, output_depth); + + hstart = AdaptStartIndex(ph, input_height, output_height); + hend = AdaptEndIndex(ph, input_height, output_height); + + wstart = AdaptStartIndex(pw, input_width, output_width); + wend = AdaptEndIndex(pw, input_width, output_width); + } else { + dstart = pd * stride_depth - padding_depth; + hstart = ph * stride_height - padding_height; + wstart = pw * stride_width - padding_width; + dend = min(dstart + ksize_depth, input_depth); + hend = min(hstart + ksize_height, input_height); + wend = min(wstart + ksize_width, input_width); + dstart = max(dstart, 0); + hstart = max(hstart, 0); + wstart = max(wstart, 0); + } T1 ele = -FLT_MAX; int max_index = -1; @@ -932,46 +1040,58 @@ __global__ void KernelMaxPool3DWithIdxGrad( const int output_width, const int ksize_depth, const int ksize_height, const int ksize_width, const int stride_depth, const int stride_height, const int stride_width, const int padding_depth, const int padding_height, - const int padding_width, T1* input_grad) { + const int padding_width, bool adaptive, T1* input_grad) { for (int index = blockIdx.x * blockDim.x + threadIdx.x; index < nthreads; index += blockDim.x * gridDim.x) { int w_offset = index % input_width; int h_offset = (index / input_width) % input_height; int d_offset = (index / input_width / input_height) % input_depth; - int c_offset = - (index / input_width / input_height / input_depth) % channels; + int offsetC = (index / input_width / input_height / input_depth) % channels; int batch_idx = index / input_width / input_height / input_depth / channels; - int pd_start = - (d_offset + padding_depth < ksize_depth) - ? 0 - : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; - int ph_start = - (h_offset + padding_height < ksize_height) - ? 0 - : (h_offset + padding_height - ksize_height) / stride_height + 1; - int pw_start = - (w_offset + padding_width < ksize_width) - ? 0 - : (w_offset + padding_width - ksize_width) / stride_width + 1; - int pd_end = - min((d_offset + padding_depth) / stride_depth + 1, output_depth); - int ph_end = - min((h_offset + padding_height) / stride_height + 1, output_height); - int pw_end = - min((w_offset + padding_width) / stride_width + 1, output_width); + int pdstart, pdend; + int phstart, phend; + int pwstart, pwend; + if (adaptive) { + pdstart = d_offset * output_depth / input_depth; + pdend = + min((d_offset + 1) * output_depth / input_depth + 1, output_depth); + phstart = h_offset * output_height / input_height; + phend = + min((h_offset + 1) * output_height / input_height + 1, output_height); + pwstart = w_offset * output_width / input_width; + pwend = + min((w_offset + 1) * output_width / input_width + 1, output_width); + } else { + pdstart = + (d_offset + padding_depth < ksize_depth) + ? 0 + : (d_offset + padding_depth - ksize_depth) / stride_depth + 1; + phstart = + (h_offset + padding_height < ksize_height) + ? 0 + : (h_offset + padding_height - ksize_height) / stride_height + 1; + pwstart = + (w_offset + padding_width < ksize_width) + ? 0 + : (w_offset + padding_width - ksize_width) / stride_width + 1; + pdend = min((d_offset + padding_depth) / stride_depth + 1, output_depth); + phend = + min((h_offset + padding_height) / stride_height + 1, output_height); + pwend = min((w_offset + padding_width) / stride_width + 1, output_width); + } T1 gradient = 0; int input_current_feature_map_idx = (d_offset * input_height + h_offset) * input_width + w_offset; - int output_idx = (batch_idx * channels + c_offset) * output_depth * + int output_idx = (batch_idx * channels + offsetC) * output_depth * output_height * output_width; mask += output_idx; output_grad += output_idx; - for (int pd = pd_start; pd < pd_end; ++pd) { - for (int ph = ph_start; ph < ph_end; ++ph) { - for (int pw = pw_start; pw < pw_end; ++pw) { + for (int pd = pdstart; pd < pdend; ++pd) { + for (int ph = phstart; ph < phend; ++ph) { + for (int pw = pwstart; pw < pwend; ++pw) { if (mask[(pd * output_height + ph) * output_width + pw] == input_current_feature_map_idx) gradient += @@ -994,8 +1114,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask) { + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask) { const int batch_size = input.dims()[0]; const int input_channels = input.dims()[1]; const int input_depth = input.dims()[2]; @@ -1029,7 +1149,8 @@ class MaxPool3dWithIndexFunctor { nthreads, input_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, stride_width, - padding_depth, padding_height, padding_width, output_data, mask_data); + padding_depth, padding_height, padding_width, adaptive, output_data, + mask_data); } }; @@ -1045,7 +1166,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad) { const int batch_size = input_grad->dims()[0]; const int input_channels = input_grad->dims()[1]; @@ -1079,7 +1200,7 @@ class MaxPool3dWithIndexGradFunctor { nthreads, output_grad_data, mask_data, input_channels, input_depth, input_height, input_width, output_depth, output_height, output_width, ksize_depth, ksize_height, ksize_width, stride_depth, stride_height, - stride_width, padding_depth, padding_height, padding_width, + stride_width, padding_depth, padding_height, padding_width, adaptive, input_grad_data); } }; diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h index 923babd4c248364b735bb09def7bf12f2762f305..e1f8e6df1d19b519e48bff326bc1aa9548c96905 100644 --- a/paddle/fluid/operators/math/pooling.h +++ b/paddle/fluid/operators/math/pooling.h @@ -68,6 +68,18 @@ class AvgPoolGrad { } }; +/* used for adaptive pool to calculate start and end index of each divided grid + */ +HOSTDEVICE inline int AdaptStartIndex(int ph, int input_size, int output_size) { + return static_cast( + floor(static_cast(ph * input_size) / output_size)); +} + +HOSTDEVICE inline int AdaptEndIndex(int ph, int input_size, int output_size) { + return static_cast( + ceil(static_cast((ph + 1) * input_size) / output_size)); +} + /* * \brief Getting pooling results, and calculating gradient. * @@ -102,7 +114,7 @@ class Pool2dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -114,7 +126,7 @@ class Pool2dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -136,7 +148,7 @@ class Pool3dFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* output); + bool exclusive, bool adaptive, framework::Tensor* output); }; template @@ -148,7 +160,7 @@ class Pool3dGradFunctor { const std::vector& ksize, const std::vector& strides, const std::vector& paddings, PoolProcess pool_compute, - bool exclusive, framework::Tensor* input_grad); + bool exclusive, bool adaptive, framework::Tensor* input_grad); }; template @@ -176,8 +188,8 @@ class MaxPool2dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -187,7 +199,7 @@ class MaxPool2dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; @@ -197,8 +209,8 @@ class MaxPool3dWithIndexFunctor { void operator()(const DeviceContext& context, const framework::Tensor& input, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, framework::Tensor* output, - framework::Tensor* mask); + const std::vector& paddings, bool adaptive, + framework::Tensor* output, framework::Tensor* mask); }; template @@ -208,7 +220,7 @@ class MaxPool3dWithIndexGradFunctor { const framework::Tensor& output_grad, const framework::Tensor& mask, const std::vector& ksize, const std::vector& strides, - const std::vector& paddings, + const std::vector& paddings, bool adaptive, framework::Tensor* input_grad); }; diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc index 62599548496b0d788bdfa9868efc0193dccd55c8..5399ae556e7f38a551d680704d8d825e2fdba88a 100644 --- a/paddle/fluid/operators/pool_op.cc +++ b/paddle/fluid/operators/pool_op.cc @@ -52,6 +52,7 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); bool ceil_mode = ctx->Attrs().Get("ceil_mode"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -72,9 +73,13 @@ void PoolOp::InferShape(framework::InferShapeContext* ctx) const { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(PoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i], ceil_mode)); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(PoolOutputSize( + in_x_dims[i + 2], ksize[i], paddings[i], strides[i], ceil_mode)); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->ShareLoD("X", "Out"); @@ -185,6 +190,14 @@ void Pool2dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); + AddAttr( "use_cudnn", "(bool, default false) Only used in cudnn kernel, need install cudnn") @@ -263,6 +276,14 @@ Example: Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} $$ + For adaptive = true: + $$ + hstart = floor(i * H_{in} / H_{out}) + hend = ceil((i + 1) * H_{in} / H_{out}) + wstart = floor(j * W_{in} / W_{out}) + wend = ceil((j + 1) * W_{in} / W_{out}) + Output(i ,j) = \\frac{sum(Input[hstart:hend, wstart:wend])}{(hend - hstart) * (wend - wstart)} + $$ )DOC"); } @@ -324,6 +345,13 @@ void Pool3dOpMaker::Make() { "averaging calculating, otherwise, include the zero-padding. Note, it " "is only used when pooling_type is avg. The defalut is True.") .SetDefault(true); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr( "use_cudnn", @@ -375,6 +403,37 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1] + strides[1] -1)}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2] + strides[2] -1)}{strides[2]} + 1 $$ + For exclusive = true: + $$ + dstart = i * strides[0] - paddings[0] + dend = dstart + ksize[0] + hstart = j * strides[1] - paddings[1] + hend = hstart + ksize[1] + wstart = k * strides[2] - paddings[2] + wend = wstart + ksize[2] + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{ksize[0] * ksize[1] * ksize[2]} + $$ + For exclusive = false: + $$ + dstart = max(0, i * strides[0] - paddings[0]) + dend = min(D, dstart + ksize[0]) + hstart = max(0, j * strides[1] - paddings[1]) + hend = min(H, hstart + ksize[1]) + wstart = max(0, k * strides[2] - paddings[2]) + wend = min(W, wstart + ksize[2]) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ + + For adaptive = true: + $$ + dstart = floor(i * D_{in} / D_{out}) + dend = ceil((i + 1) * D_{in} / D_{out}) + hstart = floor(j * H_{in} / H_{out}) + hend = ceil((j + 1) * H_{in} / H_{out}) + wstart = floor(k * W_{in} / W_{out}) + wend = ceil((k + 1) * W_{in} / W_{out}) + Output(i ,j, k) = \\frac{sum(Input[dstart:dend, hstart:hend, wstart:wend])}{(dend - dstart) * (hend - hstart) * (wend - wstart)} + $$ )DOC"); } diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h index c0594b7e3cc5602a44bb01951a22c2135ba5c7ce..6c5900bd0f55bb817834de6d1f3c5e4eb7f282b9 100644 --- a/paddle/fluid/operators/pool_op.h +++ b/paddle/fluid/operators/pool_op.h @@ -70,6 +70,7 @@ class PoolKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -85,7 +86,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::MaxPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool2dFunctor< @@ -93,7 +94,7 @@ class PoolKernel : public framework::OpKernel { pool2d_forward; paddle::operators::math::AvgPool pool_process; pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; case 3: { @@ -103,14 +104,14 @@ class PoolKernel : public framework::OpKernel { pool3d_forward; paddle::operators::math::MaxPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - true, out); + true, false, out); } else if (pooling_type == "avg") { paddle::operators::math::Pool3dFunctor< DeviceContext, paddle::operators::math::AvgPool, T> pool3d_forward; paddle::operators::math::AvgPool pool_process; pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, pool_process, - exclusive, out); + exclusive, adaptive, out); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } @@ -133,6 +134,7 @@ class PoolGradKernel : public framework::OpKernel { std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); bool exclusive = context.Attr("exclusive"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { @@ -159,7 +161,8 @@ class PoolGradKernel : public framework::OpKernel { pool2d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool2d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; case 3: { @@ -174,7 +177,8 @@ class PoolGradKernel : public framework::OpKernel { pool3d_backward; paddle::operators::math::AvgPoolGrad pool_process; pool3d_backward(dev_ctx, *in_x, *out, *out_grad, ksize, strides, - paddings, pool_process, exclusive, in_x_grad); + paddings, pool_process, exclusive, adaptive, + in_x_grad); } } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index 179ee96e01c7eb5eb9c7b65666e2cd7d3d05311a..91bd2a902f7cc53f76682d99195ed0d2c08352a3 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -40,6 +40,7 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { std::vector ksize = ctx->Attrs().Get>("ksize"); std::vector strides = ctx->Attrs().Get>("strides"); std::vector paddings = ctx->Attrs().Get>("paddings"); + bool adaptive = ctx->Attrs().Get("adaptive"); PADDLE_ENFORCE(in_x_dims.size() == 4 || in_x_dims.size() == 5, "Pooling intput should be 4-D or 5-D tensor."); @@ -60,9 +61,13 @@ class MaxPoolWithIndexOp : public framework::OperatorWithKernel { "Paddings size and pooling size should be the same."); std::vector output_shape({in_x_dims[0], in_x_dims[1]}); - for (size_t i = 0; i < ksize.size(); ++i) { - output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], - paddings[i], strides[i])); + if (adaptive) { + output_shape.insert(output_shape.end(), ksize.begin(), ksize.end()); + } else { + for (size_t i = 0; i < ksize.size(); ++i) { + output_shape.push_back(MaxPoolOutputSize(in_x_dims[i + 2], ksize[i], + paddings[i], strides[i])); + } } ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); ctx->SetOutputDim("Mask", framework::make_ddim(output_shape)); @@ -131,6 +136,14 @@ class MaxPool2dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default:false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1, 1}), strides(height, " "width) of pooling operator.") @@ -167,6 +180,12 @@ Example: H_{out} = \frac{(H_{in} - ksize[0] + 2 * paddings[0])}{strides[0]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 $$ + + For adaptive = true: + $$ + H_{out} = ksize[0] W_{out} = ksize[1] + $$ + )DOC"); } @@ -207,6 +226,14 @@ class MaxPool3dWithIndexOpMaker : public framework::OpProtoAndCheckerMaker { "(bool, default false) Whether to use the global pooling. " "If global_pooling = true, ksize and paddings will be ignored.") .SetDefault(false); + AddAttr( + "adaptive", + "(bool, default False) When true, will perform adaptive pooling " + "instead, " + "output shape in H and W dimensions will be same as ksize, input data " + "will be divided into grids specify by ksize averagely and perform " + "pooling in each grid area to get output pooling value.") + .SetDefault(false); AddAttr>("strides", "(vector, default {1,1,1}), strides(depth, " "height, width) of pooling operator.") @@ -244,6 +271,11 @@ Example: H_{out} = \frac{(H_{in} - ksize[1] + 2 * paddings[1])}{strides[1]} + 1 \\ W_{out} = \frac{(W_{in} - ksize[2] + 2 * paddings[2])}{strides[2]} + 1 $$ + + For adaptive = true: + $$ + D_{out} = ksize[0] H_{out} = ksize[1] W_{out} = ksize[2] + $$ )DOC"); } diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h index b55fa76eae34c3179d40f31ed6a57d3ecbbaaccf..a6bec121d4ff002ec80a0f47510e4431176e0ddc 100644 --- a/paddle/fluid/operators/pool_with_index_op.h +++ b/paddle/fluid/operators/pool_with_index_op.h @@ -36,6 +36,7 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); auto& dev_ctx = context.template device_context(); if (context.Attr("global_pooling")) { @@ -50,13 +51,15 @@ class MaxPoolWithIndexKernel : public framework::OpKernel { paddle::operators::math::MaxPool2dWithIndexFunctor pool2d_forward; - pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool2d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexFunctor pool3d_forward; - pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, out, mask); + pool3d_forward(dev_ctx, *in_x, ksize, strides, paddings, adaptive, out, + mask); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } @@ -75,6 +78,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { std::vector ksize = context.Attr>("ksize"); std::vector strides = context.Attr>("strides"); std::vector paddings = context.Attr>("paddings"); + bool adaptive = context.Attr("adaptive"); if (context.Attr("global_pooling")) { for (size_t i = 0; i < ksize.size(); ++i) { paddings[i] = 0; @@ -93,14 +97,14 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel { T1, T2> pool2d_backward; pool2d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; case 3: { paddle::operators::math::MaxPool3dWithIndexGradFunctor pool3d_backward; pool3d_backward(device_ctx, *out_grad, *mask, ksize, strides, - paddings, in_x_grad); + paddings, adaptive, in_x_grad); } break; default: { PADDLE_THROW("Pool op only supports 2D and 3D input."); } } diff --git a/paddle/fluid/operators/psroi_pool_op.cc b/paddle/fluid/operators/psroi_pool_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..78989582b7a0da5b7ff326cea1606df9993bed4c --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cc @@ -0,0 +1,171 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +class PSROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor), " + "the input of PSROIPoolOp. " + "The format of input tensor is NCHW. Where N is the batch size, " + "C is the number of input channels, " + "H is the height of the input feature map, and " + "W is the width."); + AddInput("ROIs", + "(LoDTensor), " + "ROIs (Regions of Interest) to pool over. " + "should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]. " + "where (x1, y1) is the top left coordinates, and " + "(x2, y2) is the bottom right coordinates. " + "The roi batch index can be calculated from LoD."); + AddOutput("Out", + "(Tensor), " + "the output of PSROIPoolOp is a 4-D Tensor with shape " + "(num_rois, output_channels, pooled_h, pooled_w)."); + AddAttr( + "output_channels", + "(int), " + "the number of channels of the output feature map. " + "For a task of C classes of objects, output_channels should be " + "(C + 1) for classification only."); + AddAttr("spatial_scale", + "(float, default 1.0), " + "Multiplicative spatial scale factor " + "to translate ROI coords from their input scale " + "to the scale used when pooling.") + .SetDefault(1.0); + AddAttr("pooled_height", + "(int, default 1), " + "the pooled output height.") + .SetDefault(1); + AddAttr("pooled_width", + "(int, default 1), " + "the pooled output width.") + .SetDefault(1); + AddComment(R"Doc( +**PSROIPool Operator** + +Position sensitive region of interest pooling (also known as PSROIPooling) is to perform +position-sensitive average pooling on regions of interest specified by input, takes as +input N position-sensitive score maps and a list of num_rois regions of interest. + +PSROIPooling for R-FCN. Please refer to https://arxiv.org/abs/1605.06409 for more details. + )Doc"); + } +}; + +class PSROIPoolOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("ROIs"), + "Input(ROIs) of PSROIPoolOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of PSROIPoolOp should not be null."); + auto input_dims = ctx->GetInputDim("X"); + auto rois_dims = ctx->GetInputDim("ROIs"); + + PADDLE_ENFORCE(input_dims.size() == 4, + "The format of input tensor is NCHW"); + PADDLE_ENFORCE(rois_dims.size() == 2, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + PADDLE_ENFORCE(rois_dims[1] == 4, + "ROIs should be a 2-D LoDTensor of shape (num_rois, 4) " + "given as [(x1, y1, x2, y2), ...]"); + + int pooled_height = ctx->Attrs().Get("pooled_height"); + int pooled_width = ctx->Attrs().Get("pooled_width"); + int output_channels = ctx->Attrs().Get("output_channels"); + float spatial_scale = ctx->Attrs().Get("spatial_scale"); + + PADDLE_ENFORCE( + input_dims[1] == output_channels * pooled_height * pooled_width, + "the channel of X(%d) should be equal to the product of " + "output_channels(%d), pooled_height(%d) and pooled_width(%d)", + input_dims[1], output_channels, pooled_height, pooled_width); + + PADDLE_ENFORCE_GT(pooled_height, 0, + "The pooled output height must be greater than 0"); + PADDLE_ENFORCE_GT(pooled_width, 0, + "The pooled output width must be greater than 0"); + PADDLE_ENFORCE_GT(output_channels, 1, + "The pooled output channels must greater than 1"); + PADDLE_ENFORCE_GT(spatial_scale, 0.0f, + "The spatial scale must greater than 0."); + + auto out_dims = input_dims; + out_dims[0] = rois_dims[0]; + out_dims[1] = + output_channels; // input_dims[1] / (pooled_height * pooled_width); + out_dims[2] = pooled_height; + out_dims[3] = pooled_width; + ctx->SetOutputDim("Out", out_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +class PSROIPoolGradOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "The gradient of Out should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "The gradient of X should not be null."); + ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(psroi_pool, ops::PSROIPoolOp, ops::PSROIPoolOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(psroi_pool_grad, ops::PSROIPoolGradOp); +REGISTER_OP_CPU_KERNEL( + psroi_pool, + ops::CPUPSROIPoolOpKernel, + ops::CPUPSROIPoolOpKernel); +REGISTER_OP_CPU_KERNEL( + psroi_pool_grad, + ops::CPUPSROIPoolGradOpKernel, + ops::CPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..22fec3244fabe5ca466202784c0cce372d0bf6e5 --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.cu @@ -0,0 +1,294 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/psroi_pool_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaximumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaximumNumBlocks); +} + +template +__global__ void GPUPSROIPoolForward( + const int nthreads, const T* input_data, const T* input_rois, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* output_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (size_t i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + const T* offset_input_data = + input_data + + (roi_batch_id * input_channels + input_channel) * height * width; + T outsum = 0; + + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + outsum += offset_input_data[input_index]; + } + } + + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + output_data[i] = is_empty ? 0. : outsum / bin_area; + } +} + +template +__global__ void GPUPSROIPoolBackward( + const int nthreads, const T* input_rois, const T* output_grad_data, + const float spatial_scale, const int input_channels, const int height, + const int width, const int output_channels, const int pooled_height, + const int pooled_width, const int* rois_batch_id_data, T* input_grad_data) { + int index = blockIdx.x * blockDim.x + threadIdx.x; + int offset = blockDim.x * gridDim.x; + for (int i = index; i < nthreads; i += offset) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = min(max(hstart, 0), height); + hend = min(max(hend, 0), height); + wstart = min(max(wstart, 0), width); + wend = min(max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + platform::CudaAtomicAdd(offset_input_grad_data + input_index, diff_val); + } + } + } +} + +template +class GPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + int rois_num = rois->dims()[0]; + if (rois_num == 0) return; + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "The rois_batch_size and input(X) batch_size must be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num, rois_num_with_lod, + "The rois_num from input and lod must be the same."); + + // set rois batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + int output_size = out->numel(); + int blocks = NumBlocks(output_size); + int threads = kNumCUDAThreads; + + // call cuda kernel function + GPUPSROIPoolForward< + T><<>>( + output_size, in->data(), rois->data(), spatial_scale, + input_channels, height, width, output_channels, pooled_height, + pooled_width, rois_batch_id_list_gpu.data(), + out->mutable_data(ctx.GetPlace())); + } +}; + +template +class GPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + + auto* output_grad = ctx.Input(framework::GradVarName("Out")); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + int rois_num = rois->dims()[0]; + int input_channels = in->dims()[1]; + int height = in->dims()[2]; + int width = in->dims()[3]; + + if (input_grad) { + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(platform::CPUPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + framework::Tensor rois_batch_id_list_gpu; + framework::TensorCopy(rois_batch_id_list, ctx.GetPlace(), + ctx.device_context(), &rois_batch_id_list_gpu); + + input_grad->mutable_data(ctx.GetPlace()); + math::SetConstant set_zero; + set_zero(ctx.cuda_device_context(), input_grad, static_cast(0)); + + int output_grad_size = output_grad->numel(); + int blocks = NumBlocks(output_grad_size); + int threads = kNumCUDAThreads; + + if (output_grad_size > 0) { + GPUPSROIPoolBackward< + T><<>>( + output_grad_size, rois->data(), output_grad->data(), + spatial_scale, input_channels, height, width, output_channels, + pooled_height, pooled_width, rois_batch_id_list_gpu.data(), + input_grad->mutable_data(ctx.GetPlace())); + } + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + psroi_pool, + ops::GPUPSROIPoolOpKernel, + ops::GPUPSROIPoolOpKernel); +REGISTER_OP_CUDA_KERNEL( + psroi_pool_grad, + ops::GPUPSROIPoolGradOpKernel, + ops::GPUPSROIPoolGradOpKernel); diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h new file mode 100644 index 0000000000000000000000000000000000000000..1a424728f7f6c4034242fb998d5121804e38702b --- /dev/null +++ b/paddle/fluid/operators/psroi_pool_op.h @@ -0,0 +1,253 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPUPSROIPoolOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* out = ctx.Output("Out"); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto spatial_scale = ctx.Attr("spatial_scale"); + auto output_channels = ctx.Attr("output_channels"); + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + auto in_stride = framework::stride(in_dims); + auto roi_stride = framework::stride(rois->dims()); + auto out_stride = framework::stride(out->dims()); + + const T* input_data = in->data(); + + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_ENFORCE_EQ( + rois_batch_size, batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_ENFORCE_EQ(rois_num_with_lod, rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_ENFORCE_EQ(input_channels, + output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + T* output_data = out->mutable_data(ctx.GetPlace()); + const T* input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute bin size w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + int out_plane_offset = out_roi_offset + c * out_stride[1]; + for (int ph = 0; ph < pooled_height; ++ph) { + int out_row_offset = out_plane_offset + ph * out_stride[2]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + int output_index = out_row_offset + pw; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_plane_offset = + roi_batch_id * in_stride[0] + input_channel * in_stride[1]; + const T* offset_input_data = input_data + input_plane_offset; + T out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[2] + iw; + out_sum += offset_input_data[input_index]; + } + } + T bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } + return; + } +}; + +template +class CPUPSROIPoolGradOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* in = ctx.Input("X"); + auto* rois = ctx.Input("ROIs"); + auto* output_grad = + ctx.Input(framework::GradVarName("Out")); + auto* input_grad = + ctx.Output(framework::GradVarName("X")); + + auto pooled_height = ctx.Attr("pooled_height"); + auto pooled_width = ctx.Attr("pooled_width"); + auto output_channels = ctx.Attr("output_channels"); + auto spatial_scale = ctx.Attr("spatial_scale"); + + if (input_grad) { + auto in_dims = in->dims(); + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // set roi batch id + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + int* rois_batch_id_data = + rois_batch_id_list.mutable_data(ctx.GetPlace()); + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + + const T* input_rois = rois->data(); + const T* output_grad_data = output_grad->data(); + T* input_grad_data = input_grad->mutable_data(ctx.GetPlace()); + + // set gradient of X to be 0. before backpropagate. + math::SetConstant set_zero; + set_zero(ctx.template device_context(), input_grad, + static_cast(0)); + + // backpropagate gradient per output pixel + int output_grad_size = output_grad->numel(); + for (int i = 0; i < output_grad_size; ++i) { + // The output is in order (n, c, ph, pw) + int pw = i % pooled_width; + int ph = (i / pooled_width) % pooled_height; + int c = (i / pooled_width / pooled_height) % output_channels; + int n = i / pooled_width / pooled_height / output_channels; + + // set roi_batch_id + int roi_batch_id = rois_batch_id_data[n]; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + int input_offset = + (roi_batch_id * input_channels + input_channel) * height * width; + T* offset_input_grad_data = input_grad_data + input_offset; + + // [start, end) interval for spatial sampling + const T* offset_input_rois = input_rois + n * 4; + T roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + T roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + T roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + T roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + T roi_height = std::max(roi_end_h - roi_start_h, (T)0.1); // avoid 0 + T roi_width = std::max(roi_end_w - roi_start_w, (T)0.1); + + // Compute w and h at input feature map + T bin_size_h = roi_height / static_cast(pooled_height); + T bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(bin_size_h * static_cast(ph) + roi_start_h); + int wstart = floor(bin_size_w * static_cast(pw) + roi_start_w); + int hend = ceil(bin_size_h * static_cast(ph + 1) + roi_start_h); + int wend = ceil(bin_size_w * static_cast(pw + 1) + roi_start_w); + + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + // Accumulate diff_val into input data + T bin_area = static_cast((hend - hstart) * (wend - wstart)); + T diff_val = is_empty ? 0. : output_grad_data[i] / bin_area; + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * width + iw; + offset_input_grad_data[input_index] += diff_val; + } + } + } + } + return; + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9b2a11bae12d242880829628faa089e1638424b0..7fc07efe7304701794595c9fa63f4a306d61e230 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -16,6 +16,7 @@ #include +#include #include // NOLINT #include #include @@ -55,8 +56,7 @@ class CTRReader : public framework::FileReader { PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); - thread_num_ = - file_list_.size() > thread_num ? thread_num : file_list_.size(); + thread_num_ = std::min(file_list_.size(), thread_num); queue_ = queue; SplitFiles(); for (size_t i = 0; i < thread_num_; ++i) { @@ -95,10 +95,10 @@ class CTRReader : public framework::FileReader { queue_->ReOpen(); VLOG(3) << "reopen success"; VLOG(3) << "thread_num " << thread_num_; - for (int thread_id = 0; thread_id < thread_num_; thread_id++) { - read_threads_.emplace_back(new std::thread( - std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, - thread_id, &read_thread_status_, queue_))); + for (size_t thread_id = 0; thread_id < thread_num_; thread_id++) { + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[thread_id], slots_, batch_size_, + static_cast(thread_id), &read_thread_status_, queue_))); } monitor_thread_.reset(new std::thread( std::bind(&MonitorThread, &read_thread_status_, queue_))); diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h index 35d9737ee01fe1505cbe30e8ed735e6b92cb8df2..3c2d51ec9111e649632dda89290f21a0988db6dd 100644 --- a/paddle/fluid/operators/spp_op.h +++ b/paddle/fluid/operators/spp_op.h @@ -56,13 +56,13 @@ class SppKernel : public framework::OpKernel { math::Pool2dFunctor, T> pool_forward; math::MaxPool max_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, max_process, true, + kernel_size, strides, paddings, max_process, true, false, &out_level); } else if (pooling_type == "avg") { math::Pool2dFunctor, T> pool_forward; math::AvgPool avg_process; pool_forward(context.template device_context(), *in_x, - kernel_size, strides, paddings, avg_process, true, + kernel_size, strides, paddings, avg_process, true, false, &out_level); } // flatten pooling output shape @@ -156,7 +156,7 @@ class SppGradKernel : public framework::OpKernel { math::AvgPoolGrad avg_process; pool_backward(context.template device_context(), *in_x, *&out_level, *&outgrad_level, kernel_size, strides, - paddings, avg_process, true, in_x_grad); + paddings, avg_process, true, false, in_x_grad); } } } diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 93cb5eb2dc0b3480ebd05dcc6b36d8915d057bab..23c7ebe84221986a5f7ac7583c3a8e17d04fe4af 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -56,9 +56,16 @@ ELSE() set(MKLDNN_CTX_DEPS) ENDIF() +nv_library(stream_callback_manager SRCS stream_callback_manager.cc DEPS simple_threadpool enforce) +IF(WITH_GPU) + set(STREAM_CALLBACK_DEPS stream_callback_manager) +ELSE() + set(STREAM_CALLBACK_DEPS) +ENDIF() + # memcpy depends on device_context, here add deps individually for # avoiding cycle dependencies -cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc +cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc ${STREAM_CALLBACK_DEPS} place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}) nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info) diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index bd81d4dd1f1073edffcb9fd4a02b455db27361d5..d2e23d80f437e1df9216fa36e99a9be394dda074 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -3,6 +3,7 @@ Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index ce1494f1702aa2bbcb255dec122cdae776cbc4a0..812e56f1f966d03207cf83ad47cb88e9fa5d55bb 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -222,14 +222,10 @@ class CUDADeviceContext : public DeviceContext { template void AddStreamCallback(Callback&& callback) const { - std::lock_guard guard(callback_mtx_); callback_manager_->AddCallback(callback); } - void WaitStreamCallback() const { - std::lock_guard guard(callback_mtx_); - callback_manager_->Wait(); - } + void WaitStreamCallback() const { callback_manager_->Wait(); } #if CUDA_VERSION >= 9000 /*! \brief CublasCall may need to change cublas's config, @@ -260,9 +256,7 @@ class CUDADeviceContext : public DeviceContext { mutable std::mutex mtx_; - // This lock is only used by callback - // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes - mutable std::mutex callback_mtx_; + // StreamCallbackManager is thread-safe std::unique_ptr callback_manager_; mutable std::mutex cublas_mtx_; diff --git a/paddle/fluid/platform/stream_callback_manager.cc b/paddle/fluid/platform/stream_callback_manager.cc new file mode 100644 index 0000000000000000000000000000000000000000..466c77469ef256179c52442d21c1d62dfc4ef1bb --- /dev/null +++ b/paddle/fluid/platform/stream_callback_manager.cc @@ -0,0 +1,63 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/platform/stream_callback_manager.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace platform { + +#if CUDA_VERSION >= 10000 +static void CUDART_CB StreamCallbackFunc(void *user_data); +#else +static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, + cudaError_t status, void *user_data) +#endif +{ + std::unique_ptr> func( + reinterpret_cast *>(user_data)); + (*func)(); +} + +StreamCallbackManager::StreamCallbackManager(const cudaStream_t stream) + : stream_(stream), thread_pool_(1) {} + +void StreamCallbackManager::AddCallback(std::function callback) const { + auto *callback_func = new std::function(std::move(callback)); + auto *func = new std::function([this, callback_func] { + std::lock_guard lock(mtx_); + last_future_ = thread_pool_.enqueue([callback_func] { + std::unique_ptr> releaser(callback_func); + (*callback_func)(); + }); + }); +#if CUDA_VERSION >= 10000 + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, StreamCallbackFunc, func)); +#else + PADDLE_ENFORCE(cudaStreamAddCallback(stream_, StreamCallbackFunc, func, 0)); +#endif +} + +void StreamCallbackManager::Wait() const { + PADDLE_ENFORCE(cudaStreamSynchronize(stream_)); + { + std::lock_guard lock(mtx_); + if (last_future_.valid()) { + last_future_.wait(); + } + } +} + +} // namespace platform +} // namespace paddle diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index ed8734c98cb996721a3002523c9276e6d7f492ae..8668bcb1131719e882ecbccb08ad00b63409eb28 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,67 +18,32 @@ #include #include #include +#include // NOLINT #include +#include // NOLINT + #include "paddle/fluid/platform/enforce.h" namespace paddle { namespace platform { -class StreamCallbackManager; - -struct StreamCallbackContext { - template - inline StreamCallbackContext(const StreamCallbackManager *manager, - Callback &&callback) - : manager_(manager), callback_(callback) {} - - const StreamCallbackManager *manager_; // do not own - std::function callback_; -}; - +// NOTE(zjl): clean StreamCallbackManager to make compilation faster +// Make StreamCallbackManager thread-safe class StreamCallbackManager { public: - explicit inline StreamCallbackManager(cudaStream_t stream = nullptr) - : stream_(stream), thread_pool_(new ThreadPool(1)) {} + explicit StreamCallbackManager(const cudaStream_t stream); + + ~StreamCallbackManager() = default; - template - inline void AddCallback(Callback &&callback) const { - auto *stream_callback_context = - new StreamCallbackContext(this, std::forward(callback)); -#if CUDA_VERSION >= 10000 - PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context)); // NOLINT -#else - PADDLE_ENFORCE(cudaStreamAddCallback( - stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0)); // NOLINT -#endif - } + void AddCallback(std::function callback) const; - void Wait() const { thread_pool_.reset(new ThreadPool(1)); } + void Wait() const; private: const cudaStream_t stream_; - mutable std::unique_ptr thread_pool_; - -// cudaStreamCallback cannot call CUDA API inside, so we have to use -// thread_pool here -#if CUDA_VERSION >= 10000 - static void CUDART_CB StreamCallbackFunc(void *user_data) -#else - static void CUDART_CB StreamCallbackFunc(cudaStream_t stream, - cudaError_t status, void *user_data) -#endif - { - auto *callback_context_ptr = - reinterpret_cast(user_data); - callback_context_ptr->manager_->thread_pool_->enqueue([=]() { - std::unique_ptr callback_context( - callback_context_ptr); - callback_context->callback_(); - }); - } + mutable ::ThreadPool thread_pool_; + mutable std::mutex mtx_; + mutable std::future last_future_; }; } // namespace platform diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 399f949e7009e1e55b56aeff1dbb537ffe20d90e..017598e1707ffdde1396e7d2bb0bbcbdfa547e71 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -81,6 +81,14 @@ bool IsCompiledWithCUDA() { #endif } +bool IsCompiledWithBrpc() { +#if defined(PADDLE_WITH_BRPC) || defined(PADDLE_WITH_BRPC_RDMA) + return true; +#else + return false; +#endif +} + bool IsCompiledWithDIST() { #ifdef PADDLE_WITH_DISTRIBUTE return true; @@ -631,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. [](bool init_p2p) { framework::InitDevices(init_p2p); }); m.def("is_compiled_with_cuda", IsCompiledWithCUDA); + m.def("is_compiled_with_brpc", IsCompiledWithBrpc); m.def("is_compiled_with_dist", IsCompiledWithDIST); #ifdef PADDLE_WITH_CUDA m.def("is_float16_supported", [](const platform::CUDAPlace &place) -> bool { diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index 5e91f5b30181eec8934d22a8c40d88f9f5c04ae8..ecdc8f3dc75cc8b72520e0fd1411e23d2dbb07e2 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -162,7 +162,7 @@ void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -182,7 +182,7 @@ inline void PyCPUTensorSetFromArray( paddle::platform::CPUPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (int i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -200,7 +200,7 @@ void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -221,7 +221,7 @@ inline void PyCUDATensorSetFromArray( paddle::platform::CUDAPlace place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -240,7 +240,7 @@ void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } @@ -260,7 +260,7 @@ inline void PyCUDAPinnedTensorSetFromArray( const paddle::platform::CUDAPinnedPlace &place) { std::vector dims; dims.reserve(array.ndim()); - for (size_t i = 0; i < array.ndim(); ++i) { + for (decltype(array.ndim()) i = 0; i < array.ndim(); ++i) { dims.push_back(static_cast(array.shape()[i])); } diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 6299b166af8a5f65cf587ae282c955f33db0044b..a0da89d3195b3e9a7bbac19a3985baa8c1fb55d0 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -517,6 +517,18 @@ function assert_api_spec_approvals() { fi fi done + + HAS_CONST_CAST=`git diff -U0 upstream/$BRANCH |grep -o -m 1 "const_cast" || true` + if [ ${HAS_CONST_CAST} ] && [ "${GIT_PR_ID}" != "" ]; then + APPROVALS=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000 | \ + python ${PADDLE_ROOT}/tools/check_pr_approval.py 2 7845005 2887803 728699 13348433` + echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}" + if [ "${APPROVALS}" == "FALSE" ]; then + echo "You must have at least 2 approvals for the const_cast" + exit 1 + fi + fi + } diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a532f94c6dd08de68c56e7af974dbf9a371cf121..2dea71d7af9dd8d9bdf544876be6035c414a8c52 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -126,9 +126,9 @@ def __bootstrap__(): 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", - 'eager_delete_tensor_gb', 'allocator_strategy', - 'reader_queue_speed_test_mode', 'print_sub_graph_dir', - 'pe_profile_fname' + 'eager_delete_tensor_gb', 'fast_eager_deletion_mode', + 'allocator_strategy', 'reader_queue_speed_test_mode', + 'print_sub_graph_dir', 'pe_profile_fname' ] if 'Darwin' not in sysstr: read_env_flags.append('use_pinned_memory') @@ -152,6 +152,7 @@ def __bootstrap__(): 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', 'cudnn_exhaustive_search', 'selected_gpus' ] + core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) core.init_glog(sys.argv[0]) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e25eaaa9fda6add9d8e81d9e6bdfb711cee3648e..9e6cd1a0ab54d9fd2764b787416e4f5c86755a68 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -52,6 +52,8 @@ __all__ = [ 'softmax', 'pool2d', 'pool3d', + 'adaptive_pool2d', + 'adaptive_pool3d', 'batch_norm', 'beam_search_decode', 'conv2d_transpose', @@ -173,6 +175,7 @@ __all__ = [ 'merge_selected_rows', 'get_tensor_from_selected_rows', 'lstm', + 'psroi_pool', ] kIgnoreIndex = -100 @@ -2499,6 +2502,204 @@ def pool3d(input, return pool_out +@templatedoc(op_type="pool2d") +def adaptive_pool2d(input, + pool_size, + pool_type="max", + require_index=False, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (pool_size_Height, pool_size_Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + .. code-block:: python + + # suppose input data in shape of [N, C, H, W], `pool_size` is [m, n], + # output shape is [N, C, m, n], adaptive pool divide H and W dimentions + # of input data into m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(m): + # for j in range(n): + # hstart = floor(i * H / m) + # hend = ceil((i + 1) * H / m) + # wstart = floor(i * W / n) + # wend = ceil((i + 1) * W / n) + # output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend]) + # + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + pool_out = fluid.layers.adaptive_pool2d( + input=data, + pool_size=[3, 3], + pool_type='avg') + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 2: + raise ValueError( + "'pool_size' should be a list or tuple with length as 2.") + + if pool_type == "max": + l_type = 'max_pool2d_with_index' + else: + l_type = "pool2d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "adaptive": True, + }) + + return (pool_out, mask) if require_index else pool_out + + +@templatedoc(op_type="pool3d") +def adaptive_pool3d(input, + pool_size, + pool_type="max", + require_index=False, + name=None): + """ + ${comment} + + Args: + input (Variable): The input tensor of pooling operator. The format of + input tensor is NCHW, where N is batch size, C is + the number of channels, H is the height of the + feature, and W is the width of the feature. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, + it must contain two integers, (Depth, Height, Width). + pool_type: ${pooling_type_comment} + require_index (bool): If true, the index of max pooling point along with outputs. + it cannot be set in average pooling type. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. + + Returns: + Variable: The pooling result. + + Raises: + ValueError: 'pool_type' is not 'max' nor 'avg'. + ValueError: invalid setting 'require_index' true when 'pool_type' is 'avg'. + ValueError: 'pool_size' should be a list or tuple with length as 2. + + Examples: + .. code-block:: python + + # suppose input data in shape of [N, C, D, H, W], `pool_size` is [l, m, n], + # output shape is [N, C, l, m, n], adaptive pool divide D, H and W dimentions + # of input data into l * m * n grids averagely and performs poolings in each + # grid to get output. + # adaptive average pool performs calculations as follow: + # + # for i in range(l): + # for j in range(m): + # for k in range(n): + # dstart = floor(i * D / l) + # dend = ceil((i + 1) * D / l) + # hstart = floor(j * H / m) + # hend = ceil((j + 1) * H / m) + # wstart = floor(k * W / n) + # wend = ceil((k + 1) * W / n) + # output[:, :, i, j, k] = + # avg(input[:, :, dstart:dend, hstart: hend, wstart: wend]) + # + data = fluid.layers.data( + name='data', shape=[3, 32, 32], dtype='float32') + pool_out, mask = fluid.layers.adaptive_pool3d( + input=data, + pool_size=[3, 3], + pool_type='avg') + """ + if pool_type not in ["max", "avg"]: + raise ValueError( + "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", + str(pool_type)) + + if pool_type == "avg" and require_index: + raise ValueError( + "invalid setting 'require_index' true when 'pool_type' is 'avg'.") + + def _is_list_or_tuple_(data): + return (isinstance(data, list) or isinstance(data, tuple)) + + if not _is_list_or_tuple_(pool_size) or len(pool_size) != 3: + raise ValueError( + "'pool_size' should be a list or tuple with length as 3.") + + if pool_type == "max": + l_type = 'max_pool3d_with_index' + else: + l_type = "pool3d" + + helper = LayerHelper(l_type, **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + + outputs = {"Out": pool_out} + if pool_type == "max": + mask = helper.create_variable_for_type_inference(dtype) + outputs["Mask"] = mask + + helper.append_op( + type=l_type, + inputs={"X": input}, + outputs=outputs, + attrs={ + "pooling_type": pool_type, + "ksize": pool_size, + "adaptive": True, + }) + + return (pool_out, mask) if require_index else pool_out + + def batch_norm(input, act=None, is_test=False, @@ -9122,3 +9323,57 @@ def get_tensor_from_selected_rows(x, name=None): outputs={'Out': out}, attrs={}) return out + + +@templatedoc() +def psroi_pool(input, + rois, + output_channels, + spatial_scale, + pooled_height, + pooled_width, + name=None): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + output_channels (integer): ${output_channels_comment} + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + name (str, default None): The name of this layer. + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.psroi_pool(input=x, rois=rois, 490, 1.0, 7, 7) + """ + helper = LayerHelper('psroi_pool', **locals()) + # check attrs + if not isinstance(output_channels, int): + raise TypeError("output_channels must be int type") + if not isinstance(spatial_scale, float): + raise TypeError("spatial_scale must be float type") + if not isinstance(pooled_height, int): + raise TypeError("pooled_height must be int type") + if not isinstance(pooled_width, int): + raise TypeError("pooled_width must be int type") + dtype = helper.input_dtype() + out = helper.create_variable_for_type_inference(dtype) + helper.append_op( + type='psroi_pool', + inputs={'X': input, + 'ROIs': rois}, + outputs={'Out': out}, + attrs={ + 'output_channels': output_channels, + 'spatial_scale': spatial_scale, + 'pooled_height': pooled_height, + 'pooled_width': pooled_width + }) + return out diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index a4089ba3ca08bed5702a66ed370da52ecd9b58c6..6d6fe245d8a0d9b3a29f11171e7d945e09a4133c 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -63,9 +63,9 @@ function(py_test_modules TARGET_NAME) set(multiValueArgs MODULES DEPS ENVS) cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} - ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) + COMMAND ${CMAKE_COMMAND} -E env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS} + ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES} + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) if (py_test_modules_SERIAL) set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1) endif() @@ -111,3 +111,7 @@ py_test_modules(test_parallel_executor_transformer MODULES test_parallel_executo if(NOT APPLE) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) endif() + +if (WITH_NGRAPH) + add_subdirectory(ngraph) +endif() diff --git a/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..5ed2d0aa80cd0462d3ac1902a2ec13fc2c1bd844 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/CMakeLists.txt @@ -0,0 +1,6 @@ +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") + +foreach(TEST_OP ${TEST_OPS}) + py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS FLAGS_use_ngraph=true) +endforeach(TEST_OP) diff --git a/python/paddle/fluid/tests/unittests/ngraph/__init__.py b/python/paddle/fluid/tests/unittests/ngraph/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..b94a21a7e406b833797f8f521c62a2351c2bc30a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ngraph/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index cedb3383ed4728306c61d7f987850000506457c7..07cc44aaa266af39fbf3d726ee51a9afc5cb3756 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -227,6 +227,7 @@ class TestDistBase(unittest.TestCase): def setUp(self): self._trainers = 2 self._pservers = 2 + self._port_set = set() self._ps_endpoints = "127.0.0.1:%s,127.0.0.1:%s" % ( self._find_free_port(), self._find_free_port()) self._python_interp = sys.executable @@ -242,9 +243,17 @@ class TestDistBase(unittest.TestCase): self._after_setup_config() def _find_free_port(self): - with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: - s.bind(('', 0)) - return s.getsockname()[1] + def __free_port(): + with closing(socket.socket(socket.AF_INET, + socket.SOCK_STREAM)) as s: + s.bind(('', 0)) + return s.getsockname()[1] + + while True: + port = __free_port() + if port not in self._port_set: + self._port_set.add(port) + return port def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py new file mode 100644 index 0000000000000000000000000000000000000000..e91cfe0b45ab7e4e56fccf8d49eb381fbbd199d1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_dynamic_rnn_base.py @@ -0,0 +1,86 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +os.environ['FLAGS_eager_delete_tensor_gb'] = '0.0' +os.environ['CPU_NUM'] = '2' + +import six +import unittest + +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): + if use_cuda and not core.is_compiled_with_cuda(): + print('Skip use_cuda=True because Paddle is not compiled with cuda') + return + + word_dict = paddle.dataset.imdb.word_dict() + train_reader = paddle.batch( + paddle.dataset.imdb.train(word_dict), batch_size=batch_size) + + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + + cost = network(data, label, len(word_dict)) + optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) + optimizer.minimize(cost) + + place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + reader = feeder.decorate_reader( + train_reader, multi_devices=use_parallel_executor) + + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + if use_parallel_executor: + train_exe = fluid.ParallelExecutor( + use_cuda=use_cuda, loss_name=cost.name) + fetch_list = [cost.name] + else: + train_exe = exe + fetch_list = [cost] + + for pass_id in six.moves.xrange(pass_num): + batch_id = 0 + for data in reader(): + train_exe.run(feed=data, + fetch_list=fetch_list if batch_id % 4 == 0 else []) + batch_id += 1 + if batch_id > 16: + break + + +class TestBase(unittest.TestCase): + def setUp(self): + self.net = None + + def test_network(self): + if self.net is None: + return + + for use_cuda in [True, False]: + for use_parallel_executor in [False, True]: + print('network: {}, use_cuda: {}, use_parallel_executor: {}'. + format(self.net.__name__, use_cuda, + use_parallel_executor)) + with fluid.program_guard(fluid.Program(), fluid.Program()): + with fluid.scope_guard(core.Scope()): + train(self.net, use_cuda, use_parallel_executor) diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py new file mode 100644 index 0000000000000000000000000000000000000000..5ed3d9fdf3bf765f1b9ef8ba1ef2a5795f1874c7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_gru_net.py @@ -0,0 +1,49 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +from test_eager_deletion_dynamic_rnn_base import TestBase +import paddle.fluid as fluid + + +def gru_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=400.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 3) + gru_h = fluid.layers.dynamic_gru(input=fc0, size=hid_dim, is_reverse=False) + gru_max = fluid.layers.sequence_pool(input=gru_h, pool_type='max') + gru_max_tanh = fluid.layers.tanh(gru_max) + fc1 = fluid.layers.fc(input=gru_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class GRUTest(TestBase): + def setUp(self): + self.net = gru_net + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py new file mode 100644 index 0000000000000000000000000000000000000000..8462c06aa56e0469fd06c7dc4b2ed514f7eb51ba --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_lstm_net.py @@ -0,0 +1,50 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from test_eager_deletion_dynamic_rnn_base import TestBase +import paddle.fluid as fluid +import unittest + + +def lstm_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2, + emb_lr=30.0): + emb = fluid.layers.embedding( + input=data, + size=[dict_dim, emb_dim], + param_attr=fluid.ParamAttr(learning_rate=emb_lr)) + fc0 = fluid.layers.fc(input=emb, size=hid_dim * 4) + lstm_h, c = fluid.layers.dynamic_lstm( + input=fc0, size=hid_dim * 4, is_reverse=False) + lstm_max = fluid.layers.sequence_pool(input=lstm_h, pool_type='max') + lstm_max_tanh = fluid.layers.tanh(lstm_max) + fc1 = fluid.layers.fc(input=lstm_max_tanh, size=hid_dim2, act='tanh') + prediction = fluid.layers.fc(input=fc1, size=class_dim, act='softmax') + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + return avg_cost + + +class LSTMTest(TestBase): + def setUp(self): + self.net = lstm_net + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py new file mode 100644 index 0000000000000000000000000000000000000000..7ec1f0ae753724dac5c4675926ead87a097a7a99 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_mnist.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_mnist import TestMNIST + + +class EagerDeletionTestMNIST(TestMNIST): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..754d5fd40953311a5deb466fa42216f72671a65a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_transformer.py @@ -0,0 +1,27 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import unittest +os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" + +from test_parallel_executor_transformer import TestTransformer + + +class EagerDeletionTestTransformer(TestTransformer): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 10e8bb5a86691d8654c5ae48794e49f30f47500d..e180822c2b4b7cceaf9f66e7819477b48bf4941b 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -233,6 +233,29 @@ class TestBook(unittest.TestCase): pool_stride=[1, 2], pool_padding=(2, 1))) + def test_adaptive_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool2d( + x, [3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool2d(x, [3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) + + def test_adaptive_pool3d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 244, 224, 224], dtype='float32') + self.assertIsNotNone( + layers.adaptive_pool3d( + x, [3, 3, 3], pool_type='avg')) + pool, mask = layers.adaptive_pool3d( + x, [3, 3, 3], require_index=True) + self.assertIsNotNone(pool) + self.assertIsNotNone(mask) + def test_lstm_unit(self): program = Program() with program_guard(program): @@ -511,6 +534,16 @@ class TestBook(unittest.TestCase): self.assertIsNotNone(output) print(str(program)) + def test_psroi_pool(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[245, 30, 30], dtype="float32") + rois = layers.data( + name="rois", shape=[4], dtype="float32", lod_level=1) + output = layers.psroi_pool(x, rois, 5, 0.25, 7, 7) + self.assertIsNotNone(output) + print(str(program)) + def test_roi_align(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index 47b2e71a4e52a327831fde7494bd7a2306b6f2ea..5ccdf082e8a4f8aabcd55b6b470a77690ee6f61f 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,29 +22,47 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -56,27 +75,37 @@ def avg_pool2D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, H, W = x.shape if global_pool == 1: ksize = [H, W] - H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] - field_size = ((r_end - r_start) * (c_end - c_start)) if exclusive \ - else (ksize[0] * ksize[1]) + field_size = ((r_end - r_start) * (c_end - c_start)) \ + if (exclusive or adaptive) else (ksize[0] * ksize[1]) out[:, :, i, j] = np.sum(x_masked, axis=(2, 3)) / field_size return out @@ -93,12 +122,13 @@ class TestPool2D_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool2D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -112,7 +142,8 @@ class TestPool2D_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -159,6 +190,9 @@ class TestPool2D_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool2D_Op): def init_test_case(self): @@ -315,5 +349,10 @@ class TestCUDNNAvgInclude(TestCase2): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py index f05f8ccb3985be162d89da099496d5b2baf4afdc..47a5b2d1abe11a37d24624ff52d05ea135befe7c 100644 --- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py @@ -13,6 +13,7 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np @@ -21,35 +22,59 @@ import paddle.fluid.core as core from op_test import OpTest +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -62,33 +87,49 @@ def avg_pool3D_forward_naive(x, paddings, global_pool=0, ceil_mode=False, - exclusive=True): + exclusive=True, + adaptive=False): N, C, D, H, W = x.shape if global_pool == 1: ksize = [D, H, W] - D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 - ) // strides[0] + 1 if ceil_mode else ( - H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 - ) // strides[1] + 1 if ceil_mode else ( - W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 - ) // strides[2] + 1 if ceil_mode else ( - W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0] + strides[0] - 1 + ) // strides[0] + 1 if ceil_mode else ( + H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1] + strides[1] - 1 + ) // strides[1] + 1 if ceil_mode else ( + W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2] + strides[2] - 1 + ) // strides[2] + 1 if ceil_mode else ( + W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] field_size = (d_end - d_start) * (h_end - h_start) * (w_end - w_start) \ - if exclusive else ksize[0] * ksize[1] * ksize[2] + if (exclusive or adaptive) else ksize[0] * ksize[1] * ksize[2] out[:, :, k, i, j] = np.sum(x_masked, axis=(2, 3, 4)) / field_size return out @@ -105,13 +146,14 @@ class TestPool3d_Op(OpTest): self.init_pool_type() self.init_ceil_mode() self.init_exclusive() + self.init_adaptive() if self.global_pool: self.paddings = [0 for _ in range(len(self.paddings))] input = np.random.random(self.shape).astype(self.dtype) output = self.pool3D_forward_naive( input, self.ksize, self.strides, self.paddings, self.global_pool, - self.ceil_mode, self.exclusive).astype(self.dtype) + self.ceil_mode, self.exclusive, self.adaptive).astype(self.dtype) self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)} self.attrs = { @@ -124,7 +166,8 @@ class TestPool3d_Op(OpTest): 'ceil_mode': self.ceil_mode, 'data_format': 'AnyLayout', # TODO(dzhwinter) : should be fix latter - 'exclusive': self.exclusive + 'exclusive': self.exclusive, + 'adaptive': self.adaptive } self.outputs = {'Out': output} @@ -171,6 +214,9 @@ class TestPool3d_Op(OpTest): def init_exclusive(self): self.exclusive = True + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestPool3d_Op): def init_test_case(self): @@ -353,5 +399,10 @@ class TestCUDNNAvgInclude(TestCUDNNCase3): self.exclusive = False +class TestAvgPoolAdaptive(TestCase1): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_pool_max_op.py b/python/paddle/fluid/tests/unittests/test_pool_max_op.py index 488ff431d4f2ef76ce0c9486d8c307b4e01b5544..6575c408eeaa43d4f7caf257b2ebd77a942aecda 100644 --- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py @@ -13,33 +13,62 @@ # limitations under the License. from __future__ import print_function +from __future__ import division import unittest import numpy as np from op_test import OpTest -def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def adaptive_start_index(index, input_size, output_size): + return int(np.floor(index * input_size / output_size)) + + +def adaptive_end_index(index, input_size, output_size): + return int(np.ceil((index + 1) * input_size / output_size)) + + +def max_pool3D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, D, H, W = x.shape if global_pool: ksize = [D, H, W] paddings = [0, 0, 0] - D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 - W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 + if adaptive: + D_out, H_out, W_out = ksize + else: + D_out = (D - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + H_out = (H - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + W_out = (W - ksize[2] + 2 * paddings[2]) // strides[2] + 1 out = np.zeros((N, C, D_out, H_out, W_out)) mask = np.zeros((N, C, D_out, H_out, W_out)) for k in range(D_out): - d_start = np.max((k * strides[0] - paddings[0], 0)) - d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) + if adaptive: + d_start = adaptive_start_index(k, D, ksize[0]) + d_end = adaptive_end_index(k, D, ksize[0]) + else: + d_start = np.max((k * strides[0] - paddings[0], 0)) + d_end = np.min((k * strides[0] + ksize[0] - paddings[0], D)) for i in range(H_out): - h_start = np.max((i * strides[0] - paddings[0], 0)) - h_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + if adaptive: + h_start = adaptive_start_index(i, H, ksize[1]) + h_end = adaptive_end_index(i, H, ksize[1]) + else: + h_start = np.max((i * strides[1] - paddings[1], 0)) + h_end = np.min((i * strides[1] + ksize[1] - paddings[1], H)) for j in range(W_out): - w_start = np.max((j * strides[1] - paddings[1], 0)) - w_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + w_start = adaptive_start_index(j, W, ksize[2]) + w_end = adaptive_end_index(j, W, ksize[2]) + else: + w_start = np.max((j * strides[2] - paddings[2], 0)) + w_end = np.min((j * strides[2] + ksize[2] - paddings[2], W)) x_masked = x[:, :, d_start:d_end, h_start:h_end, w_start:w_end] out[:, :, k, i, j] = np.max(x_masked, axis=(2, 3, 4)) @@ -58,23 +87,37 @@ def max_pool3D_forward_naive(x, ksize, strides, paddings, global_pool=False): return out, mask -def max_pool2D_forward_naive(x, ksize, strides, paddings, global_pool=False): +def max_pool2D_forward_naive(x, + ksize, + strides, + paddings, + global_pool=False, + adaptive=False): N, C, H, W = x.shape if global_pool: ksize = [H, W] paddings = [0, 0] - H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 - W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 + if adaptive: + H_out, W_out = ksize + else: + H_out = (H - ksize[0] + 2 * paddings[0]) // strides[0] + 1 + W_out = (W - ksize[1] + 2 * paddings[1]) // strides[1] + 1 out = np.zeros((N, C, H_out, W_out)) mask = np.zeros((N, C, H_out, W_out)) for i in range(H_out): for j in range(W_out): - r_start = np.max((i * strides[0] - paddings[0], 0)) - r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) - c_start = np.max((j * strides[1] - paddings[1], 0)) - c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) + if adaptive: + r_start = adaptive_start_index(i, H, ksize[0]) + r_end = adaptive_end_index(i, H, ksize[0]) + c_start = adaptive_start_index(j, W, ksize[1]) + c_end = adaptive_end_index(j, W, ksize[1]) + else: + r_start = np.max((i * strides[0] - paddings[0], 0)) + r_end = np.min((i * strides[0] + ksize[0] - paddings[0], H)) + c_start = np.max((j * strides[1] - paddings[1], 0)) + c_end = np.min((j * strides[1] + ksize[1] - paddings[1], W)) x_masked = x[:, :, r_start:r_end, c_start:c_end] out[:, :, i, j] = np.max(x_masked, axis=(2, 3)) @@ -95,10 +138,12 @@ class TestMaxPoolWithIndex_Op(OpTest): def setUp(self): self.init_test_case() self.init_global() + self.init_adaptive() input = np.random.random(self.shape).astype("float32") output, mask = self.pool_forward_naive(input, self.ksize, self.strides, - self.paddings, self.global_pool) + self.paddings, self.global_pool, + self.adaptive) output = output.astype("float32") mask = mask.astype("int32") @@ -107,6 +152,7 @@ class TestMaxPoolWithIndex_Op(OpTest): 'paddings': self.paddings, 'ksize': self.ksize, 'global_pooling': self.global_pool, + 'adaptive': self.adaptive, } self.inputs = {'X': input} @@ -129,6 +175,9 @@ class TestMaxPoolWithIndex_Op(OpTest): def init_global(self): self.global_pool = False + def init_adaptive(self): + self.adaptive = False + class TestCase1(TestMaxPoolWithIndex_Op): def init_global(self): @@ -190,5 +239,15 @@ class TestCase7(TestCase6): self.global_pool = False +class TestCastAdaptive2d(TestCase6): + def init_adaptive(self): + self.adaptive = True + + +class TestCastAdaptive3d(TestMaxPoolWithIndex_Op): + def init_adaptive(self): + self.adaptive = True + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py new file mode 100644 index 0000000000000000000000000000000000000000..abe014a38c6ecfd008b0f1028536bfb49b628fb4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_psroi_pool_op.py @@ -0,0 +1,134 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import math +import numpy as np +import unittest +from op_test import OpTest + + +class TestPSROIPoolOp(OpTest): + def set_data(self): + self.init_test_case() + self.make_rois() + self.calc_psroi_pool() + self.inputs = {'X': self.x, 'ROIs': (self.rois[:, 1:5], self.rois_lod)} + self.attrs = { + 'output_channels': self.output_channels, + 'spatial_scale': self.spatial_scale, + 'pooled_height': self.pooled_height, + 'pooled_width': self.pooled_width + } + self.outputs = {'Out': self.outs} + + def init_test_case(self): + self.batch_size = 3 + self.channels = 3 * 2 * 2 + self.height = 6 + self.width = 4 + + self.x_dim = [self.batch_size, self.channels, self.height, self.width] + + self.spatial_scale = 1.0 / 4.0 + self.output_channels = 3 + self.pooled_height = 2 + self.pooled_width = 2 + + self.x = np.random.random(self.x_dim).astype('float32') + + def make_rois(self): + rois = [] + self.rois_lod = [[]] + for bno in range(self.batch_size): + self.rois_lod[0].append(bno + 1) + for i in range(bno + 1): + x1 = np.random.random_integers( + 0, self.width // self.spatial_scale - self.pooled_width) + y1 = np.random.random_integers( + 0, self.height // self.spatial_scale - self.pooled_height) + + x2 = np.random.random_integers(x1 + self.pooled_width, + self.width // self.spatial_scale) + y2 = np.random.random_integers( + y1 + self.pooled_height, self.height // self.spatial_scale) + roi = [bno, x1, y1, x2, y2] + rois.append(roi) + self.rois_num = len(rois) + self.rois = np.array(rois).astype('float32') + + def calc_psroi_pool(self): + output_shape = (self.rois_num, self.output_channels, self.pooled_height, + self.pooled_width) + out_data = np.zeros(output_shape) + for i in range(self.rois_num): + roi = self.rois[i] + roi_batch_id = int(roi[0]) + roi_start_w = round(roi[1]) * self.spatial_scale + roi_start_h = round(roi[2]) * self.spatial_scale + roi_end_w = (round(roi[3]) + 1.) * self.spatial_scale + roi_end_h = (round(roi[4]) + 1.) * self.spatial_scale + + roi_height = max(roi_end_h - roi_start_h, 0.1) + roi_width = max(roi_end_w - roi_start_w, 0.1) + + bin_size_h = roi_height / float(self.pooled_height) + bin_size_w = roi_width / float(self.pooled_width) + + x_i = self.x[roi_batch_id] + + for c in range(self.output_channels): + for ph in range(self.pooled_height): + for pw in range(self.pooled_width): + hstart = int( + math.floor(float(ph) * bin_size_h + roi_start_h)) + wstart = int( + math.floor(float(pw) * bin_size_w + roi_start_w)) + hend = int( + math.ceil( + float(ph + 1) * bin_size_h + roi_start_h)) + wend = int( + math.ceil( + float(pw + 1) * bin_size_w + roi_start_w)) + hstart = min(max(hstart, 0), self.height) + hend = min(max(hend, 0), self.height) + wstart = min(max(wstart, 0), self.width) + wend = min(max(wend, 0), self.width) + + c_in = (c * self.pooled_height + ph + ) * self.pooled_width + pw + is_empty = (hend <= hstart) or (wend <= wstart) + out_sum = 0. + for ih in range(hstart, hend): + for iw in range(wstart, wend): + out_sum += x_i[c_in, ih, iw] + bin_area = (hend - hstart) * (wend - wstart) + out_data[i, c, ph, pw] = 0. if is_empty else ( + out_sum / float(bin_area)) + self.outs = out_data.astype('float32') + + def setUp(self): + self.op_type = 'psroi_pool' + self.set_data() + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +if __name__ == '__main__': + unittest.main()