diff --git a/CMakeLists.txt b/CMakeLists.txt index 1594e798a2ba3f735a28a43ef933d80b3b3f8964..c31f51a3f7371bd7b1b0ca3234091a35868806ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,7 @@ option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF) option(GLIDE_INSTALL "Download and install go dependencies " ON) option(USE_NNPACK "Compile PaddlePaddle with NNPACK library" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF) +option(WITH_PSLIB "Compile with pslib support" OFF) option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF) option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF) option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF) @@ -131,8 +132,6 @@ if (APPLE OR WIN32) endif() if (WIN32) - set(WITH_AVX OFF CACHE STRING - "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING @@ -209,14 +208,20 @@ include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy include(external/snappystream) # download snappystream +include(external/warpctc) # download, build, install warpctc if (NOT WIN32) -# there is no official support of warpctc, nccl, cupti in windows -include(external/warpctc) # download, build, install warpctc +# there is no official support of nccl, cupti in windows include(cupti) include(external/gzstream) endif (NOT WIN32) +if(WITH_PSLIB) + include(external/libmct) + include(external/pslib_brpc) + include(external/pslib) +endif(WITH_PSLIB) + if(WITH_DISTRIBUTE) if(WITH_GRPC) include(external/grpc) @@ -284,6 +289,12 @@ set(EXTERNAL_LIBS ${PYTHON_LIBRARIES} ) +if(WITH_PSLIB) + list(APPEND EXTERNAL_LIBS pslib) + list(APPEND EXTERNAL_LIBS pslib_brpc) + list(APPEND EXTERNAL_LIBS libmct) +endif(WITH_PSLIB) + if(WITH_AMD_GPU) find_package(HIP) include(hip) diff --git a/README.md b/README.md index c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd..32a302cc5431a62b310d4812b545bd929f090e0a 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,15 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. +欢迎来到 PaddlePaddle GitHub + +PaddlePaddle (PArallel Distributed Deep LEarning) 是一个简单易用、高效灵活、可扩展的深度学习平台,最初由百度科学家和工程师共同开发,目的是将深度学习技术应用到百度的众多产品中。 + +我们的愿景是让每个人都能通过PaddlePaddle接触深度学习 + +跟进PaddlePaddle最新特性请参考我们的[版本说明](https://github.com/PaddlePaddle/Paddle/releases) + + ### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` @@ -34,6 +43,23 @@ pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` + +### PaddlePaddle最新版本: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) +### 安装最新稳定版本: +``` +# Linux CPU +pip install paddlepaddle +# Linux GPU cuda9cudnn7 +pip install paddlepaddle-gpu +# Linux GPU cuda8cudnn7 +pip install paddlepaddle-gpu==1.2.0.post87 +# Linux GPU cuda8cudnn5 +pip install paddlepaddle-gpu==1.2.0.post85 + +# 其他平台上的安装指引请参考 http://paddlepaddle.org/ +``` + + ## Features - **Flexibility** @@ -74,10 +100,38 @@ pip install paddlepaddle-gpu==1.2.0.post85 Baidu and it has achieved a significant impact. We hope you can also explore the capability of PaddlePaddle to make an impact on your product. +## 特点 + +- **灵活性** + + PaddlePaddle支持丰富的神经网络架构和优化算法。易于配置复杂模型,例如带有注意力机制或复杂记忆连接的神经网络机器翻译模型。 + +- **高效性** + + 为了高效使用异步计算资源,PaddlePaddle对框架的不同层进行优化,包括计算、存储、架构和通信。下面是一些样例: + + - 通过SSE/AVX 内置函数、BLAS库(例如MKL、OpenBLAS、cuBLAS)或定制的CPU/GPU内核优化数学操作。 + - 通过MKL-DNN库优化CNN网络 + - 高度优化循环网络,无需执行 `padding` 操作即可处理 **变长** 序列 + - 针对高维稀疏数据模型,优化了局部和分布式训练。 + + +- **稳定性** + + 有了 PaddlePaddle,使得利用各种CPU/GPU和机器来加速训练变得简单。PaddlePaddle 通过优化通信可以实现巨大吞吐量和快速执行。 + +- **连接产品** + + 另外,PaddlePaddle 的设计也易于部署。在百度,PaddlePaddle 已经部署到含有巨大用户量的产品和服务上,包括广告点击率(CTR)预测、大规模图像分类、光学字符识别(OCR)、搜索排序,计算机病毒检测、推荐系统等等。PaddlePaddle广泛应用于百度产品中,产生了非常重要的影响。我们希望您也能探索 PaddlePaddle 的能力,为您的产品创造新的影响力和效果。 + ## Installation It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. +## 安装 + +推荐阅读官网上的[安装说明](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) + ## Documentation We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and @@ -99,10 +153,37 @@ We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarte We appreciate your contributions! +## 文档 + +我们提供[英文](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html)和 +[中文](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) 文档 + +- [深度学习101](https://github.com/PaddlePaddle/book) + + 或许您想从这个在线交互式书籍开始,可以在Jupyter Notebook中运行 + +- [分布式训练](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) + + 可以在MPI集群上运行分布式训练任务 + +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) + + 新的API支持代码更少更简洁的程序 + +- [贡献方式](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) + + 欢迎您的贡献! ## Ask Questions You are welcome to submit questions and bug reports as [Github Issues](https://github.com/PaddlePaddle/Paddle/issues). +## 答疑 + +欢迎您将问题和bug报告以[Github Issues](https://github.com/PaddlePaddle/Paddle/issues)的形式提交 + ## Copyright and License PaddlePaddle is provided under the [Apache-2.0 license](LICENSE). + +## 版权和许可证 +PaddlePaddle由[Apache-2.0 license](LICENSE)提供 diff --git a/benchmark/fluid/fluid_benchmark.py b/benchmark/fluid/fluid_benchmark.py index 5f3ce300acc44ad8d2898c27296b866c403f3cc8..10b633a4fc1063aab5c0d34b994f9c233e228f17 100644 --- a/benchmark/fluid/fluid_benchmark.py +++ b/benchmark/fluid/fluid_benchmark.py @@ -81,9 +81,11 @@ def dist_transpile(trainer_id, args, train_prog, startup_prog): # the role, should be either PSERVER or TRAINER training_role = os.getenv("PADDLE_TRAINING_ROLE") - config = distribute_transpiler.DistributeTranspilerConfig() + config = fluid.DistributeTranspilerConfig() config.slice_var_up = not args.no_split_var + config.min_block_size = 1048576 t = distribute_transpiler.DistributeTranspiler(config=config) + t.transpile( trainer_id, # NOTE: *MUST* use train_prog, for we are using with guard to diff --git a/cmake/configure.cmake b/cmake/configure.cmake index 51f7a61631d7102b60646abe1c6dd7775692f157..4ee2fdcf2db6bfa373f814ee4c0ab4d708486ea8 100644 --- a/cmake/configure.cmake +++ b/cmake/configure.cmake @@ -84,6 +84,10 @@ if(NOT WITH_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG) endif(NOT WITH_GOLANG) +if(WITH_PSLIB) + add_definitions(-DPADDLE_WITH_PSLIB) +endif() + if(WITH_GPU) add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DEIGEN_USE_GPU) diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake index 30b227b6452abf44171a1a4e04569e66b16e67a4..6b50cff7a66a33d9413627bfbc663cca06ba86f3 100644 --- a/cmake/external/brpc.cmake +++ b/cmake/external/brpc.cmake @@ -14,14 +14,16 @@ INCLUDE(ExternalProject) -find_library(SSL_LIBRARY NAMES ssl) +find_package(OpenSSL REQUIRED) + +message(STATUS "ssl:" ${OPENSSL_SSL_LIBRARY}) +message(STATUS "crypto:" ${OPENSSL_CRYPTO_LIBRARY}) + ADD_LIBRARY(ssl SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${SSL_LIBRARY}) +SET_PROPERTY(TARGET ssl PROPERTY IMPORTED_LOCATION ${OPENSSL_SSL_LIBRARY}) -find_library(CRYPTO_LIBRARY NAMES crypto) ADD_LIBRARY(crypto SHARED IMPORTED GLOBAL) -SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${CRYPTO_LIBRARY}) - +SET_PROPERTY(TARGET crypto PROPERTY IMPORTED_LOCATION ${OPENSSL_CRYPTO_LIBRARY}) SET(BRPC_SOURCES_DIR ${THIRD_PARTY_PATH}/brpc) SET(BRPC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/brpc) @@ -31,14 +33,15 @@ SET(BRPC_LIBRARIES "${BRPC_INSTALL_DIR}/lib/libbrpc.a" CACHE FILEPATH "brpc libr INCLUDE_DIRECTORIES(${BRPC_INCLUDE_DIR}) # Reference https://stackoverflow.com/questions/45414507/pass-a-list-of-prefix-paths-to-externalproject-add-in-cmake-args -set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib") +set(prefix_path "${THIRD_PARTY_PATH}/install/gflags|${THIRD_PARTY_PATH}/install/leveldb|${THIRD_PARTY_PATH}/install/snappy|${THIRD_PARTY_PATH}/install/gtest|${THIRD_PARTY_PATH}/install/protobuf|${THIRD_PARTY_PATH}/install/zlib|${THIRD_PARTY_PATH}/install/glog") # If minimal .a is need, you can set WITH_DEBUG_SYMBOLS=OFF ExternalProject_Add( extern_brpc ${EXTERNAL_PROJECT_LOG_ARGS} + # TODO(gongwb): change to de newst repo when they changed. GIT_REPOSITORY "https://github.com/gongweibao/brpc" - GIT_TAG "7dc04defad1fd4173aae170c3fcbde131b65155a" + GIT_TAG "e9b67ec1b7458f2af5fae76451afe1e27e01b4b4" PREFIX ${BRPC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} @@ -50,7 +53,7 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_PREFIX_PATH=${prefix_path} - -DBRPC_WITH_GLOG=ON + -DWITH_GLOG=ON -DIOBUF_WITH_HUGE_BLOCK=ON -DBRPC_WITH_RDMA=${WITH_BRPC_RDMA} ${EXTERNAL_OPTIONAL_ARGS} @@ -65,5 +68,6 @@ ADD_LIBRARY(brpc STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ADD_DEPENDENCIES(brpc extern_brpc) +add_definitions(-DBRPC_WITH_GLOG) LIST(APPEND external_project_dependencies brpc) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index 4fe9c13fb7f2c04ae04e985252996dfa308ac304..9be625b620287cd4c644ae6908000fd5eec5d5c7 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -12,8 +12,12 @@ # See the License for the specific language governing permissions and # limitations under the License. -IF(WITH_TESTING) - ENABLE_TESTING() +#FIXME:(gongwb) Move brpc's gtest dependency. +IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) + IF(WITH_TESTING) + ENABLE_TESTING() + ENDIF(WITH_TESTING) + INCLUDE(ExternalProject) SET(GTEST_SOURCES_DIR ${THIRD_PARTY_PATH}/gtest) @@ -76,4 +80,4 @@ IF(WITH_TESTING) ADD_DEPENDENCIES(gtest_main extern_gtest) LIST(APPEND external_project_dependencies gtest gtest_main) -ENDIF(WITH_TESTING) +ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake index fb5091731da02b497a14f119e944905eee4979d5..0df61b01ab64c8b751bdc3893dd5294ad39ab928 100644 --- a/cmake/external/leveldb.cmake +++ b/cmake/external/leveldb.cmake @@ -24,8 +24,8 @@ ExternalProject_Add( extern_leveldb ${EXTERNAL_PROJECT_LOG_ARGS} PREFIX ${LEVELDB_SOURCES_DIR} - URL "https://github.com/google/leveldb/archive/v1.18.tar.gz" - URL_MD5 "73770de34a2a5ab34498d2e05b2b7fa0" + GIT_REPOSITORY "https://github.com/google/leveldb" + GIT_TAG v1.18 CONFIGURE_COMMAND "" BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a INSTALL_COMMAND mkdir -p ${LEVELDB_INSTALL_DIR}/lib/ diff --git a/cmake/external/libmct.cmake b/cmake/external/libmct.cmake new file mode 100644 index 0000000000000000000000000000000000000000..27cff8cfb6315c9b4fa5677ad9062bee73a0e5d8 --- /dev/null +++ b/cmake/external/libmct.cmake @@ -0,0 +1,78 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_LIBMCT}) + return() +ENDIF(NOT ${WITH_LIBMCT}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with LIBMCT in Paddle yet." + "Force WITH_LIBMCT=OFF") + SET(WITH_LIBMCT OFF CACHE STRING "Disable LIBMCT package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(LIBMCT_PROJECT "extern_libmct") +IF((NOT DEFINED LIBMCT_VER) OR (NOT DEFINED LIBMCT_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(LIBMCT_VER "0.1.0" CACHE STRING "" FORCE) + SET(LIBMCT_NAME "libmct" CACHE STRING "" FORCE) + SET(LIBMCT_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${LIBMCT_VER}/${LIBMCT_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "LIBMCT_NAME: ${LIBMCT_NAME}, LIBMCT_URL: ${LIBMCT_URL}") +SET(LIBMCT_SOURCE_DIR "${THIRD_PARTY_PATH}/libmct") +SET(LIBMCT_DOWNLOAD_DIR "${LIBMCT_SOURCE_DIR}/src/${LIBMCT_PROJECT}") +SET(LIBMCT_DST_DIR "libmct") +SET(LIBMCT_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(LIBMCT_INSTALL_DIR ${LIBMCT_INSTALL_ROOT}/${LIBMCT_DST_DIR}) +SET(LIBMCT_ROOT ${LIBMCT_INSTALL_DIR}) +SET(LIBMCT_INC_DIR ${LIBMCT_ROOT}/include) +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${LIBMCT_ROOT}/lib") + +INCLUDE_DIRECTORIES(${LIBMCT_INC_DIR}) + +FILE(WRITE ${LIBMCT_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(LIBMCT)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${LIBMCT_NAME}/include ${LIBMCT_NAME}/lib \n" + " DESTINATION ${LIBMCT_DST_DIR})\n") + +ExternalProject_Add( + ${LIBMCT_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${LIBMCT_SOURCE_DIR} + DOWNLOAD_DIR ${LIBMCT_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${LIBMCT_URL} -c -q -O ${LIBMCT_NAME}.tar.gz + && tar zxvf ${LIBMCT_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${LIBMCT_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${LIBMCT_INSTALL_ROOT} +) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0" OR NOT WIN32) + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/boost_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(libmct STATIC ${dummyfile}) +else() + add_library(libmct INTERFACE) +endif() + +#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL) +ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) +LIST(APPEND external_project_dependencies libmct) + diff --git a/cmake/external/pslib.cmake b/cmake/external/pslib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..3b495d78e2c61f90418adbc5746792bc6e49d90b --- /dev/null +++ b/cmake/external/pslib.cmake @@ -0,0 +1,77 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB}) + return() +ENDIF(NOT ${WITH_PSLIB}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB in Paddle yet." + "Force WITH_PSLIB=OFF") + SET(WITH_PSLIB OFF CACHE STRING "Disable PSLIB package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_PROJECT "extern_pslib") +IF((NOT DEFINED PSLIB_VER) OR (NOT DEFINED PSLIB_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_NAME "pslib" CACHE STRING "" FORCE) + SET(PSLIB_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_VER}/${PSLIB_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_NAME: ${PSLIB_NAME}, PSLIB_URL: ${PSLIB_URL}") +SET(PSLIB_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib") +SET(PSLIB_DOWNLOAD_DIR "${PSLIB_SOURCE_DIR}/src/${PSLIB_PROJECT}") +SET(PSLIB_DST_DIR "pslib") +SET(PSLIB_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_INSTALL_DIR ${PSLIB_INSTALL_ROOT}/${PSLIB_DST_DIR}) +SET(PSLIB_ROOT ${PSLIB_INSTALL_DIR}) +SET(PSLIB_INC_DIR ${PSLIB_ROOT}/include) +SET(PSLIB_LIB_DIR ${PSLIB_ROOT}/lib) +SET(PSLIB_LIB ${PSLIB_LIB_DIR}/libps.so) +SET(PSLIB_IOMP_LIB ${PSLIB_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_INC_DIR}) + +FILE(WRITE ${PSLIB_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_NAME}/include ${PSLIB_NAME}/lib \n" + " DESTINATION ${PSLIB_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_URL} -c -q -O ${PSLIB_NAME}.tar.gz + && tar zxvf ${PSLIB_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) +ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) +LIST(APPEND external_project_dependencies pslib) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_LIB} ${PSLIB_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/cmake/external/pslib_brpc.cmake b/cmake/external/pslib_brpc.cmake new file mode 100644 index 0000000000000000000000000000000000000000..7ff5a8aca187240108164900638f5a376e9fbc93 --- /dev/null +++ b/cmake/external/pslib_brpc.cmake @@ -0,0 +1,77 @@ +# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +IF(NOT ${WITH_PSLIB_BRPC}) + return() +ENDIF(NOT ${WITH_PSLIB_BRPC}) + +IF(WIN32 OR APPLE) + MESSAGE(WARNING + "Windows or Mac is not supported with PSLIB_BRPC in Paddle yet." + "Force WITH_PSLIB_BRPC=OFF") + SET(WITH_PSLIB_BRPC OFF CACHE STRING "Disable PSLIB_BRPC package in Windows and MacOS" FORCE) + return() +ENDIF() + +INCLUDE(ExternalProject) + +SET(PSLIB_BRPC_PROJECT "extern_pslib_brpc") +IF((NOT DEFINED PSLIB_BRPC_NAME) OR (NOT DEFINED PSLIB_BRPC_URL)) + MESSAGE(STATUS "use pre defined download url") + SET(PSLIB_BRPC_VER "0.1.0" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_NAME "pslib_brpc" CACHE STRING "" FORCE) + SET(PSLIB_BRPC_URL "https://raw.githubusercontent.com/PaddlePaddle/Fleet/release/${PSLIB_BRPC_VER}/${PSLIB_BRPC_NAME}.tar.gz" CACHE STRING "" FORCE) +ENDIF() +MESSAGE(STATUS "PSLIB_BRPC_NAME: ${PSLIB_BRPC_NAME}, PSLIB_BRPC_URL: ${PSLIB_BRPC_URL}") +SET(PSLIB_BRPC_SOURCE_DIR "${THIRD_PARTY_PATH}/pslib_brpc") +SET(PSLIB_BRPC_DOWNLOAD_DIR "${PSLIB_BRPC_SOURCE_DIR}/src/${PSLIB_BRPC_PROJECT}") +SET(PSLIB_BRPC_DST_DIR "pslib_brpc") +SET(PSLIB_BRPC_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") +SET(PSLIB_BRPC_INSTALL_DIR ${PSLIB_BRPC_INSTALL_ROOT}/${PSLIB_BRPC_DST_DIR}) +SET(PSLIB_BRPC_ROOT ${PSLIB_BRPC_INSTALL_DIR}) +SET(PSLIB_BRPC_INC_DIR ${PSLIB_BRPC_ROOT}/include) +SET(PSLIB_BRPC_LIB_DIR ${PSLIB_BRPC_ROOT}/lib) +SET(PSLIB_BRPC_LIB ${PSLIB_BRPC_LIB_DIR}/libbrpc.a) +SET(PSLIB_BRPC_IOMP_LIB ${PSLIB_BRPC_LIB_DIR}/libiomp5.so) #todo what is this +SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${PSLIB_BRPC_ROOT}/lib") + +INCLUDE_DIRECTORIES(${PSLIB_BRPC_INC_DIR}) + +FILE(WRITE ${PSLIB_BRPC_DOWNLOAD_DIR}/CMakeLists.txt + "PROJECT(PSLIB_BRPC)\n" + "cmake_minimum_required(VERSION 3.0)\n" + "install(DIRECTORY ${PSLIB_BRPC_NAME}/include ${PSLIB_BRPC_NAME}/lib \n" + " DESTINATION ${PSLIB_BRPC_DST_DIR})\n") + +ExternalProject_Add( + ${PSLIB_BRPC_PROJECT} + ${EXTERNAL_PROJECT_LOG_ARGS} + PREFIX ${PSLIB_BRPC_SOURCE_DIR} + DOWNLOAD_DIR ${PSLIB_BRPC_DOWNLOAD_DIR} + DOWNLOAD_COMMAND wget --no-check-certificate ${PSLIB_BRPC_URL} -c -q -O ${PSLIB_BRPC_NAME}.tar.gz + && tar zxvf ${PSLIB_BRPC_NAME}.tar.gz + DOWNLOAD_NO_PROGRESS 1 + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${PSLIB_BRPC_INSTALL_ROOT} + CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PSLIB_BRPC_INSTALL_ROOT} +) + +ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) +SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) +ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) +LIST(APPEND external_project_dependencies pslib_brpc) + +IF(WITH_C_API) + INSTALL(FILES ${PSLIB_BRPC_LIB} ${PSLIB_BRPC_IOMP_LIB} DESTINATION lib) +ENDIF() diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake index a3599dd798c07f57ed82e3f25b6bb9fc4f8bdc3a..623c53f4f75bbd217c157bcdda0cb12c510269ee 100644 --- a/cmake/external/python.cmake +++ b/cmake/external/python.cmake @@ -18,8 +18,8 @@ ENDIF() INCLUDE(python_module) -FIND_PACKAGE(PythonInterp ${PY_VERSION}) -FIND_PACKAGE(PythonLibs ${PY_VERSION}) +FIND_PACKAGE(PythonInterp ${PY_VERSION} REQUIRED) +FIND_PACKAGE(PythonLibs ${PY_VERSION} REQUIRED) if(WIN32) execute_process(COMMAND "${PYTHON_EXECUTABLE}" "-c" @@ -79,6 +79,5 @@ IF(PYTHONINTERP_FOUND) "please use pip to upgrade protobuf. pip install -U protobuf") ENDIF() ENDIF(PYTHONINTERP_FOUND) - INCLUDE_DIRECTORIES(${PYTHON_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${PYTHON_NUMPY_INCLUDE_DIR}) diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index b30403d2d81ce471f39b4d92e24a500fe41eeebb..f9d4cd97400a68e613e3dd5467191a0d42a9942e 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,12 +24,6 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -if (WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") -else(WIN32) - set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") -endif (WIN32) - ExternalProject_Add( extern_snappy GIT_REPOSITORY "https://github.com/google/snappy" @@ -56,6 +50,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") + add_custom_command(TARGET extern_snappy POST_BUILD + COMMAND cmake -E copy ${SNAPPY_INSTALL_DIR}/lib/snappy.lib ${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib + ) + ENDIF() + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) add_library(snappy STATIC IMPORTED GLOBAL) set_property(TARGET snappy PROPERTY IMPORTED_LOCATION ${SNAPPY_LIBRARIES}) diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake index 07e1137e16afc1e4e9ab9640e1ccaea8008a0cd2..7b937c93febdfa1d7d5f4c73fc2a5830322688e5 100644 --- a/cmake/external/warpctc.cmake +++ b/cmake/external/warpctc.cmake @@ -26,25 +26,33 @@ SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include" # Used in unit test test_WarpCTCLayer SET(WARPCTC_LIB_DIR "${WARPCTC_INSTALL_DIR}/lib" CACHE PATH "Warp-ctc Library Directory" FORCE) -SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" - CACHE FILEPATH "Warp-ctc Library" FORCE) -IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" ) +IF(CMAKE_CXX_COMPILER_ID STREQUAL "Clang" OR CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR WIN32) SET(USE_OMP OFF) ELSE() SET(USE_OMP ON) ENDIF() +IF(WIN32) + SET(WARPCTC_REPOSITORY "https://github.com/wopeizl/warp-ctc.git") +ELSE() + SET(WARPCTC_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git") +ENDIF() + ExternalProject_Add( extern_warpctc ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/dzhwinter/warp-ctc.git" + GIT_REPOSITORY ${WARPCTC_REPOSITORY} PREFIX ${WARPCTC_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${WARPCTC_INSTALL_DIR} -DWITH_GPU=${WITH_GPU} -DWITH_OMP=${USE_OMP} @@ -59,6 +67,18 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_INSTALL_PREFIX:PATH=${WARPCTC_INSTALL_DIR} ) +IF(WIN32) + IF(NOT EXISTS "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}") + add_custom_command(TARGET extern_warpctc POST_BUILD + COMMAND cmake -E copy ${WARPCTC_INSTALL_DIR}/bin/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} ${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX} + ) + ENDIF() + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/warpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +else(WIN32) + SET(WARPCTC_LIBRARIES "${WARPCTC_INSTALL_DIR}/lib/libwarpctc${CMAKE_SHARED_LIBRARY_SUFFIX}" + CACHE FILEPATH "Warp-ctc Library" FORCE) +ENDIF(WIN32) MESSAGE(STATUS "warp-ctc library: ${WARPCTC_LIBRARIES}") INCLUDE_DIRECTORIES(${WARPCTC_INCLUDE_DIR}) # For warpctc code to include its headers. diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake index 4c2d64f627401071098e72bfb930fb5d62fa042d..c3e1212d8f8358e0148b5e00223414c9696686ee 100644 --- a/cmake/external/xxhash.cmake +++ b/cmake/external/xxhash.cmake @@ -56,7 +56,12 @@ else() endif() if (WIN32) - set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/xxhash.lib") + IF(NOT EXISTS "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") + add_custom_command(TARGET extern_xxhash POST_BUILD + COMMAND cmake -E copy ${XXHASH_INSTALL_DIR}/lib/xxhash.lib ${XXHASH_INSTALL_DIR}/lib/libxxhash.lib + ) + ENDIF() + set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.lib") else() set(XXHASH_LIBRARIES "${XXHASH_INSTALL_DIR}/lib/libxxhash.a") endif () diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d73235453c8c9fd2859c3ab142888e8bda2dbe..d35073753725cd5772de3fc7a23af5ba69a65558 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -19,12 +19,6 @@ SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib) SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE) SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE) -IF(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib" CACHE FILEPATH "zlib library." FORCE) -ELSE(WIN32) - SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) -ENDIF(WIN32) - INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers. INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zlib.h. @@ -49,6 +43,16 @@ ExternalProject_Add( -DCMAKE_POSITION_INDEPENDENT_CODE:BOOL=ON -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} ) +IF(WIN32) + IF(NOT EXISTS "${ZLIB_INSTALL_DIR}/lib/libz.lib") + add_custom_command(TARGET extern_zlib POST_BUILD + COMMAND cmake -E copy ${ZLIB_INSTALL_DIR}/lib/zlibstatic.lib ${ZLIB_INSTALL_DIR}/lib/libz.lib + ) + ENDIF() + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.lib" CACHE FILEPATH "zlib library." FORCE) +ELSE(WIN32) + SET(ZLIB_LIBRARIES "${ZLIB_INSTALL_DIR}/lib/libz.a" CACHE FILEPATH "zlib library." FORCE) +ENDIF(WIN32) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) diff --git a/cmake/inference_lib.cmake b/cmake/inference_lib.cmake index c679d8507d8a9d3bce48b7f38491dadd9f2fb7f6..9f0adef7aa603ec5a3c8a5aa347613f462c43e60 100644 --- a/cmake/inference_lib.cmake +++ b/cmake/inference_lib.cmake @@ -32,24 +32,35 @@ function(copy TARGET) list(GET copy_lib_SRCS ${index} src) list(GET copy_lib_DSTS ${index} dst) if (WIN32) - # windows cmd shell will not expand wildcard automatically. - # below expand the files,libs and copy them by rules. - file(GLOB header_files ${src} "*.h") - file(GLOB static_lib_files ${src} "*.lib") - file(GLOB dll_lib_files ${src} "*.dll") - set(src_files ${header_files} ${static_lib_files} ${dll_lib_files}) - - if (NOT "${src_files}" STREQUAL "") - list(REMOVE_DUPLICATES src_files) - endif () - add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" - ) - foreach (src_file ${src_files}) + if(IS_DIRECTORY ${src}) + get_filename_component(last_path ${src} NAME) + string(APPEND dst "/" ${last_path}) add_custom_command(TARGET ${TARGET} PRE_BUILD - COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" - COMMENT "copying ${src_file} -> ${dst}") - endforeach () + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + if(EXISTS ${src}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND cmake -E copy_directory "${src}" "${dst}" + COMMENT "copying ${src} -> ${dst}") + else() + message(WARNING "${src} not exist!") + endif() + else() + # windows cmd shell will not expand wildcard automatically. + # below expand the files, and copy them by rules. + file(GLOB src_files ${src}) + if (NOT "${src_files}" STREQUAL "") + list(REMOVE_DUPLICATES src_files) + endif () + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory "${dst}" + ) + foreach (src_file ${src_files}) + add_custom_command(TARGET ${TARGET} PRE_BUILD + COMMAND ${CMAKE_COMMAND} -E copy "${src_file}" "${dst}" + COMMENT "copying ${src_file} -> ${dst}") + endforeach () + endif() else (WIN32) # not windows add_custom_command(TARGET ${TARGET} PRE_BUILD COMMAND mkdir -p "${dst}" @@ -95,7 +106,7 @@ copy(xxhash_lib DEPS xxhash ) -if (NOT PROTOBUF_FOUND) +if (NOT PROTOBUF_FOUND OR WIN32) set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf") copy(protobuf_lib SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY} @@ -138,27 +149,25 @@ if (WITH_NGRAPH) ) endif () -if (NOT WIN32) - if (NOT MOBILE_INFERENCE AND NOT RPI) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") - copy(snappy_lib - SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappy) +if (NOT MOBILE_INFERENCE AND NOT RPI) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy") + copy(snappy_lib + SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappy) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") - copy(snappystream_lib - SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS snappystream) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream") + copy(snappystream_lib + SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS snappystream) - set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") - copy(zlib_lib - SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} - DSTS ${dst_dir} ${dst_dir}/lib - DEPS zlib) - endif () -endif (NOT WIN32) + set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib") + copy(zlib_lib + SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES} + DSTS ${dst_dir} ${dst_dir}/lib + DEPS zlib) +endif () # paddle fluid module set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid") @@ -191,9 +200,21 @@ if (WITH_ANAKIN AND WITH_MKL) list(APPEND inference_deps anakin_inference_lib) endif () +if (TENSORRT_FOUND) + copy(tensorrt_lib DEPS ${inference_deps} + SRCS ${TENSORRT_ROOT}/include/Nv*.h ${TENSORRT_ROOT}/lib/libnvinfer* + DSTS ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/include ${FLUID_INSTALL_DIR}/third_party/install/tensorrt/lib) +endif () + + set(module "inference") +if(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/${CMAKE_BUILD_TYPE}/libpaddle_fluid.*) +else(WIN32) + set(paddle_fluid_lib ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.*) +endif(WIN32) copy(inference_lib DEPS ${inference_deps} - SRCS ${src_dir}/${module}/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${src_dir}/${module}/*.h ${paddle_fluid_lib} ${src_dir}/${module}/api/paddle_*.h DSTS ${dst_dir}/${module} ${dst_dir}/${module} ${dst_dir}/${module} ) @@ -233,7 +254,7 @@ copy(third_party DEPS fluid_lib_dist # only need libpaddle_fluid.so/a and paddle_*.h for inference-only library copy(inference_api_lib DEPS fluid_lib_dist - SRCS ${FLUID_INSTALL_DIR}/paddle/fluid/inference/libpaddle_fluid.* + SRCS ${paddle_fluid_lib} ${FLUID_INSTALL_DIR}/paddle/fluid/inference/paddle_*.h DSTS ${FLUID_INFERENCE_INSTALL_DIR}/paddle/lib ${FLUID_INFERENCE_INSTALL_DIR}/paddle/include ) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 2ced43f9e6c60da642f7a6252f889d9c9ab9748f..70d159b4f3549662e080794efad8af943ce1f0bc 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,7 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8e6482ca981e1473a552efcc3ee043aeda137780..5e9901bb87c9a454a393a913b6da6e82266ee719 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -37,8 +37,16 @@ paddle.fluid.DataFeedDesc.desc ArgSpec(args=['self'], varargs=None, keywords=Non paddle.fluid.DataFeedDesc.set_batch_size ArgSpec(args=['self', 'batch_size'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_dense_slots ArgSpec(args=['self', 'dense_slots_name'], varargs=None, keywords=None, defaults=None) paddle.fluid.DataFeedDesc.set_use_slots ArgSpec(args=['self', 'use_slots_name'], varargs=None, keywords=None, defaults=None) -paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'debug'], varargs=None, keywords=None, defaults=(False,)) +paddle.fluid.AsyncExecutor.__init__ ArgSpec(args=['self', 'place', 'run_mode'], varargs=None, keywords=None, defaults=(None, '')) +paddle.fluid.AsyncExecutor.config_distributed_nodes ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.download_data ArgSpec(args=['self', 'afs_path', 'local_path', 'fs_default_name', 'ugi', 'file_cnt', 'hadoop_home', 'process_num'], varargs=None, keywords=None, defaults=('$HADOOP_HOME', 12)) +paddle.fluid.AsyncExecutor.get_instance ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_model ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_server ArgSpec(args=['self', 'dist_desc'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.init_worker ArgSpec(args=['self', 'dist_desc', 'startup_program'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.run ArgSpec(args=['self', 'program', 'data_feed', 'filelist', 'thread_num', 'fetch', 'mode', 'debug'], varargs=None, keywords=None, defaults=('', False)) +paddle.fluid.AsyncExecutor.save_model ArgSpec(args=['self', 'save_path'], varargs=None, keywords=None, defaults=None) +paddle.fluid.AsyncExecutor.stop ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.io.save_vars ArgSpec(args=['executor', 'dirname', 'main_program', 'vars', 'predicate', 'filename'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.io.save_params ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.io.save_persistables ArgSpec(args=['executor', 'dirname', 'main_program', 'filename'], varargs=None, keywords=None, defaults=(None, None)) @@ -77,6 +85,8 @@ paddle.fluid.layers.sequence_softmax ArgSpec(args=['input', 'use_cudnn', 'name'] paddle.fluid.layers.softmax ArgSpec(args=['input', 'use_cudnn', 'name'], varargs=None, keywords=None, defaults=(True, None)) paddle.fluid.layers.pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) paddle.fluid.layers.pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'pool_stride', 'pool_padding', 'global_pooling', 'use_cudnn', 'ceil_mode', 'name', 'exclusive'], varargs=None, keywords=None, defaults=(-1, 'max', 1, 0, False, True, False, None, True)) +paddle.fluid.layers.adaptive_pool2d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) +paddle.fluid.layers.adaptive_pool3d ArgSpec(args=['input', 'pool_size', 'pool_type', 'require_index', 'name'], varargs=None, keywords=None, defaults=('max', False, None)) paddle.fluid.layers.batch_norm ArgSpec(args=['input', 'act', 'is_test', 'momentum', 'epsilon', 'param_attr', 'bias_attr', 'data_layout', 'in_place', 'name', 'moving_mean_name', 'moving_variance_name', 'do_model_average_for_mean_and_var', 'fuse_with_relu', 'use_global_stats'], varargs=None, keywords=None, defaults=(None, False, 0.9, 1e-05, None, None, 'NCHW', False, None, None, None, False, False, False)) paddle.fluid.layers.beam_search_decode ArgSpec(args=['ids', 'scores', 'beam_size', 'end_id', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.conv2d_transpose ArgSpec(args=['input', 'num_filters', 'output_size', 'filter_size', 'padding', 'stride', 'dilation', 'groups', 'param_attr', 'bias_attr', 'use_cudnn', 'act', 'name'], varargs=None, keywords=None, defaults=(None, None, 0, 1, 1, None, None, None, True, None, None)) @@ -199,6 +209,7 @@ paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.psroi_pool ArgSpec(args=['input', 'rois', 'output_channels', 'spatial_scale', 'pooled_height', 'pooled_width', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.huber_loss ArgSpec(args=['input', 'label', 'delta'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) paddle.fluid.layers.read_file ArgSpec(args=['reader'], varargs=None, keywords=None, defaults=None) @@ -365,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 023118d74078d3afba206a5bbaabf5e8b4bb0656..f43e908d17b13af91f841271e60dd182007994bc 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -1,17 +1,18 @@ -# windows treat symbolic file as a real file, which is different with unix -# We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. function(windows_symbolic TARGET) set(oneValueArgs "") - set(multiValueArgs SRCS DEPS) + set(multiValueArgs SRCS PATH) cmake_parse_arguments(windows_symbolic "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + set(final_path ${CMAKE_CURRENT_SOURCE_DIR}/${windows_symbolic_PATH}) foreach(src ${windows_symbolic_SRCS}) get_filename_component(src ${src} NAME_WE) if (NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc OR NOT EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cu) message(FATAL " ${src}.cc and ${src}.cu must exsits, and ${src}.cu must be symbolic file.") endif() - # only copy the xx.cu to .xx.cu when the content are modified +#only copy the xx.cu to.xx.cu when the content are modified set(copy_flag 1) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/.${src}.cu) file(READ ${CMAKE_CURRENT_SOURCE_DIR}/${src}.cc SOURCE_STR) @@ -32,7 +33,7 @@ endfunction() add_subdirectory(ir) add_subdirectory(details) -# ddim lib +#ddim lib proto_library(framework_proto SRCS framework.proto) proto_library(async_executor_proto SRCS data_feed.proto) @@ -91,8 +92,8 @@ nv_test(data_device_transform_test SRCS data_device_transform_test.cu if(WITH_GPU) if (WIN32) - # windows treat symbolic file as a real file, which is different with unix - # We create a hidden file and compile it instead of origin source file. +#windows treat symbolic file as a real file, which is different with unix +#We create a hidden file and compile it instead of origin source file. windows_symbolic(hidden_file SRCS data_type_transform.cu) nv_library(data_type_transform SRCS .data_type_transform.cu DEPS tensor) add_dependencies(data_type_transform hidden_file) @@ -143,7 +144,8 @@ cc_library(op_registry SRCS op_registry.cc DEPS op_proto_maker op_info operator nv_test(op_registry_test SRCS op_registry_test.cc DEPS op_registry) py_proto_compile(framework_py_proto SRCS framework.proto data_feed.proto) -# Generate an empty __init__.py to make framework_py_proto as a valid python module. +#Generate an empty \ + #__init__.py to make framework_py_proto as a valid python module. add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py) add_dependencies(framework_py_proto framework_py_proto_init) if (NOT WIN32) @@ -169,9 +171,12 @@ cc_library(variable_helper SRCS variable_helper.cc DEPS lod_tensor) cc_library(naive_executor SRCS naive_executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass variable_helper) if(WITH_DISTRIBUTE) - cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method sendrecvop_grpc cares grpc++_unsecure grpc_unsecure gpr graph_to_program_pass variable_helper) - set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") - set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_library(executor SRCS executor.cc DEPS op_registry device_context scope framework_proto glog + lod_rank_table feed_fetch_method sendrecvop_rpc ${GLOB_DISTRIBUTE_DEPS} graph_to_program_pass variable_helper) + + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(executor.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + else() if(WITH_NGRAPH) if(NOT WIN32) @@ -192,7 +197,12 @@ cc_library(parallel_executor SRCS parallel_executor.cc DEPS graph build_strategy fast_threaded_ssa_graph_executor variable_helper) -cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +if(WITH_PSLIB) + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper pslib_brpc pslib) +else() + cc_library(async_executor SRCS async_executor.cc data_feed.cc data_feed_factory.cc executor_thread_worker.cc DEPS op_registry device_context scope framework_proto glog lod_rank_table feed_fetch_method graph_to_program_pass async_executor_proto variable_helper) +endif(WITH_PSLIB) + cc_test(data_feed_test SRCS data_feed_test.cc DEPS async_executor) cc_library(prune SRCS prune.cc DEPS framework_proto) diff --git a/paddle/fluid/framework/async_executor.cc b/paddle/fluid/framework/async_executor.cc index afb2dd2f064384da39904f6aceead4fa915a80f2..ee3c5e01f87eeb123f43f867296e35cc8adb7e8e 100644 --- a/paddle/fluid/framework/async_executor.cc +++ b/paddle/fluid/framework/async_executor.cc @@ -29,6 +29,9 @@ limitations under the License. */ #include "paddle/fluid/inference/io.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/pybind/pybind.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif namespace paddle { namespace framework { @@ -47,6 +50,11 @@ void AsyncExecutor::CreateThreads( worker->SetDataFeed(reader); worker->SetFetchVarNames(fetch_var_names); worker->BindingDataFeedMemory(); +#ifdef PADDLE_WITH_PSLIB + worker->SetPSlibPtr(_pslib_ptr); + worker->SetPullDenseThread(_pull_dense_thread); + worker->SetParamConfig(&_param_config); +#endif } void PrepareReaders(std::vector>& readers, // NOLINT @@ -60,12 +68,177 @@ void PrepareReaders(std::vector>& readers, // NOLINT readers[0]->SetFileList(filelist); } +#ifdef PADDLE_WITH_PSLIB +void AsyncExecutor::InitServer(const std::string& dist_desc, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_server(dist_desc, index); + InitParamConfig(); +} + +void AsyncExecutor::InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, + int node_num, int index) { + _pslib_ptr = std::shared_ptr( + new paddle::distributed::PSlib()); + _pslib_ptr->init_worker( + dist_desc, const_cast(host_sign_list.data()), node_num, index); + + InitParamConfig(); +} + +uint64_t AsyncExecutor::StartServer() { return _pslib_ptr->run_server(); } + +void AsyncExecutor::StopServer() { _pslib_ptr->stop_server(); } + +void AsyncExecutor::GatherServers(const std::vector& host_sign_list, + int node_num) { + _pslib_ptr->gather_servers(const_cast(host_sign_list.data()), + node_num); +} + +void AsyncExecutor::InitParamConfig() { + for (int i = 0; i < _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param_size(); + ++i) { + if (_pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .table_class() + .find("SparseTable") != -1) { + _param_config.fea_dim = _pslib_ptr->get_param() + ->server_param() + .downpour_server_param() + .downpour_table_param(i) + .accessor() + .fea_dim(); + break; + } + } + _param_config.slot_dim = _param_config.fea_dim - 2; + _param_config.tmp_push_dense_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_dense_per_batch()); + _param_config.tmp_push_sparse_wait_times = static_cast( + _pslib_ptr->get_param()->trainer_param().push_sparse_per_batch()); + + for (auto t = 0u; t < _pslib_ptr->get_param()->trainer_param().skip_op_size(); + ++t) { + _param_config.skip_op.push_back( + _pslib_ptr->get_param()->trainer_param().skip_op(t)); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().sparse_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().sparse_table(t); + std::vector tmp_sparse_variable_name; + for (int i = 0u; i < table.slot_value_size(); ++i) { + tmp_sparse_variable_name.push_back(table.slot_value(i)); + _param_config.slot_alias_to_table[table.slot_key(i)] = table.table_id(); + } + std::vector tmp_sparse_gradient_variable_name; + for (auto i = 0u; i < table.slot_gradient_size(); ++i) { + tmp_sparse_gradient_variable_name.push_back(table.slot_gradient(i)); + } + _param_config.slot_input_vec[table.table_id()] = + std::move(tmp_sparse_variable_name); + _param_config.gradient_var[table.table_id()] = + std::move(tmp_sparse_gradient_variable_name); + _param_config.sparse_table_id.push_back(table.table_id()); + } + + for (auto t = 0u; + t < _pslib_ptr->get_param()->trainer_param().dense_table_size(); ++t) { + auto& table = _pslib_ptr->get_param()->trainer_param().dense_table(t); + std::vector tmp_dense_variable_name; + for (int i = 0u; i < table.dense_variable_name_size(); ++i) { + tmp_dense_variable_name.push_back(table.dense_variable_name(i)); + } + std::vector tmp_dense_gradient_variable_name; + for (auto i = 0u; i < table.dense_gradient_variable_name_size(); ++i) { + tmp_dense_gradient_variable_name.push_back( + table.dense_gradient_variable_name(i)); + } + _param_config.dense_variable_name[table.table_id()] = + std::move(tmp_dense_variable_name); + _param_config.dense_gradient_variable_name[table.table_id()] = + std::move(tmp_dense_gradient_variable_name); + _param_config.dense_table_id.push_back(table.table_id()); + _param_config.dense_table_size.push_back(table.fea_dim()); + } +} + +void AsyncExecutor::InitModel() { + for (auto table_id : _param_config.dense_table_id) { + std::vector regions; + for (auto& t : _param_config.dense_variable_name[table_id]) { + Variable* var = root_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + + float* g = tensor->data(); + CHECK(g != nullptr) << "var[" << t << "] value not initialized"; + + float init_range = 0.2; + int rown = tensor->dims()[0]; + init_range /= sqrt(rown); + + std::normal_distribution ndistr(0.0, 1.0); + for (auto i = 0u; i < tensor->numel(); ++i) { + g[i] = ndistr(local_random_engine()) * init_range; + } + + paddle::ps::Region reg(g, tensor->numel()); + regions.emplace_back(std::move(reg)); + } + + auto push_status = _pslib_ptr->_worker_ptr->push_dense_param( + regions.data(), regions.size(), table_id); + push_status.wait(); + auto status = push_status.get(); + if (status != 0) { + LOG(FATAL) << "push dense param failed, status[" << status << "]"; + exit(-1); + } + } +} + +void AsyncExecutor::SaveModel(const std::string& path) { + auto ret = _pslib_ptr->_worker_ptr->flush(); + ret.wait(); + ret = _pslib_ptr->_worker_ptr->save(path, 0); + ret.wait(); + int32_t feasign_cnt = ret.get(); + if (feasign_cnt == -1) { // (colourful-tree) TODO should be feasign_cnt < 0 + LOG(FATAL) << "save model failed"; + exit(-1); + } +} + +void AsyncExecutor::PrepareDenseThread(const std::string& mode) { + if (mode == "mpi") { + DensePullThreadParam param; + param.ps_client = _pslib_ptr->_worker_ptr; + param.threshold = 1; + param.training_thread_num = actual_thread_num; + param.root_scope = root_scope_; + param.dense_params = &_param_config.dense_variable_name; + + _pull_dense_thread = + std::shared_ptr(new DensePullThread(param)); + _pull_dense_thread->start(); + } +} +#endif + void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, const std::string& data_feed_desc_str, const std::vector& filelist, const int thread_num, const std::vector& fetch_var_names, - const bool debug) { + const std::string& mode, const bool debug) { std::vector threads; auto& block = main_program.Block(0); @@ -82,7 +255,7 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, google::protobuf::TextFormat::ParseFromString(data_feed_desc_str, &data_feed_desc); - int actual_thread_num = thread_num; + actual_thread_num = thread_num; int file_cnt = filelist.size(); PADDLE_ENFORCE(file_cnt > 0, "File list cannot be empty"); @@ -106,11 +279,21 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, // todo: should be factory method for creating datafeed std::vector> readers; PrepareReaders(readers, actual_thread_num, data_feed_desc, filelist); - +#ifdef PADDLE_WITH_PSLIB + PrepareDenseThread(mode); +#endif std::vector> workers; workers.resize(actual_thread_num); for (auto& worker : workers) { +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + worker.reset(new AsyncExecutorThreadWorker); + } else { + worker.reset(new ExecutorThreadWorker); + } +#else worker.reset(new ExecutorThreadWorker); +#endif } // prepare thread resource here @@ -128,7 +311,11 @@ void AsyncExecutor::RunFromFile(const ProgramDesc& main_program, for (auto& th : threads) { th.join(); } - +#ifdef PADDLE_WITH_PSLIB + if (mode == "mpi") { + _pull_dense_thread->stop(); + } +#endif root_scope_->DropKids(); return; diff --git a/paddle/fluid/framework/async_executor.h b/paddle/fluid/framework/async_executor.h index f4d2a79ac592e02f49ec0b988c824dc98883fbf6..95c8472b2f3b6b0c2d95fcf0c0b6f00e7f39b032 100644 --- a/paddle/fluid/framework/async_executor.h +++ b/paddle/fluid/framework/async_executor.h @@ -14,9 +14,11 @@ limitations under the License. */ #pragma once +#include #include #include -#include // NOLINT +#include // NOLINT +#include // local_random_engine #include #include #include // NOLINT @@ -30,6 +32,31 @@ limitations under the License. */ namespace paddle { namespace framework { + +inline double current_realtime() { +#if !defined(_WIN32) + struct timespec tp; + clock_gettime(CLOCK_REALTIME, &tp); + return tp.tv_sec + tp.tv_nsec * 1e-9; +#else + return 0.0; +#endif +} + +inline std::default_random_engine& local_random_engine() { + struct engine_wrapper_t { + std::default_random_engine engine; + engine_wrapper_t() { + static std::atomic x(0); + std::seed_seq sseq = {x++, x++, x++, + static_cast(current_realtime() * 1000)}; + engine.seed(sseq); + } + }; + thread_local engine_wrapper_t r; + return r.engine; +} + class AsyncExecutor { public: AsyncExecutor(Scope* scope, const platform::Place& place); @@ -39,7 +66,19 @@ class AsyncExecutor { const std::vector& filelist, const int thread_num, const std::vector& fetch_names, - const bool debug = false); + const std::string& mode, const bool debug = false); +#ifdef PADDLE_WITH_PSLIB + void InitServer(const std::string& dist_desc, int index); + void InitWorker(const std::string& dist_desc, + const std::vector& host_sign_list, int node_num, + int index); + uint64_t StartServer(); + void StopServer(); + void GatherServers(const std::vector& host_sign_list, int node_num); + void InitModel(); + void SaveModel(const std::string& path); + void InitParamConfig(); +#endif private: void CreateThreads(ExecutorThreadWorker* worker, @@ -48,10 +87,21 @@ class AsyncExecutor { const std::vector& fetch_var_names, Scope* root_scope, const int thread_index, const bool debug); +#ifdef PADDLE_WITH_PSLIB + void PrepareDenseThread(const std::string& mode); +#endif public: +#ifdef PADDLE_WITH_PSLIB + std::shared_ptr _pslib_ptr; + std::shared_ptr _pull_dense_thread; + AsyncWorkerParamConfig _param_config; +#endif Scope* root_scope_; platform::Place place_; + + private: + int actual_thread_num; }; } // namespace framework diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index a99cf53b410433c6e4b8a19821779f28c25e678f..41155cfb7714b10fa51bc56fc90af4ee3d8b4a1a 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -64,6 +64,7 @@ bool DataFeed::PickOneFile(std::string* filename) { return false; } *filename = filelist_[file_idx_++]; + LOG(ERROR) << "pick file:" << *filename; return true; } diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index 5467f6d1b23c0058f06387e3da97c4193dd5ca6c..72c50518af08b9c1b2f97e6864e5836e806c77fc 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -85,7 +85,7 @@ void TransDataLayout(const OpKernelType& kernel_type_for_var, out->mutable_data(expected_kernel_type.place_, in.type()); framework::VisitDataType( - framework::ToDataType(in.type()), + in.type(), CastDataLayout(pool.Get(expected_kernel_type.place_), axis, in, out)); out->set_layout(expected_kernel_type.data_layout_); @@ -101,7 +101,7 @@ void* GetDataFromTensor(const Tensor& tensor, mkldnn::memory::data_type type) { case mkldnn::memory::data_type::f32: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s8: - return platform::to_void_cast(tensor.data()); + return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::u8: return platform::to_void_cast(tensor.data()); case mkldnn::memory::data_type::s16: @@ -144,7 +144,7 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, memory::data_type in_type = ToMKLDNNDataType(in.type()); PADDLE_ENFORCE(in_type != memory::data_type::data_undef, - "Input tensor type is not supported: ", in.type().name()); + "Input tensor type is not supported: %s", in.type()); memory::data_type out_type = in_type; auto in_format = platform::MKLDNNFormatForSize(in_tz.size(), in.format()); diff --git a/paddle/fluid/framework/data_layout_transform.h b/paddle/fluid/framework/data_layout_transform.h index 90bb206ec6b698bc23ad1a5c9609a25186ec6de8..2479de4fd46802148af09d34b627a8804276cacf 100644 --- a/paddle/fluid/framework/data_layout_transform.h +++ b/paddle/fluid/framework/data_layout_transform.h @@ -50,14 +50,14 @@ inline DataLayout ToPaddleLayout(const MKLDNNFormat& format) { } } -inline MKLDNNDataType ToMKLDNNDataType(const std::type_index type) { - static const std::map dict{ - {std::type_index(typeid(float)), MKLDNNDataType::f32}, // NOLINT - {std::type_index(typeid(char)), MKLDNNDataType::s8}, // NOLINT - {std::type_index(typeid(unsigned char)), MKLDNNDataType::u8}, - {std::type_index(typeid(int16_t)), MKLDNNDataType::s16}, - {std::type_index(typeid(int32_t)), MKLDNNDataType::s32}}; - auto iter = dict.find(type); +inline MKLDNNDataType ToMKLDNNDataType(proto::VarType::Type type) { + static std::unordered_map dict{ + {DataTypeTrait::DataType, MKLDNNDataType::f32}, + {DataTypeTrait::DataType, MKLDNNDataType::s8}, + {DataTypeTrait::DataType, MKLDNNDataType::u8}, + {DataTypeTrait::DataType, MKLDNNDataType::s16}, + {DataTypeTrait::DataType, MKLDNNDataType::s32}}; + auto iter = dict.find(static_cast(type)); if (iter != dict.end()) return iter->second; return MKLDNNDataType::data_undef; } diff --git a/paddle/fluid/framework/data_type.cc b/paddle/fluid/framework/data_type.cc index 28f3da88fa18021f6b71e458fdb467be86d4dbf0..a0248cf3c75690fb9ec3fcc22596af245d042d80 100644 --- a/paddle/fluid/framework/data_type.cc +++ b/paddle/fluid/framework/data_type.cc @@ -26,7 +26,7 @@ struct DataTypeMap { std::unordered_map cpp_to_proto_; std::unordered_map proto_to_cpp_; std::unordered_map proto_to_str_; - std::unordered_map cpp_to_size_; + std::unordered_map proto_to_size_; }; static DataTypeMap* InitDataTypeMap(); @@ -45,7 +45,7 @@ static inline void RegisterType(DataTypeMap* map, map->proto_to_cpp_.emplace(static_cast(proto_type), typeid(T)); map->cpp_to_proto_.emplace(typeid(T), proto_type); map->proto_to_str_.emplace(static_cast(proto_type), name); - map->cpp_to_size_.emplace(typeid(T), sizeof(T)); + map->proto_to_size_.emplace(static_cast(proto_type), sizeof(T)); } static DataTypeMap* InitDataTypeMap() { @@ -54,17 +54,7 @@ static DataTypeMap* InitDataTypeMap() { #define RegType(cc_type, proto_type) \ RegisterType(retv, proto_type, #cc_type) - // NOTE: Add your customize type here. - RegType(float16, proto::VarType::FP16); - RegType(float, proto::VarType::FP32); - RegType(double, proto::VarType::FP64); - RegType(int, proto::VarType::INT32); - RegType(int64_t, proto::VarType::INT64); - RegType(bool, proto::VarType::BOOL); - RegType(size_t, proto::VarType::SIZE_T); - RegType(int16_t, proto::VarType::INT16); - RegType(uint8_t, proto::VarType::UINT8); - RegType(int8_t, proto::VarType::INT8); + _ForEachDataType_(RegType); #undef RegType return retv; @@ -96,12 +86,12 @@ std::string DataTypeToString(const proto::VarType::Type type) { static_cast(type)); } -size_t SizeOfType(std::type_index type) { - auto it = gDataTypeMap().cpp_to_size_.find(type); - if (it != gDataTypeMap().cpp_to_size_.end()) { +size_t SizeOfType(proto::VarType::Type type) { + auto it = gDataTypeMap().proto_to_size_.find(static_cast(type)); + if (it != gDataTypeMap().proto_to_size_.end()) { return it->second; } - PADDLE_THROW("Not support %s as tensor type", type.name()); + PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type)); } } // namespace framework diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h index d5be43b33edab7871e1bba930a4fc6cd1e293825..76df78ea5e17c7eaf1e8ce7a7dc2282a5a4ed579 100644 --- a/paddle/fluid/framework/data_type.h +++ b/paddle/fluid/framework/data_type.h @@ -22,46 +22,59 @@ limitations under the License. */ namespace paddle { namespace framework { +template +struct DataTypeTrait {}; + +// Stub handle for void +template <> +struct DataTypeTrait { + constexpr static auto DataType = proto::VarType::RAW; +}; + +#define _ForEachDataTypeHelper_(callback, cpp_type, proto_type) \ + callback(cpp_type, ::paddle::framework::proto::VarType::proto_type); + +#define _ForEachDataType_(callback) \ + _ForEachDataTypeHelper_(callback, float, FP32); \ + _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16); \ + _ForEachDataTypeHelper_(callback, double, FP64); \ + _ForEachDataTypeHelper_(callback, int, INT32); \ + _ForEachDataTypeHelper_(callback, int64_t, INT64); \ + _ForEachDataTypeHelper_(callback, bool, BOOL); \ + _ForEachDataTypeHelper_(callback, uint8_t, UINT8); \ + _ForEachDataTypeHelper_(callback, int16_t, INT16); \ + _ForEachDataTypeHelper_(callback, int8_t, INT8) + +#define DefineDataTypeTrait(cpp_type, proto_type) \ + template <> \ + struct DataTypeTrait { \ + constexpr static auto DataType = proto_type; \ + } + +_ForEachDataType_(DefineDataTypeTrait); + +#undef DefineDataTypeTrait + extern proto::VarType::Type ToDataType(std::type_index type); extern std::type_index ToTypeIndex(proto::VarType::Type type); template inline void VisitDataType(proto::VarType::Type type, Visitor visitor) { - switch (type) { - case proto::VarType::FP16: - visitor.template apply(); - break; - case proto::VarType::FP32: - visitor.template apply(); - break; - case proto::VarType::FP64: - visitor.template apply(); - break; - case proto::VarType::INT32: - visitor.template apply(); - break; - case proto::VarType::INT64: - visitor.template apply(); - break; - case proto::VarType::BOOL: - visitor.template apply(); - break; - case proto::VarType::UINT8: - visitor.template apply(); - break; - case proto::VarType::INT16: - visitor.template apply(); - break; - case proto::VarType::INT8: - visitor.template apply(); - break; - default: - PADDLE_THROW("Not supported %d", type); - } +#define VisitDataTypeCallback(cpp_type, proto_type) \ + do { \ + if (type == proto_type) { \ + visitor.template apply(); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(VisitDataTypeCallback); +#undef VisitDataTypeCallback + PADDLE_THROW("Not supported %d", type); } extern std::string DataTypeToString(const proto::VarType::Type type); -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); inline std::ostream& operator<<(std::ostream& out, const proto::VarType::Type& type) { out << DataTypeToString(type); diff --git a/paddle/fluid/framework/data_type_test.cc b/paddle/fluid/framework/data_type_test.cc index 54c41c55ba63c0b2001cfcb6a9e94fbb0036d437..2a380201f297f42dd82a6809bef9a72660066819 100644 --- a/paddle/fluid/framework/data_type_test.cc +++ b/paddle/fluid/framework/data_type_test.cc @@ -26,15 +26,15 @@ TEST(DataType, float16) { Tensor tensor; CPUPlace cpu; - tensor.mutable_data(cpu, f::ToTypeIndex(dtype)); + tensor.mutable_data(cpu, dtype); // test fp16 tensor - EXPECT_EQ(tensor.type(), std::type_index(typeid(float16))); + EXPECT_EQ(tensor.type(), f::ToDataType(typeid(float16))); // test fp16 size - EXPECT_EQ(f::SizeOfType(f::ToTypeIndex(dtype)), 2u); + EXPECT_EQ(f::SizeOfType(dtype), 2u); // test debug info - std::string type = "float16"; + std::string type = "::paddle::platform::float16"; EXPECT_STREQ(f::DataTypeToString(dtype).c_str(), type.c_str()); } diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a927a3afcddb52f571543462e485b682aac163ae..63a68ba3a5c289be7c2d352717fe5911539df8a7 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -12,12 +12,19 @@ cc_library(multi_devices_graph_check_pass SRCS multi_devices_graph_check_pass.cc cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_rows) +if(WITH_DISTRIBUTE) + if(NOT WITH_GRPC) + set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") + set_source_files_properties(reduce_op_handle.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + endif() +endif() + if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) if(WITH_DISTRIBUTE) nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + ddim dynload_cuda selected_rows_functor sendrecvop_rpc) else() nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda selected_rows_functor) @@ -30,7 +37,7 @@ else() variable_visitor) if(WITH_DISTRIBUTE) cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope - ddim selected_rows_functor sendrecvop_grpc) + ddim selected_rows_functor sendrecvop_rpc) else() cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim selected_rows_functor) @@ -43,8 +50,10 @@ cc_library(data_balance_op_handle SRCS data_balance_op_handle.cc DEPS op_handle_ cc_library(gather_op_handle SRCS gather_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fuse_vars_op_handle SRCS fuse_vars_op_handle.cc DEPS op_handle_base scope) +cc_library(memory_optimize_pass SRCS analysis_var_pass.cc memory_reuse_types.cc DEPS graph graph_helper pass) cc_library(modify_op_lock_and_record_event_pass SRCS modify_op_lock_and_record_event_pass.cc DEPS computation_op_handle op_graph_view multi_devices_helper) - +cc_library(memory_early_delete_pass SRCS memory_early_delete_pass.cc DEPS memory_optimize_pass computation_op_handle scale_loss_grad_op_handle rpc_op_handle + all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle graph graph_helper pass) cc_library(reference_count_pass_helper SRCS reference_count_pass_helper.cc DEPS garbage_collector computation_op_handle) cc_library(eager_deletion_op_handle SRCS eager_deletion_op_handle.cc DEPS lod_tensor selected_rows reference_count_pass_helper) cc_library(eager_deletion_pass SRCS eager_deletion_pass.cc DEPS computation_op_handle eager_deletion_op_handle graph graph_helper pass) @@ -56,7 +65,12 @@ cc_library(all_reduce_deps_pass SRCS all_reduce_deps_pass.cc DEPS graph graph_he cc_library(multi_devices_graph_pass SRCS multi_devices_graph_pass.cc DEPS multi_devices_helper computation_op_handle scale_loss_grad_op_handle rpc_op_handle all_reduce_op_handle reduce_op_handle broadcast_op_handle data_balance_op_handle fused_broadcast_op_handle) -set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass) +set(SSA_GRAPH_EXECUTOR_DEPS graph framework_proto sequential_execution_pass modify_op_lock_and_record_event_pass all_reduce_deps_pass reference_count_pass eager_deletion_pass memory_optimize_pass memory_early_delete_pass) +if (WITH_GPU) + list(APPEND SSA_GRAPH_EXECUTOR_DEPS reference_count_pass) +endif() +cc_test(memory_reuse_types_test SRCS memory_reuse_types_test.cc memory_reuse_types.cc DEPS framework_proto graph) +cc_test(analysis_var_pass_test SRCS analysis_var_pass_test.cc analysis_var_pass.cc memory_reuse_types.cc DEPS framework_proto graph graph_helper op_registry pass) cc_library(ssa_graph_executor SRCS ssa_graph_executor.cc DEPS ${SSA_GRAPH_EXECUTOR_DEPS}) @@ -77,4 +91,5 @@ cc_test(fused_broadcast_op_test SRCS fused_broadcast_op_handle_test.cc DEPS fuse cc_library(build_strategy SRCS build_strategy.cc DEPS graph_viz_pass multi_devices_graph_pass multi_devices_graph_print_pass multi_devices_graph_check_pass - fuse_elewise_add_act_pass multi_batch_merge_pass) + fuse_elewise_add_act_pass multi_batch_merge_pass + memory_optimize_pass) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index e8bf53e160e7382122c3c2f92a152fea058a032e..9eaff1f560147ad053ac599cf141be8a66a5c353 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -127,7 +127,7 @@ void AllReduceOpHandle::RunImpl() { // Reduce All Tensor to trg in CPU ReduceLoDTensor func(lod_tensors, &trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); for (size_t i = 1; i < local_scopes_.size(); ++i) { auto &scope = diff --git a/paddle/fluid/framework/details/analysis_var_pass.cc b/paddle/fluid/framework/details/analysis_var_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..223b9da3cfba33fc32d1334cddccb9f503bd0bef --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.cc @@ -0,0 +1,656 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "gflags/gflags.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +DEFINE_bool(enable_subgraph_optimize, false, + "SubGraph also reuse global graph variables, it will reduce the " + "memory occupation" + "but a higher risk of memory reuse error. default disabled."); +DEFINE_string(memory_optimize_debug, "", + "debug the operator output variable when do the variable reuse." + "memory reuse pass." + "only for debug, default disabled."); + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +template +class FilterVariableImpl { + public: + void operator()(const Container& nodes, Callback callback) { + for (auto* node : nodes) { + callback(node); + } + } +}; + +// filter var node for op->inputs/outputs +template +class FilterVariableImpl, Callback> { + public: + void operator()(const std::vector& nodes, Callback callback) { + for (auto* var : nodes) { + if (var->IsVar() && !var->IsCtrlVar()) { + callback(var); + } + } + } +}; + +template +void FilterVariables(const Container& nodes, Callback callback) { + FilterVariableImpl()(nodes, callback); +} + +std::unique_ptr AnalysisVarPass::ApplyImpl( + std::unique_ptr graph) const { + auto nodes = graph->Nodes(); + auto subblock_vars = GetSubBlockVars(nodes); + skip_set_.insert(subblock_vars.begin(), subblock_vars.end()); + + cfg_.reset(new details::ControlFlowGraph(*graph)); + cfg_->LiveVariableAnalysis(); + InitSSAGraphNodes(); + + int reuse_id = 0; + for (size_t idx = 0; idx < cfg_->Ops().size(); ++idx) { + auto& op = cfg_->Ops()[idx]; + auto* op_desc = op->Op(); + // some op in graph has no op desc + if (op_desc == nullptr) continue; + if (OpHasSubBlock(op_desc)) { + if (FLAGS_enable_subgraph_optimize) { + SubGraphOptimize(op_desc); + } else { + VLOG(3) << op->Name() + << " has subblock, but disable subgraph optimize. skipped."; + continue; + } + } + + for (auto& var : op->outputs) { + if (NodeCanReused(var) && cfg_->Use(op).count(var->Name()) == 0) { + ir::Node* cache = pool_.NodeMatch(var); + if (var->Name() == FLAGS_memory_optimize_debug) { + VLOG(3) << "start match var " << DebugString(var) << " of op " + << op->Name(); + VLOG(3) << pool_.ToString(); + VLOG(3) << "matched in pool : " + << ((cache == nullptr) ? "False" : "True"); + } + if (cache != nullptr) { + if (var->Name() == cache->Name()) { + VLOG(3) << "The same cache variable is cascade reused." + << var->Name() << " is re-filled to the pool after" + << "the reused op is finished. Current op can not " + << "replace it again. Skip this candidate."; + continue; + } + + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(reuse_id++), DebugString(var), DebugString(cache), + node_idx_in_pool, static_cast(pool_.size())); + // update CFG Graph on the fly. + // reused var maybe re-fill into the pool + cfg_->RenameVarInCFGGraph(var->Name(), cache->Name(), idx); + // NOTE(dzhwinter): we need to both update the ProgramDesc + // and IR Graph. because op_desc/var_desc is used in CreateOp, + // CreateVar when running happens. But IR Graph + // define the dependence relationship between nodes. + RenameVarInGraphDesc(var->Name(), cache->Name(), idx); + RenameVarInGraphNode(var->Name(), cache->Name(), idx, graph.get()); + + pool_.Erase(cache); + } + } + } + // fill the pool + for (auto var : cfg_->LiveIn(op)) { + if (cfg_->LiveOut(op).count(var) == 0) { + ir::Node* var_node = cfg_->GetNodeFromVarName(var, op); + if (var_node == nullptr) continue; + if (NodeCanReused(var_node) && !pool_.Has(var_node)) { + pool_.Insert(var_node, op); + } + } + } + } + graph->ResolveHazard(var_nodes_); + + // For early delete pass. use GraphNodePool load the unlived vars. + // 1. find all deps op for each unlived var in memory pool. + for (auto& op : graph->Nodes()) { + for (auto& var : op->inputs) { + if (pool_.Has(var)) { + pool_.Insert(var, op); + } + } + } + // 2. convert ir node based memory pool to graph node + // because Node* maybe released bettwen passes. + auto& graph_pool = graph->Get(kGraphNodePool); + for (auto it = pool_.begin(); it != pool_.end(); ++it) { + std::unordered_set descs; + for (auto& op : it->second) { + PADDLE_ENFORCE(op->IsOp()); + descs.insert(op->Op()); + } + graph_pool.push_back(std::make_pair(it->first->Name(), descs)); + } + + return graph; +} + +void AnalysisVarPass::SubGraphOptimize(OpDesc* op_desc) const { + // conditional block, while op and their grad op + auto* sub_block_desc = + AttrReader(op_desc->GetAttrMap()).Get("sub_block"); + + // create a mirror block to construct an IR Graph. + ProgramDesc prog; + auto* copy_block = prog.MutableBlock(0); + for (auto* op : sub_block_desc->AllOps()) { + auto* copy_op = copy_block->AppendOp(); + copy_op->CopyFrom(*op); + copy_op->Flush(); + } + + for (auto* var : sub_block_desc->AllVars()) { + auto* copy_var = copy_block->Var(var->Name()); + copy_var->SetDataType(var->GetDataType()); + // only lod tensor can be reused. So ignore the multiple dims case. + copy_var->SetType(var->GetType()); + copy_var->SetShape(var->GetShape()); + copy_var->SetPersistable(var->Persistable()); + } + + ir::Graph sub_graph(prog); + std::unordered_set sub_graph_all_ops; + FilterVariables(sub_graph.Nodes(), [&](ir::Node* var) { + // sub_graph_all_ops.emplace(var); + if (var->IsVar() && !var->IsCtrlVar()) { + sub_graph_all_ops.emplace(var); + } + }); + int sub_reuse_id = 0; + // subgraph nodes is unordered, reuse need to follow the desc order. + // find the right op node through the descs + for (auto* sub_op_desc : sub_block_desc->AllOps()) { + ir::Node* sub_op = nullptr; + for (auto* node : sub_graph_all_ops) { + if (node->Op() == sub_op_desc) { + sub_op = node; + break; + } + } + PADDLE_ENFORCE(sub_op != nullptr); + for (auto* var : sub_op->outputs) { + if (NodeCanReused(var)) { + ir::Node* cache = pool_.NodeMatch(var); + if (cache != nullptr) { + if (var->Var()->GetDataType() != cache->Var()->GetDataType()) { + continue; + } + int node_idx_in_pool = pool_.GetIndex(cache); + VLOG(3) << string::Sprintf( + "!!! %s, %s => %s, cache idx %d, pool size %d", + std::to_string(sub_reuse_id++), DebugString(var), + DebugString(cache), node_idx_in_pool, + static_cast(pool_.size())); + // NOTE(dzh): subblock is not in IR graph. Modify the block_desc + // immediately to make the subblock variable reuse strategy take + // effect. Because it is a single op in graph. No need to + // update the ir nodes. + sub_op_desc->Rename(var->Name(), cache->Name()); + if (sub_op_desc->Block()->HasVar(var->Name())) { + sub_op_desc->Block()->RemoveVar(var->Name()); + } + } + } + } + } +} + +std::unordered_set AnalysisVarPass::GetSubBlockVars( + const std::unordered_set& nodes) const { + std::unordered_set vars; + for (auto& op : nodes) { + if (!op->IsOp() || op->Op() == nullptr) continue; + auto* op_desc = op->Op(); + if (OpHasSubBlock(op_desc)) { + auto inputs = op_desc->InputArgumentNames(); + auto outputs = op_desc->OutputArgumentNames(); + vars.insert(inputs.begin(), inputs.end()); + vars.insert(outputs.begin(), outputs.end()); + } + } + return vars; +} + +void AnalysisVarPass::RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, + size_t idx) const { + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + PADDLE_ENFORCE(op->IsOp() && op->Op()); + auto* op_desc = op->Op(); + op_desc->RenameInput(var, cache_var); + op_desc->RenameOutput(var, cache_var); + if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); + op_desc->Flush(); + } +} + +void AnalysisVarPass::InitSSAGraphNodes() const { + std::unordered_map> all_vars; + if (var_nodes_.empty()) { + for (auto* op : cfg_->Ops()) { + for (auto* node : op->inputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + for (auto* node : op->outputs) { + if (all_vars[node->Name()].count(node) == 0) { + all_vars[node->Name()].emplace(node); + var_nodes_[node->Name()].emplace_back(node); + } + } + } + } +} + +void AnalysisVarPass::RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, + size_t idx, ir::Graph* graph) const { + // if replace happens, we need to create a newer version cache_var + // but use the same dims/data_type with var. + PADDLE_ENFORCE(var_nodes_[var].size() >= 1 && + var_nodes_[var].at(0)->Var() != nullptr); + std::unique_ptr var_desc(new VarDesc(*var_nodes_[var].at(0)->Var())); + var_desc->SetName(cache_var); + + for (size_t i = idx; i < cfg_->Ops().size(); ++i) { + auto* op = cfg_->Ops()[i]; + + // redirect the input to the latest version of cache_var + for (auto* node : op->inputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache_node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + PADDLE_ENFORCE(node->inputs.size() == 1 && node->inputs[0]->IsOp()); + auto* prev_op = node->inputs[0]; + std::replace(prev_op->outputs.begin(), prev_op->outputs.end(), node, + cache_node); + cache_node->inputs.emplace_back(prev_op); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + + // if we need to rename the output, + // always create a newer version of cache_var + for (auto* node : op->outputs) { + if (node->Name() == var) { + ir::Node* cache_node = graph->CreateVarNode(var_desc.get()); + var_nodes_[cache_var].emplace_back(cache_node); + + // swap node to cache node + cache_node->outputs.insert(cache_node->outputs.end(), + node->outputs.begin(), node->outputs.end()); + cache_node->inputs.emplace_back(op); + std::replace(op->outputs.begin(), op->outputs.end(), node, cache_node); + for (auto* next_op : node->outputs) { + std::replace(next_op->inputs.begin(), next_op->inputs.end(), node, + cache_node); + } + } + } + } + + // release node of unused var in graph + for (auto* node : var_nodes_[var]) { + graph->RemoveNode(node); + } + var_nodes_.at(var).clear(); +} + +bool AnalysisVarPass::NodeCanReused(ir::Node* node) const { + if (!node->IsVar() || node->IsCtrlVar()) return false; + auto* desc = node->Var(); + auto type = desc->GetType(); + if (desc->Persistable() || type != proto::VarType::LOD_TENSOR || + desc->GetShape().empty()) { + return false; + } + // vars can be @EMPTY@, @LR_DECAY_REUSE_ID@. For example, while_grad + std::string name = node->Name(); + if (!name.empty() && name[0] == '@' && name[name.size() - 1] == '@') + return false; + if (skip_set_.count(name)) return false; + for (auto* op : node->inputs) { + if (op->Op()->HasAttr("force_cpu")) { + // op output force generated in cpu, can not be reused. + return framework::AttrReader(op->Op()->GetAttrMap()) + .Get("force_cpu") == 0; + } + } + return true; +} + +bool AnalysisVarPass::OpHasSubBlock(OpDesc* desc) const { + const AttributeMap& attrs = desc->GetAttrMap(); + for (auto& attr : attrs) { + if (attr.second.type() == typeid(BlockDesc*) || // NOLINT + attr.second.type() == typeid(std::vector)) // NOLINT + return true; + } + return false; +} + +std::vector SortOpLikeDescOrder(const ir::Graph& graph) { + PADDLE_ENFORCE(graph.Has(kAllOpDescs), + "Graph has no attribute of kAllOpDescs."); + // 1. get op desc order + auto& op_descs = graph.Get>(kAllOpDescs); + + // 2. topology sort order + auto nodes = graph.Nodes(); + std::deque ops; + FilterVariables(nodes, [&](ir::Node* op) { + if (op->IsOp() && op->Op() != nullptr) { + ops.emplace_back(op); + } + }); + std::unordered_map op_deps; + std::list ready_ops; + std::unordered_map> pending_ops; + + for (auto* op : ops) { + std::unordered_set preceding_op; + for (auto* in : op->inputs) { + if (in->inputs.empty()) continue; + PADDLE_ENFORCE(in->inputs.size() == 1 && in->inputs[0]->IsOp()); + preceding_op.emplace(in->inputs[0]); + pending_ops[in->inputs[0]].emplace(op); + } + op_deps[op] = preceding_op.size(); + if (preceding_op.empty()) { + ready_ops.emplace_back(op); + } + } + + // 3. generated op list based desc order and the topology order + std::vector ret; + std::list op_descs_list(op_descs.begin(), op_descs.end()); + + auto update_by_found_node = [&](ir::Node* found_node) { + for (auto* pending_op : pending_ops[found_node]) { + if (--op_deps[pending_op] == 0) { + ready_ops.emplace_back(pending_op); + } + } + ready_ops.remove(found_node); + ret.emplace_back(found_node); + }; + + while (!ready_ops.empty()) { + bool all_of_ready_op_unmatched = true; + for (auto it = op_descs_list.begin(); it != op_descs_list.end();) { + auto op_desc = *it; + ir::Node* found_node = nullptr; + for (auto* op : ready_ops) { + if (IsSameDesc(op->Op(), op_desc)) { + found_node = op; + break; + } + } + + // 3.1 op desc deleted by other pass + if (found_node == nullptr) { + ++it; + continue; + } else { + all_of_ready_op_unmatched = false; + it = op_descs_list.erase(it); + } + update_by_found_node(found_node); + } + + // 3.2 op descs are added by other pass + // preceding op non empty means some new op descs are + // created, but not contained in return node list. + // these new op desc may depend on each other. + std::list prev_ready_ops(ready_ops); + if (all_of_ready_op_unmatched) { + for (auto op : prev_ready_ops) { + update_by_found_node(op); + } + } + } + + PADDLE_ENFORCE(std::all_of( + op_deps.begin(), op_deps.end(), + [&](const std::pair& p) { return p.second == 0; })); + + return ret; +} + +ControlFlowGraph::ControlFlowGraph(const ir::Graph& graph) { + ops_ = SortOpLikeDescOrder(graph); + ConnectNodes(); +} + +void ControlFlowGraph::BuildCFGGraph() { + // FIXME(dzh): same effect with ConnectNodes, but use the control + // link to build dependency graph, it goes wrong in transformer. + for (ir::Node* op : ops_) { + for (auto& input_var : op->inputs) { + if (!input_var->inputs.empty()) { + PADDLE_ENFORCE( + input_var->inputs.size() == 1 && input_var->inputs[0]->IsOp(), + "Preceding Op Node of Var Node must be unique"); + auto* pred_op = input_var->inputs[0]; + if (pred_op->Op() != nullptr) { + predecessors_[op].insert(pred_op); + successors_[pred_op].insert(op); + } + } + if (input_var->IsVar() && !input_var->IsCtrlVar()) { + uses_[op].insert(input_var->Name()); + } + } + for (auto& output_var : op->outputs) { + // output var may be used by many op + for (auto* succ_op : output_var->outputs) { + if (succ_op->Op() != nullptr) { + successors_[op].insert(succ_op); + predecessors_[succ_op].insert(op); + } + } + if (output_var->IsVar() && !output_var->IsCtrlVar()) { + defs_[op].insert(output_var->Name()); + } + } + } +} + +void ControlFlowGraph::ConnectNodes() { + for (size_t i = 0; i < ops_.size(); ++i) { + auto& op = ops_[i]; + try { + auto& next_op = ops_.at(i + 1); + successors_[op].insert(next_op); + predecessors_[next_op].insert(op); + } catch (...) { + // do nothing + } + + FilterVariables(op->inputs, + [&](ir::Node* var) { uses_[op].emplace(var->Name()); }); + + FilterVariables(op->outputs, + [&](ir::Node* var) { defs_[op].emplace(var->Name()); }); + } +} + +void ControlFlowGraph::LiveVariableAnalysis() { + // NOTE(dzh): variable liveless analysis (a.k.a reversed_ops algorithm) + // compute the liveness of for each variable though reversed_ops algorithm. + // It iterates the operators from end to begin, compute the live in/live out + // variable set for each op, then the diff between in/out will be used for + // the variable reuse. For detail refer to + // http://www.cs.cornell.edu/courses/cs4120/2013fa/lectures/lec26-fa13.pdf + std::list work_list(ops_.rbegin(), ops_.rend()); + while (!work_list.empty()) { + ir::Node* op = work_list.front(); + work_list.pop_front(); + // get the live_in calculated before. Empty if first. + auto prev_live_in = std::move(live_in_[op]); + for (auto& s : successors_[op]) { + for (auto& var : live_in_[s]) { + live_out_[op].insert(var); + } + } + for (auto& var : uses_[op]) { + live_in_[op].insert(var); + } + for (auto& var : live_out_[op]) { + live_in_[op].insert(var); + } + for (auto& var : defs_[op]) { + live_in_[op].erase(var); + } + + // If the live_in is not changed, then the liveness analysis of + // predecessors is completed. + // + // Otherwise, recalculate the predecessors liveness + if (live_in_[op] != prev_live_in) { + for (auto& pre : predecessors_[op]) { + work_list.push_back(pre); + } + } + } +} + +void ControlFlowGraph::RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, + int begin_idx) { + // update graph from begin idx to the end + for (size_t i = begin_idx; i != ops_.size(); ++i) { + auto* op = ops_[i]; + if (uses_[op].find(old_node) != uses_[op].end()) { + uses_[op].erase(old_node); + uses_[op].insert(new_node); + } + if (defs_[op].find(old_node) != defs_[op].end()) { + defs_[op].erase(old_node); + defs_[op].insert(new_node); + } + if (live_in_[op].find(old_node) != live_in_[op].end()) { + live_in_[op].erase(old_node); + live_in_[op].insert(new_node); + } + if (live_out_[op].find(old_node) != live_out_[op].end()) { + live_out_[op].erase(old_node); + live_out_[op].insert(new_node); + } + } +} + +const std::set ControlFlowGraph::LiveIn(ir::Node* op) const { + auto it = live_in_.find(op); + PADDLE_ENFORCE( + it != live_in_.end(), + string::Sprintf("Expect %s in live_in, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::LiveOut(ir::Node* op) const { + auto it = live_out_.find(op); + PADDLE_ENFORCE( + it != live_out_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::set ControlFlowGraph::Use(ir::Node* op) const { + auto it = uses_.find(op); + PADDLE_ENFORCE( + it != uses_.end(), + string::Sprintf("Expect %s in live_out, but Not Found.", op->Name())); + return it->second; +} + +const std::vector ControlFlowGraph::Ops() const { return ops_; } + +std::vector& ControlFlowGraph::Ops() { return ops_; } + +ir::Node* ControlFlowGraph::GetNodeFromVarName(const std::string& name, + ir::Node* op) const { + // in ssa-graph, different version nodes have same name, + // this function get the latest version var before target op + // It may return nullptr, such as data node. + ir::Node* found_node = nullptr; + for (auto* node : ops_) { + if (node == op) break; + for (auto& output : node->outputs) { + if (output->Name() == name) { + found_node = output; + } + } + } + return found_node; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(analysis_var_pass, paddle::framework::details::AnalysisVarPass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kAllOpDescs); diff --git a/paddle/fluid/framework/details/analysis_var_pass.h b/paddle/fluid/framework/details/analysis_var_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..144204beafb341351172c29e3b4cd41db49be6f9 --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass.h @@ -0,0 +1,120 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { +constexpr char kAllOpDescs[] = "all_op_descs"; + +std::vector SortOpLikeDescOrder(const ir::Graph& graph); +// sort op in bfs order +std::vector BFSSortGraphOps(const ir::Graph& graph); + +class ControlFlowGraph; + +class AnalysisVarPass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; + + private: + // fill the variable map(var_nodes) by version. + void InitSSAGraphNodes() const; + // update program descs + void RenameVarInGraphDesc(const std::string& var, + const std::string& cache_var, size_t idx) const; + // update ir nodes + void RenameVarInGraphNode(const std::string& var, + const std::string& cache_var, size_t idx, + ir::Graph* graph) const; + + void SubGraphOptimize(OpDesc* op_desc) const; + // valid a tensor can be reuse or not + bool NodeCanReused(ir::Node* node) const; + // scan subblock and collect the output/input variables. + std::unordered_set GetSubBlockVars( + const std::unordered_set&) const; + // check op has subblock or not + bool OpHasSubBlock(OpDesc* desc) const; + + private: + // Reuse Node Pool, Owned. + mutable OrderedNodePairPool pool_; + // controlflow Graph + mutable std::unique_ptr cfg_; + // skip set + mutable std::unordered_set skip_set_; + // var nodes + mutable std::map> var_nodes_; +}; + +class ControlFlowGraph { + public: + ControlFlowGraph() = default; + // For IR Graph in parallelexecutor + explicit ControlFlowGraph(const ir::Graph& graph); + + void LiveVariableAnalysis(); + + void RenameVarInCFGGraph(const std::string& old_node, + const std::string& new_node, int begin_idx); + + const std::set LiveIn(ir::Node* op) const; + const std::set LiveOut(ir::Node* op) const; + const std::set Use(ir::Node* op) const; + const std::vector Ops() const; + std::vector& Ops(); + + // for ssa-graph nodes + ir::Node* GetNodeFromVarName(const std::string& name, ir::Node* op) const; + + private: + void BuildCFGGraph(); + void ConnectNodes(); + using NodeListMap = std::unordered_map>; + using VarSetMap = std::map>; + // successors ops use the output variables. + NodeListMap successors_; + // predecessors ops generated input variables. + NodeListMap predecessors_; + // variables lived before run current op. + VarSetMap live_in_; + // variables lived after run current op. + VarSetMap live_out_; + VarSetMap uses_; // op inputs + VarSetMap defs_; // op outputs + + std::vector ops_; // op sequence by topology sort +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/analysis_var_pass_test.cc b/paddle/fluid/framework/details/analysis_var_pass_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..9bc4fd33f7058949ca60983ea666a21cb4877b3e --- /dev/null +++ b/paddle/fluid/framework/details/analysis_var_pass_test.cc @@ -0,0 +1,470 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/analysis_var_pass.h" +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/program_desc.h" + +namespace paddle { +namespace framework { + +class DummyOp : public OperatorBase { + public: + DummyOp(const std::string& type, const VariableNameMap& inputs, + const VariableNameMap& outputs, const AttributeMap& attrs) + : OperatorBase(type, inputs, outputs, attrs) {} + + private: + void RunImpl(const Scope& scope, + const platform::Place& place) const override {} +}; + +class SumOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class AssignOpMaker : public OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("X", "").AsDuplicable(); + AddOutput("Out", ""); + AddComment(""); + } +}; + +class DummyVarTypeInference : public VarTypeInference { + public: + void operator()(const OpDesc& op_desc, BlockDesc* block) const override { + auto& inputs = op_desc.Input("X"); + auto type = block->Var(inputs.front())->GetType(); + auto out_var_name = op_desc.Output("Out").front(); + block->Var(out_var_name)->SetType(type); + } +}; + +} // namespace framework +} // namespace paddle + +REGISTER_OPERATOR(sum, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(assign, paddle::framework::DummyOp, + paddle::framework::AssignOpMaker, + paddle::framework::DummyVarTypeInference); +REGISTER_OPERATOR(dummy, paddle::framework::DummyOp, + paddle::framework::SumOpMaker, + paddle::framework::DummyVarTypeInference); +/* + https://en.wikipedia.org/wiki/Live_variable_analysis + Create a customed classical dependency graph, left row is the instruction + number. + 1. a = 1 + 2. b = a + 3. c = a + 4. d = b + c + 5. e = d + + a--------+ + | | + b c + | | + d--------+ + | + e + Then analysis these variable's liveness range + */ + +namespace paddle { +namespace framework { +namespace details { + +static inline bool IsSameDesc(OpDesc* op1, OpDesc* op2) { + return op1->Type() == op2->Type() && op1->Inputs() == op2->Inputs() && + op1->Outputs() == op2->Outputs(); +} + +inline static ProgramDesc FillProgramDesc() { + ProgramDesc prog; + prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("d")->SetType(proto::VarType::LOD_TENSOR); + prog.MutableBlock(0)->Var("e")->SetType(proto::VarType::LOD_TENSOR); + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"b"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"a"}); + op->SetOutput("Out", {"c"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d"}); + } + { + auto* op = prog.MutableBlock(0)->AppendOp(); + op->SetType("assign"); + op->SetInput("X", {"d"}); + op->SetOutput("Out", {"e"}); + } + return prog; +} + +template +inline static std::string DebugString(const Container& c) { + std::stringstream ss; + for (auto& item : c) { + ss << item << " "; + } + return ss.str(); +} + +TEST(CFGGraph, IRGraph) { + // prepare ir graph + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + ControlFlowGraph cfg(graph); + cfg.LiveVariableAnalysis(); + + // test assign op + ASSERT_TRUE((std::set{"a"} == cfg.LiveIn(cfg.Ops()[0]))); + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveOut(cfg.Ops()[0]))); + + // test assign op + ASSERT_TRUE((std::set{"a", "b"} == cfg.LiveIn(cfg.Ops()[1]))); + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveOut(cfg.Ops()[1]))); + + // test sum op + ASSERT_TRUE((std::set{"b", "c"} == cfg.LiveIn(cfg.Ops()[2]))); + ASSERT_TRUE((std::set{"d"} == cfg.LiveOut(cfg.Ops()[2]))); + + // test assign op + ASSERT_TRUE((std::set{"d"} == cfg.LiveIn(cfg.Ops()[3]))); + ASSERT_TRUE((std::set{} == cfg.LiveOut(cfg.Ops()[3]))); +} + +// 1. normal test +TEST(SortOpLikeDescOrder, NormalTest) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto nodes = SortOpLikeDescOrder(graph); + auto op_descs = prog.Block(0).AllOps(); + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 2. remove some op_desc +TEST(SortOpLikeDescOrder, RemoveOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + auto nodes = graph.Nodes(); + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->outputs.back()->Name() == "e") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + graph.RemoveNode(found_node); + graph.RemoveNode(e); + + // other node keeps the same order + auto remain_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < remain_nodes.size(); ++i) { + auto node = remain_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 3. add some op_desc +TEST(SortOpLikeDescOrder, AddOpDesc) { + auto prog = FillProgramDesc(); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + ir::Graph graph(prog); + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // cached desc different with real one + // mimic the intermidiete pass modify the programdesc. + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto op_descs = prog.Block(0).AllOps(); + + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + op_descs.insert(op_descs.begin() + 4, op); + + auto nodes = SortOpLikeDescOrder(graph); + + for (size_t i = 0; i < nodes.size(); ++i) { + auto node = nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 4. add and delete some op_desc +TEST(SortOpLikeDescOrder, AddAndDeleteOpDesc) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + // remove sum node + auto op_descs = prog.Block(0).AllOps(); + ir::Node* found_node = nullptr; + auto nodes = graph.Nodes(); + for (auto node : nodes) { + if (node->Name() == "sum") { + found_node = node; + break; + } + } + PADDLE_ENFORCE(found_node != nullptr); + for (auto it = op_descs.begin(); it != op_descs.end();) { + if (IsSameDesc(*it, found_node->Op())) { + it = op_descs.erase(it); + } else { + ++it; + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* c = find_node_in_graph("c"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(c->outputs.begin(), c->outputs.end(), found_node); + ir::Node* pending_op = found_node->outputs[0]->outputs[0]; + graph.RemoveNode(e); + graph.RemoveNode(pending_op); + graph.RemoveNode(found_node); + } + + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + op_descs.insert(op_descs.begin() + 2, op); + + // check the order + auto mynodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < mynodes.size(); ++i) { + auto node = mynodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +// 5. add and replace some op_desc inplace. +TEST(SortOpLikeDescOrder, AddAndReplaceOpDescInplace) { + auto prog = FillProgramDesc(); + ir::Graph graph(prog); + const std::vector* all_op_descs = + new std::vector(prog.Block(0).AllOps()); + graph.Set(details::kAllOpDescs, all_op_descs); // take ownership + + auto find_node_in_graph = [&](std::string s) { + ir::Node* ret = nullptr; + for (auto n : graph.Nodes()) { + if (n->Name() == s) { + ret = n; + break; + } + } + PADDLE_ENFORCE(ret != nullptr); + return ret; + }; + + auto op_descs = prog.Block(0).AllOps(); + // add node + auto op = prog.MutableBlock(0)->AppendOp(); + prog.MutableBlock(0)->Var("d1")->SetType(proto::VarType::LOD_TENSOR); + op->SetType("sum"); + op->SetInput("X", {"b", "c"}); + op->SetOutput("Out", {"d1"}); + { + ir::Node* node = graph.CreateOpNode(op); + ir::Node* d1 = graph.CreateVarNode(prog.MutableBlock(0)->Var("d1")); + ir::Node* b = find_node_in_graph("b"); + ir::Node* c = find_node_in_graph("c"); + node->outputs.emplace_back(d1); + node->inputs.emplace_back(b); + node->inputs.emplace_back(c); + d1->inputs.emplace_back(node); + b->outputs.emplace_back(node); + c->outputs.emplace_back(node); + } + + op_descs.emplace_back(op); + + // replace op_desc inplace + auto nodes = graph.Nodes(); + ir::Node* found_node = nullptr; + for (auto node : nodes) { + if (node->IsOp() && node->Op() && node->Name() == "assign") { + if (node->outputs.size() == 1 && node->outputs[0]->Name() == "e") { + found_node = node; + break; + } + } + } + { + ir::Node* d = find_node_in_graph("d"); + ir::Node* e = find_node_in_graph("e"); + std::remove(d->outputs.begin(), d->outputs.end(), found_node); + std::remove(e->inputs.begin(), e->inputs.end(), found_node); + graph.RemoveNode(found_node); + } + op_descs.erase(op_descs.begin() + 3); + + auto replace_op = prog.MutableBlock(0)->AppendOp(); + replace_op->SetType("sum"); + replace_op->SetInput("X", {"d", "d1"}); + replace_op->SetOutput("Out", {"e"}); + { + ir::Node* sum2 = graph.CreateOpNode(replace_op); + ir::Node* e = find_node_in_graph("e"); + ir::Node* d = find_node_in_graph("d"); + ir::Node* d1 = find_node_in_graph("d1"); + sum2->inputs.emplace_back(d); + sum2->inputs.emplace_back(d1); + sum2->outputs.emplace_back(e); + e->inputs.emplace_back(sum2); + d->outputs.emplace_back(sum2); + d1->outputs.emplace_back(sum2); + } + + op_descs.emplace_back(replace_op); + // compare op order + auto graph_nodes = SortOpLikeDescOrder(graph); + for (size_t i = 0; i < graph_nodes.size(); ++i) { + auto node = graph_nodes[i]; + auto op_desc = op_descs[i]; + ASSERT_TRUE(IsSameDesc(node->Op(), op_desc)); + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index d8526b3f2492992c5c0f6f5e0a85cffca7398700..779a9ed52365e66d8141f7e3a1183ef6d7832e4b 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -14,11 +14,16 @@ limitations under the License. */ #include "paddle/fluid/framework/details/build_strategy.h" +#include +#include + +#include "paddle/fluid/framework/details/memory_reuse_types.h" #include "paddle/fluid/framework/details/multi_devices_graph_check_pass.h" #include "paddle/fluid/framework/details/multi_devices_graph_print_pass.h" #include "paddle/fluid/framework/details/reduce_op_handle.h" #include "paddle/fluid/framework/details/sequential_execution_pass.h" #include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/graph_helper.h" #include "paddle/fluid/framework/ir/graph_viz_pass.h" namespace paddle { @@ -69,6 +74,14 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } VLOG(1) << "CollectiveContext:" << context->String(); + // NOTE(dzh): memory optimize should be a runtime pass. + // However, after multi_devices_pass, VarHandle, OpHandle is + // the de-fact IR, any reuse on Graph is meaningless. + // A side-effect of that, memory optimize cannot forsee the fetched vars + // , so fetchlist should be set persistable before call the Run interface. + if (strategy.memory_optimize_) { + auto analysis_var_pass = AppendPass("analysis_var_pass"); + } // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", @@ -79,8 +92,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { auto multi_devices_print_pass = AppendPass("multi_devices_print_pass"); - multi_devices_print_pass->SetNotOwned( - "debug_graphviz_path", &strategy_.debug_graphviz_path_); + const std::string graph_path = + string::Sprintf("%s%s", strategy_.debug_graphviz_path_.c_str(), + "_multi_devices_graph"); + multi_devices_print_pass->Set(kGraphvizPath, + new std::string(graph_path)); multi_devices_print_pass->Set( "graph_printer", new details::GraphvizSSAGraphPrinter); } @@ -127,7 +143,6 @@ std::unique_ptr BuildStrategy::Apply( CreatePassesFromStrategy(false); std::unique_ptr graph(new ir::Graph(main_program)); - for (std::shared_ptr &pass : pass_builder_->AllPasses()) { if (pass->Type() == "multi_devices_pass") { pass->Erase("places"); @@ -145,6 +160,17 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); #endif + } else if (pass->Type() == "analysis_var_pass") { + const std::vector *all_op_descs = + new std::vector(main_program.Block(0).AllOps()); + graph->Set>(kAllOpDescs, + all_op_descs); // take ownership + graph->Set(kGraphNodePool, + new GraphNodePool); // take ownership + + pass->Erase(kAllOpDescs); + pass->SetNotOwned>(kAllOpDescs, all_op_descs); + } else if (pass->Type() == "sequential_execution_pass") { LOG(INFO) << "set enable_sequential_execution:" << enable_sequential_execution_; @@ -166,6 +192,7 @@ std::unique_ptr BuildStrategy::Apply( } return graph; } + } // namespace details } // namespace framework } // namespace paddle @@ -176,6 +203,7 @@ USE_PASS(multi_batch_merge_pass); USE_PASS(multi_devices_pass); USE_PASS(multi_devices_check_pass); USE_PASS(multi_devices_print_pass); +USE_PASS(analysis_var_pass); USE_PASS(sequential_execution_pass); USE_PASS(all_reduce_deps_pass); USE_PASS(modify_op_lock_and_record_event_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index c97be169575f578dfd18a6290230d1b3f3bd7596..29396501dc0efedd31a42b77f915fd66c9943985 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -60,8 +60,15 @@ struct BuildStrategy { kCustomized = 2, }; + enum class OptimizeStrategy { + // To be Implemented,bruteforce, recursive compute unused var names. + kBruteForce = 0, + kControlFlowGraph = 1, // use cfg_graph algorithm, faster speed. + }; + ReduceStrategy reduce_{ReduceStrategy::kAllReduce}; GradientScaleStrategy gradient_scale_{GradientScaleStrategy::kCoeffNumDevice}; + OptimizeStrategy strategy_{OptimizeStrategy::kControlFlowGraph}; std::string debug_graphviz_path_{""}; @@ -69,6 +76,10 @@ struct BuildStrategy { bool enable_data_balance_{false}; + bool memory_optimize_{false}; + + bool memory_early_delete_{false}; + bool enable_sequential_execution_{false}; bool fuse_broadcast_op_{false}; diff --git a/paddle/fluid/framework/details/early_delete_op_handle.h b/paddle/fluid/framework/details/early_delete_op_handle.h new file mode 100644 index 0000000000000000000000000000000000000000..c8382d34b790ba7c95415acdf0b55dc97a9cd265 --- /dev/null +++ b/paddle/fluid/framework/details/early_delete_op_handle.h @@ -0,0 +1,140 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/details/computation_op_handle.h" +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/details/var_handle.h" +#include "paddle/fluid/framework/garbage_collector.h" +#include "paddle/fluid/framework/lod_tensor_array.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { +namespace details { + +class EarlyDeleteOpHandle : public OpHandleBase { + public: + EarlyDeleteOpHandle(ir::Node* node, const Scope* scope, + const platform::Place& place, + const std::vector& names, + GarbageCollector* gc) + : OpHandleBase(node), + scope_(scope), + place_(place), + names_(names), + gc_(gc) { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(place); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventCreateWithFlags(&event_, cudaEventDisableTiming)); + } +#endif + } + ~EarlyDeleteOpHandle() { +#ifdef PADDLE_WITH_CUDA + if (IsStreamGarabageCollector()) { + auto gpu_place = boost::get(dev_ctx_->GetPlace()); + PADDLE_ENFORCE(cudaSetDevice(gpu_place.device)); + PADDLE_ENFORCE(cudaEventDestroy(event_)); + } +#endif + } + + std::string Name() const override { return "early_delete"; } + + protected: + void RunImpl() override { + std::vector> tensors; + auto* local_scope = scope_->FindVar(kLocalExecScopeName)->Get(); + for (auto& var_name : names_) { + auto* var = local_scope->FindVar(var_name); + PADDLE_ENFORCE(var != nullptr, + string::Sprintf("Local Scope not has var %s", var_name)); + if (var->IsType()) { + tensors.emplace_back(var->GetMutable()->MoveMemoryHolder()); + } else if (var->IsType()) { + tensors.emplace_back(var->GetMutable() + ->mutable_value() + ->MoveMemoryHolder()); + } else if (var->IsType()) { + LoDTensorArray* tensor_array = var->GetMutable(); + for (auto& tensor : *tensor_array) { + tensors.emplace_back(tensor.MoveMemoryHolder()); + } + } + } + if (!tensors.empty()) { + ClearTensors(tensors); + } + } + + private: + void ClearTensors( + const std::vector>& tensors) { + if (platform::is_cpu_place(place_)) { + ClearCPUTensors(tensors); + } else { + ClearGPUTensors(tensors); + } + } + + void ClearCPUTensors( + const std::vector>& tensors) { + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + gc->Add(tensors); + } + } + + void ClearGPUTensors( + const std::vector>& tensors) { +#ifdef PADDLE_WITH_CUDA + auto* gc = dynamic_cast(gc_); + if (gc != nullptr) { + auto compute_stream = dev_ctx_->stream(); + auto callback_stream = gc->stream(); + auto callback_func = [=]() { + PADDLE_ENFORCE(cudaEventRecord(event_, compute_stream)); + PADDLE_ENFORCE(cudaStreamWaitEvent(callback_stream, event_, 0)); + }; + gc_->Add(tensors, callback_func); + } else { + gc_->Add(tensors); + } + } + + bool IsStreamGarabageCollector() const { + return dynamic_cast(gc_) != nullptr; +#endif + } + + const Scope* scope_; + const platform::Place place_; + std::vector names_; + GarbageCollector* gc_; +#ifdef PADDLE_WITH_CUDA + platform::CUDADeviceContext* dev_ctx_; + cudaEvent_t event_; +#endif +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/fuse_vars_op_handle.h b/paddle/fluid/framework/details/fuse_vars_op_handle.h index 3f360c510a4fdc0caaeb15d862b217ef41b8ea6e..b40b01df36479543e8b2779762210ae144d7d9be 100644 --- a/paddle/fluid/framework/details/fuse_vars_op_handle.h +++ b/paddle/fluid/framework/details/fuse_vars_op_handle.h @@ -33,7 +33,7 @@ struct FuseVarsOpHandle : public OpHandleBase { FuseVarsOpHandle(ir::Node *node, Scope *local_scope, const platform::Place &place, const std::unordered_map &inputs_numel, - const std::type_index &var_type) + const proto::VarType::Type var_type) : OpHandleBase(node), local_scope_(local_scope), place_(place), @@ -57,7 +57,7 @@ struct FuseVarsOpHandle : public OpHandleBase { Scope *local_scope_; const platform::Place place_; const std::unordered_map inputs_numel_; - const std::type_index type_; + const proto::VarType::Type type_; int64_t total_numel_; }; } // namespace details diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.cc b/paddle/fluid/framework/details/memory_early_delete_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..06a2451c136e3243ba41661fa691f9a6ef8b52ac --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.cc @@ -0,0 +1,117 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_early_delete_pass.h" +#include +#include +#include +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include "paddle/fluid/framework/details/multi_devices_helper.h" +#include "paddle/fluid/framework/details/reference_count_pass_helper.h" +#include "paddle/fluid/framework/ir/graph_helper.h" + +namespace paddle { +namespace framework { +namespace details { + +static ComputationOpHandle* FindNextComputationOpHandle(VarHandle* var_in) { + std::queue queue; + queue.push(var_in); + do { + auto* var = queue.front(); + queue.pop(); + for (auto* op : var->PendingOps()) { + auto* compute_op = dynamic_cast(op); + if (compute_op != nullptr && compute_op->GetPlace() == var_in->place_) { + return compute_op; + } + for (auto* out_var : op->Outputs()) { + queue.push(out_var); + } + } + } while (!queue.empty()); + return nullptr; +} + +std::unique_ptr MemoryEarlyDeletePass::ApplyImpl( + std::unique_ptr graph) const { + auto& graph_pool = Get(kGraphNodePool); + auto& gcs = Get(kGarbageCollector); + + std::unordered_map> unlived_vars; + unlived_vars.reserve(graph_pool.size()); + for (auto& pair : graph_pool) { + unlived_vars.insert(std::make_pair(pair.first, pair.second)); + } + + auto compare_and_insert_early_delete_op = [&]( + OpHandleBase* op, const std::vector& vars) { + if (unlived_vars.empty()) return; + // unlived vars can be deleted after the last used op has finished. + auto* compute_op = dynamic_cast(op); + const auto& places = Get>(kAllPlaces); + for (auto& var : vars) { + auto* var_handle = dynamic_cast(var); + auto var_name = var->Node()->Name(); + auto& var_place = var_handle->place_; + if (unlived_vars.count(var_name) == 0) continue; + if (!unlived_vars[var_name].empty()) { + if (compute_op != nullptr && + unlived_vars[var_name].count(compute_op->Node()->Op()) != 0) { + unlived_vars[var_name].erase(compute_op->Node()->Op()); + } + continue; + } + + if (var_handle == nullptr || !var_handle->Node()->IsVar() || + var_handle->Node()->IsCtrlVar()) + continue; + + // shameless copyed from reference count pass. + if (compute_op == nullptr) { + // use next computation op scope + compute_op = FindNextComputationOpHandle(var_handle); + } + auto* early_delete_node = + graph->CreateEmptyNode("early_delete", ir::Node::Type::kOperation); + GarbageCollector* gc = gcs.at(places[compute_op->GetScopeIdx()]).get(); + auto* early_delete_handle = new EarlyDeleteOpHandle( + early_delete_node, compute_op->GetScope(), var_place, {var_name}, gc); + if (compute_op->Outputs().empty()) { + auto* dep_var = new DummyVarHandle(graph->CreateControlDepVar()); + compute_op->AddOutput(dep_var); + graph->Get(kGraphDepVars).emplace(dep_var); + } + early_delete_handle->AddInput(compute_op->Outputs().front()); + VLOG(5) << "Add early delete op " << var_name << " to Operator" + << compute_op->Name(); + } + }; + + auto all_ops = ir::FilterByNodeWrapper(*graph); + for (auto& op : all_ops) { + compare_and_insert_early_delete_op(op, op->Inputs()); + compare_and_insert_early_delete_op(op, op->Outputs()); + } + return graph; +} + +} // namespace details +} // namespace framework +} // namespace paddle + +REGISTER_PASS(memory_early_delete_pass, + paddle::framework::details::MemoryEarlyDeletePass) + .RequireGraphAttr(paddle::framework::details::kGraphNodePool) + .RequireGraphAttr(paddle::framework::details::kGarbageCollector); diff --git a/paddle/fluid/framework/details/memory_early_delete_pass.h b/paddle/fluid/framework/details/memory_early_delete_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..8215aa1b2baa223a111f9050d5488c5fc8ac0e6e --- /dev/null +++ b/paddle/fluid/framework/details/memory_early_delete_pass.h @@ -0,0 +1,32 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/details/early_delete_op_handle.h" +#include "paddle/fluid/framework/ir/graph.h" +#include "paddle/fluid/framework/ir/pass.h" + +namespace paddle { +namespace framework { +namespace details { + +class MemoryEarlyDeletePass : public ir::Pass { + protected: + std::unique_ptr ApplyImpl( + std::unique_ptr graph) const override; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.cc b/paddle/fluid/framework/details/memory_reuse_types.cc new file mode 100644 index 0000000000000000000000000000000000000000..2b9ff518b9adcd366cc877998400a8bdc05fa033 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.cc @@ -0,0 +1,155 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include + +namespace paddle { +namespace framework { +namespace details { + +size_t NodeSizeInBytes(ir::Node* n) { + auto* desc = FindVarDescInBlock(n); + auto shape = desc->GetShape(); + size_t type_size = SizeOfType(desc->GetDataType()); + int size = 1; + for (auto& s : shape) { + size *= s; + } + return type_size * std::abs(size); +} + +std::string DebugStringImpl(VarDesc* var) { + std::stringstream ss; + ss << var->Name(); + ss << "["; + try { + auto shape = var->GetShape(); + for (size_t i = 0; i < shape.size(); ++i) { + if (i != shape.size() - 1) { + ss << shape[i] << ","; + } else { + ss << shape[i]; + } + } + ss << "]"; + } catch (...) { + ss << "Var has no VarDesc !!! Name:" << var->Name(); + } + return ss.str(); +} + +std::string DebugString(ir::Node* var) { + return DebugStringImpl(FindVarDescInBlock(var)); +} +// return DebugString(var->Var()); } + +// NOTE(dzh): based ir node, if a large node has been reused +// by a small size node, then next time it appear in pool, it will +// have the small size. Find the original node shap from blockdesc. +VarDesc* FindVarDescInBlock(ir::Node* n) { + PADDLE_ENFORCE(n->IsVar() && !n->IsCtrlVar() && n->inputs.size() == 1); + BlockDesc* block = n->inputs[0]->Op()->Block(); + PADDLE_ENFORCE(block->HasVar(n->Name()), + string::Sprintf("Block do not has var %s", n->Name())); + return block->FindVar(n->Name()); +} + +struct NodeComparator { + bool operator()(ir::Node* lhs, ir::Node* rhs) const { + auto* lhs_desc = FindVarDescInBlock(lhs); + auto* rhs_desc = FindVarDescInBlock(rhs); + auto lhs_shape = lhs_desc->GetShape(); + auto rhs_shape = rhs_desc->GetShape(); + if ((lhs_shape[0] == -1 && rhs_shape[0] == -1) || + (lhs_shape[0] != -1 && rhs_shape[0] != -1)) { + return NodeSizeInBytes(lhs) <= NodeSizeInBytes(rhs); + } else { + return false; + } + } +}; + +void OrderedNodePairPool::Insert(ir::Node* var, ir::Node* op) { + PADDLE_ENFORCE(var->IsVar() && !var->IsCtrlVar()); + PADDLE_ENFORCE(op->IsOp()); + if (mark_table_.count(var->Name()) != 0) { + mark_table_[var->Name()]->second.insert(op); + return; + } + + auto* var_desc = FindVarDescInBlock(var); + auto var_shape = var_desc->GetShape(); + int batch_size = static_cast(var_shape[0]); + + NodeComparator compare_node; + Iter it = nodes_.begin(); + while (it != nodes_.end()) { + auto* cache_desc = FindVarDescInBlock(it->first); + int cache_batch_size = cache_desc->GetShape()[0]; + if ((cache_batch_size == -1 && batch_size == -1) || + (cache_batch_size != -1 && batch_size != -1)) { + if (compare_node(it->first, var)) { + ++it; + } else { + break; + } + } else if (cache_batch_size == -1 && batch_size != -1) { + ++it; + } else if (cache_batch_size != -1 && batch_size == -1) { + break; + } + } + + it = + nodes_.insert(it, std::make_pair(var, std::unordered_set{op})); + mark_table_[var->Name()] = it; +} + +int OrderedNodePairPool::GetIndex(ir::Node* var) { + return std::distance(nodes_.begin(), mark_table_[var->Name()]); +} + +ir::Node* OrderedNodePairPool::NodeMatch(ir::Node* var) const { + ir::Node* found_node = nullptr; + NodeComparator compare_node; + + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + if (compare_node(var, it->first)) { + found_node = it->first; + break; + } + } + return found_node; +} + +void OrderedNodePairPool::Erase(ir::Node* var) { + PADDLE_ENFORCE(mark_table_.count(var->Name())); + nodes_.erase(mark_table_[var->Name()]); + mark_table_.erase(var->Name()); +} + +std::string OrderedNodePairPool::ToString() const { + std::stringstream ss; + for (auto it = nodes_.begin(); it != nodes_.end(); ++it) { + ss << DebugString(it->first) << " "; + } + return ss.str(); +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types.h b/paddle/fluid/framework/details/memory_reuse_types.h new file mode 100644 index 0000000000000000000000000000000000000000..9a9c1d948e869016717fea9ff6b8236adfc29845 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types.h @@ -0,0 +1,87 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include +#include +#include +#include +#include +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/ir/graph.h" + +namespace paddle { +namespace framework { +namespace details { + +constexpr char kFetchedVars[] = "fetched_vars"; +constexpr char kGraphNodePool[] = "graph_node_pool"; + +// NOTE(dzh): Variable and the operators use the var. +// for early delete pass. +// Because analysis var pass build base on ir::Node, which maybe released +// or modified between passes, so we use OpDesc* to mark ops. +using GraphNodePool = std::vector< + std::pair /* ops */>>; + +// NOTE(dzh): by default, it sort node in ascend order(by node bytes size). +// in fluid, -1 means the batch_size is determined in runtime. +// the node batch_size equal -1 always ranking in the front than the node not. +// For example, +// node0[-1, 1] node1[-1, 1, 1], node2[1,1], node3[1,1024], .. +// O(1) insert, delete +class OrderedNodePairPool { + public: + using NodePair = std::pair>; + using Iter = typename std::list::iterator; + using ConstIter = typename std::list::const_iterator; + + void Insert(ir::Node* var, ir::Node* op); + + void Erase(ir::Node* var); + + bool Has(ir::Node* var) { return mark_table_.count(var->Name()); } + + ir::Node* NodeMatch(ir::Node* var) const; + // map store non-const iterator, can not promise const + int GetIndex(ir::Node* var); + // pool all node to string + std::string ToString() const; + + Iter begin() { return nodes_.begin(); } + Iter end() { return nodes_.end(); } + ConstIter begin() const { return nodes_.begin(); } + ConstIter end() const { return nodes_.end(); } + size_t size() const { return nodes_.size(); } + + private: + // for searching. + std::unordered_map mark_table_; + // node swap pairs. var -> ops dep var + std::list nodes_; +}; + +// node memory size in bytes +size_t NodeSizeInBytes(ir::Node* n); + +std::string DebugString(ir::Node* var); + +// std::string DebugString(VarDesc* var); +VarDesc* FindVarDescInBlock(ir::Node* n); + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/memory_reuse_types_test.cc b/paddle/fluid/framework/details/memory_reuse_types_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..d2fabf5ce068e0f752b86c0d02b971f18fc65f01 --- /dev/null +++ b/paddle/fluid/framework/details/memory_reuse_types_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/details/memory_reuse_types.h" +#include +#include +#include +#include +#include +#include +#include +#include "glog/logging.h" +#include "gtest/gtest.h" + +namespace paddle { +namespace framework { +namespace details { + +TEST(OrderedNodePairPool, Normal) { + OrderedNodePairPool pool; + std::vector> nodes; + + // clang-format off + std::vector> shapes = {{-1, 10}, + {-1, 20}, + {1, 2}, + {5, 2}, + {10, 20}, + {-1, 2, 5}, + {-1, 1, 5}, + {-1, 1}}; + // clang-format on + const int COUNT = shapes.size(); + ProgramDesc prog; + BlockDesc* block_desc = prog.MutableBlock(0); + auto* op_desc = block_desc->AppendOp(); + op_desc->SetType("dummy"); + std::unique_ptr op = ir::CreateNodeForTest(op_desc); + + for (int i = 0; i < COUNT; ++i) { + auto desc = block_desc->Var(std::to_string(i)); + desc->SetShape(shapes[i]); + std::unique_ptr node = ir::CreateNodeForTest(desc); + node->inputs.emplace_back(op.get()); + nodes.emplace_back(std::move(node)); + } + + for (auto& node : nodes) { + pool.Insert(node.get(), op.get()); + } + + // assert its order and interface. + std::cout << pool.ToString() << std::endl; + pool.Erase(nodes.front().get()); + std::cout << pool.ToString() << std::endl; + + ASSERT_EQ(pool.size(), static_cast(COUNT - 1)); + ASSERT_EQ(pool.GetIndex(nodes.back().get()), 0); + + { + auto v1 = block_desc->Var("11"); + v1->SetShape({-1, 256, 56, 56}); + std::unique_ptr node1 = ir::CreateNodeForTest(v1); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(cache, nullptr); + } + { + auto v2 = block_desc->Var("12"); + v2->SetShape({-1, 2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v2); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 2); // match 6:[-1,2,5] + } + { + auto v3 = block_desc->Var("13"); + v3->SetShape({2, 5}); + std::unique_ptr node1 = ir::CreateNodeForTest(v3); + node1->inputs.emplace_back(op.get()); + auto* cache = pool.NodeMatch(node1.get()); + ASSERT_EQ(pool.GetIndex(cache), 5); // match 4:[5,2] + } +} + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc index 8f92f0948d7d397ab0f20c01eae9e313f739adec..c203073845375c879a0fc10564f5dad0f19ceae4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.cc @@ -85,4 +85,5 @@ void GraphvizSSAGraphPrinter::Print(const ir::Graph &graph, } // namespace paddle REGISTER_PASS(multi_devices_print_pass, - paddle::framework::details::SSAGraghBuilderWithPrinter); + paddle::framework::details::SSAGraghBuilderWithPrinter) + .RequirePassAttr(paddle::framework::details::kGraphvizPath); diff --git a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h index c00685fa1629c0722c315c726053c2cba8bf17e7..b06c87a5c185c550818af0bdeacd0070d1d90e4e 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_print_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_print_pass.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include #include @@ -24,6 +25,8 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGraphvizPath[] = "debug_graphviz_path"; + class SSAGraphPrinter { public: virtual ~SSAGraphPrinter() {} @@ -40,7 +43,7 @@ class SSAGraghBuilderWithPrinter : public ir::Pass { std::unique_ptr ApplyImpl( std::unique_ptr graph) const override { std::unique_ptr fout( - new std::ofstream(Get("debug_graphviz_path"))); + new std::ofstream(Get(kGraphvizPath))); PADDLE_ENFORCE(fout->good()); Get("graph_printer").Print(*graph, *fout); return graph; diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index ba12ca3c61c05b3e856fffa8353d4ec5bf79bc39..b1a82e8771b92f2d0af4a1c7732ff2da54d496a8 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -25,7 +25,7 @@ namespace paddle { namespace framework { namespace details { -constexpr char kLocalExecScopeName[] = "@LCOAL_SCOPE@"; +constexpr char kLocalExecScopeName[] = "@LOCAL_SCOPE@"; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index cb864848b938e249ecd9d09e2a02f683959ce413..7a5f7de57ef20b4b909894ff8d742a65ea05874d 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -218,18 +218,18 @@ void ReduceOpHandle::RunImpl() { } #if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE - if (framework::IsType(in_selected_rows[0]->value().type())) { + if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP32) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); - } else if (framework::IsType( - in_selected_rows[0]->value().type())) { + } else if (in_selected_rows[0]->value().type() == + framework::proto::VarType::FP64) { GatherSelectedRows( in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, out_var->GetMutable()); } else { - PADDLE_ENFORCE(false, - "only support double or float when gahter SelectedRows"); + PADDLE_THROW("only support double or float when gather SelectedRows"); } #endif }); @@ -246,7 +246,7 @@ void ReduceOpHandle::RunImpl() { if (!FLAGS_cpu_deterministic) { ReduceLoDTensor func(lod_tensors, out_var->GetMutable()); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); } else { // We sum lod_tensors to reduce_sum_trg which is in local_scopes_0 // here, but it doesn't mean reduce_sum_trg must be in local_scopes_0. @@ -256,7 +256,7 @@ void ReduceOpHandle::RunImpl() { ->FindVar(out_var_handle->name_) ->GetMutable(); ReduceLoDTensor func(lod_tensors, &reduce_sum_trg); - VisitDataType(ToDataType(lod_tensors[0]->type()), func); + VisitDataType(lod_tensors[0]->type(), func); auto trg = out_var->GetMutable(); if (reduce_sum_trg.data() != trg->data()) { diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 5014fcd06a0eed65a7f4b4754457ab27c71b38dc..39652706c43fb51da99170b361b3e1a6e04c6fc9 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -13,7 +13,7 @@ // limitations under the License. #include "paddle/fluid/framework/dlpack_tensor.h" - +#include "paddle/fluid/framework/data_type.h" namespace paddle { namespace framework { @@ -36,26 +36,23 @@ static ::DLDataType GetDLDataTypeCode() { return dtype; } -static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { -#define REG_DL_DATA_TYPE(type) \ - { std::type_index(typeid(type)), GetDLDataTypeCode() } - static const std::unordered_map - type_to_dtype_map({ - REG_DL_DATA_TYPE(platform::float16), // NOLINT - REG_DL_DATA_TYPE(float), // NOLINT - REG_DL_DATA_TYPE(double), // NOLINT - REG_DL_DATA_TYPE(int), // NOLINT - REG_DL_DATA_TYPE(int64_t), // NOLINT - REG_DL_DATA_TYPE(bool), // NOLINT - REG_DL_DATA_TYPE(size_t), // NOLINT - REG_DL_DATA_TYPE(int16_t), // NOLINT - REG_DL_DATA_TYPE(uint8_t), // NOLINT - REG_DL_DATA_TYPE(int8_t) // NOLINT - }); +static std::unordered_map CreateDLDataTypeMap() { + static std::unordered_map result; + +#define REG_DL_DATA_TYPE(cpp_type, proto_type) \ + result[static_cast(proto_type)] = GetDLDataTypeCode() + + _ForEachDataType_(REG_DL_DATA_TYPE); +#undef REG_DL_DATA_TYPE + return result; +} + +static DLDataType GetDLDataTypeFromTypeIndex(proto::VarType::Type type) { + static auto type_to_dtype_map = CreateDLDataTypeMap(); static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); - auto it = type_to_dtype_map.find(type); - PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", - type.name()); + auto it = type_to_dtype_map.find(static_cast(type)); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %d", + type); return it->second; #undef REG_DL_DATA_TYPE } diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc index 938b05635004fcc417f753d5912269333e3ebc01..c0a8e1bcdfa3a54aea061f1a0815fc1405c76d9c 100644 --- a/paddle/fluid/framework/dlpack_tensor_test.cc +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -91,23 +91,11 @@ void TestMainLoop() { } } } +TEST(dlpack, test_all) { +#define TestCallback(cpp_type, proto_type) TestMainLoop() -#define PADDLE_DLPACK_TEST(type) \ - TEST(dlpack, test_##type) { TestMainLoop(); } - -using float16 = platform::float16; -PADDLE_DLPACK_TEST(float16); -PADDLE_DLPACK_TEST(float); -PADDLE_DLPACK_TEST(double); -PADDLE_DLPACK_TEST(int); -PADDLE_DLPACK_TEST(int64_t); -PADDLE_DLPACK_TEST(bool); -PADDLE_DLPACK_TEST(size_t); -PADDLE_DLPACK_TEST(int16_t); -PADDLE_DLPACK_TEST(uint8_t); -PADDLE_DLPACK_TEST(int8_t); - -#undef PADDLE_DLPACK_TEST + _ForEachDataType_(TestCallback); +} } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 0c4bd336c5b36eff7d93e4099c96cd3c5990ac17..da9556c6c1f3468208db02f2958ad6ad137c6566 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -100,7 +100,7 @@ static void DeleteUnusedTensors( continue; } auto* var = scope.FindVar(name); - if (var != nullptr) { + if (var == nullptr) { continue; } @@ -157,9 +157,9 @@ void Executor::Close() { #ifdef PADDLE_WITH_DISTRIBUTE // TODO(typhoonzero): complete message will need to use real trainer_id, // except 0. - ::paddle::operators::distributed::RPCClient::GetInstance< - ::paddle::operators::distributed::GRPCClient>(0) - ->SendComplete(); + auto client = + paddle::operators::distributed::RPCClient::GetInstance(0); + client->SendComplete(); #endif } diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 5fc5aeb662ae4af37321db88661cfc8a4dabe4d3..2eb9e564f87807e88def536ee875ebe0d1e83cd6 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/executor_thread_worker.h" +#include #include "google/protobuf/io/zero_copy_stream_impl.h" #include "google/protobuf/message.h" #include "google/protobuf/text_format.h" @@ -32,6 +33,89 @@ limitations under the License. */ namespace paddle { namespace framework { +#ifdef PADDLE_WITH_PSLIB +int DensePullThread::start() { + _running = true; + _t = std::thread(&DensePullThread::run, this); + return 0; +} + +void DensePullThread::run() { + while (_running) { + _pull_dense_status.resize(0); + for (auto& t : _dense_variable_name) { + if (check_update_param(t.first)) { + auto status = pull_dense(t.first); + _pull_dense_status.emplace_back(std::move(status)); + reset_thread_version(t.first); + } + } + if (_pull_dense_status.size() != 0) { + wait_all(); + } + + usleep(_sleep_time_ms * 1000); + } +} +bool DensePullThread::check_update_param(uint64_t table_id) { + { + std::lock_guard lock(_mutex_for_version); + auto& version = _training_versions[table_id]; + _current_version[table_id] = + *(std::min_element(version.begin(), version.end())); + } + if (_current_version[table_id] - _last_versions[table_id] < _threshold) { + return false; + } + return true; +} + +void DensePullThread::reset_thread_version(uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _last_versions[table_id] = _current_version[table_id]; +} +std::future DensePullThread::pull_dense(uint64_t table_id) { + auto& regions = _regions[table_id]; + regions.clear(); + auto& variables = _dense_variable_name[table_id]; + regions.resize(variables.size()); + + for (auto i = 0u; i < variables.size(); ++i) { + auto& t = variables[i]; + Variable* var = _root_scope->FindVar(t); + LoDTensor* tensor = var->GetMutable(); + + float* w = tensor->data(); + paddle::ps::Region reg(w, tensor->numel()); + regions[i] = std::move(reg); + } + return _ps_client->pull_dense(regions.data(), regions.size(), table_id); +} + +void DensePullThread::wait_all() { + for (auto& t : _pull_dense_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(WARNING) << "pull dense failed times:" << ++_pull_dense_fail_times; + } + } + + if (_pull_dense_fail_times > 20) { + LOG(FATAL) << "pull dense failed times more than 20 times"; + exit(-1); + } + + _pull_dense_status.resize(0); +} + +void DensePullThread::increase_thread_version(int thread_id, + uint64_t table_id) { + std::lock_guard lock(_mutex_for_version); + _training_versions[table_id][thread_id]++; +} +#endif + void ExecutorThreadWorker::CreateThreadOperators(const ProgramDesc& program) { auto& block = program.Block(0); op_names_.clear(); @@ -139,39 +223,19 @@ void print_lod_tensor(std::string var_name, const LoDTensor& lod_tensor) { std::cout << sstream.str() << std::endl; } -void print_fetch_var(Scope* scope, std::string var_name) { - const LoDTensor& tensor = scope->FindVar(var_name)->Get(); - - if (std::type_index(tensor.type()) == - std::type_index(typeid(platform::float16))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(float))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(double))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(int))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int64_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == std::type_index(typeid(bool))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(uint8_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int16_t))) { - print_lod_tensor(var_name, tensor); - } else if (std::type_index(tensor.type()) == - std::type_index(typeid(int8_t))) { - print_lod_tensor(var_name, tensor); - } else { - VLOG(1) << "print_fetch_var: unrecognized data type:" - << tensor.type().name(); - } +static void print_fetch_var(Scope* scope, const std::string& var_name) { + auto& tensor = scope->FindVar(var_name)->Get(); - return; +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor.type() == proto_type) { \ + print_lod_tensor(var_name, tensor); \ + return; \ + } \ + } while (0) + + _ForEachDataType_(PrintLoDTensorCallback); + VLOG(1) << "print_fetch_var: unrecognized data type:" << tensor.type(); } void ExecutorThreadWorker::TrainFiles() { @@ -222,5 +286,358 @@ void ExecutorThreadWorker::SetRootScope(Scope* g_scope) { root_scope_ = g_scope; } +#ifdef PADDLE_WITH_PSLIB +// AsyncExecutor +void AsyncExecutorThreadWorker::TrainFiles() { + SetDevice(); + + int fetch_var_num = fetch_var_names_.size(); + fetch_values_.clear(); + fetch_values_.resize(fetch_var_num); + + thread_reader_->Start(); + + int cur_batch; + int batch_cnt = 0; + while ((cur_batch = thread_reader_->Next()) > 0) { + // executor run here + TrainOneNetwork(); + + ++batch_cnt; + thread_scope_->DropKids(); + + if (debug_ == false || thread_id_ != 0) { + continue; + } + + for (int i = 0; i < fetch_var_num; ++i) { + print_fetch_var(thread_scope_, fetch_var_names_[i]); + } // end for (int i = 0...) + } // end while () +} + +void AsyncExecutorThreadWorker::SetPSlibPtr( + std::shared_ptr pslib_ptr) { + _pslib_ptr = pslib_ptr; +} +void AsyncExecutorThreadWorker::SetPullDenseThread( + std::shared_ptr dpt) { + _pull_dense_thread = dpt; +} +void AsyncExecutorThreadWorker::TrainOneNetwork() { + PrepareParams(); + + for (auto& op : ops_) { + if (op->Type().find("sgd") != std::string::npos) { + continue; + } + bool need_skip = false; + for (auto t = 0u; t < _param_config->skip_op.size(); ++t) { + if (op->Type().find(_param_config->skip_op[t]) != std::string::npos) { + need_skip = true; + break; + } + } + if (!need_skip) { + op->Run(*thread_scope_, place_); + } + } + UpdateParams(); +} + +void AsyncExecutorThreadWorker::SetParamConfig( + AsyncWorkerParamConfig* param_config) { + _param_config = param_config; +} + +void AsyncExecutorThreadWorker::PrepareParams() { + for (auto table_id : _param_config->sparse_table_id) { + PullSparse(table_id); + for (auto& t : _pull_sparse_status) { + t.wait(); + auto status = t.get(); + if (status != 0) { + LOG(ERROR) << "pull sparse failed, status[" << status << "]"; + exit(-1); + } + } + } + _pull_sparse_status.resize(0); + + for (auto table_id : _param_config->sparse_table_id) { + FillSparse(table_id); + } +} + +void AsyncExecutorThreadWorker::UpdateParams() { + for (auto i : _param_config->sparse_table_id) { + PushSparse(i); + } + for (auto i : _param_config->dense_table_id) { + PushDense(i); + } + int32_t tmp_push_dense_wait_times = -1; + int32_t tmp_push_sparse_wait_times = -1; + static uint32_t push_dense_wait_times = + static_cast(tmp_push_dense_wait_times); + static uint32_t push_sparse_wait_times = + static_cast(tmp_push_sparse_wait_times); + + if (_push_dense_status.size() >= push_dense_wait_times) { + for (auto& t : _push_dense_status) { + t.wait(); + } + _push_dense_status.resize(0); + } + if (tmp_push_dense_wait_times == -1) { + _push_dense_status.resize(0); + } + if (_push_sparse_status.size() >= push_sparse_wait_times) { + for (auto& t : _push_sparse_status) { + t.wait(); + } + _push_sparse_status.resize(0); + } + if (tmp_push_sparse_wait_times == -1) { + _push_sparse_status.resize(0); + } + for (auto dense_table_id : _param_config->dense_table_id) { + _pull_dense_thread->increase_thread_version(thread_id_, dense_table_id); + } +} + +void AsyncExecutorThreadWorker::PushDense(int table_id) { + std::vector regions; + for (auto& t : _param_config->dense_gradient_variable_name[table_id]) { + Variable* var = thread_scope_->FindVar(t); + CHECK(var != nullptr) << "var[" << t << "] not found"; + LoDTensor* tensor = var->GetMutable(); + int count = tensor->numel(); + float* g = tensor->data(); + paddle::ps::Region reg(g, count); + regions.emplace_back(std::move(reg)); + } + + auto status = _pslib_ptr->_worker_ptr->push_dense(regions.data(), + regions.size(), table_id); + _push_dense_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::PullSparse(int table_id) { + auto& features = _features[table_id]; + auto& feature_value = _feature_value[table_id]; + auto fea_dim = _param_config->fea_dim; + // slot id starts from 1 + features.clear(); + features.resize(0); + features.reserve(MAX_FEASIGN_NUM); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + for (auto i = 0u; i < len; ++i) { + // todo(colourful-tree): current trick - filter feasign=use_slot_mod( + // bug: datafeed fill use_slot_mod for empty slot) + if (ids[i] == 0u) { + continue; + } + features.push_back(static_cast(ids[i])); + } + } + check_pull_push_memory(features, &feature_value, fea_dim); + + std::vector pull_feature_value; + for (auto i = 0u; i < features.size(); ++i) { + pull_feature_value.push_back(feature_value[i].data()); + } + + auto status = _pslib_ptr->_worker_ptr->pull_sparse( + pull_feature_value.data(), table_id, features.data(), features.size()); + _pull_sparse_status.push_back(std::move(status)); + + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + + collect_feasign_info(table_id); +} + +void AsyncExecutorThreadWorker::FillSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& fea_value = _feature_value[table_id]; + + CHECK(features.size() > 0) << "feature size check failed"; + + auto fea_idx = 0u; + + std::vector init_value(fea_dim); + + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label TODO + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + int len = tensor->numel(); + Variable* var_emb = thread_scope_->FindVar( + _param_config->slot_input_vec[table_id][slot_idx - 1]); + LoDTensor* tensor_emb = var_emb->GetMutable(); + float* ptr = + tensor_emb->mutable_data({len, slot_dim}, platform::CPUPlace()); + memset(ptr, 0, sizeof(float) * len * slot_dim); + auto& tensor_lod = tensor->lod()[0]; + + LoD data_lod{tensor_lod}; + tensor_emb->set_lod(data_lod); + + for (auto index = 0u; index < len; ++index) { + if (ids[index] == 0u) { + memcpy(ptr + slot_dim * index, init_value.data() + 2, + sizeof(float) * slot_dim); + continue; + } + memcpy(ptr + slot_dim * index, fea_value[fea_idx].data() + 2, + sizeof(float) * slot_dim); + fea_idx++; + } + } +} + +void AsyncExecutorThreadWorker::PushSparse(int table_id) { + auto slot_dim = _param_config->slot_dim; + auto fea_dim = _param_config->fea_dim; + auto& features = _features[table_id]; + auto& push_g = _feature_push_value[table_id]; + check_pull_push_memory(features, &push_g, fea_dim); + CHECK(push_g.size() == features.size() + 1) + << "push_g size:" << push_g.size() + << " features size:" << features.size(); + uint64_t fea_idx = 0u; + auto& fea_info = _fea_info[table_id]; + int offset = 2; + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + // slot_idx = 0 is label + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + if (_param_config->slot_alias_to_table.find(feed_vec[slot_idx]) == + _param_config->slot_alias_to_table.end()) { + LOG(ERROR) << "ERROR slot_idx:" << slot_idx + << " name:" << feed_vec[slot_idx]; + } else if (_param_config->slot_alias_to_table[feed_vec[slot_idx]] != + table_id) { + continue; + } + Variable* g_var = thread_scope_->FindVar( + _param_config->gradient_var[table_id][slot_idx - 1]); + CHECK(g_var != nullptr) + << "var[" << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + LoDTensor* g_tensor = g_var->GetMutable(); + if (g_tensor == NULL) { + LOG(ERROR) << "var[" + << _param_config->gradient_var[table_id][slot_idx - 1] + << "] not found"; + exit(-1); + } + float* g = g_tensor->data(); + + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + CHECK(var != nullptr) << "var[" << feed_vec[slot_idx] << "] not found"; + LoDTensor* tensor = var->GetMutable(); + if (tensor == NULL) { + LOG(ERROR) << "var[" << feed_vec[slot_idx] << "] not found"; + exit(-1); + } + int len = tensor->numel(); + CHECK(slot_dim * len == g_tensor->numel()) + << "len:" << len << " g_numel:" << g_tensor->numel(); + CHECK(len == tensor->numel()) << "len:" << len + << "t_numel:" << tensor->numel(); + int64_t* ids = tensor->data(); + for (auto id_idx = 0u; id_idx < len; ++id_idx) { + if (ids[id_idx] == 0) { + g += slot_dim; + continue; + } + memcpy(push_g[fea_idx].data() + offset, g, sizeof(float) * slot_dim); + push_g[fea_idx][0] = 1.0f; + CHECK(fea_idx < fea_info.size()) << "fea_idx:" << fea_idx + << " size:" << fea_info.size(); + push_g[fea_idx][1] = static_cast(fea_info[fea_idx].label); + g += slot_dim; + fea_idx++; + } + } + CHECK(fea_idx == features.size()) << "fea_idx:" << fea_idx + << " features size:" << features.size(); + CHECK_GT(features.size(), 0); + + std::vector push_g_vec; + for (auto i = 0u; i < features.size(); ++i) { + push_g_vec.push_back(push_g[i].data()); + } + auto status = _pslib_ptr->_worker_ptr->push_sparse( + table_id, features.data(), (const float**)push_g_vec.data(), + features.size()); + _push_sparse_status.push_back(std::move(status)); +} + +void AsyncExecutorThreadWorker::collect_feasign_info(int table_id) { + auto& fea_info = _fea_info[table_id]; + auto& feature = _features[table_id]; + fea_info.resize(feature.size()); + const std::vector& feed_vec = thread_reader_->GetUseSlotAlias(); + Variable* var = thread_scope_->FindVar(feed_vec[0]); + LoDTensor* tensor = var->GetMutable(); + int64_t* label = tensor->data(); + + int global_index = 0; + for (auto slot_idx = 1u; slot_idx < feed_vec.size(); ++slot_idx) { + Variable* var = thread_scope_->FindVar(feed_vec[slot_idx]); + LoDTensor* tensor = var->GetMutable(); + int64_t* ids = tensor->data(); + + int fea_idx = 0; + for (auto ins_idx = 1u; ins_idx < tensor->lod()[0].size(); ++ins_idx) { + for (; fea_idx < tensor->lod()[0][ins_idx]; ++fea_idx) { + if (ids[fea_idx] == 0u) { + continue; + } + FeasignInfo info{slot_idx, ins_idx, label[ins_idx - 1]}; + + fea_info[global_index++] = std::move(info); + } + } + } + CHECK(global_index == feature.size()) + << "expect fea info size:" << feature.size() << " real:" << global_index; +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, + std::vector>* push_g, int dim) { + push_g->resize(features.size() + 1); + for (auto& t : *push_g) { + t.resize(dim); + } +} + +void AsyncExecutorThreadWorker::check_pull_push_memory( + const std::vector& features, std::vector* push_g, + int dim) { + if (features.size() > push_g->size()) { + push_g->reserve(features.size() + 1); + auto size = features.size() - push_g->size() + 1; + for (auto i = 0u; i < size; ++i) { + float* ptr = new float[dim]; + push_g->push_back(ptr); + } + } +} +#endif + } // einit_modelnd namespace framework } // end namespace paddle diff --git a/paddle/fluid/framework/executor_thread_worker.h b/paddle/fluid/framework/executor_thread_worker.h index 13ec2442c46459116320236bf98f23c91340f389..30b81ad88035eacc7a8efbe6d20f03d362122003 100644 --- a/paddle/fluid/framework/executor_thread_worker.h +++ b/paddle/fluid/framework/executor_thread_worker.h @@ -25,16 +25,119 @@ limitations under the License. */ #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" +#ifdef PADDLE_WITH_PSLIB +#include +#endif namespace paddle { namespace framework { + void CreateTensor(Variable* var, proto::VarType::Type var_type); +#ifdef PADDLE_WITH_PSLIB +static const uint32_t MAX_FEASIGN_NUM = 1000 * 100 * 100; + +struct AsyncWorkerParamConfig { + int slot_dim; + int fea_dim; + int32_t tmp_push_dense_wait_times; + int32_t tmp_push_sparse_wait_times; + + std::vector skip_op; + + std::map> dense_variable_name; + std::map> dense_gradient_variable_name; + std::vector dense_table_id; + // fea_dim for each dense table + std::vector dense_table_size; + std::vector sparse_table_id; + std::map> slot_input_vec; + std::map> gradient_var; + std::map slot_alias_to_table; +}; + +struct DensePullThreadParam { + std::shared_ptr ps_client; + int threshold; + int training_thread_num; + Scope* root_scope; + std::map>* dense_params; + int sleep_time_ms = 2; +}; + +class DensePullThread { + public: + explicit DensePullThread(const DensePullThreadParam& param) + : _running(false) { + _ps_client = param.ps_client; + _threshold = param.threshold; + _thread_num = param.training_thread_num; + _root_scope = param.root_scope; + _sleep_time_ms = param.sleep_time_ms; + + for (auto& t : *param.dense_params) { + _dense_variable_name[t.first].insert(_dense_variable_name[t.first].end(), + t.second.begin(), t.second.end()); + _training_versions[t.first].resize(_thread_num, 0); + _last_versions[t.first] = 0; + _current_version[t.first] = 0; + } + } + + int start(); + + void stop() { + if (_running) { + _running = false; + _t.join(); + } + } + + void increase_thread_version(int thread_id, uint64_t table_id); + void reset_thread_version(uint64_t table_id); + std::future pull_dense(uint64_t table_id); + void pull_dense2(uint64_t table_id); + void wait_all(); + + private: + void run(); + bool check_update_param(uint64_t table_id); + + private: + std::shared_ptr _ps_client; + int _thread_num; + int _threshold; + int _sleep_time_ms; + Scope* _root_scope; + bool _running; + + std::map _last_versions; + std::map _current_version; + std::mutex _mutex_for_version; + std::map> _training_versions; + std::map> _dense_variable_name; + + std::thread _t; + + std::vector<::std::future> _pull_dense_status; + + std::map> _regions; + uint32_t _pull_dense_fail_times = 0; + + std::vector _base_norm_param; + std::vector _mean; + std::vector _scale; + float _squared_sum_epsilon = 1e-4; + std::mutex _mutex_for_mean_scale; + + float _total_batch_num = 0; +}; +#endif class ExecutorThreadWorker { public: ExecutorThreadWorker() : thread_id_(-1), root_scope_(NULL), thread_scope_(NULL), debug_(false) {} - ~ExecutorThreadWorker() {} + virtual ~ExecutorThreadWorker() {} void CreateThreadResource(const framework::ProgramDesc& program, const paddle::platform::Place& place); @@ -51,9 +154,15 @@ class ExecutorThreadWorker { // set data feed declared in executor void SetDataFeed(const std::shared_ptr& datafeed); // A multi-thread training function - void TrainFiles(); + virtual void TrainFiles(); // set fetch variable names from python interface assigned by users void SetFetchVarNames(const std::vector& fetch_var_names); +#ifdef PADDLE_WITH_PSLIB + virtual void SetPSlibPtr( + std::shared_ptr pslib_ptr) {} + virtual void SetPullDenseThread(std::shared_ptr dpt) {} + virtual void SetParamConfig(AsyncWorkerParamConfig* param_config) {} +#endif private: void CreateThreadScope(const framework::ProgramDesc& program); @@ -77,12 +186,58 @@ class ExecutorThreadWorker { Scope* root_scope_; // a thread scope, father scope is global score which is shared Scope* thread_scope_; - - private: std::vector fetch_var_names_; std::vector> fetch_values_; bool debug_; }; +#ifdef PADDLE_WITH_PSLIB +class AsyncExecutorThreadWorker : public ExecutorThreadWorker { + public: + AsyncExecutorThreadWorker() {} + virtual ~AsyncExecutorThreadWorker() {} + void SetPSlibPtr(std::shared_ptr pslib_ptr); + void SetPullDenseThread(std::shared_ptr dpt); + void SetParamConfig(AsyncWorkerParamConfig* param_config); + void TrainFiles(); + void TrainOneNetwork(); + void PrepareParams(); + void UpdateParams(); + void PullSparse(int table_id); + void FillSparse(int table_id); + void PushSparse(int table_id); + void PushDense(int table_id); + + void check_pull_push_memory(const std::vector& features, + std::vector* push_g, int dim); + void check_pull_push_memory(const std::vector& features, + std::vector>* push_g, int dim); + void collect_feasign_info(int table_id); + + private: + struct FeasignInfo { + uint32_t slot; + uint32_t ins; + int64_t label; + }; + + std::map> _features; + std::map> _fea_info; + std::map>> _feature_value; + std::map>> _feature_push_value; + + std::shared_ptr _pslib_ptr; + + std::shared_ptr _pull_dense_thread; + + std::vector<::std::future> _pull_sparse_status; + std::vector<::std::future> _pull_dense_status; + std::vector<::std::future> _push_sparse_status; + std::vector<::std::future> _push_dense_status; + + AsyncWorkerParamConfig* _param_config; +}; +#endif + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index 883575e41db2d883e9b969978419a10ffc58b97e..b7f7e2ee8ef590c0d0d8307de4400a8ce8ad4e7d 100644 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -42,6 +42,9 @@ pass_library(multi_batch_merge_pass base) pass_library(conv_bn_fuse_pass inference) pass_library(seqconv_eltadd_relu_fuse_pass inference) pass_library(is_test_pass base) +pass_library(conv_elementwise_add_act_fuse_pass inference) +pass_library(conv_elementwise_add2_act_fuse_pass inference) +pass_library(conv_elementwise_add_fuse_pass inference) if(WITH_MKLDNN) pass_library(mkldnn_placement_pass base) pass_library(depthwise_conv_mkldnn_pass base) diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc new file mode 100644 index 0000000000000000000000000000000000000000..6e9905b7ecdba653bb4d8a4aa82234ffba5a9528 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse.cc @@ -0,0 +1,106 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, new_conv_op); // ResidualData + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..23f343f631628b432d91d4504019895ed4bac4a5 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.cc @@ -0,0 +1,105 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h" +#include + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(elementwise_add_op_1); \ + GET_IR_NODE(elementwise_add_in_y_1); \ + GET_IR_NODE(elementwise_add_out_1); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& bias1, const std::string& activation, + const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {bias1}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAdd2ActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern()->NewNode("x")->AsInput()->assert_is_op_input( + "conv2d", "Input"); + + patterns::ConvElementwiseadd2Act pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string bias1_name = elementwise_add_in_y_1->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = PrepareOpDesc(base_op_desc, bias_name, bias1_name, + act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, conv_op); // Input + IR_NODE_LINK_TO(conv_filter, conv_op); // Filter + IR_NODE_LINK_TO(conv_op, conv_out); // Output + IR_NODE_LINK_TO(elementwise_add_in_y, conv_op); // Bias + IR_NODE_LINK_TO(elementwise_add_in_y_1, conv_op); // Bias + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), + {conv_op, elementwise_add_op, elementwise_add_op_1, + elementwise_add_out}); + }; + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add2_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAdd2ActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..3b40a5a92665c07bc2b66e6a96721f573d40393f --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add2_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAdd2ActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAdd2ActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..fe3b4fca79f372d570634a3c182a9ec3cf5522e1 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.cc @@ -0,0 +1,104 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h" +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); \ + GET_IR_NODE(act_op); \ + GET_IR_NODE(act_out); + +// Inherient the basic infomation from `base_desc`, and modify some fields. +framework::proto::OpDesc PrepareOpDesc( + const framework::proto::OpDesc& base_desc, const std::string& bias, + const std::string& activation, const std::string& output) { + auto proto = base_desc; + framework::OpDesc desc(proto, nullptr); + desc.SetType("conv2d_fusion"); + desc.SetInput("Bias", {bias}); + desc.SetInput("ResidualData", {}); + desc.SetAttr("activation", activation); + desc.SetOutput("Output", {output}); + desc.SetAttr("is_test", true); + desc.SetAttr("use_cudnn", false); + desc.Flush(); + return *desc.Proto(); +} + +std::unique_ptr ConvElementwiseAddActFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_act_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseaddAct pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string act_op_type = act_op->Op()->Type(); + std::string act_op_out = act_out->Name(); + + auto new_op_proto = + PrepareOpDesc(base_op_desc, bias_name, act_op_type, act_op_out); + framework::OpDesc new_op_desc(new_op_proto, nullptr); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, act_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op, + elementwise_add_out, act_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_act_fuse_pass, + paddle::framework::ir::ConvElementwiseAddActFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..ac69aa6458fc8c19b670dea2af1251c44dc353a8 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_act_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddActFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddActFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc new file mode 100644 index 0000000000000000000000000000000000000000..476c9dbc353f865916d0065bbce653d7b7204dce --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.cc @@ -0,0 +1,91 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h" +#include "paddle/fluid/framework/ir/graph_viz_pass.h" + +namespace paddle { +namespace framework { +namespace ir { + +#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern); +#define GET_NODES \ + GET_IR_NODE(conv_op); \ + GET_IR_NODE(conv_out); \ + GET_IR_NODE(conv_filter); \ + GET_IR_NODE(elementwise_add_op); \ + GET_IR_NODE(elementwise_add_in_y); \ + GET_IR_NODE(elementwise_add_out); + +std::unique_ptr ConvElementwiseAddFusePass::ApplyImpl( + std::unique_ptr graph) const { + const std::string pattern_name = "conv_elementwise_add_fuse"; + FusePassBase::Init(pattern_name, graph.get()); + + GraphPatternDetector gpd; + auto* x = gpd.mutable_pattern() + ->NewNode("x") + ->assert_is_op_input("conv2d", "Input") + ->AsInput(); + + patterns::ConvElementwiseadd pattern(gpd.mutable_pattern(), pattern_name); + pattern(x); + + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + GET_NODES; + + auto base_op_desc = *conv_op->Op()->Proto(); + std::string bias_name = elementwise_add_in_y->Name(); + std::string output_name = elementwise_add_out->Name(); + + std::string act_type = "identity"; + framework::OpDesc new_op_desc(base_op_desc, nullptr); + new_op_desc.SetType("conv2d_fusion"); + new_op_desc.SetInput("Bias", {bias_name}); + new_op_desc.SetInput("ResidualData", {}); + new_op_desc.SetAttr("activation", act_type); + new_op_desc.SetOutput("Output", {output_name}); + new_op_desc.SetAttr("is_test", true); + new_op_desc.SetAttr("use_cudnn", false); + new_op_desc.Flush(); + + // Create a new node for the fused op. + auto* new_conv_op = graph->CreateOpNode(&new_op_desc); + + // Link inputs and outputs. + PADDLE_ENFORCE(subgraph.count(x)); + auto* conv_in_node = subgraph.at(x); + + IR_NODE_LINK_TO(conv_in_node, new_conv_op); // Input + IR_NODE_LINK_TO(conv_filter, new_conv_op); // Filter + IR_NODE_LINK_TO(elementwise_add_in_y, new_conv_op); // Bias + IR_NODE_LINK_TO(new_conv_op, elementwise_add_out); // Output + + // Delete the unneeded nodes. + GraphSafeRemoveNodes(graph.get(), {conv_op, conv_out, elementwise_add_op}); + }; + + gpd(graph.get(), handler); + return graph; +} + +} // namespace ir +} // namespace framework +} // namespace paddle + +REGISTER_PASS(conv_elementwise_add_fuse_pass, + paddle::framework::ir::ConvElementwiseAddFusePass); diff --git a/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h new file mode 100644 index 0000000000000000000000000000000000000000..f234603f5856a9238164f7fb0e5cc81ea9b7ed60 --- /dev/null +++ b/paddle/fluid/framework/ir/conv_elementwise_add_fuse_pass.h @@ -0,0 +1,33 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/fluid/framework/ir/fuse_pass_base.h" +#include "paddle/fluid/framework/ir/graph_pattern_detector.h" + +namespace paddle { +namespace framework { +namespace ir { + +class ConvElementwiseAddFusePass : public FusePassBase { + public: + virtual ~ConvElementwiseAddFusePass() {} + + protected: + std::unique_ptr ApplyImpl(std::unique_ptr graph) const; +}; + +} // namespace ir +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index 8679118fe28b1c68aea30caf711441823b5255c0..8670dcfed7e40473e06cd12cecc1157dd4f54aa0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -162,7 +162,10 @@ void Graph::ResolveHazard( (*it_new)->inputs.empty() ? nullptr : (*it_new)->inputs[0]; const auto &read_ops = (*it_old)->outputs; - PADDLE_ENFORCE(write_op, "The write_op should not be empty."); + PADDLE_ENFORCE( + write_op, + string::Sprintf("The write_op of var %s should not be empty.", + (*it_new)->Name())); // Add write after write dependence ir::Node *upstream_op = diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index d2d28793c4320e3664bb69c65dab4fec830e4d02..d99f856d8f46ea760ce07533446ce3bec95d7d27 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include #include #include +#include #include DEFINE_string(print_sub_graph_dir, "", @@ -121,7 +122,7 @@ std::map> BuildOperationAdjList( } size_t GraphNum(const Graph &graph) { - std::unordered_set nodes = graph.Nodes(); + std::unordered_set nodes(graph.Nodes()); std::unordered_set visited_nodes; visited_nodes.reserve(nodes.size()); std::deque q_nodes; diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 8d92c406689ab3a97596a8666ceb452aec4be170..be525151f9f9749b913a7e5111e5622d868bd266 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -24,6 +24,7 @@ limitations under the License. */ namespace paddle { namespace framework { namespace ir { + // Test if the graph contains circle. bool HasCircle(const Graph &graph); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 0118019df2f779a6409365555b530ae3b6d3971f..13d752e5167c039ec8d9e4300b190a726bb02a63 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -25,6 +25,7 @@ #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/string/pretty_log.h" #include "paddle/fluid/string/printf.h" + namespace paddle { namespace framework { namespace ir { @@ -104,7 +105,7 @@ bool GraphPatternDetector::MarkPDNodesInGraph(const ir::Graph &graph) { for (auto &node : GraphTraits::DFS(graph)) { for (const auto &pdnode : pattern_.nodes()) { if (pdnode->Tell(&node)) { - VLOG(4) << "pdnode " << pdnode->name() << " marked"; + VLOG(4) << "Node " << node.Name() << " marked as " << pdnode->name(); pdnodes2nodes_[pdnode.get()].insert(&node); } } @@ -1099,6 +1100,142 @@ PDNode *patterns::ElementwiseAdd::operator()(PDNode *x_var, PDNode *y_var) { return out_var; } + +std::unordered_set conv_act_set({"identity", "sigmoid", "relu", + "relu6", "relux", "tanh", + "band_pass"}); + +PDNode *patterns::ConvElementwiseaddAct::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + act_op->LinksFrom({elementwise_add_out}).LinksTo({act_out}); + + return act_out; +} + +PDNode *patterns::ConvElementwiseadd2Act::operator()(PDNode *conv_in) { + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + + auto elementwise_add_op_1 = pattern->NewNode(elementwise_add_op_1_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y_1 = pattern->NewNode(elementwise_add_in_y_1_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out_1 = pattern->NewNode(elementwise_add_out_1_repr()) + ->assert_is_op_output("elementwise_add") + ->AsIntermediate(); + + auto act_op = pattern->NewNode(act_op_repr()) + ->assert_is_op() + ->assert_more([&](Node *node) { + auto op_type = node->Name(); + return conv_act_set.count(op_type); + }); + auto act_out = pattern->NewNode(act_out_repr()) + ->assert_is_var() + // is activation op's output. + ->assert_more([&](Node *node) { + for (auto *in_op : node->inputs) { + if (conv_act_set.count(in_op->Name())) { + return true; + } + } + return false; + }) + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}).LinksTo({conv_out}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + elementwise_add_op_1->LinksFrom( + {elementwise_add_out, elementwise_add_in_y_1}); + act_op->LinksFrom({elementwise_add_out_1}).LinksTo({act_out}); + return act_out; +} + +PDNode *patterns::ConvElementwiseadd::operator()(PDNode *conv_in) { + conv_in->AsInput(); + auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d"); + auto conv_out = pattern->NewNode(conv_out_repr()) + ->assert_is_op_output("conv2d") + ->assert_is_op_input("elementwise_add", "X") + ->AsIntermediate(); + auto conv_filter = pattern->NewNode(conv_filter_repr()) + ->assert_is_op_input("conv2d", "Filter") + ->AsInput(); + auto elementwise_add_op = pattern->NewNode(elementwise_add_op_repr()) + ->assert_is_op("elementwise_add"); + auto elementwise_add_in_y = pattern->NewNode(elementwise_add_in_y_repr()) + ->assert_is_op_input("elementwise_add", "Y") + ->AsInput(); + auto elementwise_add_out = pattern->NewNode(elementwise_add_out_repr()) + ->assert_is_op_output("elementwise_add") + ->AsOutput(); + + conv_op->LinksFrom({conv_in, conv_filter}); + conv_out->LinksFrom({conv_op}); + elementwise_add_op->LinksFrom({conv_out, elementwise_add_in_y}) + .LinksTo({elementwise_add_out}); + + return elementwise_add_out; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index d044802f22d02372e0ddb72c6fd702aebf2f82c3..eaedd9d08e0fab820481d6eaacb6e7bfc1ab6d1d 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -671,6 +671,69 @@ struct ElementwiseAdd : public PatternBase { PATTERN_DECL_NODE(elementwise_add_y); PATTERN_DECL_NODE(elementwise_add_out); }; + +// Conv + ElementwiseAdd + an activation +// This pattern can futher fuse the conv related ops after the conv+bn fusion. +struct ConvElementwiseaddAct : public PatternBase { + ConvElementwiseaddAct(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd + ElementwiseAdd + Activation +struct ConvElementwiseadd2Act : public PatternBase { + ConvElementwiseadd2Act(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, + "conv_elementwiseadd2_elementwiseadd_act") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_filter); + PATTERN_DECL_NODE(conv_out); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); // input + PATTERN_DECL_NODE(elementwise_add_out); + + PATTERN_DECL_NODE(elementwise_add_op_1); + PATTERN_DECL_NODE(elementwise_add_in_y_1); // input + PATTERN_DECL_NODE(elementwise_add_out_1); + + PATTERN_DECL_NODE(act_op); + PATTERN_DECL_NODE(act_out); +}; + +// Conv + ElementwiseAdd +// This pattern should be used after ConvElementwiseadd2Act or +// ConvElementwiseadd pass +struct ConvElementwiseadd : public PatternBase { + ConvElementwiseadd(PDPattern* pattern, const std::string& name_scope) + : PatternBase(pattern, name_scope, "conv_elementwiseadd") {} + + PDNode* operator()(PDNode* conv_in); + + PATTERN_DECL_NODE(conv_op); + PATTERN_DECL_NODE(conv_out); + PATTERN_DECL_NODE(conv_filter); + + PATTERN_DECL_NODE(elementwise_add_op); + PATTERN_DECL_NODE(elementwise_add_in_y); + PATTERN_DECL_NODE(elementwise_add_out); +}; + } // namespace patterns // Link two ir::Nodes from each other. diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index eac67108e2106e986cbe1255a64c956153bc5560..45d81b937392244f678fbd01395b3ffffd07f710 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,6 +30,14 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } +std::unique_ptr CreateNodeForTest(VarDesc *var_desc) { + return std::unique_ptr(new Node(var_desc)); +} + +std::unique_ptr CreateNodeForTest(OpDesc *op_desc) { + return std::unique_ptr(new Node(op_desc)); +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index d2a393b3f19e9aab79098757dae663d030b0fa2b..89dcc677b57eba356c0b6af857f9f8ff6273a683 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -18,7 +18,6 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/op_desc.h" #include "paddle/fluid/framework/var_desc.h" #include "paddle/fluid/platform/macros.h" @@ -125,6 +124,8 @@ class Node { friend class Graph; friend std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); + friend std::unique_ptr CreateNodeForTest(VarDesc* var_desc); + friend std::unique_ptr CreateNodeForTest(OpDesc* op_desc); explicit Node(const std::string& name, Type type) : name_(name), var_desc_(nullptr), op_desc_(nullptr), type_(type) {} @@ -152,7 +153,9 @@ class Node { std::unique_ptr CreateNodeForTest(const std::string& name, Node::Type type); +std::unique_ptr CreateNodeForTest(VarDesc* var_desc); +std::unique_ptr CreateNodeForTest(OpDesc* op_desc); } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/lod_tensor.cc b/paddle/fluid/framework/lod_tensor.cc index 9b2eeaf59a536b5e8577f513b392648ed4270aab..6c8bec32de2a8c1d59155b812c05d5181acb82be 100644 --- a/paddle/fluid/framework/lod_tensor.cc +++ b/paddle/fluid/framework/lod_tensor.cc @@ -70,9 +70,9 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) { // only print first ten elements int64_t size = t.numel() < 10 ? t.numel() : 10; for (int64_t i = 0; i < size; ++i) { - if (IsType(t.type())) { + if (t.type() == proto::VarType::FP32) { os << t.data()[i] << " "; - } else if (IsType(t.type())) { + } else if (t.type() == proto::VarType::INT64) { os << t.data()[i] << " "; } else { PADDLE_THROW("LoDTensor data type not in [float, int64_t]"); @@ -387,7 +387,7 @@ void LoDTensor::MergeLoDTensor( PADDLE_ENFORCE(!lod_tensors.empty()); framework::DDim new_dim = lod_tensors[0]->dims(); - std::type_index new_type = lod_tensors[0]->type(); + auto new_type = lod_tensors[0]->type(); framework::DataLayout new_layout = lod_tensors[0]->layout(); LoD new_lod = lod_tensors[0]->lod(); for (size_t i = 1; i < lod_tensors.size(); ++i) { diff --git a/paddle/fluid/framework/ngraph_bridge.cc b/paddle/fluid/framework/ngraph_bridge.cc index a5acfd70449e92663cb66ef90a141c087ff6ec88..5fcb17b9f3ac390548aba33db7d0b8350cde7e00 100644 --- a/paddle/fluid/framework/ngraph_bridge.cc +++ b/paddle/fluid/framework/ngraph_bridge.cc @@ -16,100 +16,25 @@ limitations under the License. */ #include #include +#include "ngraph/ngraph.hpp" #include "paddle/fluid/framework/ngraph_bridge.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/platform/enforce.h" - -#include "ngraph/ngraph.hpp" +#include "paddle/fluid/platform/ngraph_helper.h" namespace paddle { namespace framework { -static std::shared_ptr GetNode( - const std::shared_ptr& op, const std::string name, - const VariableNameMap& var_map, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto& var_names = var_map.at(name); - PADDLE_ENFORCE_EQ(var_names.size(), 1, - "op %s name %s expects one associated var", op->Type(), - name); - if (ngb_node_map->find(var_names[0]) != ngb_node_map->end()) { - return (*ngb_node_map)[var_names[0]]; - } else { - return nullptr; - } -} - -static std::shared_ptr GetInputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - return GetNode(op, name, op->Inputs(), ngb_node_map); -} - -static std::shared_ptr GetOutputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - return GetNode(op, name, op->Outputs(), ngb_node_map); -} - -static void SetOutputNode( - const std::shared_ptr& op, const std::string name, - std::shared_ptr node, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto& var_names = op->Outputs().at(name); - if (var_names.size() == 1) { - (*ngb_node_map)[var_names[0]] = node; - } else if (var_names.size() == 0) { - (*ngb_node_map)[""] = node; - } else { - PADDLE_THROW("name %s has more than 1 var_names.", name); - } -} - -static bool HasOutput(const std::shared_ptr& op, - const std::string name) { - auto& outputs = op->Outputs(); - if (outputs.find(name) == outputs.end()) return false; - return outputs.at(name).size() > 0; -} - -template -static void BuildBinaryNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto x = GetInputNode(op, "X", ngb_node_map); - auto y = GetInputNode(op, "Y", ngb_node_map); - auto out = std::make_shared(x, y); - SetOutputNode(op, "Out", out, ngb_node_map); -} - -template -static void BuildUnaryNode( - const std::shared_ptr& op, - std::shared_ptr< - std::unordered_map>> - ngb_node_map) { - auto input = GetInputNode(op, "X", ngb_node_map); - auto out = std::make_shared(input); - SetOutputNode(op, "Out", out, ngb_node_map); -} - std::map&, std::shared_ptr>>)>> - NgraphBridge::NG_NODE_MAP = {{"relu", BuildUnaryNode}, - {"tanh", BuildUnaryNode}}; + NgraphBridge::NG_NODE_MAP = { + {"mul", paddle::operators::ngraphs::BuildMulNode}, + {"mul_grad", paddle::operators::ngraphs::BuildMulGradNode}, + {"relu", paddle::operators::ngraphs::BuildUnaryNode}, + {"tanh", paddle::operators::ngraphs::BuildUnaryNode}}; void NgraphBridge::BuildNgNode(const std::shared_ptr& op) { auto& op_type = op->Type(); diff --git a/paddle/fluid/framework/ngraph_operator.cc b/paddle/fluid/framework/ngraph_operator.cc index 253de4c61160e52202a0192215a93284f27e5896..23f681ce886fd0d8c113ffe4e80e25e6a803e31b 100644 --- a/paddle/fluid/framework/ngraph_operator.cc +++ b/paddle/fluid/framework/ngraph_operator.cc @@ -278,7 +278,8 @@ std::shared_ptr NgraphEngine::backend_ = ngraph::runtime::Backend::create("CPU"); void NgraphEngine::GetNgInputShape(std::shared_ptr op) { - op->RuntimeInferShape(scope_, place_); + RuntimeContext ctx(op->Inputs(), op->Outputs(), scope_); + op->RuntimeInferShape(scope_, place_, ctx); for (auto& var_name_item : op->Inputs()) { for (auto& var_name : var_name_item.second) { auto* var = scope_.FindVar(var_name); @@ -471,27 +472,23 @@ void NgraphEngine::Run(const Scope& scope, const platform::Place& place) const { auto* tensor_pd = GetLoDTensorOrSelectedRowsValueFromVar(*var); PADDLE_ENFORCE(sp == Ddim2Shape(tensor_pd->dims()), "Ensure ngraph tensor layout align with paddle tensor"); - if (tensor_pd->type().hash_code() == - typeid(float).hash_code()) { // NOLINT + if (tensor_pd->type() == proto::VarType::FP32) { const float* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(int).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::INT32) { const int* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i32, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == typeid(int64_t).hash_code()) { + } else if (tensor_pd->type() == proto::VarType::INT64) { const int64_t* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::i64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(double).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::FP64) { const double* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::f64, sp, const_cast(arr)); - } else if (tensor_pd->type().hash_code() == - typeid(bool).hash_code()) { // NOLINT + } else if (tensor_pd->type() == proto::VarType::BOOL) { const bool* arr = tensor_pd->data(); ti = backend_->create_tensor(ngraph::element::boolean, sp, const_cast(arr)); diff --git a/paddle/fluid/framework/op_kernel_type_test.cc b/paddle/fluid/framework/op_kernel_type_test.cc index 3e17a512ce154de88ac890f3b29f03385595d95c..40db85400d2c8776b82ce0fa2fb4deed993b0255 100644 --- a/paddle/fluid/framework/op_kernel_type_test.cc +++ b/paddle/fluid/framework/op_kernel_type_test.cc @@ -34,7 +34,8 @@ TEST(OpKernelType, ToString) { OpKernelType op_kernel_type2(DataType::FP16, CUDAPlace(0), DataLayout::kNCHW, LibraryType::kCUDNN); ASSERT_EQ(paddle::framework::KernelTypeToString(op_kernel_type2), - "data_type[float16]:data_layout[NCHW]:place[CUDAPlace(0)]:library_" + "data_type[::paddle::platform::float16]:data_layout[NCHW]:place[" + "CUDAPlace(0)]:library_" "type[CUDNN]"); } diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 66055e6f1d8c8685c27e7d63c3d79c607fb30104..8c83748668e9f91e9c333beb8494c3ef1db875dc 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -43,10 +43,9 @@ std::vector> kKernelPriority = { proto::VarType::Type GetDataTypeOfVar(const Variable* var) { if (var->IsType()) { - return framework::ToDataType(var->Get().type()); + return var->Get().type(); } else if (var->IsType()) { - return framework::ToDataType( - var->Get().value().type()); + return var->Get().value().type(); } else { PADDLE_THROW("Var should be LoDTensor or SelectedRows"); } @@ -93,13 +92,13 @@ static std::string GetDtype(const Scope& scope, const std::string& name) { if (UNLIKELY(!tensor.IsInitialized())) { return ""; } - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } else if (var->IsType()) { auto tensor = var->Get().value(); if (UNLIKELY(!tensor.IsInitialized())) { return "uninited"; } else { - return DataTypeToString(ToDataType(tensor.type())); + return DataTypeToString(tensor.type()); } } else { return ""; @@ -138,6 +137,23 @@ static LoD GetLoD(const Scope& scope, const std::string& name) { } } +RuntimeContext::RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, + const Scope& scope) { + for (auto& var_name_item : innames) { + std::vector& input_vars = inputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + input_vars.push_back(scope.FindVar(var_name)); + } + } + for (auto& var_name_item : outnames) { + std::vector& output_vars = outputs[var_name_item.first]; + for (auto& var_name : var_name_item.second) { + output_vars.push_back(scope.FindVar(var_name)); + } + } +} + void OperatorBase::Run(const Scope& scope, const platform::Place& place) { VLOG(4) << place << " " << DebugStringEx(&scope); if (platform::is_gpu_place(place)) { @@ -413,11 +429,48 @@ bool ExecutionContext::HasOutput(const std::string& name) const { return var != nullptr; } +const Variable* ExecutionContext::InputVar(const std::string& name) const { + auto it = ctx_.inputs.find(name); + if (it == ctx_.inputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's input %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +const Variable* ExecutionContext::LegacyInputVar( + const std::string& name) const { + auto ipt = op_.Input(name); + return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); +} + +Variable* ExecutionContext::OutputVar(const std::string& name) const { + auto it = ctx_.outputs.find(name); + if (it == ctx_.outputs.end()) return nullptr; + + PADDLE_ENFORCE_LE(it->second.size(), 1UL, + "Operator %s's output %s should contain only one variable.", + op_.Type(), name); + return it->second.empty() ? nullptr : it->second[0]; +} + +Variable* ExecutionContext::LegacyOutputVar(const std::string& name) const { + auto opt = op_.Output(name); + return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); +} + template <> const Tensor* ExecutionContext::Input(const std::string& name) const { return Input(name); } +template <> +const Tensor* ExecutionContext::LegacyInput( + const std::string& name) const { + return LegacyInput(name); +} + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const { @@ -442,6 +495,11 @@ Tensor* ExecutionContext::Output(const std::string& name) const { return Output(name); } +template <> +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const { + return LegacyOutput(name); +} + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const { @@ -478,23 +536,22 @@ bool OpSupportGPU(const std::string& op_type) { class RuntimeInferShapeContext : public InferShapeContext { public: - RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope) - : op_(op), scope_(scope) {} + RuntimeInferShapeContext(const OperatorBase& op, const Scope& scope, + const RuntimeContext& ctx) + : op_(op), scope_(scope), ctx_(ctx) {} bool HasInput(const std::string& name) const override { // has only one input - const auto& ins = op_.Inputs(); + const auto& ins = ctx_.inputs; auto it = ins.find(name); if (it == ins.end()) { return false; } const auto& in = it->second; - if (in.size() == 0 || in[0] == kEmptyVarName) { - return false; - } + if (in.size() == 0) return false; PADDLE_ENFORCE_EQ(in.size(), 1UL, "Input %s should not have more than one inputs", name); - return scope_.FindVar(in[0]) != nullptr; + return in[0] != nullptr; } bool HasOutput(const std::string& name) const override { @@ -679,6 +736,7 @@ class RuntimeInferShapeContext : public InferShapeContext { private: const OperatorBase& op_; const Scope& scope_; + const RuntimeContext& ctx_; }; static void CheckTensorNANOrInf(const std::string& name, @@ -686,7 +744,8 @@ static void CheckTensorNANOrInf(const std::string& name, if (tensor.memory_size() == 0) { return; } - if (!IsType(tensor.type()) && !IsType(tensor.type())) { + if (tensor.type() != proto::VarType::FP32 && + tensor.type() != proto::VarType::FP64) { return; } PADDLE_ENFORCE(!framework::TensorContainsInf(tensor), @@ -696,15 +755,15 @@ static void CheckTensorNANOrInf(const std::string& name, } void OperatorWithKernel::RuntimeInferShape(const Scope& scope, - const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); + const platform::Place& place, + const RuntimeContext& ctx) const { + RuntimeInferShapeContext infer_shape_ctx(*this, scope, ctx); this->InferShape(&infer_shape_ctx); } void OperatorWithKernel::RunImpl(const Scope& scope, const platform::Place& place) const { - RuntimeInferShapeContext infer_shape_ctx(*this, scope); - this->InferShape(&infer_shape_ctx); + RuntimeContext ctx(Inputs(), Outputs(), scope); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); @@ -718,15 +777,8 @@ void OperatorWithKernel::RunImpl(const Scope& scope, OpKernelMap& kernels = kernels_iter->second; - // TODO(dzhwinter) : kernel fallback mechanism will be added when all the - // transform functions are ready. - - // for (auto& candidate : kKernelPriority) { - // Do selection - // } - - auto expected_kernel_key = - this->GetExpectedKernelType(ExecutionContext(*this, scope, *dev_ctx)); + auto expected_kernel_key = this->GetExpectedKernelType( + ExecutionContext(*this, scope, *dev_ctx, ctx)); VLOG(3) << "expected_kernel_key:" << expected_kernel_key; auto kernel_iter = kernels.find(expected_kernel_key); @@ -748,7 +800,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, // do data transformScope &transfer_scope; std::vector transfered_inplace_vars; auto* transfer_scope = - TryTransferData(scope, expected_kernel_key, &transfered_inplace_vars); + PrepareData(scope, expected_kernel_key, &transfered_inplace_vars, &ctx); // exec scope is the scope that kernel actually executed on. const Scope& exec_scope = @@ -758,7 +810,11 @@ void OperatorWithKernel::RunImpl(const Scope& scope, dev_ctx = pool.Get(expected_kernel_key.place_); } - kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx)); + RuntimeInferShapeContext infer_shape_ctx(*this, exec_scope, ctx); + this->InferShape(&infer_shape_ctx); + // TODO(panyx0718): ExecutionContext should only depend on RuntimeContext + // not Scope. Imperative mode only pass inputs and get outputs. + kernel_iter->second(ExecutionContext(*this, exec_scope, *dev_ctx, ctx)); if (!transfered_inplace_vars.empty()) { // there is inplace variable has been transfered. @@ -782,6 +838,7 @@ void OperatorWithKernel::RunImpl(const Scope& scope, } } } + void OperatorWithKernel::TransferInplaceVarsBack( const Scope& scope, const std::vector& inplace_vars, const Scope& transfer_scope) const { @@ -797,13 +854,19 @@ void OperatorWithKernel::TransferInplaceVarsBack( } } -Scope* OperatorWithKernel::TryTransferData( +Scope* OperatorWithKernel::PrepareData( const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const { + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const { Scope* new_scope = nullptr; for (auto& var_name_item : Inputs()) { - for (auto& var_name : var_name_item.second) { + std::vector& input_vars = ctx->inputs[var_name_item.first]; + + for (size_t i = 0; i < var_name_item.second.size(); ++i) { + auto& var_name = var_name_item.second[i]; auto* var = scope.FindVar(var_name); + input_vars[i] = var; + // Only tensor can be tranfer to another device. if (var == nullptr || !VarIsTensor(*var)) { continue; @@ -851,6 +914,7 @@ Scope* OperatorWithKernel::TryTransferData( } auto* trans_var = new_scope->Var(var_name); + input_vars[i] = trans_var; Tensor out; TransformData(expected_kernel_key, kernel_type_for_var, *tensor_in, &out); @@ -881,7 +945,7 @@ proto::VarType::Type OperatorWithKernel::IndicateDataType( if (t != nullptr) { PADDLE_ENFORCE(t->IsInitialized(), "Input %s is not initialized: %s", ipt_name, DebugString()); - int tmp = static_cast(ToDataType(t->type())); + int tmp = static_cast(t->type()); PADDLE_ENFORCE( tmp == data_type || data_type == -1, "DataType of Paddle Op %s must be the same. Get %s(%d) != %s(%d)", diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 0a6a28a5bce01d71cf56f25f5556033db94452c2..39190d07b4ccdd5ffd03e2d50bb0e577ac00af75 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -70,6 +70,15 @@ Tensor* GetMutableLoDTensorOrSelectedRowsValueFromVar(Variable* var); class OperatorBase; class ExecutionContext; +class RuntimeContext { + public: + RuntimeContext(const VariableNameMap& innames, + const VariableNameMap& outnames, const Scope& scope); + + VariableValueMap inputs; + VariableValueMap outputs; +}; + /** * OperatorBase has the basic elements that Net will call to do computation. * Only CreateOperator from OpRegistry will new Operator directly. User @@ -129,7 +138,8 @@ class OperatorBase { void SetIsCalledByExecutor(bool x) { run_by_executor_ = x; } virtual void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const {} + const platform::Place& place, + const RuntimeContext& ctx) const {} protected: std::string type_; @@ -156,8 +166,9 @@ class OperatorBase { class ExecutionContext { public: ExecutionContext(const OperatorBase& op, const Scope& scope, - const platform::DeviceContext& device_context) - : op_(op), scope_(scope), device_context_(device_context) {} + const platform::DeviceContext& device_context, + const RuntimeContext& ctx) + : op_(op), scope_(scope), device_context_(device_context), ctx_(ctx) {} const OperatorBase& op() const { return op_; } @@ -180,15 +191,9 @@ class ExecutionContext { return op_.Outputs(name).size(); } - const Variable* InputVar(const std::string& name) const { - auto ipt = op_.Input(name); - return ipt == kEmptyVarName ? nullptr : scope_.FindVar(ipt); - } + const Variable* InputVar(const std::string& name) const; - Variable* OutputVar(const std::string& name) const { - auto opt = op_.Output(name); - return opt == kEmptyVarName ? nullptr : scope_.FindVar(opt); - } + Variable* OutputVar(const std::string& name) const; const std::vector MultiInputVar( const std::string& name) const { @@ -227,6 +232,22 @@ class ExecutionContext { return var == nullptr ? nullptr : var->GetMutable(); } + template + const T* LegacyInput(const std::string& name) const { + auto* var = LegacyInputVar(name); + return var == nullptr ? nullptr : &var->Get(); + } + + template + T* LegacyOutput(const std::string& name) const { + auto var = LegacyOutputVar(name); + return var == nullptr ? nullptr : var->GetMutable(); + } + + const Variable* LegacyInputVar(const std::string& name) const; + + Variable* LegacyOutputVar(const std::string& name) const; + template const std::vector MultiInput(const std::string& name) const { auto names = op_.Inputs(name); @@ -286,11 +307,16 @@ class ExecutionContext { const OperatorBase& op_; const Scope& scope_; const platform::DeviceContext& device_context_; + const RuntimeContext& ctx_; }; template <> const Tensor* ExecutionContext::Input(const std::string& name) const; +template <> +const Tensor* ExecutionContext::LegacyInput( + const std::string& name) const; + template <> const std::vector ExecutionContext::MultiInput( const std::string& name) const; @@ -298,6 +324,9 @@ const std::vector ExecutionContext::MultiInput( template <> Tensor* ExecutionContext::Output(const std::string& name) const; +template <> +Tensor* ExecutionContext::LegacyOutput(const std::string& name) const; + template <> std::vector ExecutionContext::MultiOutput( const std::string& name) const; @@ -350,8 +379,8 @@ class OperatorWithKernel : public OperatorBase { OpInfoMap::Instance().Get(Type()).infer_shape_(ctx); } - void RuntimeInferShape(const Scope& scope, - const platform::Place& place) const override; + void RuntimeInferShape(const Scope& scope, const platform::Place& place, + const RuntimeContext& ctx) const override; protected: virtual OpKernelType GetExpectedKernelType(const ExecutionContext& ctx) const; @@ -371,9 +400,10 @@ class OperatorWithKernel : public OperatorBase { * * * transfered_inplace_vars is a output vector. */ - Scope* TryTransferData( - const Scope& scope, const OpKernelType& expected_kernel_key, - std::vector* transfered_inplace_vars) const; + Scope* PrepareData(const Scope& scope, + const OpKernelType& expected_kernel_key, + std::vector* transfered_inplace_vars, + RuntimeContext* ctx) const; void TransferInplaceVarsBack(const Scope& scope, const std::vector& inplace_vars, diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index eb4baa06b5284512eab128e57f893bad43afda97..7e3fe02eaf5560ef03e42c6b82ed338edc30b0ab 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/parallel_executor.h" +#include #include #include #include @@ -93,6 +94,7 @@ class ParallelExecutorPrivate { } } + BuildStrategy build_strategy_; std::vector places_; std::vector local_scopes_; Scope *global_scope_; // not owned @@ -169,6 +171,14 @@ std::unique_ptr ParallelExecutorPrivate::PrepareGCAndRefCnts( eager_deletion_pass->SetNotOwned(details::kAllPlaces, &places_); graph = eager_deletion_pass->Apply(std::move(graph)); VLOG(10) << "EagerDeletionPass Applied"; + + if (build_strategy_.memory_early_delete_) { + auto early_delete_pass = + ir::PassRegistry::Instance().Get("memory_early_delete_pass"); + early_delete_pass->SetNotOwned(details::kGarbageCollector, &gcs_); + graph = early_delete_pass->Apply(std::move(graph)); + } + VLOG(10) << "MemoryEarlyDeletePass Applied."; } return graph; @@ -189,6 +199,7 @@ ParallelExecutor::ParallelExecutor( : member_(new ParallelExecutorPrivate(places)) { member_->global_scope_ = scope; member_->use_cuda_ = exec_strategy.use_cuda_; + member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -245,7 +256,6 @@ ParallelExecutor::ParallelExecutor( build_strategy.Apply(main_program, member_->places_, loss_var_name, params, member_->local_scopes_, member_->use_cuda_); #endif - auto max_memory_size = GetEagerDeletionThreshold(); if (max_memory_size >= 0) { graph = member_->PrepareGCAndRefCnts(std::move(graph), @@ -280,10 +290,12 @@ ParallelExecutor::ParallelExecutor( if (exec_strategy.type_ == ExecutionStrategy::kDefault) { member_->executor_.reset(new details::ThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } else { member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( - exec_strategy, member_->local_scopes_, places, std::move(graph))); + exec_strategy, member_->local_scopes_, member_->places_, + std::move(graph))); } member_->executor_.reset(new details::ScopeBufferedSSAGraphExecutor( @@ -423,5 +435,6 @@ ParallelExecutor::~ParallelExecutor() { } // namespace framework } // namespace paddle +USE_PASS(memory_early_delete_pass); USE_PASS(reference_count_pass); USE_PASS(eager_deletion_pass); diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index 62a30815d4f75a742447d974a34c7e6046871771..54a818250b45e593de4110f56e42a04a9ea65e00 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -218,11 +218,11 @@ void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value, if (index < 0) { VLOG(5) << "id " << id << " not in the table, return 0"; framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorFillVisitor(value, i * value_width, value_width, 0.0)); } else { framework::VisitDataType( - framework::ToDataType(value_->type()), + value_->type(), TensorCopyVisitor(value, i * value_width, *value_.get(), index * value_width, value_width)); } diff --git a/paddle/fluid/framework/tensor.cc b/paddle/fluid/framework/tensor.cc index 41566800e5781d576120ccf5dfbb3024bf4bea24..57335847a1931de6599560c6e9395a910282b0ee 100644 --- a/paddle/fluid/framework/tensor.cc +++ b/paddle/fluid/framework/tensor.cc @@ -16,7 +16,7 @@ limitations under the License. */ namespace paddle { namespace framework { -extern size_t SizeOfType(std::type_index type); +extern size_t SizeOfType(proto::VarType::Type type); void Tensor::check_memory_size() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor holds no memory. Call Tensor::mutable_data first."); @@ -31,7 +31,7 @@ size_t Tensor::memory_size() const { return holder_ == nullptr ? 0UL : holder_->size() - offset_; } -void* Tensor::mutable_data(platform::Place place, std::type_index type, +void* Tensor::mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr, size_t requested_size) { type_ = type; diff --git a/paddle/fluid/framework/tensor.h b/paddle/fluid/framework/tensor.h index 153222506af02911b792907d392a42e5499c5e7a..6a1cbe5cd567429c922156f8bce7ca710b15a0f5 100644 --- a/paddle/fluid/framework/tensor.h +++ b/paddle/fluid/framework/tensor.h @@ -19,9 +19,9 @@ limitations under the License. */ #include #include #include - #include "paddle/fluid/framework/data_layout.h" #include "paddle/fluid/framework/ddim.h" +#include "paddle/fluid/framework/framework.pb.h" #include "paddle/fluid/memory/memory.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" @@ -67,7 +67,7 @@ class Tensor { friend struct EigenVector; public: - Tensor() : type_(typeid(float)), offset_(0) {} + Tensor() : type_(proto::VarType::FP32), offset_(0) {} /*! Return a pointer to mutable memory block. */ template @@ -88,7 +88,7 @@ class Tensor { memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); - void* mutable_data(platform::Place place, std::type_index type, + void* mutable_data(platform::Place place, proto::VarType::Type type, memory::Allocator::Attr attr = memory::Allocator::kDefault, size_t requested_size = 0); @@ -138,7 +138,7 @@ class Tensor { return holder_->place(); } - std::type_index type() const { + proto::VarType::Type type() const { PADDLE_ENFORCE_NOT_NULL( holder_, "Tensor not initialized yet when Tensor::type() is called."); return type_; @@ -165,7 +165,7 @@ class Tensor { private: /*! holds the memory block if allocated. */ std::shared_ptr holder_; - std::type_index type_; + proto::VarType::Type type_; /** * @brief points to elements dimensions. * diff --git a/paddle/fluid/framework/tensor_impl.h b/paddle/fluid/framework/tensor_impl.h index 0c9c0d782fc73bd8278b82bebf7fd84a4f297b94..ce3ad18b1fb1c6304eaa60173e6dfad5e9dafb2d 100644 --- a/paddle/fluid/framework/tensor_impl.h +++ b/paddle/fluid/framework/tensor_impl.h @@ -24,9 +24,8 @@ template inline const T* Tensor::data() const { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %d", type_); return reinterpret_cast( reinterpret_cast(holder_->ptr()) + offset_); @@ -38,9 +37,8 @@ template inline T* Tensor::data() { check_memory_size(); bool valid = - std::is_same::value || type_ == std::type_index(typeid(T)); - PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", - type_.name()); + std::is_same::value || type_ == DataTypeTrait::DataType; + PADDLE_ENFORCE(valid, "Tensor holds the wrong type, it holds %s", type_); return reinterpret_cast(reinterpret_cast(holder_->ptr()) + offset_); } @@ -60,7 +58,7 @@ inline T* Tensor::mutable_data(platform::Place place, size_t requested_size) { static_assert(std::is_pod::value, "T must be POD"); return reinterpret_cast( - mutable_data(place, typeid(T), attr, requested_size)); + mutable_data(place, DataTypeTrait::DataType, attr, requested_size)); } inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { diff --git a/paddle/fluid/framework/tensor_test.cc b/paddle/fluid/framework/tensor_test.cc index a0a9a573603ceb6b577529101cb331adbc81337a..83dea8639010f77619a6fc2a81e092ae513c6e79 100644 --- a/paddle/fluid/framework/tensor_test.cc +++ b/paddle/fluid/framework/tensor_test.cc @@ -74,6 +74,22 @@ TEST(Tensor, MutableData) { p2 = src_tensor.mutable_data(framework::make_ddim({2, 2}), platform::CPUPlace()); EXPECT_EQ(p1, p2); + + float* p3 = nullptr; + float* p4 = nullptr; + // set src_tensor a different type but smaller size. + // memory block is supposed to be unchanged. + auto* tmp = src_tensor.mutable_data(framework::make_ddim({2, 2}), + platform::CPUPlace()); + p3 = reinterpret_cast(tmp); + EXPECT_EQ(p1, p3); + + // set src_tensor a different type but bigger size. + // memory block is supposed to be changed. + auto* tmp2 = src_tensor.mutable_data( + framework::make_ddim({2, 2, 3}), platform::CPUPlace()); + p4 = reinterpret_cast(tmp2); + EXPECT_NE(p1, p4); } // Not sure if it's desired, but currently, Tensor type can be changed. { diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc index ca1e01c89f07c4ffc3979a6a6c3728328e0a1819..85d15c5d3faa5a3d021b12396f9f8ea7735f9148 100644 --- a/paddle/fluid/framework/tensor_util.cc +++ b/paddle/fluid/framework/tensor_util.cc @@ -186,8 +186,8 @@ struct AnyDTypeVisitor { template inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor, const DevCtx& ctx, framework::Tensor* out) { - VisitDataType(ToDataType(tensor.type()), AnyDTypeVisitor( - predicate, tensor, ctx, out)); + VisitDataType(tensor.type(), AnyDTypeVisitor( + predicate, tensor, ctx, out)); } template @@ -379,7 +379,7 @@ void TensorToStream(std::ostream& os, const Tensor& tensor, // int32_t size // void* protobuf message proto::VarType::TensorDesc desc; - desc.set_data_type(framework::ToDataType(tensor.type())); + desc.set_data_type(tensor.type()); auto dims = framework::vectorize(tensor.dims()); auto* pb_dims = desc.mutable_dims(); pb_dims->Resize(static_cast(dims.size()), 0); @@ -461,9 +461,7 @@ void TensorFromStream(std::istream& is, Tensor* tensor, tensor->Resize(framework::make_ddim(dims)); void* buf; auto ctx = platform::CPUDeviceContext(); - size_t size = - tensor->numel() * - framework::SizeOfType(framework::ToTypeIndex(desc.data_type())); + size_t size = tensor->numel() * framework::SizeOfType(desc.data_type()); if (platform::is_gpu_place(dev_ctx.GetPlace())) { #ifdef PADDLE_WITH_CUDA Tensor cpu_tensor; diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index 2de6233a9e0d320ec9a06d547db3575eb61925c0..938e2024c3359c2acd65a1aa4af875a8350e4c58 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -28,8 +28,11 @@ class OperatorBase; class OpDesc; class InferShapeContext; class BlockDesc; +class Variable; using VariableNameMap = std::map>; +// TODO(panyx0718): Replace vector with something like gtl::Vector. +using VariableValueMap = std::map>; // The order should be as same as framework.proto using Attribute = diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc index 612503768079472ba233ee3fcd43a47fdba9a0cc..342cb68ab2bf8ceb543317ed8d8f2356ef6b2cde 100644 --- a/paddle/fluid/imperative/layer.cc +++ b/paddle/fluid/imperative/layer.cc @@ -188,11 +188,13 @@ std::vector OpBase::ApplyGrad(framework::Scope* scope) { std::vector ret; for (size_t i = 0; i < input_vars_->size(); ++i) { bool found = false; + VarBase* origin_var = (*input_vars_)[i]; for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { Variable* var = scope->FindVar(outvar); - VarBase* origin_var = (*input_vars_)[i]; std::string orig_var = grad_to_var_->at(outvar); - PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + if (origin_var->var_desc_->Name() != orig_var) { + continue; + } VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; origin_var->ApplyGrad(scope, var); found = true; diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h index 433d07c0e5aa0986ab1e9fe349ef865d2851c0c0..97772dc110135d9d2533e1574933d49f7c8cd346 100644 --- a/paddle/fluid/imperative/tracer.h +++ b/paddle/fluid/imperative/tracer.h @@ -43,9 +43,12 @@ void CreateGradOp(const framework::OpDesc& op_desc, class Tracer { public: - explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + explicit Tracer(framework::BlockDesc* root_block, + framework::BlockDesc* startup_block) + : root_block_(root_block), startup_block_(startup_block) { root_scope_ = new framework::Scope(); scopes_[root_block_] = root_scope_; + scopes_[startup_block_] = root_scope_; } virtual ~Tracer() { delete root_scope_; } @@ -80,6 +83,8 @@ class Tracer { } else { op->pre_ops_->push_back(nullptr); } + VLOG(3) << "input vname " << vname << " " + << var->Get().dims().size(); } *op->output_vars_ = outputs; @@ -98,12 +103,19 @@ class Tracer { outputs[i]->pre_op_ = op; outputs[i]->pre_op_out_idx_ = i; } + + VLOG(3) << "tracer running " << op_desc->Type(); op_base->Run(*scope, platform::CPUPlace()); - framework::OpDesc* grad_op_desc; - auto grad_to_var = new std::unordered_map(); - CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); - op->grad_op_desc_ = grad_op_desc; - op->grad_to_var_ = grad_to_var; + if (block == startup_block_) { + op->grad_op_desc_ = nullptr; + op->grad_to_var_ = nullptr; + } else { + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + } op->block_ = block; } @@ -121,6 +133,7 @@ class Tracer { private: std::map scopes_; framework::BlockDesc* root_block_; + framework::BlockDesc* startup_block_; framework::Scope* root_scope_; }; diff --git a/paddle/fluid/inference/CMakeLists.txt b/paddle/fluid/inference/CMakeLists.txt index 058a5b5f460d2bd3c4c0248929dd0c87f7506930..b80e7ef752c5251e3ea3f9d9c11f6a2b1422cd34 100644 --- a/paddle/fluid/inference/CMakeLists.txt +++ b/paddle/fluid/inference/CMakeLists.txt @@ -26,9 +26,6 @@ endif(WIN32) # paddle_fluid_origin exclude inference api interface if(WIN32) sep_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_origin DEPS ${fluid_modules} paddle_fluid_api) endif(WIN32) @@ -44,9 +41,6 @@ set(SHARED_INFERENCE_SRCS if(WIN32) sep_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid DEPS ${fluid_modules} ${STATIC_INFERENCE_APIS} zero_copy_tensor reset_tensor_array analysis_config paddle_pass_builder) @@ -63,9 +57,6 @@ if(WIN32) sep_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) target_link_libraries(paddle_fluid_shared shlwapi) - if(WITH_GPU AND NOT WITH_DSO) - target_link_libraries(paddle_fluid_origin ${cuda_modules}) - endif(WITH_GPU AND NOT WITH_DSO) else(WIN32) cc_library(paddle_fluid_shared SHARED SRCS ${SHARED_INFERENCE_SRCS} DEPS ${fluid_modules} paddle_fluid_api reset_tensor_array analysis_config paddle_pass_builder) diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index 4ffe5f575c232ccfc0089cb86e28737e56b32f94..9c42b83e7add348433635b1899087324e4e370d4 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -63,7 +63,6 @@ std::unique_ptr analysis::TensorRtSubgraphPass::ApplyImpl( void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, Graph *graph) const { auto *op_desc = node->Op(); - static int counter{0}; auto &subgraph = *Agent(node).subgraph(); PADDLE_ENFORCE(!subgraph.empty()); @@ -192,8 +191,6 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, block_desc.Proto()->SerializeAsString()); SetAttr(op_desc->Proto(), "max_batch_size", Get("max_batch_size")); SetAttr(op_desc->Proto(), "workspace_size", Get("workspace_size")); - SetAttr(op_desc->Proto(), "engine_uniq_key", - "trt-" + std::to_string(counter++)); SetAttr(op_desc->Proto(), "parameters", ExtractParameters(graph->Nodes())); SetAttr(op_desc->Proto(), "output_name_mapping", output_mapping); } diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index be51e7fc1f01c5fc4a48c7f32db15bb82a5ddc07..c751e8515829d06970c55f097f50de8bf33ee2a4 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -289,10 +289,10 @@ bool AnalysisPredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::proto::VarType::FP32) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::proto::VarType::INT64) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index d67305670c91bb0814b8771332641e96974ade9d..a361b34437ade36dfba2c99db800a7d77ada8704 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -55,7 +55,12 @@ TEST(AnalysisPredictor, analysis_off) { } TEST(AnalysisPredictor, analysis_on) { - AnalysisConfig config(false); +#ifdef PADDLE_WITH_CUDA + AnalysisConfig config(true); + config.fraction_of_gpu_memory = 0.15; +#else + AnalysisConfig config; +#endif config.model_dir = FLAGS_dirname; config.enable_ir_optim = true; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 4c5b412a2c1717b8edbb17c238caaa11aeccebd3..3d121e046004dfe6fc6953e0b23852b9ecda5c1b 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -266,10 +266,10 @@ bool NativePaddlePredictor::GetFetch(std::vector *outputs, auto type = fetch.type(); auto output = &(outputs->at(i)); output->name = fetchs_[idx]->Input("X")[0]; - if (type == typeid(float)) { + if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::FLOAT32; - } else if (type == typeid(int64_t)) { + } else if (type == framework::DataTypeTrait::DataType) { GetFetchOne(fetch, output); output->dtype = PaddleDType::INT64; } else { diff --git a/paddle/fluid/inference/api/api_impl_tester.cc b/paddle/fluid/inference/api/api_impl_tester.cc index 014bdc6a379744463e535df97af4c9c2e1651656..78396397397c3125c3990073d6b2887ebb477ff2 100644 --- a/paddle/fluid/inference/api/api_impl_tester.cc +++ b/paddle/fluid/inference/api/api_impl_tester.cc @@ -36,10 +36,10 @@ namespace paddle { PaddleTensor LodTensorToPaddleTensor(framework::LoDTensor* t) { PaddleTensor pt; - if (t->type() == typeid(int64_t)) { + if (t->type() == framework::proto::VarType::INT64) { pt.data.Reset(t->data(), t->numel() * sizeof(int64_t)); pt.dtype = PaddleDType::INT64; - } else if (t->type() == typeid(float)) { + } else if (t->type() == framework::proto::VarType::FP32) { pt.data.Reset(t->data(), t->numel() * sizeof(float)); pt.dtype = PaddleDType::FLOAT32; } else { diff --git a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt index ec93729cd2b379dc2ac39b51df6799b74c8529b6..8d0d96d391efd7f0f11e9d48f5a6221431bd3824 100644 --- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt +++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt @@ -15,12 +15,43 @@ macro(safe_set_static_flag) endforeach(flag_var) endmacro() +if(NOT DEFINED PADDLE_LIB) + message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") +endif() +if(NOT DEFINED DEMO_NAME) + message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") +endif() + +include_directories("${PADDLE_LIB}/") +include_directories("${PADDLE_LIB}/fluid_inference_install_dir/") +include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") +include_directories("${PADDLE_LIB}/third_party/install/glog/include") +include_directories("${PADDLE_LIB}/third_party/install/gflags/include") +include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") +include_directories("${PADDLE_LIB}/third_party/install/snappy/include") +include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") +include_directories("${PADDLE_LIB}/third_party/install/zlib/include") +include_directories("${PADDLE_LIB}/third_party/boost") +include_directories("${PADDLE_LIB}/third_party/eigen3") + +link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") +link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") +link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") +link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") +link_directories("${PADDLE_LIB}/third_party/install/glog/lib") +link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") +link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") +link_directories("${PADDLE_LIB}/paddle/lib") + if (WIN32) + add_definitions("/DGOOGLE_GLOG_DLL_DECL=") + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_C_FLAGS_RELEASE "${CMAKE_C_FLAGS_RELEASE} /bigobj /MT") + set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} /bigobj /MTd") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /bigobj /MT") if (WITH_STATIC_LIB) safe_set_static_flag() add_definitions(-DSTATIC_LIB) - set(CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS} "/w") - set(CMAKE_CXX_FLAGS_RELEASE ${CMAKE_CXX_FLAGS_RELEASE} "/w") endif() set(CMAKE_STATIC_LIBRARY_PREFIX "lib") else() @@ -29,36 +60,15 @@ else() endif() message("flags" ${CMAKE_CXX_FLAGS}) -if(NOT DEFINED PADDLE_LIB) - message(FATAL_ERROR "please set PADDLE_LIB with -DPADDLE_LIB=/path/paddle/lib") -endif() -if(NOT DEFINED DEMO_NAME) - message(FATAL_ERROR "please set DEMO_NAME with -DDEMO_NAME=demo_name") -endif() - - if(WITH_GPU) if(NOT WIN32) set(CUDA_LIB "/usr/local/cuda/lib64/" CACHE STRING "CUDA Library") else() if(CUDA_LIB STREQUAL "") - set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") + set(CUDA_LIB "C:\\Program\ Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v8.0\\lib\\x64") endif() endif(NOT WIN32) endif() -include_directories("${PADDLE_LIB}") -include_directories("${PADDLE_LIB}/third_party/install/protobuf/include") -include_directories("${PADDLE_LIB}/third_party/install/glog/include") -include_directories("${PADDLE_LIB}/third_party/install/gflags/include") -include_directories("${PADDLE_LIB}/third_party/install/xxhash/include") -if (NOT WIN32) -include_directories("${PADDLE_LIB}/third_party/install/snappy/include") -include_directories("${PADDLE_LIB}/third_party/install/snappystream/include") -include_directories("${PADDLE_LIB}/third_party/install/zlib/include") -endif(NOT WIN32) - -include_directories("${PADDLE_LIB}/third_party/boost") -include_directories("${PADDLE_LIB}/third_party/eigen3") if (NOT WIN32) if (USE_TENSORRT AND WITH_GPU) @@ -67,18 +77,6 @@ if (NOT WIN32) endif() endif(NOT WIN32) -if (NOT WIN32) -link_directories("${PADDLE_LIB}/third_party/install/snappy/lib") -link_directories("${PADDLE_LIB}/third_party/install/snappystream/lib") -link_directories("${PADDLE_LIB}/third_party/install/zlib/lib") -endif(NOT WIN32) - -link_directories("${PADDLE_LIB}/third_party/install/protobuf/lib") -link_directories("${PADDLE_LIB}/third_party/install/glog/lib") -link_directories("${PADDLE_LIB}/third_party/install/gflags/lib") -link_directories("${PADDLE_LIB}/third_party/install/xxhash/lib") -link_directories("${PADDLE_LIB}/paddle/lib") - if (NOT WIN32) set(NGRAPH_PATH "${PADDLE_LIB}/third_party/install/ngraph") if(EXISTS ${NGRAPH_PATH}) @@ -89,8 +87,6 @@ if (NOT WIN32) endif() endif() -add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) - if(WITH_MKL) include_directories("${PADDLE_LIB}/third_party/install/mklml/include") set(MATH_LIB ${PADDLE_LIB}/third_party/install/mklml/lib/libmklml_intel${CMAKE_SHARED_LIBRARY_SUFFIX} @@ -106,26 +102,25 @@ endif() # Note: libpaddle_inference_api.so/a must put before libpaddle_fluid.so/a if(WITH_STATIC_LIB) - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_STATIC_LIBRARY_SUFFIX}) else() - set(DEPS - ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) + set(DEPS ${PADDLE_LIB}/paddle/lib/libpaddle_fluid${CMAKE_SHARED_LIBRARY_SUFFIX}) endif() if (NOT WIN32) -set(EXTERNAL_LIB "-lrt -ldl -lpthread") -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} - glog gflags protobuf snappystream snappy z xxhash - ${EXTERNAL_LIB}) + set(EXTERNAL_LIB "-lrt -ldl -lpthread") + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} ${NGRAPH_LIB} + glog gflags protobuf snappystream snappy z xxhash + ${EXTERNAL_LIB}) else() -set(DEPS ${DEPS} - ${MATH_LIB} ${MKLDNN_LIB} - ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf - ${EXTERNAL_LIB}) -# NOTE(dzhwinter) shlwapi is deprecated. -set(DEPS ${DEPS} libcmt shlwapi) + set(DEPS ${DEPS} + ${MATH_LIB} ${MKLDNN_LIB} + ${CMAKE_STATIC_LIBRARY_PREFIX}glog ${CMAKE_STATIC_LIBRARY_PREFIX}gflags ${CMAKE_STATIC_LIBRARY_PREFIX}protobuf + ${CMAKE_STATIC_LIBRARY_PREFIX}snappy ${CMAKE_STATIC_LIBRARY_PREFIX}z ${CMAKE_STATIC_LIBRARY_PREFIX}xxhash + snappystream ${EXTERNAL_LIB}) + # NOTE(dzhwinter) shlwapi is deprecated. + set(DEPS ${DEPS} libcmt shlwapi) endif(NOT WIN32) if(WITH_GPU) @@ -137,9 +132,10 @@ if(WITH_GPU) set(DEPS ${DEPS} ${CUDA_LIB}/libcudart${CMAKE_SHARED_LIBRARY_SUFFIX}) else() set(DEPS ${DEPS} ${CUDA_LIB}/cudart${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) - set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cublas${CMAKE_STATIC_LIBRARY_SUFFIX} ) + set(DEPS ${DEPS} ${CUDA_LIB}/cudnn${CMAKE_STATIC_LIBRARY_SUFFIX} ) endif() endif() +add_executable(${DEMO_NAME} ${DEMO_NAME}.cc) target_link_libraries(${DEMO_NAME} ${DEPS}) diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index bc5139a7e54eaf7133ea96ae3b36915a236a2c5e..40ca0d287ccde113a20abb1036af289a36f54e6c 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -118,7 +118,11 @@ class GpuPassStrategy : public PassStrategy { public: GpuPassStrategy() : PassStrategy({}) { passes_.assign({ - "infer_clean_graph_pass", "conv_bn_fuse_pass", + "infer_clean_graph_pass", // + "conv_bn_fuse_pass", // + "conv_elementwise_add_act_fuse_pass", // + "conv_elementwise_add2_act_fuse_pass", // + "conv_elementwise_add_fuse_pass", // }); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 24d15f12f9cd4a9280cd316bd727fdbccb831b9b..ae72a74acce826c3635d5d537540eaad79ff8199 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -79,7 +79,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, for (auto* var : global_block.AllVars()) { if (IsPersistable(var)) { - VLOG(3) << "persistable variable's name: " << var->Name(); + VLOG(4) << "persistable variable's name: " << var->Name(); framework::VarDesc* new_var = load_block->Var(var->Name()); new_var->SetShape(var->GetShape()); diff --git a/paddle/fluid/inference/tensorrt/convert/op_converter.h b/paddle/fluid/inference/tensorrt/convert/op_converter.h index d61d635ed707bc455d495f2420925a3585234b5c..91670ba8ac5332fe6e83b7bff14cb1a349d7e2a2 100644 --- a/paddle/fluid/inference/tensorrt/convert/op_converter.h +++ b/paddle/fluid/inference/tensorrt/convert/op_converter.h @@ -103,6 +103,7 @@ class OpConverter { void ConvertBlock(const framework::proto::BlockDesc& block, const std::unordered_set& parameters, const framework::Scope& scope, TensorRTEngine* engine) { + std::unique_lock lk(mut_); for (int i = 0; i < block.ops_size(); i++) { const auto& op = block.ops(i); ConvertOp(op, parameters, scope, engine); @@ -125,6 +126,7 @@ class OpConverter { std::unordered_map converters_; // fluid inference scope framework::Scope* scope_{nullptr}; + std::mutex mut_; }; } // namespace tensorrt diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt index 8a4bc04b67879918c6ac8d1b40dae68a107034d4..46ce61b73611d05369f90e7d8f97e9b6724b860f 100644 --- a/paddle/fluid/inference/tests/api/CMakeLists.txt +++ b/paddle/fluid/inference/tests/api/CMakeLists.txt @@ -30,6 +30,13 @@ function(inference_analysis_api_test_with_fake_data target install_dir filename ARGS --infer_model=${install_dir}/model) endfunction() +function(inference_analysis_api_test_with_refer_result target install_dir filename) + inference_analysis_test(${target} SRCS ${filename} + EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} + ARGS --infer_model=${install_dir}/model --infer_data=${install_dir}/data.txt + --refer_result=${install_dir}/result.txt) +endfunction() + # RNN1 if(NOT APPLE AND WITH_MKLML) set(RNN1_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn1") @@ -83,14 +90,21 @@ set(OCR_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/ocr") if (NOT EXISTS ${OCR_INSTALL_DIR}) inference_download_and_uncompress(${OCR_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Focr.tar.gz") endif() -inference_analysis_api_test(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) +inference_analysis_api_test_with_refer_result(test_analyzer_ocr ${OCR_INSTALL_DIR} analyzer_vis_tester.cc) + +# mobilenet with transpose op +set(MOBILENET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet") +if (NOT EXISTS ${MOBILENET_INSTALL_DIR}) + inference_download_and_uncompress(${MOBILENET_INSTALL_DIR} "http://paddlemodels.cdn.bcebos.com/" "inference-vis-demos%2Fmobilenet.tar.gz") +endif() +inference_analysis_api_test_with_refer_result(test_analyzer_mobilenet_transpose ${MOBILENET_INSTALL_DIR} analyzer_vis_tester.cc) # resnet50 inference_analysis_api_test_with_fake_data(test_analyzer_resnet50 "${INFERENCE_DEMO_INSTALL_DIR}/resnet50" analyzer_resnet50_tester.cc "resnet50_model.tar.gz") # mobilenet with depthwise_conv op -inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet +inference_analysis_api_test_with_fake_data(test_analyzer_mobilenet_depthwise_conv "${INFERENCE_DEMO_INSTALL_DIR}/mobilenet_depthwise_conv" analyzer_resnet50_tester.cc "mobilenet_model.tar.gz") # anakin diff --git a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc index adaa338e289936a7e6915bd23eba86863481dd06..a8f7d5c4461964bcb18bc8df24e282ea89264aa8 100644 --- a/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_vis_tester.cc @@ -93,18 +93,20 @@ void profile(bool use_mkldnn = false) { SetInput(&input_slots_all); TestPrediction(reinterpret_cast(&cfg), input_slots_all, &outputs, FLAGS_num_threads); - if (FLAGS_num_threads == 1 && !FLAGS_test_all_data) { - const float ocr_result_data[] = { - 5.273636460856323538e-08, 3.296741795111302054e-07, - 1.873261190610264748e-08, 3.403730275408634043e-08, - 3.383312474625199684e-08}; - PADDLE_ENFORCE_EQ(outputs.size(), 1UL); - size_t size = GetSize(outputs[0]); - PADDLE_ENFORCE_GT(size, 0); - float *result = static_cast(outputs[0].data.data()); - for (size_t i = 0; i < std::min(5UL, size); i++) { - EXPECT_NEAR(result[i], ocr_result_data[i], 1e-3); + std::string line; + std::ifstream file(FLAGS_refer_result); + std::getline(file, line); + auto refer = ProcessALine(line); + file.close(); + + auto &output = outputs.front(); + size_t numel = output.data.length() / PaddleDtypeSize(output.dtype); + CHECK_EQ(numel, refer.data.size()); + for (size_t i = 0; i < numel; ++i) { + CHECK_LT( + fabs(static_cast(output.data.data())[i] - refer.data[i]), + 1e-5); } } } diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 8209a049f4614fe31c22c4e83c1968411b749b49..b07949c196ca1d41bb33a0b0499ebb3204d1be4a 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -36,6 +36,7 @@ DEFINE_string(model_name, "", "model name"); DEFINE_string(infer_model, "", "model path"); DEFINE_string(infer_data, "", "data file"); +DEFINE_string(refer_result, "", "reference result for comparison"); DEFINE_int32(batch_size, 1, "batch size."); DEFINE_int32(repeat, 1, "Running the inference program repeat times."); DEFINE_bool(test_all_data, false, "Test the all dataset in data file."); @@ -373,7 +374,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, } for (size_t i = 0; i < a_size; i++) { - if (a.type() == typeid(float)) { + if (a.type() == framework::proto::VarType::FP32) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { @@ -382,7 +383,7 @@ static bool CompareTensorData(const framework::LoDTensor &a, b_data[i]); return false; } - } else if (a.type() == typeid(int64_t)) { + } else if (a.type() == framework::proto::VarType::INT64) { const auto *a_data = a.data(); const auto *b_data = b.data(); if (std::abs(a_data[i] - b_data[i]) > 1e-3) { diff --git a/paddle/fluid/inference/tests/api/trt_models_tester.cc b/paddle/fluid/inference/tests/api/trt_models_tester.cc index 9eb3fb5da1065f14d9ad1c3520f6415fbadfdca1..d3bd035c1c49c926fc9f5ed83085b2e6d9ca8c93 100644 --- a/paddle/fluid/inference/tests/api/trt_models_tester.cc +++ b/paddle/fluid/inference/tests/api/trt_models_tester.cc @@ -78,6 +78,7 @@ void profile(std::string model_dir, bool use_analysis, bool use_tensorrt) { std::vector outputs; if (use_analysis || use_tensorrt) { contrib::AnalysisConfig config(true); + config.pass_builder()->TurnOnDebug(); SetConfig(&config, model_dir, true, use_tensorrt, FLAGS_batch_size); TestPrediction(reinterpret_cast(&config), @@ -141,9 +142,31 @@ TEST(TensorRT_resnext50, profile) { profile(model_dir, /* use_analysis */ true, FLAGS_use_tensorrt); } +TEST(resnext50, compare_analysis_native) { + std::string model_dir = FLAGS_infer_model + "/resnext50"; + compare(model_dir, false /*use tensorrt*/); +} + TEST(TensorRT_mobilenet, analysis) { std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; - compare(model_dir, /* use_tensorrt */ false); + compare(model_dir, false /* use_tensorrt */); +} + +TEST(AnalysisPredictor, use_gpu) { + std::string model_dir = FLAGS_infer_model + "/" + "mobilenet"; + AnalysisConfig config(true); + config.model_dir = model_dir; + config.fraction_of_gpu_memory = 0.15; + config.pass_builder()->TurnOnDebug(); + + std::vector> inputs_all; + auto predictor = CreatePaddlePredictor(config); + SetFakeImageInput(&inputs_all, model_dir, false, "__model__", ""); + + std::vector outputs; + for (auto& input : inputs_all) { + ASSERT_TRUE(predictor->Run(input, &outputs)); + } } } // namespace inference diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 257bfc0a3f926d20abc4647b27e8e9cc2c49e014..d9b0c66e5727e80486423ab065dccf9105775127 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -64,9 +64,7 @@ endif() set(COMMON_OP_DEPS ${OP_HEADER_DEPS}) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) -if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) -endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) diff --git a/paddle/fluid/operators/affine_grid_op.cc b/paddle/fluid/operators/affine_grid_op.cc index 6f7da445fc84fc1f14b01a633af0e886aec6f8ed..1de59a5165c83a314a0ff8f4e4351aa3326beb67 100644 --- a/paddle/fluid/operators/affine_grid_op.cc +++ b/paddle/fluid/operators/affine_grid_op.cc @@ -78,7 +78,7 @@ class AffineGridOp : public framework::OperatorWithKernel { library = framework::LibraryType::kCUDNN; } #endif - auto data_type = framework::ToDataType(ctx.Input("Theta")->type()); + auto data_type = ctx.Input("Theta")->type(); return framework::OpKernelType(data_type, ctx.GetPlace(), framework::DataLayout::kAnyLayout, library); } @@ -188,9 +188,9 @@ class AffineGridOpGrad : public framework::OperatorWithKernel { library_ = framework::LibraryType::kCUDNN; } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Theta")->type()), - ctx.GetPlace(), framework::DataLayout::kAnyLayout, library_); + return framework::OpKernelType(ctx.Input("Theta")->type(), + ctx.GetPlace(), + framework::DataLayout::kAnyLayout, library_); } }; diff --git a/paddle/fluid/operators/arg_max_op.cc b/paddle/fluid/operators/arg_max_op.cc index 8174d3735859b1fac40cd4c07545f34874d31ab7..7fe9a0df7467970286fb0efc7c5ce7aaf01ac28b 100644 --- a/paddle/fluid/operators/arg_max_op.cc +++ b/paddle/fluid/operators/arg_max_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_max_op.cu b/paddle/fluid/operators/arg_max_op.cu index a147d77a9e9c577984028e1a6ed9582dda622069..85e4f98173511435a52b32e506afc8d5b772f74f 100644 --- a/paddle/fluid/operators/arg_max_op.cu +++ b/paddle/fluid/operators/arg_max_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMaxKernel, - paddle::operators::ArgMaxKernel, paddle::operators::ArgMaxKernel); diff --git a/paddle/fluid/operators/arg_min_op.cc b/paddle/fluid/operators/arg_min_op.cc index 41f188029f17dbe8717afc0ca0760a39edc24b54..23b24735cd0ba17afd30b95c329cb0530a1f0104 100644 --- a/paddle/fluid/operators/arg_min_op.cc +++ b/paddle/fluid/operators/arg_min_op.cc @@ -28,6 +28,5 @@ REGISTER_OP_CPU_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/arg_min_op.cu b/paddle/fluid/operators/arg_min_op.cu index 4d020508505a6ebac8be41ce1e4f99d436b67ab5..47d7c8b12243c6c5c501188af7f48f125c266009 100644 --- a/paddle/fluid/operators/arg_min_op.cu +++ b/paddle/fluid/operators/arg_min_op.cu @@ -25,7 +25,5 @@ REGISTER_OP_CUDA_KERNEL( int32_t>, paddle::operators::ArgMinKernel, - paddle::operators::ArgMinKernel, paddle::operators::ArgMinKernel); diff --git a/paddle/fluid/operators/array_to_lod_tensor_op.cc b/paddle/fluid/operators/array_to_lod_tensor_op.cc index 6257e04b010d8c580e69e466759e8e80d344c105..d942391b8644959f63ac58f6a7122bbd3c0ddf84 100644 --- a/paddle/fluid/operators/array_to_lod_tensor_op.cc +++ b/paddle/fluid/operators/array_to_lod_tensor_op.cc @@ -58,7 +58,7 @@ struct ArrayToLoDFunctor : public boost::static_visitor { ArrayToLoDFunctorImpl functor; functor.dev_ctx_ = dev_ctx; functor.prev_functor_ = this; - framework::VisitDataType(framework::ToDataType(out->type()), functor); + framework::VisitDataType(out->type(), functor); } }; @@ -91,7 +91,7 @@ class ArrayToLoDTensorOp : public framework::OperatorBase { PADDLE_ENFORCE(!x.empty(), "There's no element in the input array."); int rank = x[0].dims().size(); platform::Place place = x[0].place(); - std::type_index data_type = x[0].type(); + auto data_type = x[0].type(); int64_t batch_size = x[0].dims()[0]; framework::DDim ins_dims = rank > 1 ? framework::slice_ddim(x[0].dims(), 1, rank) diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 75fc59125f21901b6781315eb3d7dba36b7f11f2..b6996be4b0984bcee3b16da268d79708a68b65b3 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -121,9 +121,8 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType AttentionLSTMOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } void AttentionLSTMOpMaker::Make() { diff --git a/paddle/fluid/operators/average_accumulates_op.cc b/paddle/fluid/operators/average_accumulates_op.cc index f389eab605e087c535b9918264e6502217062505..0922b03b5f5fbd2a7a62b0a325ebed9600767497 100644 --- a/paddle/fluid/operators/average_accumulates_op.cc +++ b/paddle/fluid/operators/average_accumulates_op.cc @@ -103,9 +103,8 @@ class AverageAccumulatesOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("param")->type()), - ctx.GetPlace()); + return framework::OpKernelType(ctx.Input("param")->type(), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc index f66813989c64737a4b41e3f653d9ca654be72dd6..8b672e09b2c5c203c1a1447fbbd14a45ef7ba257 100644 --- a/paddle/fluid/operators/batch_norm_op.cc +++ b/paddle/fluid/operators/batch_norm_op.cc @@ -72,8 +72,7 @@ class BatchNormOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("X")->type()); + auto input_data_type = ctx.Input("X")->type(); // By default, the type of the scale, bias, mean, // and var tensors should both be float. (For float or float16 input tensor) // or double (For double input tensor). @@ -81,17 +80,13 @@ class BatchNormOp : public framework::OperatorWithKernel { if (input_data_type == framework::proto::VarType::FP64) { bn_param_type = framework::proto::VarType::FP64; } - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Scale")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Scale")->type(), "Scale input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Bias")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Bias")->type(), "Bias input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, - framework::ToDataType(ctx.Input("Mean")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Mean")->type(), "Mean input should be of float type"); - PADDLE_ENFORCE_EQ(bn_param_type, framework::ToDataType( - ctx.Input("Variance")->type()), + PADDLE_ENFORCE_EQ(bn_param_type, ctx.Input("Variance")->type(), "Variance input should be of float type"); // TODO(pzelazko-intel): enable MKLDNN layout when it's ready @@ -413,9 +408,8 @@ class BatchNormGradOp : public framework::OperatorWithKernel { } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.GetPlace(), layout, library); } }; diff --git a/paddle/fluid/operators/beam_search_decode_op.cc b/paddle/fluid/operators/beam_search_decode_op.cc index 0d32cae0e1e5ff274793df50e854283d8e2f7bf8..7f2bde55c98277b9fd4b3374657001c42d673d43 100644 --- a/paddle/fluid/operators/beam_search_decode_op.cc +++ b/paddle/fluid/operators/beam_search_decode_op.cc @@ -122,7 +122,8 @@ class BeamSearchDecodeOp : public framework::OperatorBase { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& dev_ctx = *pool.Get(dev_place); - framework::ExecutionContext ctx(*this, scope, dev_ctx); + framework::RuntimeContext run_ctx(Inputs(), Outputs(), scope); + framework::ExecutionContext ctx(*this, scope, dev_ctx, run_ctx); const LoDTensorArray* ids = ctx.Input("Ids"); const LoDTensorArray* scores = ctx.Input("Scores"); @@ -145,7 +146,7 @@ class BeamSearchDecodeOp : public framework::OperatorBase { LoDTensor* sentenceScores = ctx.Output("SentenceScores"); framework::VisitDataType( - framework::ToDataType(scores->at(0).type()), + scores->at(0).type(), BeamSearchDecodeFunctor(*ids, *scores, sentenceIds, sentenceScores, beam_size, end_id)); } diff --git a/paddle/fluid/operators/beam_search_op.cc b/paddle/fluid/operators/beam_search_op.cc index 62771d09f112785ca1ba741a0ba239b1f0234633..30f700f1d91c5a81f39594b6dab7e5e717c9818f 100644 --- a/paddle/fluid/operators/beam_search_op.cc +++ b/paddle/fluid/operators/beam_search_op.cc @@ -282,8 +282,7 @@ class BeamSearchOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { framework::OpKernelType kt = framework::OpKernelType( - framework::ToDataType( - ctx.Input("pre_ids")->type()), + ctx.Input("pre_ids")->type(), platform::CPUPlace()); return kt; } diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc index 9258d7c7e83122149c7cbc42e4a4bdd84903ce67..f349c51d8a99aaab43a15580a8904d4e4fd0d9b7 100644 --- a/paddle/fluid/operators/bpr_loss_op.cc +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -47,9 +47,8 @@ class BprLossOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; @@ -94,9 +93,8 @@ class BprLossGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("X")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt index b1c2ee22951a3881b4ce5b82f9ff7eb01fde6e9e..b614e9b03502634a29333f331e25201a0f77ba38 100644 --- a/paddle/fluid/operators/controlflow/CMakeLists.txt +++ b/paddle/fluid/operators/controlflow/CMakeLists.txt @@ -1,4 +1,4 @@ include(operators) -register_operators() +register_operators(DEPS naive_executor) file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc index 135254ce6b6bf9add7bb1f0c3f645ed47081fba4..dd28f82b65403550c67418cae535bbfeeef4476e 100644 --- a/paddle/fluid/operators/controlflow/conditional_block_op.cc +++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc @@ -48,13 +48,12 @@ class ConditionalOp : public framework::OperatorBase { if (!(ips.size() == 1UL && ips[0]->IsInitialized())) { PADDLE_THROW("should have one initialized input as condition"); } - if (!(framework::IsType(ips[0]->type()) && // NOLINT - ips[0]->numel() == 1)) { - PADDLE_THROW( - "condition input's data type should be bool, " - "numel should be 1, actual numel is %d", - ips[0]->numel()); - } + + PADDLE_ENFORCE(ips[0]->type() == framework::proto::VarType::BOOL && + ips[0]->numel() == 1, + "condition input's data type should be bool, " + "numel should be 1, actual numel is %d", + ips[0]->numel()); bool res = false; if (platform::is_gpu_place(ips[0]->place())) { #ifdef PADDLE_WITH_CUDA diff --git a/paddle/fluid/operators/controlflow/parallel_do_op.cc b/paddle/fluid/operators/controlflow/parallel_do_op.cc deleted file mode 100644 index ab25628d45699dbcfc1fc5792958bae9e42e72a3..0000000000000000000000000000000000000000 --- a/paddle/fluid/operators/controlflow/parallel_do_op.cc +++ /dev/null @@ -1,426 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include - -#include "paddle/fluid/framework/executor.h" -#include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/threadpool.h" -#include "paddle/fluid/operators/detail/safe_ref.h" - -namespace paddle { -namespace operators { - -static constexpr char kInputs[] = "inputs"; -static constexpr char kParameters[] = "parameters"; -static constexpr char kPlaces[] = "places"; - -static constexpr char kOutputs[] = "outputs"; -static constexpr char kParallelScopes[] = "parallel_scopes"; - -static constexpr char kParallelBlock[] = "sub_block"; -static constexpr char kUseNCCL[] = "use_nccl"; - -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; - -static void SplitTensorAndMoveTensorToScopes( - const framework::Scope &scope, std::vector *sub_scopes, - const std::vector &places, - const std::vector &names) { - size_t num_sub_scopes = 0; - for (auto &argu : names) { - const auto &tensor = - detail::Ref(scope.FindVar(argu), - "Cannot find variable %s in the parent scope", argu) - .Get(); - auto lod_tensors = tensor.SplitLoDTensor(places); - - for (auto &lod : lod_tensors) { - VLOG(3) << lod.dims(); - } - if (num_sub_scopes == 0) { - num_sub_scopes = lod_tensors.size(); - } else { - PADDLE_ENFORCE_EQ(num_sub_scopes, lod_tensors.size()); - } - PADDLE_ENFORCE_NE(num_sub_scopes, 0); - if (sub_scopes->size() == 0) { - sub_scopes->reserve(num_sub_scopes); - for (size_t i = 0; i < num_sub_scopes; ++i) { - sub_scopes->emplace_back(&scope.NewScope()); - } - } - - for (size_t i = 0; i < lod_tensors.size(); ++i) { - *detail::Ref(sub_scopes->at(i)->Var(argu), - "Cannot find variable in the sub-scope", argu) - .GetMutable() = lod_tensors[i]; - } - } -} - -inline void CopyOrShare(const framework::Variable &src, - const platform::Place &dst_place, - framework::Variable *dst) { - if (src.IsType()) { - if (src.Get().place() == dst_place) { - dst->GetMutable()->ShareDataWith(src.Get()); - dst->GetMutable()->set_lod(src.Get().lod()); - } else { - TensorCopy(src.Get(), dst_place, dst->GetMutable()); - } - } else if (src.IsType()) { - auto &src_sr = src.Get(); - auto *dst_sr = dst->GetMutable(); - dst_sr->set_height(src_sr.height()); - if (src_sr.value().place() == dst_place) { - dst_sr->mutable_value()->ShareDataWith(src_sr.value()); - dst_sr->set_rows(src_sr.rows()); - } else { - TensorCopy(src_sr.value(), dst_place, dst_sr->mutable_value()); - } - } else { - PADDLE_THROW("Expect LoDTensor/SelectedRows, get %s", src.Type().name()); - } -} - -void WaitOnPlace(const platform::Place place) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); -} - -void WaitOnPlaces(const std::vector places) { - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - - for (auto &place : places) { - auto &dev_ctx = *pool.Get(place); - dev_ctx.Wait(); - } -} - -class ParallelDoOp : public framework::OperatorBase { - public: - ParallelDoOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - // get device context from pool - platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - auto &sub_scopes = *scope.FindVar(Output(kParallelScopes)) - ->GetMutable>(); - - // split input - SplitTensorAndMoveTensorToScopes(scope, &sub_scopes, places, - Inputs(kInputs)); - - // copy parameter - for (auto ¶m : Inputs(kParameters)) { - PADDLE_ENFORCE(scope.FindVar(param)->IsType(), - "Only support parameter type as LoDTensor"); - auto &src = scope.FindVar(param)->Get(); - - auto *sub_scope0 = sub_scopes[0]; - auto *dst0 = sub_scope0->Var(param)->GetMutable(); - dst0->ShareDataWith(src); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *sub_scope = sub_scopes[i]; - auto *dst = sub_scope->Var(param)->GetMutable(); - framework::TensorCopy(src, place, dst); - } - } - WaitOnPlaces(places); - - std::vector> workers; - workers.reserve(places.size()); - for (size_t place_idx = 0; place_idx < sub_scopes.size(); ++place_idx) { - auto &place = places[place_idx]; - auto *cur_scope = sub_scopes[place_idx]; - - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // merge output - for (auto &o_name : Outputs(kOutputs)) { - std::vector lod_tensors; - lod_tensors.reserve(sub_scopes.size()); - for (auto *sub_scope : sub_scopes) { - lod_tensors.emplace_back(&sub_scope->FindVar(o_name)->Get()); - } - - auto *lod_tensor_to_be_merged = - scope.FindVar(o_name)->GetMutable(); - lod_tensor_to_be_merged->MergeLoDTensor(lod_tensors, dev_ctx.GetPlace()); - } - WaitOnPlaces(places); - } -}; - -class ParallelDoOpProtoMaker : public framework::OpProtoAndCheckerMaker { - public: - void Make() override { - AddInput(kInputs, "").AsDuplicable(); - AddInput(kParameters, "").AsDuplicable(); - AddInput(kPlaces, ""); - AddOutput(kOutputs, "").AsDuplicable(); - AddOutput(kParallelScopes, ""); - AddAttr(kParallelBlock, ""); - AddAttr(kUseNCCL, "true if we use nccl on backward") - .SetDefault(false); - AddComment(R"DOC( -ParallelDo Operator. -)DOC"); - } -}; - -class ParallelDoGradOp : public framework::OperatorBase { - public: - ParallelDoGradOp(const std::string &type, - const framework::VariableNameMap &inputs, - const framework::VariableNameMap &outputs, - const framework::AttributeMap &attrs) - : framework::OperatorBase(type, inputs, outputs, attrs) {} - - private: - void RunImpl(const framework::Scope &scope, - const platform::Place &place) const override { - auto *block = Attr(kParallelBlock); - auto *program = block->Program(); - - auto &sub_scopes = scope.FindVar(Input(kParallelScopes)) - ->Get>(); - auto &places = scope.FindVar(Input(kPlaces))->Get(); - - // feed output@grad - SplitTensorAndMoveTensorToScopes( - scope, const_cast *>(&sub_scopes), - places, Inputs(framework::GradVarName(kOutputs))); - WaitOnPlaces(places); - - // exe run - std::vector> workers; - for (size_t i = 0; i < sub_scopes.size(); ++i) { - auto &place = places[i]; - auto *cur_scope = sub_scopes[i]; - - // execute - workers.emplace_back(framework::Async([program, cur_scope, place, block] { - framework::Executor executor(place); - executor.Run(*program, cur_scope, block->ID(), - false /*create_local_scope*/); - })); - } - for (auto &worker : workers) { - worker.wait(); - } - WaitOnPlaces(places); - - // NCCL allreduce op will be added by backward, - // so no need to explicitly accumulate grad - if (!(Attr(kUseNCCL))) { - AccumulateGrad(scope, place, sub_scopes, places); - } else { - for (auto &place : places) { - PADDLE_ENFORCE(platform::is_gpu_place(place), - "NCCL only supports cuda place"); - } - } - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Moving " << s; - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } - - void AccumulateGrad(const framework::Scope &scope, - const platform::Place &place, - const std::vector &sub_scopes, - const platform::PlaceList &places) const { - for (auto &s : Outputs(framework::GradVarName(kParameters))) { - if (s == framework::kEmptyVarName) { - continue; - } - VLOG(3) << "Accumulating " << s; - if (s == framework::kEmptyVarName) continue; - std::string tmp_name; - auto *tmp = sub_scopes[0]->Var(&tmp_name); - - for (size_t i = 1; i < sub_scopes.size(); ++i) { - CopyOrShare(*sub_scopes[i]->FindVar(s), places[0], tmp); - WaitOnPlaces(places); - - auto sum_op = framework::OpRegistry::CreateOp( - "sum", {{"X", {s, tmp_name}}}, {{"Out", {s}}}, - framework::AttributeMap{{"use_mkldnn", {false}}}); - VLOG(10) << sum_op->DebugStringEx(sub_scopes[0]); - sum_op->Run(*sub_scopes[0], places[0]); - WaitOnPlace(places[0]); - } - - CopyOrShare(*sub_scopes[0]->FindVar(s), place, scope.FindVar(s)); - } - WaitOnPlaces(places); - } -}; - -std::ostream &operator<<(std::ostream &sout, - const std::vector &strs) { - std::copy(strs.begin(), strs.end(), - std::ostream_iterator(sout, ",")); - return sout; -} - -class ParallelDoGradOpDescMaker : public framework::SingleGradOpDescMaker { - public: - using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; - - protected: - virtual std::unique_ptr Apply() const { - auto *grad = new framework::OpDesc(); - grad->SetType("parallel_do_grad"); - for (auto &input_param : this->InputNames()) { - VLOG(3) << input_param; - grad->SetInput(input_param, this->Input(input_param)); - if (input_param != kPlaces) { - grad->SetOutput(framework::GradVarName(input_param), - this->InputGrad(input_param, false)); - } - } - auto *g_block = this->grad_block_[0]; - - // All variable name that needed by gradient operators - std::unordered_set all_inputs_in_grad_blocks; - - for (size_t i = 0; i < g_block->OpSize(); ++i) { - auto *op = g_block->Op(i); - for (auto &var_name : op->InputArgumentNames()) { - all_inputs_in_grad_blocks.insert(var_name); - } - } - - for (auto &output_param : this->OutputNames()) { - if (output_param == kParallelScopes) { - grad->SetInput(output_param, this->Output(output_param)); - grad->SetInput(framework::GradVarName(output_param), - this->Output(output_param)); - } else { - grad->SetInput(output_param, this->Output(output_param)); - std::vector og_names; - for (auto &og_name : this->OutputGrad(output_param)) { - if (all_inputs_in_grad_blocks.count(og_name) != 0) { - // there are some gradient operators who need the OG. So make this - // OG as an input of parallel.do - og_names.push_back(og_name); - } - // else, there is no operator who need the OG. Do not use this OG as - // an input - } - grad->SetInput(framework::GradVarName(output_param), og_names); - } - } - grad->SetInput("Communicator", {"nccl_com__do_not_change_"}); - grad->SetAttrMap(this->Attrs()); - grad->SetBlockAttr(kParallelBlock, grad_block_[0]); - - return std::unique_ptr(grad); - } -}; - -class ParallelDoGradOpShapeInference : public framework::InferShapeBase { - public: - void operator()(framework::InferShapeContext *ctx) const override { - PADDLE_ENFORCE(ctx->HasInputs(kParameters)); - PADDLE_ENFORCE(ctx->HasInputs(kInputs)); - PADDLE_ENFORCE(ctx->HasInputs(kOutputs)); - - ctx->SetOutputsDim(framework::GradVarName(kParameters), - ctx->GetInputsDim(kParameters)); - - auto i_dims = ctx->GetInputsDim(kInputs); - auto ig_names = ctx->Outputs(framework::GradVarName(kInputs)); - - for (size_t i = 0; i < ig_names.size(); ++i) { - auto &ig_name = ig_names[i]; - if (ig_name == framework::kEmptyVarName) { - continue; - } - - ctx->SetDims({ig_name}, {i_dims[i]}); - } - - auto p_dims = ctx->GetInputsDim(kParameters); - auto pg_names = ctx->Outputs(framework::GradVarName(kParameters)); - for (size_t i = 0; i < pg_names.size(); ++i) { - auto &pg_name = pg_names[i]; - if (pg_name == framework::kEmptyVarName) { - continue; - } - ctx->SetDims({pg_name}, {p_dims[i]}); - } - } -}; - -class ParallelDoGradOpVarTypeInference : public framework::VarTypeInference { - public: - void operator()(const framework::OpDesc &op_desc, - framework::BlockDesc *block) const override { - framework::BlockDesc *sub_block = - boost::get(op_desc.GetAttr(kParallelBlock)); - for (auto &out_vars : op_desc.Outputs()) { - for (auto &out_var : out_vars.second) { - auto &var = block->FindRecursiveOrCreateVar(out_var); - auto sub_var = sub_block->FindRecursiveOrCreateVar(out_var); - if (sub_var.GetType() != var.GetType()) { - var.SetType(sub_var.GetType()); - } - } - } - } -}; - -} // namespace operators -} // namespace paddle - -REGISTER_OPERATOR(parallel_do, paddle::operators::ParallelDoOp, - paddle::operators::ParallelDoOpProtoMaker, - paddle::operators::ParallelDoGradOpDescMaker); -REGISTER_OPERATOR(parallel_do_grad, paddle::operators::ParallelDoGradOp, - paddle::operators::ParallelDoGradOpShapeInference, - paddle::operators::ParallelDoGradOpVarTypeInference); diff --git a/paddle/fluid/operators/controlflow/while_op.cc b/paddle/fluid/operators/controlflow/while_op.cc index 5ab0918c486cc56c7d55f24f4952a013044971ee..e91d9ef7765568a842b31ba682dc1b7e0d8ffa08 100644 --- a/paddle/fluid/operators/controlflow/while_op.cc +++ b/paddle/fluid/operators/controlflow/while_op.cc @@ -261,7 +261,7 @@ class WhileGradOp : public framework::OperatorBase { if (var->IsType()) { auto &inside_tensor = var->Get(); framework::AttributeMap attrs; - attrs["dtype"] = framework::ToDataType(inside_tensor.type()); + attrs["dtype"] = inside_tensor.type(); attrs["shape"] = framework::vectorize2int(inside_tensor.dims()); attrs["value"] = 0.0f; diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index d7b876628855b8b76b340cd1e6115896ead4aa6c..8e0d2824953a372b96d5819be658636f9a3d78ba 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -44,7 +44,9 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { std::vector dilations = ctx->Attrs().Get>("dilations"); PADDLE_ENFORCE(in_dims.size() == 4 || in_dims.size() == 5, - "Conv intput should be 4-D or 5-D tensor."); + "Conv intput should be 4-D or 5-D tensor, get %u", + in_dims.size()); + PADDLE_ENFORCE_EQ( in_dims.size(), filter_dims.size(), "Conv input dimension and filter dimension should be the same."); @@ -95,10 +97,8 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } #endif - auto input_data_type = - framework::ToDataType(ctx.Input("Input")->type()); - auto filter_data_type = - framework::ToDataType(ctx.Input("Filter")->type()); + auto input_data_type = ctx.Input("Input")->type(); + auto filter_data_type = ctx.Input("Filter")->type(); PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, "input and filter data type should be consistent"); @@ -382,9 +382,9 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_, customized_type_value); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_, + customized_type_value); } } // namespace operators diff --git a/paddle/fluid/operators/conv_transpose_op.cc b/paddle/fluid/operators/conv_transpose_op.cc index 2fdfc40d194224f0328161f5689da6246b1aae7f..86a140f15219001126283aa8b3f76d72fddb28fc 100644 --- a/paddle/fluid/operators/conv_transpose_op.cc +++ b/paddle/fluid/operators/conv_transpose_op.cc @@ -104,9 +104,8 @@ framework::OpKernelType ConvTransposeOp::GetExpectedKernelType( } #endif - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } void Conv2DTransposeOpMaker::Make() { @@ -335,9 +334,8 @@ framework::OpKernelType ConvTransposeOpGrad::GetExpectedKernelType( std::string data_format = ctx.Attr("data_format"); framework::DataLayout layout_ = framework::StringToDataLayout(data_format); - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.GetPlace(), layout_, library_); } } // namespace operators diff --git a/paddle/fluid/operators/crf_decoding_op.cc b/paddle/fluid/operators/crf_decoding_op.cc index c27befe1143baa68add4b56f3572eab75272c3a5..81c9e9e543191d9b2d606217d726cc783be97fea 100644 --- a/paddle/fluid/operators/crf_decoding_op.cc +++ b/paddle/fluid/operators/crf_decoding_op.cc @@ -118,9 +118,8 @@ class CRFDecodingOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Emission")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("Emission")->type(), + platform::CPUPlace()); } }; } // namespace operators diff --git a/paddle/fluid/operators/crop_op.cc b/paddle/fluid/operators/crop_op.cc index a2a871efa850df5101be7c27ebd81456acace7e1..97d20681b8136c13d512c0b86a7ff15b24367db2 100644 --- a/paddle/fluid/operators/crop_op.cc +++ b/paddle/fluid/operators/crop_op.cc @@ -51,9 +51,8 @@ class CropOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -174,9 +173,7 @@ class CropOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input(framework::GradVarName("Out")) - ->type()), + ctx.Input(framework::GradVarName("Out"))->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/cross_entropy_op.cc b/paddle/fluid/operators/cross_entropy_op.cc index a904dd91302c951560dc32ac107d4d73b6024c25..1968e54b00601139e252f0480ca3ae1fc08904f4 100644 --- a/paddle/fluid/operators/cross_entropy_op.cc +++ b/paddle/fluid/operators/cross_entropy_op.cc @@ -57,9 +57,8 @@ class CrossEntropyOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -111,9 +110,8 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel { // is determined by its input "X". framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/ctc_align_op.cc b/paddle/fluid/operators/ctc_align_op.cc index d2b440d9d2e50340af7a7bb4e76e55beea1bcb46..e7c472f8c0ce2cfe70b24be3c6930093922b0e27 100644 --- a/paddle/fluid/operators/ctc_align_op.cc +++ b/paddle/fluid/operators/ctc_align_op.cc @@ -36,9 +36,8 @@ class CTCAlignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Input")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index 744d149714c62713c0138617f55f569626d21cdb..7fc0294f8d2e2f7a3ca5e23deb91f2bf7a451435 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -300,9 +300,11 @@ class CudnnLSTMGPUKernel : public framework::OpKernel { } CudnnRNNCache *cudnn_rnn_cache = nullptr; if (cache_var->IsInitialized()) { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); } else { + // const_cast is usually bad. cudnn_rnn_cache = const_cast(cache_var) ->GetMutable(); std::random_device rnd; diff --git a/paddle/fluid/operators/detection/anchor_generator_op.cc b/paddle/fluid/operators/detection/anchor_generator_op.cc index 0c0155a0a977846b1300d93b4c3fef0e71fc1d26..f2984d1af2f26d901bc30ecfd519d5268a60278a 100644 --- a/paddle/fluid/operators/detection/anchor_generator_op.cc +++ b/paddle/fluid/operators/detection/anchor_generator_op.cc @@ -53,8 +53,7 @@ class AnchorGeneratorOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc index c23b65fe4dead3ca01a447d03877e3359b19e656..b7da1261a8f9780028bf2d36903e54d7e270bec0 100644 --- a/paddle/fluid/operators/detection/bipartite_match_op.cc +++ b/paddle/fluid/operators/detection/bipartite_match_op.cc @@ -45,9 +45,8 @@ class BipartiteMatchOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("DistMat")->type()), - platform::CPUPlace()); + return framework::OpKernelType(ctx.Input("DistMat")->type(), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 1012ba3652ddfb06af9292e8061684481f1dbef3..cacd47ed4a80489c59cdd80747d69c70bd5ea286 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -66,8 +66,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.GetPlace()); + ctx.Input("Input")->type(), ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu index 3b7c781795f02b9d9c9f2ead51034193ceb2a745..6a92762896b89a06a91cd11fb38587f7df69e6c3 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cu +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -146,7 +146,8 @@ class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { // At least use 32 threads, at most 512 threads. // blockx is multiple of 32. - int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int blockx = std::min( + static_cast(((feature_width * num_priors + 31) >> 5) << 5), 512L); int gridx = (feature_width * num_priors + blockx - 1) / blockx; dim3 threads(blockx, 1); dim3 grids(gridx, feature_height); diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc index f1975a9a4be69525c5145d958d930c8bbddbe4d7..06e48f1262a74dfdfd6d38e71cd02116f3e6eca5 100644 --- a/paddle/fluid/operators/detection/generate_proposals_op.cc +++ b/paddle/fluid/operators/detection/generate_proposals_op.cc @@ -60,9 +60,8 @@ class GenerateProposalsOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext &ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("Anchors")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("Anchors")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/mine_hard_examples_op.cc b/paddle/fluid/operators/detection/mine_hard_examples_op.cc index 54a4b87ec8f13c4d474aad4cc0b8159cd5f59d1c..f70e6adb5b4aefc02dabd4425ee4d633fff82e31 100644 --- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc +++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc @@ -249,8 +249,7 @@ class MineHardExamplesOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("ClsLoss")->type()), - platform::CPUPlace()); + ctx.Input("ClsLoss")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc index f0f8851be0ec2b532c570dc82b8ed5c290981aab..2395b181485429784e0f3dff6d056b84268ef245 100644 --- a/paddle/fluid/operators/detection/multiclass_nms_op.cc +++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc @@ -65,8 +65,7 @@ class MultiClassNMSOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Scores")->type()), + ctx.Input("Scores")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/prior_box_op.cc b/paddle/fluid/operators/detection/prior_box_op.cc index b5cb6a724c095eb849f3a184f13843e1a0cca92f..3e75c0394f971d0c8ab5edc88467e56c86db8815 100644 --- a/paddle/fluid/operators/detection/prior_box_op.cc +++ b/paddle/fluid/operators/detection/prior_box_op.cc @@ -72,8 +72,7 @@ class PriorBoxOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("Input")->type()), - ctx.device_context()); + ctx.Input("Input")->type(), ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc index 42c720e701fbabacf1280dec2f78d3f6b99dfea2..3796854fe67389c8ea68cd0098d31551fb50b957 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc @@ -498,9 +498,8 @@ class ROIPerspectiveTransformOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; @@ -519,9 +518,8 @@ class ROIPerspectiveTransformGradOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu index 2d262f932aed9761143f7983c9a38f7a97c374ea..862d664d42e03d2ae968ea0bdec8ae8e50bf7fb3 100644 --- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu +++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu @@ -35,12 +35,12 @@ namespace operators { template __device__ bool GT_E(T a, T b) { - return (a > b) || fabs(a - b) < 1e-4; + return (a > b) || Eigen::numext::abs(a - b) < 1e-4; } template __device__ bool LT_E(T a, T b) { - return (a < b) || fabs(a - b) < 1e-4; + return (a < b) || Eigen::numext::abs(a - b) < 1e-4; } template diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc index fd5d75ba527a5c98b283f691fe426c2e48529557..0b8053e8d03c426e5a1b619e67bc8dae21c5c024 100644 --- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc +++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc @@ -77,8 +77,7 @@ class RpnTargetAssignOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("Anchor")->type()), + ctx.Input("Anchor")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/detection/target_assign_op.cc b/paddle/fluid/operators/detection/target_assign_op.cc index 367001939251114a9cf442fd85c734958ccb2da8..c057c82ce0f5eef67c09d0ed719ddd24382f451d 100644 --- a/paddle/fluid/operators/detection/target_assign_op.cc +++ b/paddle/fluid/operators/detection/target_assign_op.cc @@ -57,9 +57,8 @@ class TargetAssignOp : public framework::OperatorWithKernel { protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), - ctx.device_context()); + return framework::OpKernelType(ctx.Input("X")->type(), + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/detection_map_op.cc b/paddle/fluid/operators/detection_map_op.cc index d7f49a9590e4ef4ca4d2ad5a92572c70e6bfb6ac..e1d113f8542da8827b9e36e44fc1bac6c07c9257 100644 --- a/paddle/fluid/operators/detection_map_op.cc +++ b/paddle/fluid/operators/detection_map_op.cc @@ -71,8 +71,7 @@ class DetectionMAPOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType( - ctx.Input("DetectRes")->type()), + ctx.Input("DetectRes")->type(), platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 101dbe9c89616b7025337261469e2b1aa3e8bc76..eab4297c737bbf2424fff8814b73942cd520d778 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -12,7 +12,7 @@ configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @O set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_GRPC) - grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc + grpc_library(sendrecvop_rpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows_functor memory) @@ -20,36 +20,43 @@ if(WITH_GRPC) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) cc_test(grpc_serde_test SRCS grpc_serde_test.cc - DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_rpc scope profiler math_function SERIAL) cc_test(rpc_server_test SRCS rpc_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) if(WITH_GPU) cc_test(collective_server_test SRCS collective_server_test.cc - DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + DEPS sendrecvop_rpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor selected_rows_functor scope math_function SERIAL) endif() - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) else() - set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc - brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + set_source_files_properties(brpc_server.cc parameter_prefetch.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc + brpc_variable_response.cc brpc_sendrecvop_utils.cc brpc_rdma_pool.cc collective_server.cc collective_server_test.cc + collective_client.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) - brpc_library(sendrecvop_brpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc - brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc + brpc_library(sendrecvop_rpc SRCS brpc_client.cc brpc_server.cc rpc_server.cc rpc_client.cc request_handler_impl.cc brpc_sendrecvop_utils.cc + brpc_variable_response.cc variable_response.cc sendrecvop_utils.cc brpc_rdma_pool.cc collective_client.cc collective_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows memory) - cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_brpc memory) + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) - set(brpc_test_depends sendrecvop_brpc brpc ssl crypto protobuf leveldb gflags glog executor proto_desc lookup_table_op snappystream snappy) + set(brpc_test_depends sendrecvop_rpc brpc ssl crypto protobuf leveldb gflags glog executor + proto_desc lookup_sparse_table_op snappystream snappy zlib) - cc_test(brpc_server_test SRCS rpc_server_test.cc + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS ${brpc_test_depends} SERIAL) cc_test(brpc_serde_test SRCS brpc_serde_test.cc DEPS ${brpc_test_depends} SERIAL) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS ${brpc_test_depends} selected_rows_functor scope math_function SERIAL) + endif() endif() diff --git a/paddle/fluid/operators/distributed/brpc_client.cc b/paddle/fluid/operators/distributed/brpc_client.cc index 350969f74be258ffbfef687b56083a9c6508bc81..62e32977b8cd7e70ddd9f5d879c6844ff346ce80 100644 --- a/paddle/fluid/operators/distributed/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc_client.cc @@ -14,135 +14,316 @@ #include "paddle/fluid/operators/distributed/brpc_client.h" #include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/platform/profiler.h" namespace paddle { namespace operators { namespace distributed { -DEFINE_int32(brpc_channel_num, 24, - "Number of channels to send requests connected to one server"); DEFINE_int32(timeout_ms, 30000, "RPC timeout in milliseconds"); DEFINE_int32(max_retry, 3, "Max retries(not including the first RPC)"); BRPCClient::~BRPCClient() { Wait(); } -void HandleSendResponse(brpc::Controller* cntl, - sendrecv::VoidMessage* response) { +void HandleSendResponse(brpc::Controller* cntl, sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used by other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to send SendVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleSendResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleSendResponse"; } -bool BRPCClient::AsyncSendVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; const auto ch_ptr = GetChannel(ep_val); + const std::string method = "SendRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, p_ctx, ep_val, p_scope, time_out, ch_ptr, this] { - auto ch_ctx = ch_ptr->Pop(); - brpc::Controller* cntl = new brpc::Controller(); - sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); - cntl->set_timeout_ms(time_out); + auto* var = p_scope->FindVar(var_name_val); + sendrecv::VariableMessage request; + distributed::SerializeToIOBuf(var_name_val, var, *p_ctx, &request, + &cntl->request_attachment(), "", false, + trainer_id_); - google::protobuf::Closure* done = - brpc::NewCallback(&HandleSendResponse, cntl, response); + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - sendrecv::VariableMessage request; - ch_ctx->stub->SendVariable(cntl, &request, response, done); - }); + platform::RecordRPCEvent record_event(method, p_ctx); + + ch_ctx->stub->SendVariable(cntl, &request, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } +void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls) { + // std::unique_ptr makes sure cntl/response will be deleted before returning. + std::unique_ptr cntl_guard(cntl); + std::unique_ptr response_guard(response); + + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { + LOG(FATAL) << "Fail to get HandleFetchBarrierResponse: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + var_h->Finish(false); + cls->DecreaseReqCount(); + return; + } + + var_h->Finish(true); + cls->DecreaseReqCount(); + + VLOG(4) << "HandleFetchBarrierResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + VLOG(4) << "Finish HandleFetchBarrierResponse"; +} void HandleGetResponse(brpc::Controller* cntl, - sendrecv::VariableMessage* response) { + sendrecv::VariableMessage* response, VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, ChannelContextPtr ch_ctx, + BRPCClient* cls) { // std::unique_ptr makes sure cntl/response will be deleted before returning. std::unique_ptr cntl_guard(cntl); std::unique_ptr response_guard(response); + // this channel can be used other now. + ch_ptr->Push(ch_ctx); + if (cntl->Failed()) { - LOG(WARNING) << "Fail to send EchoRequest, " << cntl->ErrorText(); + LOG(FATAL) << "Fail to GetVar: " << var_h->name() + << ", error text: " << cntl->ErrorText(); + cls->DecreaseReqCount(); + var_h->Finish(false); return; } - LOG(INFO) << "Received response from " << cntl->remote_side() - << " latency=" << cntl->latency_us() << "us"; - // framework::Variable* outvar = nullptr; - // DeserializeFromByteBuffer(ret_msg, *var_h.ctx, var_h.scope, &outvar); + VLOG(4) << "HandleGetResponse from: " << cntl->remote_side() + << ", varname: " << var_h->name() + << ", latency: " << cntl->latency_us() << "us"; + + framework::Variable* outvar = nullptr; + int trainer_id; + distributed::DeserializeFromIOBuf(*response, cntl->response_attachment(), + *var_h->ctx(), var_h->scope(), &outvar, + &trainer_id); + VLOG(4) << "Finish HandleGetResponse"; + cls->DecreaseReqCount(); + var_h->Finish(true); } -bool BRPCClient::AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, int64_t time_out) { +VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + const std::string method = "GetRPC"; + VarHandlePtr var_h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); - framework::AsyncIO( - [var_name_val, ep_val, p_scope, p_ctx, time_out, ch, this] {}); + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_trainer_id(trainer_id_); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + platform::RecordRPCEvent record_event(method, p_ctx); + + if (method_name == "GetMonomerVariable") { + ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); + } else { + ch_ctx->stub->GetVariable(cntl, &req, response, done); + } + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; +} + +VarHandlePtr BRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetMonomerVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + return AsyncSendMessage(ep, "GetMonomerBarrier", var_name, time_out); } -bool BRPCClient::AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, "GetVariable", time_out); +} + +VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string in_var_name_val = in_var_name; const std::string out_var_name_val = out_var_name; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; - const auto ch = GetChannel(ep_val); + const auto ch_ptr = GetChannel(ep_val); + + const std::string method = "PrefetchRPC"; + + VarHandlePtr var_h( + new VarHandle(ep, method, out_var_name_val, p_ctx, p_scope)); + + framework::AsyncIO([=] { + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + auto* var = p_scope->FindVar(in_var_name_val); + sendrecv::VariableMessage req; + distributed::SerializeToIOBuf(in_var_name_val, var, *p_ctx, &req, + &cntl->request_attachment(), out_var_name_val, + false, 0, table_name_val); + + platform::RecordRPCEvent record_event(method, p_ctx); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); - framework::AsyncIO([in_var_name_val, out_var_name_val, ep_val, p_scope, p_ctx, - time_out, ch, this] {}); + ch_ctx->stub->PrefetchVariable(cntl, &req, response, done); + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + }); req_count_++; - return true; + return var_h; } -void BRPCClient::AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out) { - req_count_++; +VarHandlePtr BRPCClient::AsyncSendBatchBarrier(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "BatchBarrierRPC", BATCH_BARRIER_MESSAGE, + time_out); } -void BRPCClient::AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out) { +VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VariableMessage* response = new sendrecv::VariableMessage(); + cntl->set_timeout_ms(time_out); + + sendrecv::VariableMessage req; + req.set_varname(FETCH_BARRIER_MESSAGE); + + const std::string method = "FetchBarrierRPC"; + // var handle + VarHandlePtr var_h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + + platform::RecordRPCEvent record_event(method, nullptr); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + ch_ctx->stub->GetVariable(cntl, &req, response, done); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; } -void BRPCClient::Wait() { - std::unique_lock lk(sync_mutex_); - sync_cond_.wait(lk, [this] { return req_count_ == 0; }); +bool BRPCClient::Wait() { + VLOG(9) << "begin to brpcclient wait"; + { + std::unique_lock lk(sync_mutex_); + sync_cond_.wait(lk, [this] { return req_count_ == 0; }); + } + VLOG(9) << "end to brpcclient wait"; + return true; } ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { + VLOG(4) << "begin to GetChannel:" << ep; { std::lock_guard guard(chan_mutex_); auto it = channels_.find(ep); if (it != channels_.end()) { + VLOG(4) << "end to GetChannel:" << ep; return it->second; } } @@ -150,12 +331,20 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { ChannelQueuePtr q(new framework::BlockingQueue()); brpc::ChannelOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.protocol = "baidu_std"; - options.connection_type = "pooled"; - options.connect_timeout_ms = 100; + // don't use pooled type. the server can't afford that. + options.connection_type = "single"; + options.connect_timeout_ms = 1000; options.timeout_ms = FLAGS_timeout_ms /*milliseconds*/; options.max_retry = FLAGS_max_retry; - for (int i = 0; i < FLAGS_brpc_channel_num; ++i) { + + VLOG(1) << "create " << brpc_channel_num_per_server_ + << " brpc channels to pserver:" << ep; + + for (int i = 0; i < brpc_channel_num_per_server_; ++i) { std::shared_ptr c(new ChannelContext()); if (c->channel.Init(ep.c_str(), &options) != 0) { LOG(FATAL) << "Fail to initialize channel"; @@ -172,9 +361,75 @@ ChannelQueuePtr BRPCClient::GetChannel(const std::string& ep) { channels_[ep] = q; } + VLOG(4) << "end to GetChannel:" << ep; return q; } +VarHandlePtr BRPCClient::AsyncSendComplete(const std::string& ep, + int64_t time_out) { + return AsyncSendMessage(ep, "SendCompleteRPC", COMPLETE_MESSAGE, time_out); +} + +void BRPCClient::SendComplete() { + for (auto& kv : channels_) { + AsyncSendComplete(kv.first); + } +} + +VarHandlePtr BRPCClient::AsyncSendVarMessage( + const std::string& ep, const std::string& method_name, + const sendrecv::VariableMessage& req, int64_t time_out) { + auto ch_ptr = GetChannel(ep); + auto ch_ctx = ch_ptr->Pop(); + + brpc::Controller* cntl = new brpc::Controller(); + sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); + cntl->set_timeout_ms(time_out); + + platform::RecordRPCEvent record_event(method_name, nullptr); + + VarHandlePtr var_h( + new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); + + google::protobuf::Closure* done = brpc::NewCallback( + &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); + + if (method_name == "CheckPointNotifyRPC") { + ch_ctx->stub->CheckpointNotify(cntl, &req, response, done); + } else if (method_name == "GetMonomerBarrier") { + ch_ctx->stub->GetMonomerBarrier(cntl, &req, response, done); + } else { + ch_ctx->stub->SendVariable(cntl, &req, response, done); + } + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + var_h->Wait(); + } + + return var_h; +} + +VarHandlePtr BRPCClient::AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(message); + + return AsyncSendVarMessage(ep, method_name, req, time_out); +} + +VarHandlePtr BRPCClient::AsyncCheckpointNotify(const std::string& ep, + const std::string& dir, + int64_t time_out) { + sendrecv::VariableMessage req; + req.set_varname(CHECKPOINT_SAVE_MESSAGE); + req.set_out_varname(dir); + + return AsyncSendVarMessage(ep, "CheckPointNotifyRPC", req, time_out); +} + } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_client.h b/paddle/fluid/operators/distributed/brpc_client.h index 8ff1f0a6076b3574c42065edcbac50eb75b3b483..80cc81bff37916b558aa87104f194eb37a7fc309 100644 --- a/paddle/fluid/operators/distributed/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc_client.h @@ -31,6 +31,8 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/request_handler.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" #include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN @@ -53,33 +55,94 @@ class BRPCClient : public RPCClient { BRPCClient() {} virtual ~BRPCClient(); - bool AsyncSendVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncGetVar(const std::string& ep, const platform::DeviceContext& ctx, - const framework::Scope& scope, const std::string& var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - bool AsyncPrefetchVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& in_var_name, - const std::string& out_var_name, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendBatchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; - void AsyncSendFetchBarrier(const std::string& ep, - int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& in_var_name, + const std::string& out_var_name, + const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline) override; - void Wait() override; + VarHandlePtr AsyncSendBatchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncSendFetchBarrier( + const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + + VarHandlePtr AsyncCheckpointNotify( + const std::string& ep, const std::string& dir, + int64_t time_out = FLAGS_rpc_deadline) override; + + bool Wait() override; + + void SendComplete() override; private: + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& method_name, + int64_t time_out = FLAGS_rpc_deadline); + void Proceed(); ChannelQueuePtr GetChannel(const std::string& ep); + VarHandlePtr AsyncSendComplete(const std::string& ep, + int64_t time_out = FLAGS_rpc_deadline); + + VarHandlePtr AsyncSendMessage(const std::string& ep, + const std::string& method_name, + const std::string& message, int64_t time_out); + + VarHandlePtr AsyncSendVarMessage(const std::string& ep, + const std::string& method_name, + const sendrecv::VariableMessage& req, + int64_t time_out); + + friend void HandleSendResponse(brpc::Controller* cntl, + sendrecv::VoidMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleGetResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, BRPCClient* cls); + + friend void HandleFetchBarrierResponse(brpc::Controller* cntl, + sendrecv::VariableMessage* response, + VarHandlePtr var_h, + ChannelQueuePtr ch_ptr, + ChannelContextPtr ch_ctx, + BRPCClient* cls); + void DecreaseReqCount() { + if (--req_count_ <= 0) { + sync_cond_.notify_all(); + } + } + private: std::unordered_map channels_; @@ -88,6 +151,8 @@ class BRPCClient : public RPCClient { std::condition_variable sync_cond_; std::atomic req_count_{0}; + static constexpr int brpc_channel_num_per_server_ = 4; + // mutex for GetChannel thread safety std::mutex chan_mutex_; DISABLE_COPY_AND_ASSIGN(BRPCClient); diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.cc b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc new file mode 100644 index 0000000000000000000000000000000000000000..e1be5673dfbc51e6a7beb8e3b8b2f162872a1988 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.cc @@ -0,0 +1,84 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifdef PADDLE_WITH_BRPC_RDMA + +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "brpc/channel.h" +#include "brpc/rdma/rdma_helper.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +RdmaMemPool& RdmaMemPool::Instance() { + static RdmaMemPool* g_rdma_mem_pool = new RdmaMemPool(); + return *g_rdma_mem_pool; +} + +void* RdmaMemPool::Find(const std::string& varname, int64_t size) { + pthread_rwlock_rdlock(&access_); + auto it = pool_.find(varname); + if (it == pool_.end()) { + pthread_rwlock_unlock(&access_); + return nullptr; + } + + auto info = it->second; + if (info.data_size != size) { + pthread_rwlock_unlock(&access_); + PADDLE_ENFORCE(false, "var:%s size:%ld != %ld", varname, size, + info.data_size); + return nullptr; + } + + pthread_rwlock_unlock(&access_); + return info.data; +} + +void RdmaMemPool::Register(const std::string& varname, void* data, + int64_t data_size) { + void* old = Find(varname, data_size); + if (old != nullptr) { + if (data != old) { + PADDLE_ENFORCE(false, "var:%s data:%ld != %ld", varname, data, old); + } + VLOG(7) << "Find on rdma:" << varname << " data:" << data + << " data_size:" << data_size; + return; + } + + VarInfo info; + info.data = data; + info.data_size = data_size; + + pthread_rwlock_wrlock(&access_); + pool_[varname] = info; + pthread_rwlock_unlock(&access_); + + if (brpc::rdma::RegisterMemoryForRdma(data, data_size)) { + LOG(FATAL) << "register " << varname << " data:" << data + << " data_size:" << data_size << " error"; + } + + VLOG(4) << "register on rdma:" << varname << " data:" << data + << " data_size:" << data_size; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_rdma_pool.h b/paddle/fluid/operators/distributed/brpc_rdma_pool.h new file mode 100644 index 0000000000000000000000000000000000000000..156a93ec5784715c0a68c1af2e31d640dfc60277 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_rdma_pool.h @@ -0,0 +1,56 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#ifdef PADDLE_WITH_BRPC_RDMA + +#include // NOLINT +#include +#include + +namespace paddle { +namespace operators { +namespace distributed { + +/* + * This class is used to avoid duplicated registion of brpc::rdma. + */ +class RdmaMemPool { + public: + static RdmaMemPool& Instance(); + RdmaMemPool() : access_(PTHREAD_RWLOCK_INITIALIZER) {} + + virtual ~RdmaMemPool() { pthread_rwlock_destroy(&access_); } + + void Register(const std::string& varname, void* data, int64_t size); + void* Find(const std::string& varname, int64_t size); + + private: + struct VarInfo { + void* data; + int64_t data_size; + + VarInfo() : data(nullptr), data_size(0) {} + }; + + private: + std::unordered_map pool_; + pthread_rwlock_t access_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle + +#endif diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc new file mode 100644 index 0000000000000000000000000000000000000000..e4604db3a381616c7420f816f0b49a015c925bd4 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.cc @@ -0,0 +1,207 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_CUDA +#include +#endif +#include +#include +#include // NOLINT + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/profiler.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class IOBufWriter { + public: + static void Append(const std::string& varname, butil::IOBuf* iobuf, int k, + const char* v, int64_t vlen) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + iobuf->append(v, vlen); + } + + static void AppendTCPZeroCopy(butil::IOBuf* iobuf, int k, const char* v, + int64_t vlen, bool in_cuda_pinned, + void (*destroy)(void*), void* user_data) { + VLOG(7) << "AppendTCPZeroCopy " + << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + // FIXME(gongwb): use append_zerocopy + /* + if (in_cuda_pinned) { + iobuf->append_zerocopy(v, vlen, IOBufWriter::FreeMemory); + } else { + iobuf->append_zerocopy(v, vlen, nullptr); + } + */ + iobuf->append(v, vlen); + destroy(user_data); + } + +#ifdef PADDLE_WITH_BRPC_RDMA + static void AppendRdmaZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + VLOG(7) << "AppendRdmaZeroCopy varname:" << varname << " k:" << k + << " data:" << static_cast(const_cast(v)) + << " data_size:" << vlen << " in_cuda_pinned:" << in_cuda_pinned; + + iobuf->append(reinterpret_cast(&k), 4); + iobuf->append(reinterpret_cast(&vlen), 8); + + RdmaMemPool::Instance().Register( + varname, static_cast(const_cast(v)), vlen); + + // FIXME(gongwb): use append_zerocopy + // iobuf->append_zerocopy(v, vlen, nullptr); + iobuf->append(v, vlen); + destroy(user_data); + return; + } +#endif + + static void AppendZeroCopy(const std::string varname, butil::IOBuf* iobuf, + int k, const char* v, int64_t vlen, + bool in_cuda_pinned, void (*destroy)(void*), + void* user_data) { + if (vlen >= std::numeric_limits::max() || vlen < 0) { + LOG(FATAL) << "AppendZeroCopy varname:" << varname << ", vlen:" << vlen; + } + +#ifdef PADDLE_WITH_BRPC_RDMA + IOBufWriter::AppendRdmaZeroCopy(varname, iobuf, k, v, vlen, in_cuda_pinned, + destroy, user_data); +#else + IOBufWriter::AppendTCPZeroCopy(iobuf, k, v, vlen, in_cuda_pinned, destroy, + user_data); +#endif + } +}; + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, int trainer_id, + const std::string& table_name) { + std::unique_ptr payload; + + request->set_varname(name); + request->set_trainer_id(trainer_id); + // Note: normally the profiler is enabled in 1 trainer, hence only + // 1 trainer returns true for ShouldSendProfileState(). It tells PS + // servers the trainer's profiling state so that PS can follow the + // trainer. + if (platform::ShouldSendProfileState()) { + if (platform::IsProfileEnabled()) { + request->set_profile(platform::kEnableProfiler); + } else { + request->set_profile(platform::kDisableProfiler); + } + } + if (!out_varname.empty()) { + request->set_out_varname(out_varname); + } + if (!table_name.empty()) { + request->set_table_name(table_name); + } + if (var->IsType()) { + request->set_type(::sendrecv::LOD_TENSOR); + payload.reset(new TensorPayload(GetTensorPayload(var, ctx, request))); + } else if (var->IsType()) { + request->set_type(::sendrecv::SELECTED_ROWS); + payload.reset(new TensorPayload(GetSelectedRowsPayload(var, ctx, request))); +#ifdef PADDLE_WITH_CUDA + } else if (var->IsType()) { + request->set_type(::sendrecv::NCCL_ID); + const ncclUniqueId& uid = var->Get(); + // TODO(gongwb): use append_zero to avoid data copy. + IOBufWriter::Append(name, iobuf, + sendrecv::VariableMessage::kSerializedFieldNumber, + uid.internal, NCCL_UNIQUE_ID_BYTES); + return; +#endif + } else { + PADDLE_THROW("Serialize does not support type: %s", + typeid(var->Type()).name()); + } + + PADDLE_ENFORCE_NOT_NULL(payload); + + // FIXME(gongwb): it seems that can use zero copy. + if (var_is_not_stable) { + IOBufWriter::Append( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size()); + } else { + if (platform::is_gpu_place(ctx.GetPlace())) { +#ifdef PADDLE_WITH_CUDA + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + true, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); +#endif + } else { + IOBufWriter::AppendZeroCopy( + name, iobuf, ::sendrecv::VariableMessage::kSerializedFieldNumber, + static_cast(payload->ptr()), payload->memory_size(), + false, SerializeDestroyCallback, static_cast(payload.get())); + payload.release(); + } + } + + if (var->IsType()) { + auto* slr = var->GetMutable(); + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + + IOBufWriter::Append(name, iobuf, + ::sendrecv::VariableMessage::kRowsFieldNumber, + reinterpret_cast(slr->rows().data()), + static_cast(rows_memory_size)); + } +} + +void DeserializeFromIOBuf(const ::sendrecv::VariableMessage& meta, + const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id) { + operators::distributed::BRPCVariableResponse resp(scope, &ctx); + PADDLE_ENFORCE(resp.Parse(iobuf, meta) == 0, "parse iobuf to tensor error!"); + *var = resp.GetVar(); + *trainer_id = resp.GetTrainerId(); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h new file mode 100644 index 0000000000000000000000000000000000000000..ffaf44222422882d6c9a3cb8efbd335e8072fd49 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" + +namespace paddle { +namespace operators { +namespace distributed { + +void SerializeToIOBuf(const std::string& name, framework::Variable* var, + const platform::DeviceContext& ctx, VarMsg* request, + butil::IOBuf* iobuf, const std::string& out_varname, + bool var_is_not_stable, const int trainer_id = 0, + const std::string& table_name = std::string()); + +void DeserializeFromIOBuf(const VarMsg& meta, const butil::IOBuf& iobuf, + const platform::DeviceContext& ctx, + const framework::Scope* scope, + framework::Variable** var, int* trainer_id); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_serde_test.cc b/paddle/fluid/operators/distributed/brpc_serde_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..2a2dc72150a3295205724cb04b4a246efe32d5e8 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_serde_test.cc @@ -0,0 +1,175 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "brpc/channel.h" +#include "google/protobuf/text_format.h" +#include "gtest/gtest.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/tensor_util.h" +#include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/place.h" +#include "paddle/fluid/string/printf.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace operators = paddle::operators; +namespace math = paddle::operators::math; +namespace memory = paddle::memory; + +void RunSerdeTestSelectedRows(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 564 * 128; + + // serialize var to IOBuf + { + framework::Variable var; + auto* slr = var.GetMutable(); + slr->set_height(1000); + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + tensor->Resize(framework::make_ddim({564, 128})); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 564; ++i) rows->push_back(i); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // desrialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto* slr2 = var2->GetMutable(); + auto* tensor2 = slr2->mutable_value(); + auto* rows2 = slr2->mutable_rows(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(*tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2->data()); + } + const int64_t* rows_data2 = rows2->data(); + + for (int i = 0; i < tensor_numel; ++i) { + EXPECT_FLOAT_EQ(tensor_data2[i], 32.7); + } + for (size_t i = 0; i < rows2->size(); ++i) { + EXPECT_EQ(rows_data2[i], static_cast(i)); + } + EXPECT_EQ(slr2->height(), 1000); + } +} + +void RunTestLodTensor(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + // serialize var to ByteBuffer + butil::IOBuf iobuf; + sendrecv::VariableMessage msg; + int tensor_numel = 512 * 8 * 4 * 2; + { + framework::Variable var; + auto* tensor = var.GetMutable(); + tensor->Resize(framework::make_ddim({512, 8, 4, 2})); + framework::LoD lod; + lod.push_back(framework::Vector({1, 3, 8})); + tensor->set_lod(lod); + tensor->mutable_data(place); + math::set_constant(ctx, tensor, 31.9); + + operators::distributed::SerializeToIOBuf("myvar", &var, ctx, &msg, &iobuf, + "", false); + } + + // check sendrecv::VariableMessage meta data + { + EXPECT_EQ(msg.varname(), "myvar"); + EXPECT_EQ(msg.type(), 0); + EXPECT_EQ(msg.dims()[0], 512); + EXPECT_EQ(msg.dims()[1], 8); + EXPECT_EQ(msg.dims()[2], 4); + EXPECT_EQ(msg.dims()[3], 2); + EXPECT_EQ(msg.lod_level(), 1); + EXPECT_EQ(msg.lod(0).lod_data(0), 1); + EXPECT_EQ(msg.lod(0).lod_data(1), 3); + EXPECT_EQ(msg.lod(0).lod_data(2), 8); + } + + // deserialize + { + framework::Scope scope; + scope.Var("myvar"); + operators::distributed::BRPCVariableResponse resp(&scope, &ctx); + EXPECT_EQ(resp.Parse(iobuf, msg), 0); + + framework::Variable* var2 = resp.GetVar(); + + auto tensor2 = var2->Get(); + float* tensor_data2 = nullptr; + framework::Tensor tmp_tensor; + + if (platform::is_gpu_place(ctx.GetPlace())) { + platform::CPUPlace cpu; + framework::TensorCopy(tensor2, cpu, &tmp_tensor); + tensor_data2 = tmp_tensor.data(); + } else { + tensor_data2 = const_cast(tensor2.data()); + } + + for (int i = 0; i < tensor_numel; ++i) + EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); + } +} + +TEST(LodTensor, Run) { + platform::CPUPlace place; + RunTestLodTensor(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu(0); + RunTestLodTensor(gpu); +#endif +} + +TEST(SelectedRows, Run) { + platform::CPUPlace place; + RunSerdeTestSelectedRows(place); +#ifdef PADDLE_WITH_CUDA + platform::CUDAPlace gpu; + RunSerdeTestSelectedRows(gpu); +#endif +} diff --git a/paddle/fluid/operators/distributed/brpc_server.cc b/paddle/fluid/operators/distributed/brpc_server.cc index 862167f02084cfe81db1c0936bbfb0415fa85721..78d41aeac50a31d13fb3304909e7aed07f6a852a 100644 --- a/paddle/fluid/operators/distributed/brpc_server.cc +++ b/paddle/fluid/operators/distributed/brpc_server.cc @@ -13,84 +13,287 @@ // limitations under the License. #include "paddle/fluid/operators/distributed/brpc_server.h" +#include "paddle/fluid/framework/threadpool.h" +#include "paddle/fluid/operators/distributed/brpc_sendrecvop_utils.h" +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" #include "paddle/fluid/operators/distributed/request_handler.h" namespace sendrecv { -typedef std::unordered_map +namespace distributed = paddle::operators::distributed; + +typedef std::unordered_map HandlerMap; class BRPCServiceImpl : public SendRecvService { public: - explicit BRPCServiceImpl(const HandlerMap& rpc_call_map) - : request_send_h_(nullptr), - request_get_h_(nullptr), - request_prefetch_h_(nullptr) { - auto it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + explicit BRPCServiceImpl(const HandlerMap& rpc_call_map, + distributed::RPCServer* rpc_server) + : rpc_server_(rpc_server) { + VLOG(3) << "BRPCServiceImpl size: " << rpc_call_map.size(); + auto it = rpc_call_map.find(distributed::kRequestSend); if (it != rpc_call_map.end()) { request_send_h_ = it->second; + send_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestSend))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestSend); + it = rpc_call_map.find(distributed::kRequestGet); if (it != rpc_call_map.end()) { request_get_h_ = it->second; + get_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestGet))); } - it = rpc_call_map.find(paddle::operators::distributed::kRequestPrefetch); + it = rpc_call_map.find(distributed::kRequestPrefetch); if (it != rpc_call_map.end()) { request_prefetch_h_ = it->second; + prefetch_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestCheckpoint); + if (it != rpc_call_map.end()) { + request_checkpoint_h_ = it->second; + checkpoint_notify_threads_.reset(new paddle::framework::ThreadPool( + rpc_server_->GetThreadNum(distributed::kRequestPrefetch))); + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerVariable); + if (it != rpc_call_map.end()) { + request_get_monomer_handler_h_ = it->second; + } + + it = rpc_call_map.find(distributed::kRequestGetMonomerBarrier); + if (it != rpc_call_map.end()) { + request_get_monomer_barrier_handler_h_ = it->second; } } virtual ~BRPCServiceImpl() {} - void SendVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VoidMessage* response, google::protobuf::Closure* done) override { + send_threads_->Run( + [=] { _SendVariable(cntl_butil, request, response, done); }); + } + + void _SendVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_send_h_ != nullptr, "RequestSend handler should be registed first!"); brpc::ClosureGuard done_guard(done); - - paddle::framework::Scope* local_scope = request_send_h_->scope(); - paddle::framework::Variable* outvar = nullptr; - paddle::framework::Variable* invar = nullptr; + brpc::Controller* cntl = static_cast(cntl_butil); std::string varname = request->varname(); + VLOG(3) << "RequestSend var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); - if (!request_send_h_->sync_mode()) { - local_scope = &request_send_h_->scope()->NewScope(); - invar = local_scope->Var(varname); - } else { - invar = local_scope->FindVar(varname); - } + distributed::BRPCVariableResponse resp(request_send_h_->scope(), + request_send_h_->dev_ctx(), + !request_send_h_->sync_mode()); + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); - request_send_h_->Handle(varname, local_scope, invar, &outvar); + auto scope = resp.GetMutableLocalScope(); + auto invar = resp.GetVar(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; - if (!request_send_h_->sync_mode()) { - request_send_h_->scope()->DeleteScope(local_scope); - } + request_send_h_->Handle(varname, scope, invar, &outvar, trainer_id); } void GetVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + get_threads_->Run( + [=] { _GetVariable(cntl_butil, request, response, done); }); + } + + void _GetVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_get_h_ != nullptr, "RequestGet handler should be registed first!"); - } + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGet varname:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + auto scope = request_get_h_->scope(); + auto invar = scope->FindVar(varname); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = nullptr; + + request_get_h_->Handle(varname, scope, invar, &outvar, trainer_id); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *request_get_h_->dev_ctx(), + response, &cntl->response_attachment(), "", + false); + } + } void PrefetchVariable(google::protobuf::RpcController* cntl_butil, const VariableMessage* request, VariableMessage* response, google::protobuf::Closure* done) override { + prefetch_threads_->Run( + [=] { _PrefetchVariable(cntl_butil, request, response, done); }); + } + + void _PrefetchVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) { PADDLE_ENFORCE(request_prefetch_h_ != nullptr, "kRequestPrefetch handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // prefetch process... + std::string in_var_name = request->varname(); + std::string out_var_name = request->out_varname(); + VLOG(3) << "RequestPrefetch, in_var_name: " << in_var_name + << ", out_var_name: " << out_var_name + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + distributed::BRPCVariableResponse resp( + request_prefetch_h_->scope(), request_prefetch_h_->dev_ctx(), true); + + PADDLE_ENFORCE(resp.Parse(cntl->request_attachment(), *request) == 0, + "parse iobuf to tensor error!"); + + auto scope = resp.GetMutableLocalScope(); + auto invar = scope->FindVar(in_var_name); + std::string table_name = request->table_name(); + int trainer_id = request->trainer_id(); + paddle::framework::Variable* outvar = scope->Var(out_var_name); + + request_prefetch_h_->Handle(in_var_name, scope, invar, &outvar, trainer_id, + out_var_name, table_name); + + distributed::SerializeToIOBuf(out_var_name, outvar, + *request_prefetch_h_->dev_ctx(), response, + &cntl->response_attachment(), "", true); + } + + void CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + checkpoint_notify_threads_->Run( + [=] { _CheckpointNotify(cntl_butil, request, response, done); }); + } + + void _CheckpointNotify(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) { + PADDLE_ENFORCE( + request_checkpoint_h_ != nullptr, + "kRequestCheckpointNotify handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + distributed::BRPCVariableResponse resp(request_checkpoint_h_->scope(), + request_checkpoint_h_->dev_ctx()); + + auto scope = resp.GetMutableLocalScope(); + + std::string checkpoint_notify = request->varname(); + std::string checkpoint_dir = request->out_varname(); + int trainer_id = request->trainer_id(); + + VLOG(4) << "RequestCheckpointNotify notify: " << checkpoint_notify + << ", dir: " << checkpoint_dir + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + request_checkpoint_h_->Handle(checkpoint_notify, scope, nullptr, nullptr, + trainer_id, checkpoint_dir); + } + + void GetMonomerVariable(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, + VariableMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_handler_h_ != nullptr, + "kRequestGetMonomerVariable handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + // proc request. + std::string varname = request->varname(); + VLOG(3) << "GetMonomerVariable " << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_handler_h_->Handle(varname, scope, invar, &outvar, + request->trainer_id()); + + if (outvar) { + distributed::SerializeToIOBuf(varname, outvar, *h.dev_ctx_, response, + &cntl->response_attachment(), "", false); + } + } + + void GetMonomerBarrier(google::protobuf::RpcController* cntl_butil, + const VariableMessage* request, VoidMessage* response, + google::protobuf::Closure* done) override { + PADDLE_ENFORCE( + request_get_monomer_barrier_handler_h_ != nullptr, + "RequestGetMonomerBarrier handler should be registed first!"); + + brpc::ClosureGuard done_guard(done); + brpc::Controller* cntl = static_cast(cntl_butil); + + std::string varname = request->varname(); + VLOG(3) << "RequestGetMonomerBarrier var_name:" << varname + << ", trainer_id:" << request->trainer_id() + << ", from:" << cntl->remote_side(); + + rpc_server_->WaitVarCond(varname); + distributed::MonomerHandle h = rpc_server_->GetMonomer(varname); + + paddle::framework::Scope* scope = nullptr; + paddle::framework::Variable* invar = nullptr; + paddle::framework::Variable* outvar = nullptr; + + request_get_monomer_barrier_handler_h_->Handle( + varname, scope, invar, &outvar, request->trainer_id()); } private: - paddle::operators::distributed::RequestHandler* request_send_h_; - paddle::operators::distributed::RequestHandler* request_get_h_; - paddle::operators::distributed::RequestHandler* request_prefetch_h_; + distributed::RequestHandler* request_send_h_{nullptr}; + distributed::RequestHandler* request_get_h_{nullptr}; + distributed::RequestHandler* request_prefetch_h_{nullptr}; + distributed::RequestHandler* request_checkpoint_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_handler_h_{nullptr}; + distributed::RequestHandler* request_get_monomer_barrier_handler_h_{nullptr}; + + distributed::RPCServer* rpc_server_{nullptr}; + + // FIXME(gongwb): brpc should support process one rpce use one threadpool. + std::unique_ptr send_threads_; + std::unique_ptr get_threads_; + std::unique_ptr prefetch_threads_; + std::unique_ptr checkpoint_notify_threads_; }; } // namespace sendrecv @@ -100,7 +303,7 @@ namespace distributed { void AsyncBRPCServer::StartServer() { // Instance of your service. - sendrecv::BRPCServiceImpl service_impl(rpc_call_map_); + sendrecv::BRPCServiceImpl service_impl(rpc_call_map_, this); // Add the service into server. Notice the second parameter, because the // service is put on stack, we don't want server to delete it, otherwise @@ -111,6 +314,9 @@ void AsyncBRPCServer::StartServer() { } brpc::ServerOptions options; +#ifdef PADDLE_WITH_BRPC_RDMA + options.use_rdma = true; +#endif options.idle_timeout_sec = idle_timeout_s_; options.max_concurrency = max_concurrency_; if (server_.Start(bind_address_.c_str(), &options) != 0) { diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.cc b/paddle/fluid/operators/distributed/brpc_variable_response.cc new file mode 100644 index 0000000000000000000000000000000000000000..75306d72334abc98f1527d4557509bf2a1f8a9bb --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.cc @@ -0,0 +1,73 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#include "paddle/fluid/operators/distributed/brpc_variable_response.h" +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +namespace paddle { +namespace operators { +namespace distributed { + +namespace pb = ::google::protobuf; +using vr = ::sendrecv::VariableMessage; + +int BRPCVariableResponse::Parse(Source* source) { + pb::io::ZeroCopyInputStream* input_stream = source->contents(); + pb::io::CodedInputStream input(input_stream); + input.SetTotalBytesLimit(INT_MAX, INT_MAX); + + while (1) { + unsigned int tag = 0; + if (!input.ReadLittleEndian32(&tag)) { + break; + } + + uint64_t num_bytes = 0; + if (!input.ReadLittleEndian64(&num_bytes)) { + break; + } + + int field = static_cast(tag); + int ret = field == 0 ? -1 : field; + switch (field) { + case vr::kSerializedFieldNumber: { + if (!ProcSerializedField(field, &input, num_bytes)) { + return ret; + } + break; + } + case vr::kRowsFieldNumber: { + PADDLE_ENFORCE((meta_.type() == sendrecv::SELECTED_ROWS || + meta_.type() == sendrecv::LOD_TENSOR) && + meta_.varname() != "", + "meta info should be got first!"); + + if (!CopySelectRowsData(&input, *dev_ctx_, num_bytes)) { + return ret; + } + break; + } + default: { + PADDLE_ENFORCE(false, "not surpported %u fieldnumber", field); + return ret; + } + } + } + + return 0; +} +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc_variable_response.h b/paddle/fluid/operators/distributed/brpc_variable_response.h new file mode 100644 index 0000000000000000000000000000000000000000..b0b91a42a01c79bc76aa19c8745ae8f7d3e9a297 --- /dev/null +++ b/paddle/fluid/operators/distributed/brpc_variable_response.h @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include + +#include "brpc/channel.h" +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/selected_rows.h" +#include "paddle/fluid/framework/var_type.h" + +#include "paddle/fluid/operators/distributed/send_recv.pb.h" + +#include "google/protobuf/io/coded_stream.h" +#include "google/protobuf/io/zero_copy_stream.h" +#include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/distributed/variable_response.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class BRPCSourceWrapper : public Source { + public: + explicit BRPCSourceWrapper(const butil::IOBuf& iobuf) : source_(iobuf) {} + ::google::protobuf::io::ZeroCopyInputStream* contents() override { + return &source_; + } + + private: + butil::IOBufAsZeroCopyInputStream source_; +}; + +class BRPCVariableResponse : public VariableResponse { + public: + BRPCVariableResponse(const framework::Scope* scope, + const platform::DeviceContext* dev_ctx, + bool create_scope = false) + : VariableResponse(scope, dev_ctx, create_scope) {} + + virtual ~BRPCVariableResponse() {} + + // parse attachment from iobuf + int Parse(Source* source) override; + int Parse(const butil::IOBuf& iobuf, const sendrecv::VariableMessage& meta) { + BRPCSourceWrapper wrapper(iobuf); + return VariableResponse::Parse(&wrapper, meta); + } +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index f14dfcdb238a9580affde96e4d5a0093743eb6c8..78956c9ea4942098002dd30e6f2f471ae49ab8d1 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -293,8 +293,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, const auto ch = GetChannel(ep); BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); const std::string method = "SendMonomerFetchBarrierRPC"; - VarHandlePtr h( - new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + VarHandlePtr h(new VarHandle(ep, method, var_name, nullptr, nullptr)); s->Prepare(h, time_out); VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index 31fac2133cf159719474207407c52bb96e80e131..a9dea9cfd2eeaa7e7ed8f052d2f51f5893c1e2e3 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif +#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -32,13 +33,6 @@ namespace paddle { namespace operators { namespace distributed { -static void SerializeDestroyCallback(void* payload) { - if (payload != nullptr) { - auto* shared_payload = reinterpret_cast(payload); - delete shared_payload; - } -} - void SerializeToByteBuffer(const std::string& name, framework::Variable* var, const platform::DeviceContext& ctx, ::grpc::ByteBuffer* msg, const std::string& out_name, @@ -109,6 +103,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, e.WriteVarlengthBeginning(VarMsg::kSerializedFieldNumber, payload->memory_size()); + if (payload->memory_size() >= std::numeric_limits::max()) { + LOG(FATAL) << "AppendZeroCopy varname:" << name + << ", vlen:" << payload->memory_size(); + } // steal reference of tensor data ::grpc::Slice slices[4]; // metadata, tensor, rows meta, rows int num_slices = 2; // only SelectedRows have rows buffer @@ -122,8 +120,10 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, if (var->IsType()) { auto* slr = var->GetMutable(); ProtoEncodeHelper e2(static_cast(buf), 128); - size_t rows_memory_size = - slr->rows().size() * framework::SizeOfType(typeid(int64_t)); + + PADDLE_ENFORCE(VectorElemName(slr->rows()) == typeid(int64_t).name()); + size_t rows_memory_size = slr->rows().size() * sizeof(int64_t); + e2.WriteVarlengthBeginning(VarMsg::kRowsFieldNumber, rows_memory_size); slices[2] = ::grpc::Slice(e2.size()); memcpy(const_cast(slices[2].begin()), e2.data(), e2.size()); diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index c3974138f4d4665c46bdfccaef09c0bd84b9d028..cda102e78d2de2876d54418574b7e07211fc92b4 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -488,7 +488,7 @@ void AsyncGRPCServer::HandleRequest( while (true) { VLOG(4) << "HandleRequest " << rpc_name << " wait next"; if (!cq->Next(&tag, &ok)) { - VLOG(3) << "CompletionQueue " << rpc_name << " shutdown!"; + LOG(WARNING) << "CompletionQueue " << rpc_name << " shutdown!"; break; } @@ -511,9 +511,8 @@ void AsyncGRPCServer::HandleRequest( // https://groups.google.com/forum/#!topic/grpc-io/xftlRy-IQwM // https://groups.google.com/forum/#!topic/grpc-io/ywATt88Ef_I if (!ok) { - LOG(WARNING) << "completion queue:" << rpc_name - << " recv no regular event" - << " context:" << base->Status2String(rpc_name); + VLOG(4) << "completion queue:" << rpc_name << " recv no regular event" + << " context:" << base->Status2String(rpc_name); TryToRegisterNewOne(rpc_name, req_id); delete base; continue; diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index 45d1d3479ce731894c26bbff40f456bbfdc13d44..8c7b7f1d7eeeced24d2ade2bcff4261b24587624 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -75,6 +75,10 @@ class RPCServer { void RegisterRPC(const std::string& rpc_name, RequestHandler* handler, int thread_num = 5); + int GetThreadNum(const std::string& rpc_name) { + return rpc_thread_num_[rpc_name]; + } + // Wait util all the clients have reached the barrier for one // rpc method. This function should be called in the // RequestHandler if you want to run the server/client in a diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 6ba883ba01f73bd941e7b1b58afb777e2588764b..25e2f77fb74f273c1cc5263046202b3fbf1084a3 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include // NOLINT #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/operators/distributed/brpc_rdma_pool.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" #include "paddle/fluid/platform/port.h" @@ -45,7 +46,6 @@ static TensorPayload GetCommunicationAllocationFromTensor( memory::Copy(cuda_pinned, result->ptr(), boost::get(tensor.place()), tensor.data(), copy_size, gpu_dev_ctx.stream()); - ctx.Wait(); return TensorPayload(result); #else @@ -61,8 +61,7 @@ TensorPayload GetTensorPayload(framework::Variable* var, auto tensor = var->Get(); // FIXME(wuyi): data types in send_recv.proto is copied from // framework.proto - request->set_data_type( - static_cast(framework::ToDataType(tensor.type()))); + request->set_data_type(static_cast(tensor.type())); for (auto& dim : framework::vectorize(tensor.dims())) { request->add_dims(dim); } @@ -83,8 +82,7 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request) { auto* slr = var->GetMutable(); - request->set_data_type( - static_cast(framework::ToDataType(slr->value().type()))); + request->set_data_type(static_cast(slr->value().type())); request->set_lod_level(0); request->set_slr_height(slr->height()); diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 523e56fe3e4eaf9d4faa8f0bf91d0b6075e0b0e5..6a87178be5daa02444c41a26f6e6c067713dd96f 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include #include +#include #include #include "paddle/fluid/framework/data_type.h" @@ -23,9 +24,8 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" -#include "paddle/fluid/platform/port.h" - #include "paddle/fluid/operators/distributed/send_recv.pb.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { @@ -50,6 +50,13 @@ class TensorPayload final { size_t memory_size_; }; +inline void SerializeDestroyCallback(void* payload) { + if (payload != nullptr) { + auto* shared_payload = reinterpret_cast(payload); + delete shared_payload; + } +} + TensorPayload GetTensorPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); @@ -58,23 +65,29 @@ TensorPayload GetSelectedRowsPayload(framework::Variable* var, const platform::DeviceContext& ctx, VarMsg* request); -inline std::type_index ToTypeIndex(sendrecv::VariableMessage::Type type) { +inline framework::proto::VarType::Type ToVarType( + sendrecv::VariableMessage::Type type) { switch (type) { case sendrecv::VariableMessage::FP32: - return typeid(float); // NOLINT + return framework::proto::VarType::FP32; // NOLINT case sendrecv::VariableMessage::FP64: - return typeid(double); // NOLINT + return framework::proto::VarType::FP64; // NOLINT case sendrecv::VariableMessage::INT32: - return typeid(int); // NOLINT + return framework::proto::VarType::INT32; // NOLINT case sendrecv::VariableMessage::INT64: - return typeid(int64_t); // NOLINT + return framework::proto::VarType::INT64; // NOLINT case sendrecv::VariableMessage::BOOL: - return typeid(bool); // NOLINT + return framework::proto::VarType::BOOL; // NOLINT default: PADDLE_THROW("Not support type %d", type); } } +template