未验证 提交 98069d99 编写于 作者: L lujun 提交者: GitHub

Merge pull request #9 from PaddlePaddle/develop

merge to local
...@@ -44,6 +44,7 @@ ...@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang | | qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu | | reyoung | Yang Yu |
| Sand3r- | Michal Gallus | | Sand3r- | Michal Gallus |
| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan | | Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang | | tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu | | tianbingsz | Tian-Bing Xu |
...@@ -54,6 +55,7 @@ ...@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang | | wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang | | wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang | | wen-bo-yang | Wen-Bo Yang |
| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu | | wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun | | xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu | | Xreki | Yi-Qun Liu |
......
...@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF) ...@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF) option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON) option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF) option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF) option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF) option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF) option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF) option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF) option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF) option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF) option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF) option(WITH_ANAKIN "Compile with Anakin library" OFF)
...@@ -105,8 +94,6 @@ endif() ...@@ -105,8 +94,6 @@ endif()
if (WIN32) if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE) "Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif() endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
...@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas ...@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph include(external/ngraph) # download, build, install nGraph
include(external/boost) # download boost include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3 include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11 include(external/pybind11) # download pybind11
include(external/cares) include(external/cares)
...@@ -225,7 +211,6 @@ include(generic) # simplify cmake module ...@@ -225,7 +211,6 @@ include(generic) # simplify cmake module
include(package) # set paddle packages include(package) # set paddle packages
include(ccache) # set ccache for compilation include(ccache) # set ccache for compilation
include(util) # set unittest and link libs include(util) # set unittest and link libs
include(rdma) # set rdma libraries
include(version) # set PADDLE_VERSION include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries include(inference_lib) # add paddle fluid inference libraries
...@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries ...@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}") include_directories("${PADDLE_SOURCE_DIR}")
set(EXTERNAL_LIBS
gflags
glog
${CBLAS_LIBRARIES}
protobuf
zlib
${PYTHON_LIBRARIES}
)
if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU) if(WITH_AMD_GPU)
find_package(HIP) find_package(HIP)
include(hip) include(hip)
endif(WITH_AMD_GPU) endif(WITH_AMD_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_LIBXSMM)
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build") set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
...@@ -20,31 +20,10 @@ if(WITH_DSO) ...@@ -20,31 +20,10 @@ if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO) add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO) endif(WITH_DSO)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING) if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING) add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING) endif(WITH_TESTING)
if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER) if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER) add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER) endif(NOT WITH_PROFILER)
...@@ -78,10 +57,6 @@ if(WIN32) ...@@ -78,10 +57,6 @@ if(WIN32)
endif(NOT MSVC) endif(NOT MSVC)
endif(WIN32) endif(WIN32)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
if(WITH_PSLIB) if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB) add_definitions(-DPADDLE_WITH_PSLIB)
endif() endif()
...@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE) ...@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
add_definitions(-DPADDLE_WITH_DISTRIBUTE) add_definitions(-DPADDLE_WITH_DISTRIBUTE)
endif() endif()
if(WITH_GOLANG)
# we need to symlink Paddle directory into GOPATH. If we
# don't do it and we have code that depends on Paddle, go
# get ./... will download a new Paddle repo from Github,
# without the changes in our current Paddle repo that we
# want to build.
set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
file(MAKE_DIRECTORY ${GOPATH})
set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
add_custom_target(go_path)
add_custom_command(TARGET go_path
# Symlink Paddle directory into GOPATH
COMMAND mkdir -p ${PADDLE_IN_GOPATH}
COMMAND rm -rf ${PADDLE_IN_GOPATH}
COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
# Automatically get all dependencies specified in the source code
# We can't run `go get -d ./...` for every target, because
# multiple `go get` can not run concurrently, but make need to be
# able to run with multiple jobs.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
if (GLIDE_INSTALL)
if(EXISTS $ENV{GOPATH}/bin/glide)
set(GLIDE "$ENV{GOPATH}/bin/glide")
else()
message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
endif()
# this command will only run when the file it depends is missing
# or has changed, or the output is missing.
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
)
# depends on the custom command which outputs
# ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
# run every time this target is built.
add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
endif()
endif(WITH_GOLANG)
if(WITH_GRPC) if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC) add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC) endif(WITH_GRPC)
......
...@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x ...@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
endif() endif()
include_directories(${CUDA_INCLUDE_DIRS}) include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO) if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32) if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY}) set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32) endif(WIN32)
......
...@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin) ...@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL) add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB}) set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin) add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add(
extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(lib_any STATIC ${dummyfile})
else()
add_library(lib_any INTERFACE)
endif()
add_dependencies(lib_any extern_lib_any)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
LIST(APPEND external_project_dependencies lib_any)
...@@ -57,5 +57,4 @@ else() ...@@ -57,5 +57,4 @@ else()
endif() endif()
add_dependencies(boost ${BOOST_PROJECT}) add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR}) set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
...@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES}) ...@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc) ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG) add_definitions(-DBRPC_WITH_GLOG)
LIST(APPEND external_project_dependencies brpc)
...@@ -31,5 +31,3 @@ else() ...@@ -31,5 +31,3 @@ else()
endif() endif()
add_dependencies(cub extern_cub) add_dependencies(cub extern_cub)
LIST(APPEND external_project_dependencies cub)
...@@ -27,5 +27,3 @@ else() ...@@ -27,5 +27,3 @@ else()
endif() endif()
add_dependencies(dlpack extern_dlpack) add_dependencies(dlpack extern_dlpack)
LIST(APPEND external_project_dependencies dlpack)
...@@ -52,5 +52,3 @@ else() ...@@ -52,5 +52,3 @@ else()
endif() endif()
add_dependencies(eigen3 extern_eigen3) add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
...@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL) ...@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES}) SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags) ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available. # On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32) if (WIN32)
include(CheckIncludeFileCXX) include(CheckIncludeFileCXX)
......
...@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL) ...@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES}) SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog gflags) ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags) LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
...@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ...@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES}) SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
ADD_DEPENDENCIES(gtest_main extern_gtest) ADD_DEPENDENCIES(gtest_main extern_gtest)
LIST(APPEND external_project_dependencies gtest gtest_main)
ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC)) ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
...@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy) ...@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL) ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES}) SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb) ADD_DEPENDENCIES(leveldb extern_leveldb)
LIST(APPEND external_project_dependencies leveldb)
...@@ -72,7 +72,4 @@ else() ...@@ -72,7 +72,4 @@ else()
add_library(libmct INTERFACE) add_library(libmct INTERFACE)
endif() endif()
#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT}) ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
LIST(APPEND external_project_dependencies libmct)
...@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}") ...@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR}) include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM) ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm) ADD_DEPENDENCIES(libxsmm extern_libxsmm)
LIST(APPEND external_project_dependencies libxsmm)
...@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB}) ...@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT}) ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}") MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN) add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies # generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn) # for cc_library(xxx SRCS xxx.c DEPS mkldnn)
......
...@@ -40,7 +40,9 @@ IF(WIN32) ...@@ -40,7 +40,9 @@ IF(WIN32)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll) SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll) SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSE() ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE) #TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "VsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE) SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so) SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
...@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ...@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL) ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB}) SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT}) ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
...@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT}) ...@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH) target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR}) target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB}) target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
...@@ -11,11 +11,6 @@ ...@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas) INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
...@@ -91,7 +86,6 @@ ENDIF() ...@@ -91,7 +86,6 @@ ENDIF()
IF(NOT ${CBLAS_FOUND}) IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas) ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas)
ELSE() ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML") IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml) ADD_DEPENDENCIES(cblas mklml)
......
...@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB) ...@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
ADD_DEPENDENCIES(protoc ${dep}) ADD_DEPENDENCIES(protoc ${dep})
ENDFOREACH() ENDFOREACH()
LIST(APPEND external_project_dependencies protobuf)
RETURN() RETURN()
endmacro() endmacro()
macro(SET_PROTOBUF_VERSION) macro(SET_PROTOBUF_VERSION)
...@@ -203,7 +202,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -203,7 +202,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
ENDIF() ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "v3.6.1") SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
ExternalProject_Add( ExternalProject_Add(
${TARGET_NAME} ${TARGET_NAME}
...@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) ...@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
) )
ENDFUNCTION() ENDFUNCTION()
SET(PROTOBUF_VERSION 3.6.1) SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND) IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE) build_protobuf(extern_protobuf FALSE)
......
...@@ -70,4 +70,3 @@ ExternalProject_Add( ...@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL) ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB}) SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT}) ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
...@@ -70,4 +70,3 @@ ExternalProject_Add( ...@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL) ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB}) SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT}) ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
...@@ -74,8 +74,8 @@ IF(PYTHONINTERP_FOUND) ...@@ -74,8 +74,8 @@ IF(PYTHONINTERP_FOUND)
find_python_module(wheel REQUIRED) find_python_module(wheel REQUIRED)
find_python_module(google.protobuf REQUIRED) find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED) FIND_PACKAGE(NumPy REQUIRED)
IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.6.1") IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.6.1, " MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
"please use pip to upgrade protobuf. pip install -U protobuf") "please use pip to upgrade protobuf. pip install -U protobuf")
ENDIF() ENDIF()
ENDIF(PYTHONINTERP_FOUND) ENDIF(PYTHONINTERP_FOUND)
......
...@@ -26,5 +26,3 @@ else() ...@@ -26,5 +26,3 @@ else()
endif() endif()
add_dependencies(simple_threadpool extern_threadpool) add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
...@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa ...@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL) ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES}) SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc) ADD_DEPENDENCIES(warpctc extern_warpctc)
LIST(APPEND external_project_dependencies warpctc)
...@@ -55,4 +55,3 @@ else() ...@@ -55,4 +55,3 @@ else()
endif() endif()
add_dependencies(xbyak ${XBYAK_PROJECT}) add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
...@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL) ...@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES}) set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR}) include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash) add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
...@@ -57,5 +57,3 @@ ENDIF(WIN32) ...@@ -57,5 +57,3 @@ ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL) ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES}) SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib) ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
...@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include") ...@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include") include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust") include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO) if(WITH_DSO)
...@@ -31,22 +29,12 @@ if(WITH_GRPC) ...@@ -31,22 +29,12 @@ if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC) endif(WITH_GRPC)
if(NOT WITH_GOLANG)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
endif(NOT WITH_GOLANG)
if(WITH_MKLDNN) if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN) endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(NOT WITH_RDMA)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
endif(NOT WITH_RDMA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
......
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
if(WITH_RDMA)
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
include_directories("${RDMA_INC_DIR}")
else()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
endif()
else(WITH_RDMA)
set(RDMA_LIBS "")
set(RDMA_LD_FLAGS "")
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
...@@ -33,6 +33,5 @@ if(TENSORRT_FOUND) ...@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. " message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ") "Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR}) include_directories(${TENSORRT_INCLUDE_DIR})
list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT) add_definitions(-DPADDLE_WITH_TENSORRT)
endif() endif()
...@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \ ...@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_GPU=ON \ -DWITH_GPU=ON \
-DWITH_TESTING=ON \ -DWITH_TESTING=ON \
-DWITH_TIMER=ON \
-DWITH_PROFILER=ON \ -DWITH_PROFILER=ON \
-DWITH_FLUID_ONLY=ON
make -j `nproc` make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)" pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
......
...@@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin ...@@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None) paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -30,8 +30,6 @@ namespace paddle { ...@@ -30,8 +30,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) { VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) { for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p); VarHandle* b = dynamic_cast<VarHandle*>(p);
......
...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, ...@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif #endif
void AllReduceOpHandle::RunImpl() { void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
WaitInputVarGenerated(); WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void BroadcastOpHandle::RunImpl() { void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
...@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() { ...@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle; VarHandle *in_var_handle;
{ {
auto in_var_handles = DynamicCast<VarHandle>(inputs_); auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1, PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one."); "The number of input should be one.");
in_var_handle = in_var_handles[0]; in_var_handle = in_var_handles[0];
} }
......
...@@ -34,9 +34,11 @@ namespace details { ...@@ -34,9 +34,11 @@ namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) { static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
// Should fix the allreduce op order if scheduling // Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang. // them in multiple threads or processes to avoid hang.
// NOTE: ParallelGraph would execute this pass on each graph, so
// don't need to append it here.
return (!strategy.enable_sequential_execution_ && return (!strategy.enable_sequential_execution_ &&
strategy.num_trainers_ > 1) || strategy.num_trainers_ > 1) &&
strategy.enable_parallel_graph_; !strategy.enable_parallel_graph_;
} }
class ParallelExecutorPassBuilder : public ir::PassBuilder { class ParallelExecutorPassBuilder : public ir::PassBuilder {
......
...@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan( ...@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
} }
void DataBalanceOpHandle::RunImpl() { void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1, PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of " "Data balance can only be enabled when the number of "
"places to run larger than 1."); "places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
...@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() { ...@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs()); auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs()); auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0); PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), ""); PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>(); auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
...@@ -22,7 +22,7 @@ namespace framework { ...@@ -22,7 +22,7 @@ namespace framework {
namespace details { namespace details {
void FusedBroadcastOpHandle::RunImpl() { void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return; if (places_.size() == 1UL) return;
......
...@@ -129,7 +129,13 @@ size_t NodeSize(const VarDesc& node) { ...@@ -129,7 +129,13 @@ size_t NodeSize(const VarDesc& node) {
} }
size_t NodeSize(ir::Node* n) { size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n); VarDesc* desc = nullptr;
// some op do not have block pointer
if (n->inputs[0]->Op() != nullptr) {
desc = FindVarDescInBlock(n);
} else {
desc = n->Var();
}
return NodeSize(*desc); return NodeSize(*desc);
} }
......
...@@ -29,8 +29,6 @@ namespace paddle { ...@@ -29,8 +29,6 @@ namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph); std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// NOTE(dzh): A ordered set for node reuse in memory optimize. // NOTE(dzh): A ordered set for node reuse in memory optimize.
......
...@@ -194,7 +194,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const { ...@@ -194,7 +194,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// effect. Because it is a single op in graph. No need to // effect. Because it is a single op in graph. No need to
// update the ir nodes. // update the ir nodes.
sub_op_desc->Rename(var->Name(), cache->Name()); sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block()->HasVar(var->Name())) { if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) {
sub_op_desc->Block()->RemoveVar(var->Name()); sub_op_desc->Block()->RemoveVar(var->Name());
} }
} }
...@@ -235,7 +236,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var, ...@@ -235,7 +236,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
auto* op_desc = op->Op(); auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var); op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var); op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var); if (op_desc->Block() != nullptr) {
op_desc->Block()->RemoveVar(var);
} else {
LOG(WARNING) << "op " << op->Name() << " not know its block."
<< "Is the op_desc created without block pointer? "
<< "Can not find " << var << " in Block(0)";
}
op_desc->Flush(); op_desc->Flush();
} }
} }
......
...@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result, ...@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
ir::Graph *result, const std::string &og) const { ir::Graph *result, const std::string &og) const {
OpHandleBase *op_handle = nullptr;
auto append_allreduce_op = [&](
const std::vector<Scope *> &scopes,
const std::vector<platform::Place> &places) -> OpHandleBase * {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_)); scopes, places, nccl_ctxs_));
#else #else
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle( result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_)); scopes, places));
#endif #endif
auto *op_handle = result->Get<GraphOps>(kGraphOps).back(); return result->Get<GraphOps>(kGraphOps).back();
};
if (!strategy_.enable_parallel_graph_)
op_handle = append_allreduce_op(local_scopes_, places_);
for (size_t i = 0; i < places_.size(); ++i) { for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i]; if (strategy_.enable_parallel_graph_) {
SetCommunicationContext(op_handle, p); op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
}
SetCommunicationContext(op_handle, places_[i]);
auto &vars = result->Get<GraphVars>(kGraphVars)[i][og]; auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
PADDLE_ENFORCE(!vars.empty()); PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back(); auto &prev_grad = vars.back();
...@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp( ...@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
auto var = auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable), new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), i, og, p); vars.size(), i, og, places_[i]);
vars.emplace_back(var); vars.emplace_back(var);
op_handle->AddOutput(var); op_handle->AddOutput(var);
} }
......
...@@ -36,13 +36,14 @@ namespace details { ...@@ -36,13 +36,14 @@ namespace details {
// map from variable name to variables. The variables, who have the same name, // map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the // will have a differsent version. The offset in the
// `std::vector<VarHandle*>` is the version of varaibles. // `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>> typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars; GraphVars;
const char kGraphVars[] = "vars"; const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard. // aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase*> GraphDepVars; typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars"; const char kGraphDepVars[] = "dep_vars";
} // namespace details } // namespace details
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -70,6 +70,9 @@ class OpHandleBase { ...@@ -70,6 +70,9 @@ class OpHandleBase {
auto it = dev_ctxes_.find(place); auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr; return it != dev_ctxes_.end() ? it->second : nullptr;
} }
const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
return dev_ctxes_;
}
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) { void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
dev_ctxes_[place] = ctx_; dev_ctxes_[place] = ctx_;
......
...@@ -13,22 +13,92 @@ ...@@ -13,22 +13,92 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details { namespace details {
std::vector<std::unique_ptr<ir::Graph>>
ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
std::unique_ptr<ir::Graph> &&graph) {
std::vector<std::unique_ptr<ir::Graph>> graphs;
graphs.reserve(places_.size());
for (size_t i = 0; i < places_.size(); ++i) {
ProgramDesc empty;
graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
auto &g = graphs.back();
g->Set(kGraphVars, new GraphVars(1UL));
g->Set(kGraphDepVars, new GraphDepVars);
}
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
for (auto &op : op_handles) {
auto &dev_ctx = op->DeviceContext();
auto &p = dev_ctx.begin()->first;
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
for (auto &var : op->Inputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
for (auto &var : op->Outputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
}
for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
for (auto &name_pair : origin_vars) {
dev_vars.emplace(name_pair.first, name_pair.second);
for (auto &version_pair : name_pair.second) {
if (graph->Nodes().count(version_pair->Node())) {
graphs[dev_id]->AddNode(
graph->RemoveNode(version_pair->Node()).release());
}
}
}
}
return graphs;
}
ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes, const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs) const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
: strategy_(std::move(strategy)), : strategy_(std::move(strategy)),
local_scopes_(std::move(local_scopes)), local_scopes_(std::move(local_scopes)),
pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr), pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
places_(std::move(places)), places_(std::move(places)),
graphs_(std::move(graphs)) { main_prog_(main_prog),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size()); PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
auto seq_allreduce_pass =
ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
seq_allreduce_pass->Erase(details::kAllOpDescs);
seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
details::kAllOpDescs,
new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
for (size_t i = 0; i < graphs_.size(); ++i) {
graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
}
// set the correct size of thread pool to each device. // set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size() strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
? 1UL ? 1UL
...@@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( ...@@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<< " to run the operators of the graph on each device."; << " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) { for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor( executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i]))); strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
} }
} }
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#include <vector> #include <vector>
#include "ThreadPool.h" #include "ThreadPool.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { ...@@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor(const ExecutionStrategy &strategy, ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places, const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs); const framework::ProgramDesc &main_prog,
std::unique_ptr<ir::Graph> &&graph);
~ParallelSSAGraphExecutor() final = default; ~ParallelSSAGraphExecutor() final = default;
const ir::Graph &Graph() const override { return *graphs_[0]; } const ir::Graph &Graph() const override { return *graphs_[0]; }
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override; FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
private: private:
std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
std::unique_ptr<ir::Graph> &&graph);
ExecutionStrategy strategy_; ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_; std::vector<Scope *> local_scopes_;
std::unique_ptr<::ThreadPool> pool_{nullptr}; std::unique_ptr<::ThreadPool> pool_{nullptr};
std::vector<platform::Place> places_; std::vector<platform::Place> places_;
framework::ProgramDesc main_prog_;
std::vector<std::unique_ptr<ir::Graph>> graphs_; std::vector<std::unique_ptr<ir::Graph>> graphs_;
std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_; std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
......
...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows( ...@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
#endif #endif
void ReduceOpHandle::RunImpl() { void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); platform::RecordEvent record_event(Name());
if (places_.size() == 1) return; if (places_.size() == 1) return;
// the input and output may have dummy var. // the input and output may have dummy var.
...@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() { ...@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
{ {
auto out_var_handles = DynamicCast<VarHandle>(outputs_); auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1, PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one."); "The number of output should be one.");
out_var_handle = out_var_handles.front(); out_var_handle = out_var_handles.front();
} }
......
...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run( ...@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception(); eptr = std::current_exception();
} }
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr); platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
++drop_scope_counter_; ++drop_scope_counter_;
bool stream_end = false; bool stream_end = false;
......
...@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor( ...@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run( FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) { const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event( std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr)); new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map<OpHandleBase *, size_t> pending_ops; std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars; std::unordered_set<VarHandleBase *> pending_vars;
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>(); auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
...@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp( ...@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp(
VLOG(10) << op << " " << op->Name() << " Done "; VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--; running_ops_--;
ready_var_q->Extend(op->Outputs()); ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted"; VLOG(10) << op << " " << op->Name() << " Signal posted";
} catch (...) { } catch (...) {
exception_holder_.Catch(std::current_exception()); exception_holder_.Catch(std::current_exception());
} }
......
...@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE ...@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass) cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN) if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass) cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass) cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass) cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif () endif ()
...@@ -22,7 +22,8 @@ namespace ir { ...@@ -22,7 +22,8 @@ namespace ir {
class AttentionLSTMFusePass : public FusePassBase { class AttentionLSTMFusePass : public FusePassBase {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
virtual ~ConvAffineChannelFusePass() {} virtual ~ConvAffineChannelFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_affine_channel_fuse"}; const std::string name_scope_{"conv_affine_channel_fuse"};
}; };
...@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase { ...@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
virtual ~ConvEltwiseAddAffineChannelFusePass() {} virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"}; const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
}; };
......
...@@ -169,7 +169,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl( ...@@ -169,7 +169,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
if (has_bias && conv->Op()->Input("Bias").size() > 0) { if (has_bias && conv->Op()->Input("Bias").size() > 0) {
// reuse existing conv bias node // reuse existing conv bias node
auto conv_bias_names = conv->Op()->Input("Bias"); auto conv_bias_names = conv->Op()->Input("Bias");
PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1); PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
auto* conv_bias_var = scope->FindVar(conv_bias_names[0]); auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>(); auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(), PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
......
...@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
virtual ~ConvBNFusePass() {} virtual ~ConvBNFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_bn_fuse"}; const std::string name_scope_{"conv_bn_fuse"};
}; };
...@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase { ...@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
virtual ~ConvEltwiseAddBNFusePass() {} virtual ~ConvEltwiseAddBNFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_eltwiseadd_bn_fuse"}; const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
}; };
......
...@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase { ...@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
virtual ~ConvElementwiseAdd2ActFusePass() {} virtual ~ConvElementwiseAdd2ActFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase { ...@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
virtual ~ConvElementwiseAddActFusePass() {} virtual ~ConvElementwiseAddActFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase { ...@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
virtual ~ConvElementwiseAddFusePass() {} virtual ~ConvElementwiseAddFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase { ...@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
virtual ~EmbeddingFCLSTMFusePass() {} virtual ~EmbeddingFCLSTMFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"embedding_fc_lstm_fuse"}; const std::string name_scope_{"embedding_fc_lstm_fuse"};
}; };
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase { ...@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
virtual ~FCFusePass() {} virtual ~FCFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase { ...@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
virtual ~FCGRUFusePass() {} virtual ~FCGRUFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_gru_fuse"}; const std::string name_scope_{"fc_gru_fuse"};
}; };
...@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase { ...@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
virtual ~MulGRUFusePass() {} virtual ~MulGRUFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_nobias_gru_fuse"}; const std::string name_scope_{"fc_nobias_gru_fuse"};
}; };
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
#pragma once #pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h" #include "paddle/fluid/framework/ir/graph_pattern_detector.h"
...@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase { ...@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
virtual ~FCLstmFusePass() {} virtual ~FCLstmFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_lstm_fuse"}; const std::string name_scope_{"fc_lstm_fuse"};
}; };
...@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase { ...@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
virtual ~MulLstmFusePass() {} virtual ~MulLstmFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_nobias_lstm_fuse"}; const std::string name_scope_{"fc_nobias_lstm_fuse"};
}; };
......
...@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase { ...@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
virtual ~FuseElewiseAddActPass() {} virtual ~FuseElewiseAddActPass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
std::unique_ptr<ir::Graph> FuseElewiseAddAct( std::unique_ptr<ir::Graph> FuseElewiseAddAct(
std::unique_ptr<ir::Graph> graph, std::unique_ptr<ir::Graph> graph,
......
...@@ -111,7 +111,7 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( ...@@ -111,7 +111,7 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
xg_var = subgraph.at(xg)->Var(); xg_var = subgraph.at(xg)->Var();
} }
PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1); PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name()); PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
layer_op->SetInput("Input", {x_var->Name()}); layer_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer)->inputs.push_back(subgraph.at(x)); subgraph.at(layer)->inputs.push_back(subgraph.at(x));
...@@ -119,13 +119,13 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv( ...@@ -119,13 +119,13 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name(); VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
if (!only_forward) { if (!only_forward) {
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1); PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name()); PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
layer_g_op->SetInput("Input", {x_var->Name()}); layer_g_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer_g)->inputs.push_back(subgraph.at(x)); subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
subgraph.at(x)->outputs.push_back(subgraph.at(layer_g)); subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1); PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0], PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
yg_var->Name()); yg_var->Name());
layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()}); layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
......
...@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase { ...@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
virtual ~FuseReluDepthwiseConvPass() {} virtual ~FuseReluDepthwiseConvPass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
std::unique_ptr<ir::Graph> FuseReluDepthwiseConv( std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
std::unique_ptr<ir::Graph> graph, bool only_forward) const; std::unique_ptr<ir::Graph> graph, bool only_forward) const;
}; };
......
...@@ -26,6 +26,14 @@ limitations under the License. */ ...@@ -26,6 +26,14 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace framework { namespace framework {
namespace details {
// This attr is not recommended, because the graph should not dependence
// the program once it is built.
constexpr char kAllOpDescs[] = "all_op_descs";
} // namespace details
namespace ir { namespace ir {
/* /*
...@@ -168,10 +176,13 @@ class Graph { ...@@ -168,10 +176,13 @@ class Graph {
return ret; return ret;
} }
void RemoveNode(ir::Node *node) { std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) != node_set_.end()); PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
node_set_.erase(node); std::unique_ptr<ir::Node> ret;
ret.reset(nodes_.at(node).release());
nodes_.erase(node); nodes_.erase(node);
node_set_.erase(node);
return ret;
} }
// NOTE low performance, but simple and secure. // NOTE low performance, but simple and secure.
...@@ -184,13 +195,6 @@ class Graph { ...@@ -184,13 +195,6 @@ class Graph {
return nullptr; return nullptr;
} }
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// This method takes ownership of `node`. // This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) { ir::Node *AddNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
...@@ -199,6 +203,13 @@ class Graph { ...@@ -199,6 +203,13 @@ class Graph {
return node; return node;
} }
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// NOTE: program_ shouldn't be exposed to user. // NOTE: program_ shouldn't be exposed to user.
const ProgramDesc program_; const ProgramDesc program_;
std::map<std::string, boost::any> attrs_; std::map<std::string, boost::any> attrs_;
......
...@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL; ...@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL;
PDNode *PDPattern::NewNode(const std::string &name) { PDNode *PDPattern::NewNode(const std::string &name) {
if (!name.empty()) { if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0, PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]", "PDNode's name should be unique, get duplicate [%s]",
name); name);
} }
...@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) { ...@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) {
PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) { PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
if (!name.empty()) { if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0, PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]", "PDNode's name should be unique, get duplicate [%s]",
name); name);
} }
......
...@@ -22,7 +22,8 @@ namespace ir { ...@@ -22,7 +22,8 @@ namespace ir {
class IdentityScaleOpCleanPass : public FusePassBase { class IdentityScaleOpCleanPass : public FusePassBase {
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private: private:
virtual ~IdentityScaleOpCleanPass() = default; virtual ~IdentityScaleOpCleanPass() = default;
......
...@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass { ...@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
virtual ~LockFreeOptimizePass() {} virtual ~LockFreeOptimizePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private: private:
// Create a new sgd node via current optimizer node // Create a new sgd node via current optimizer node
......
...@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase { ...@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
virtual bool is_conv3d() const { return false; } virtual bool is_conv3d() const { return false; }
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_bias_mkldnn_fuse"}; const std::string name_scope_{"conv_bias_mkldnn_fuse"};
}; };
/* /*
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "conv2d") {
op->SetAttr("use_mkldnn", true);
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
if (inputs.size() > 2)
op->SetInput("Bias", {inputs[2]});
else
op->SetInput("Bias", {});
} else if (type == "elementwise_add") {
op->SetAttr("use_mkldnn", true);
op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]});
}
op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
}
// (c, weights)->conv->f
// (f)->elementwise_add->g
ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
ProgramDesc prog;
std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
if (convWithExistingBias) nodes.push_back("conv_bias");
for (auto& v : nodes) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::LOD_TENSOR);
if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
var->SetPersistable(true);
}
}
// conv+bias, both with MKL-DNN
if (convWithExistingBias) {
SetOp(&prog, "conv2d", "conv",
std::vector<std::string>({"c", "weights", "conv_bias"}),
std::vector<std::string>({"f"}));
} else {
SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
std::vector<std::string>({"f"}));
}
SetOp(&prog, "elementwise_add", "eltwise",
std::vector<std::string>({"f", "eltwise_bias"}),
std::vector<std::string>({"g"}));
return prog;
}
void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
const char* var_name) {
auto x = scope->Var(var_name);
auto tensor = x->GetMutable<LoDTensor>();
tensor->mutable_data(place, proto::VarType::FP32,
::paddle::memory::Allocator::kDefault, 1);
}
void MainTest(bool convWithExistingBias) {
auto prog = BuildProgramDesc(convWithExistingBias);
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto place = paddle::platform::CPUPlace();
NaiveExecutor exe{place};
Scope scope;
// Init scope, as it is used in pass
exe.CreateVariables(prog, 0, true, &scope);
if (convWithExistingBias) {
InitTensorHolder(&scope, place, "conv_bias");
InitTensorHolder(&scope, place, "eltwise_bias");
}
graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
// Remove 3 Nodes: Conv, Bias, conv_out
// Add 1 Node: ConvBias
EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
// Assert conv_bias op in newly generated graph
int conv_bias_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
auto* op = node->Op();
ASSERT_TRUE(op->HasAttr("use_mkldnn"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
// check if "conv" convolution is fused
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv") {
auto input_names = op->InputNames();
ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
input_names.end());
auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
if (bias.size()) {
++conv_bias_count;
}
}
}
}
EXPECT_EQ(conv_bias_count, 1);
}
TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
TEST(ConvBiasFusePass, conv3d) {
Conv3DBiasFusePass pass;
ASSERT_TRUE(pass.is_conv3d());
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_bias_mkldnn_fuse_pass);
...@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
virtual ~RepeatedFCReluFusePass() {} virtual ~RepeatedFCReluFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"repeated_fc_relu_fuse"}; const std::string name_scope_{"repeated_fc_relu_fuse"};
}; };
......
...@@ -12,6 +12,8 @@ ...@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h" #include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h" #include "paddle/fluid/framework/ir/pass.h"
...@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase { ...@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
virtual ~SeqConcatFcFusePass() {} virtual ~SeqConcatFcFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase { ...@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
virtual ~SeqConvEltAddReluFusePass() {} virtual ~SeqConvEltAddReluFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"seqconv_eltadd_relu_fuse"}; const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
}; };
......
...@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase { ...@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
virtual ~SeqPoolConcatFusePass() {} virtual ~SeqPoolConcatFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"seqpool_concat_fuse"}; const std::string name_scope_{"seqpool_concat_fuse"};
}; };
......
...@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase { ...@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
virtual ~SquaredMatSubFusePass() {} virtual ~SquaredMatSubFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"squared_mat_sub_fuse"}; const std::string name_scope_{"squared_mat_sub_fuse"};
}; };
......
...@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase { ...@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
virtual ~TransposeFlattenConcatFusePass() {} virtual ~TransposeFlattenConcatFusePass() {}
protected: protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const; std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
}; };
} // namespace ir } // namespace ir
......
...@@ -27,7 +27,7 @@ enum class OpRole { ...@@ -27,7 +27,7 @@ enum class OpRole {
kForward = 0x0000, kForward = 0x0000,
kBackward = 0x0001, kBackward = 0x0001,
kOptimize = 0x0002, kOptimize = 0x0002,
// RPC role is for send/recv releated op // RPC role is for send/recv related op
kRPC = 0x0004, kRPC = 0x0004,
// Dist role is for split_byref/split_selected_rows/concat // Dist role is for split_byref/split_selected_rows/concat
// used for distributed training. // used for distributed training.
......
...@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// in concurrency scenerio. Here use an `if` to fix this issue. // in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern. // Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) { if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool = platform::RecordEvent record_event(Type());
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
RunImpl(scope, place); RunImpl(scope, place);
} else { } else {
RunImpl(scope, place); RunImpl(scope, place);
......
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h" #include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h" #include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h" #include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
...@@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor( ...@@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor(
member_->use_all_reduce_ = member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = build_strategy.num_trainers_ * places.size(); member_->nranks_ = build_strategy.num_trainers_ * places.size();
if (!member_->use_all_reduce_) { if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1, PADDLE_ENFORCE(places.size() > 1,
"If you set build_strategy.reduce with 'Reduce'," "If you set build_strategy.reduce with 'Reduce',"
...@@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor( ...@@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor(
// choice the execution strategy. // choice the execution strategy.
build_strategy.enable_parallel_graph_ = build_strategy.enable_parallel_graph_ =
EnableParallelGraphExecution(main_program, exec_strategy, build_strategy); EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
if (build_strategy.enable_parallel_graph_)
VLOG(1) << "Enable ParallelGraph Execution: " VLOG(0) << "The Executor would execute the graph by ParallelGraph "
<< build_strategy.enable_parallel_graph_; "Execution which can get better performance,"
<< "you can force it off by env FLAGS_enable_parallel_graph=0";
if (member_->use_cuda_) { if (member_->use_cuda_) {
// Bcast Parameters to all GPUs // Bcast Parameters to all GPUs
...@@ -257,42 +258,27 @@ ParallelExecutor::ParallelExecutor( ...@@ -257,42 +258,27 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert // Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp // ncclOp
std::vector<std::unique_ptr<ir::Graph>> graphs; std::unique_ptr<ir::Graph> graph;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (build_strategy.enable_parallel_graph_) { graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
for (size_t i = 0; i < member_->places_.size(); ++i) { member_->local_scopes_, member_->nranks_,
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( member_->use_cuda_, member_->nccl_ctxs_.get());
main_program, {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
} else {
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
#else #else
std::unique_ptr<ir::Graph> graph = build_strategy.Apply( graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
main_program, member_->places_, loss_var_name, member_->local_scopes_, member_->local_scopes_, member_->nranks_,
member_->nranks_, member_->use_cuda_); member_->use_cuda_);
graphs.push_back(std::move(graph));
#endif #endif
auto max_memory_size = GetEagerDeletionThreshold(); auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold " VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30); << static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) { if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) { graph = member_->PrepareGCAndRefCnts(std::move(graph),
graphs[i] = member_->PrepareGCAndRefCnts( static_cast<size_t>(max_memory_size));
std::move(graphs[i]), static_cast<size_t>(max_memory_size));
}
} }
// Step 3. Create vars in each scope. Passes may also create new vars. // Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars // skip control vars and empty vars
std::vector<details::VariableInfo> var_infos; std::vector<details::VariableInfo> var_infos;
for (auto &graph : graphs) {
for (auto &node : graph->Nodes()) { for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) { if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
var_infos.emplace_back(); var_infos.emplace_back();
...@@ -301,16 +287,15 @@ ParallelExecutor::ParallelExecutor( ...@@ -301,16 +287,15 @@ ParallelExecutor::ParallelExecutor(
var_infos.back().persistable_ = node->Var()->Persistable(); var_infos.back().persistable_ = node->Var()->Persistable();
} }
} }
}
// If the loss_var_name is given, the number of graph should be only one. // If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) { if (loss_var_name.size()) {
size_t graph_num = ir::GraphNum(*graphs[0]); size_t graph_num = ir::GraphNum(*graph);
if (graph_num > 1) { if (graph_num > 1) {
LOG(WARNING) LOG(WARNING)
<< "The number of graph should be only one, " << "The number of graph should be only one, "
"but the current graph has " "but the current graph has "
<< ir::GraphNum(*graphs[0]) << ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the " << " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' " "sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, " "to specify the output dir. NOTES: if you not do training, "
...@@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor( ...@@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor(
} }
if (build_strategy.enable_parallel_graph_) { if (build_strategy.enable_parallel_graph_) {
#ifdef PADDLE_WITH_CUDA
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
member_->executor_.reset(new details::ParallelSSAGraphExecutor( member_->executor_.reset(new details::ParallelSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_, exec_strategy, member_->local_scopes_, member_->places_, main_program,
std::move(graphs))); std::move(graph)));
#else
PADDLE_THROW(
"Paddle should be compiled with CUDA for ParallelGraph Execution.");
#endif
} else { } else {
if (exec_strategy.type_ == ExecutionStrategy::kDefault) { if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor( member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_, exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs[0]))); std::move(graph)));
} else { } else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor( member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_, exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs[0]))); std::move(graph)));
} }
} }
...@@ -482,7 +474,6 @@ bool ParallelExecutor::EnableParallelGraphExecution( ...@@ -482,7 +474,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
} }
if (!member_->use_all_reduce_ || !member_->use_cuda_) if (!member_->use_all_reduce_ || !member_->use_cuda_)
enable_parallel_graph = false;
if (build_strategy.enable_sequential_execution_ || if (build_strategy.enable_sequential_execution_ ||
exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
......
...@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(params_file_); CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields. // params_file_ fields.
// Gpu releated. // Gpu related.
CP_MEMBER(use_gpu_); CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_); CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_); CP_MEMBER(memory_pool_init_size_mb_);
...@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) { ...@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(enable_memory_optim_); CP_MEMBER(enable_memory_optim_);
CP_MEMBER(static_memory_optim_); CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_); CP_MEMBER(static_memory_optim_force_update_);
// TensorRT releated. // TensorRT related.
CP_MEMBER(use_tensorrt_); CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_); CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_); CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_); CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_); CP_MEMBER(tensorrt_precision_mode_);
// MKLDNN releated. // MKLDNN related.
CP_MEMBER(use_mkldnn_); CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_); CP_MEMBER(mkldnn_enabled_op_types_);
......
...@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) { AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig"; VLOG(3) << "create AnalysisConfig";
if (config.use_gpu()) { if (config.use_gpu()) {
// 1. GPU memeroy // 1. GPU memory
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f); PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d", PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id()); config.gpu_device_id());
...@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() { ...@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return need; return need;
} }
std::string AnalysisPredictor::GetSeriazlizedProgram() const { std::string AnalysisPredictor::GetSerializedProgram() const {
return inference_program_->Proto()->SerializeAsString(); return inference_program_->Proto()->SerializeAsString();
} }
......
...@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor { ...@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid); void SetMkldnnThreadID(int tid);
std::string GetSeriazlizedProgram() const override; std::string GetSerializedProgram() const override;
protected: protected:
// For memory optimization. // For memory optimization.
......
...@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) { ...@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
{ {
// The first predictor help to cache the memory optimize strategy. // The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config); auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram(); LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty()); ASSERT_FALSE(predictor->GetSerializedProgram().empty());
// Run several times to check the parameters are not reused by mistake. // Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
......
...@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) { ...@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) {
void PaddleBuf::Free() { void PaddleBuf::Free() {
if (memory_owned_ && data_) { if (memory_owned_ && data_) {
PADDLE_ENFORCE_GT(length_, 0); PADDLE_ENFORCE_GT(length_, 0UL);
free(static_cast<char *>(data_)); free(static_cast<char *>(data_));
data_ = nullptr; data_ = nullptr;
length_ = 0; length_ = 0;
......
...@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor< ...@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) { NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor"; VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) { if (config.use_gpu) {
// 1. GPU memeroy // 1. GPU memory
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f, config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]"); "fraction_of_gpu_memory in the config should be set to range (0., 1.]");
......
...@@ -212,12 +212,12 @@ struct AnalysisConfig { ...@@ -212,12 +212,12 @@ struct AnalysisConfig {
std::string prog_file_; std::string prog_file_;
std::string params_file_; std::string params_file_;
// GPU releated. // GPU related.
bool use_gpu_{false}; bool use_gpu_{false};
int device_id_{0}; int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB. uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated. // TensorRT related.
bool use_tensorrt_{false}; bool use_tensorrt_{false};
// For workspace_size, refer it from here: // For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting // https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
......
...@@ -248,7 +248,7 @@ class PaddlePredictor { ...@@ -248,7 +248,7 @@ class PaddlePredictor {
/** \brief Get the serialized model program that executes in inference phase. /** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message. * Its data type is ProgramDesc, which is a protobuf message.
*/ */
virtual std::string GetSeriazlizedProgram() const { virtual std::string GetSerializedProgram() const {
assert(false); // Force raise error. assert(false); // Force raise error.
return "NotImplemented"; return "NotImplemented";
} }
......
...@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2") ...@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz") download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc) inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
# TODO(luotao, Superjom) Disable DAM test, temporarily fix
# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
# After inference framework refactor, will reopen it.
# normal DAM # normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam") set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz") download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL) #inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM # small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam") set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
......
...@@ -56,14 +56,14 @@ struct DataRecord { ...@@ -56,14 +56,14 @@ struct DataRecord {
std::vector<float> slot_data; std::vector<float> slot_data;
split_to_float(data[1], ' ', &slot_data); split_to_float(data[1], ' ', &slot_data);
std::string name = data[0]; std::string name = data[0];
PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0, PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
"line %d, %s should be divisible", num_lines, name); "line %d, %s should be divisible", num_lines, name);
datasets[name].emplace_back(std::move(slot_data)); datasets[name].emplace_back(std::move(slot_data));
} }
num_samples = num_lines / num_slots; num_samples = num_lines / num_slots;
PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines), PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
"num samples should be divisible"); "num samples should be divisible");
PADDLE_ENFORCE_GT(num_samples, 0); PADDLE_ENFORCE_GT(num_samples, 0UL);
} }
void Prepare(int bs) { void Prepare(int bs) {
......
include(ExternalProject)
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url") set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.") "A path setting inference demo download directories.")
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}") function(inference_download INSTALL_DIR URL FILENAME)
file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}") message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
message(STATUS "finish downloading ${filename}") string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
ExternalProject_Add(
extern_inference_download_${FILENAME_EX}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
endfunction() endfunction()
function (inference_download_and_uncompress install_dir url filename) function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
inference_download(${install_dir} ${url} ${filename}) message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
execute_process( string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename} set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
WORKING_DIRECTORY ${install_dir} set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
) )
endfunction() endfunction()
......
...@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname, ...@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
// Enable the profiler // Enable the profiler
paddle::platform::EnableProfiler(state); paddle::platform::EnableProfiler(state);
{ {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event("init_program");
"init_program",
paddle::platform::DeviceContextPool::Instance().Get(place));
inference_program = InitProgram(&executor, scope, dirname, is_combined); inference_program = InitProgram(&executor, scope, dirname, is_combined);
} }
...@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname, ...@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
// Run repeat times to profile the performance // Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) { for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event( paddle::platform::RecordEvent record_event("run_inference");
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));
if (PrepareContext) { if (PrepareContext) {
// Note: if you change the inference_program, you need to call // Note: if you change the inference_program, you need to call
......
...@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) { ...@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
usage_ -= size; usage_ -= size;
} }
uint64_t MemInfo::GetPeakUsage() { return peak_usage_; } uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }
LegacyMemMonitor::~LegacyMemMonitor() { LegacyMemMonitor::~LegacyMemMonitor() {
for (auto &item : gpu_mem_info_) delete item.second; for (auto &item : gpu_mem_info_) delete item.second;
...@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) { ...@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
gpu_mem_info_[device]->Minus(size); gpu_mem_info_[device]->Minus(size);
} }
uint64_t LegacyMemMonitor::GetMemUsage(const int &device) { uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
return gpu_mem_info_.find(device) == gpu_mem_info_.end() return gpu_mem_info_.find(device) == gpu_mem_info_.end()
? 0 ? 0
: gpu_mem_info_[device]->GetPeakUsage(); : gpu_mem_info_.at(device)->GetPeakUsage();
} }
void LegacyMemMonitor::PrintMemUsage() { void LegacyMemMonitor::PrintMemUsage() {
......
...@@ -27,20 +27,20 @@ namespace allocation { ...@@ -27,20 +27,20 @@ namespace allocation {
class MemInfo { class MemInfo {
public: public:
MemInfo() : usage_(0), peak_usage_(0) {} MemInfo() : usage_(0), peak_usage_(0) {}
MemInfo(const MemInfo &) = delete;
MemInfo &operator=(const MemInfo &) = delete;
// return a flag to indicate current operation will create a peak point or not // return a flag to indicate current operation will create a peak point or not
bool Add(const size_t &); bool Add(const size_t &);
void Minus(const size_t &); void Minus(const size_t &);
uint64_t GetPeakUsage(); uint64_t GetPeakUsage() const;
private: private:
/* current memory usage*/ /* current memory usage*/
uint64_t usage_; uint64_t usage_;
uint64_t peak_usage_; uint64_t peak_usage_;
std::mutex mutex_; std::mutex mutex_;
DISABLE_COPY_AND_ASSIGN(MemInfo);
}; };
class LegacyMemMonitor { class LegacyMemMonitor {
...@@ -56,11 +56,11 @@ class LegacyMemMonitor { ...@@ -56,11 +56,11 @@ class LegacyMemMonitor {
void Add(const int &, const size_t &); void Add(const int &, const size_t &);
void Minus(const int &, const size_t &); void Minus(const int &, const size_t &);
uint64_t GetMemUsage(const int &); uint64_t GetMemUsage(const int &) const;
void PrintMemUsage(); void PrintMemUsage();
protected: private:
MemUsage gpu_mem_info_; MemUsage gpu_mem_info_;
}; };
......
...@@ -97,3 +97,4 @@ if (WITH_PYTHON) ...@@ -97,3 +97,4 @@ if (WITH_PYTHON)
endif() endif()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library") set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
add_subdirectory(benchmark)
...@@ -11,6 +11,7 @@ limitations under the License. */ ...@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once #pragma once
#include <glog/logging.h> #include <glog/logging.h>
#include <algorithm>
#include <string> #include <string>
#include <unordered_set> #include <unordered_set>
#include <utility> #include <utility>
...@@ -24,6 +25,7 @@ limitations under the License. */ ...@@ -24,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
...@@ -301,8 +303,28 @@ template <typename T> ...@@ -301,8 +303,28 @@ template <typename T>
struct GeluFunctor : public BaseActivationFunctor<T> { struct GeluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const { void operator()(Device d, X x, Out out) const {
// Because the execute or device context can not be deliver here, it keep the
// marco for NVCC.
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
auto x_data = x.data();
auto out_data = out.data();
int n = std::min(x.size(), out.size());
std::memset(out_data, 0, n * sizeof(T));
math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
for (int i = 0; i < n; i++) {
out_data[i] += static_cast<T>(1);
}
math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
for (int i = 0; i < n; i++) {
out_data[i] *= static_cast<T>(0.5);
}
#else
auto temp = (x * static_cast<T>(M_SQRT1_2)).erf(); auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp); out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
#endif
} }
}; };
......
...@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> { ...@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
int len = x_lod[0][i + 1] - x_lod[0][i]; int len = x_lod[0][i + 1] - x_lod[0][i];
max_seq_len = max_seq_len < len ? len : max_seq_len; max_seq_len = max_seq_len < len ? len : max_seq_len;
} }
PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1."); PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D); PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
fc_out->Resize({max_seq_len, 1}); fc_out->Resize({max_seq_len, 1});
......
cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
DEPS memory timer framework_proto proto_desc lod_tensor op_registry
device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h"
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace operators {
namespace benchmark {
DEFINE_string(op_config_list, "", "Path of op config file.");
void OpTester::Init(const std::string &filename) {
Init(OpTesterConfig(filename));
}
void OpTester::Init(const OpTesterConfig &config) {
config_ = config;
auto &op_desc_info = framework::OpInfoMap::Instance();
// Initialize the OpDesc
if (op_desc_info.Has(config_.op_type)) {
type_ = config_.op_type;
op_desc_.SetType(config_.op_type);
CreateInputVarDesc();
CreateOutputVarDesc();
} else {
LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
}
if (config_.device_id >= 0) {
place_ = paddle::platform::CUDAPlace(config_.device_id);
} else {
place_ = paddle::platform::CPUPlace();
}
framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
op_ = framework::OpRegistry::CreateOp(op_desc_);
CreateVariables(scope_.get());
}
void OpTester::Run() {
if (config_.print_debug_string) {
LOG(INFO) << DebugString();
}
// Warm up
RunImpl();
platform::Timer timer;
if (config_.profile) {
if (platform::is_cpu_place(place_)) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else {
#ifdef PADDLE_WITH_CUDA
platform::EnableProfiler(platform::ProfilerState::kAll);
platform::SetDeviceId(config_.device_id);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
timer.Start();
for (int i = config_.repeat; i > 0; --i) {
RunImpl();
}
timer.Pause();
platform::DisableProfiler(platform::EventSortingKey::kDefault,
"op_tester_profiler");
} else {
timer.Start();
for (int i = config_.repeat; i > 0; --i) {
RunImpl();
}
timer.Pause();
}
config_.runtime = timer.ElapsedMS() / config_.repeat;
LOG(INFO) << "=== Run " << config_.repeat
<< " times, latency: " << config_.runtime << " ms ===";
}
void OpTester::RunImpl() {
op_->Run(*scope_, place_);
platform::DeviceContextPool::Instance().Get(place_)->Wait();
scope_->DropKids();
}
std::vector<std::string> OpTester::GetOpProtoInputNames() {
std::vector<std::string> input_names;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
for (int i = 0; i != proto.inputs_size(); ++i) {
const auto &input = proto.inputs(i);
input_names.push_back(input.name());
}
return input_names;
}
std::vector<std::string> OpTester::GetOpProtoOutputNames() {
std::vector<std::string> output_names;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
for (int i = 0; i != proto.outputs_size(); ++i) {
const auto &output = proto.outputs(i);
output_names.push_back(output.name());
}
return output_names;
}
void OpTester::CreateInputVarDesc() {
std::vector<std::string> input_names = GetOpProtoInputNames();
for (auto &name : input_names) {
const OpInputConfig *input = config_.GetInput(name);
if (input == nullptr) {
LOG(FATAL) << "The input " << name << " of op " << config_.op_type
<< " is not correctlly provided.";
}
std::string var_name = config_.op_type + "." + name;
framework::VarDesc *var = Var(var_name);
// Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32);
var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name});
inputs_.push_back(var_name);
}
}
void OpTester::CreateOutputVarDesc() {
std::vector<std::string> output_names = GetOpProtoOutputNames();
for (auto &name : output_names) {
std::string var_name = config_.op_type + "." + name;
framework::VarDesc *var = Var(var_name);
// Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32);
op_desc_.SetOutput(name, {var_name});
outputs_.push_back(var_name);
}
}
framework::VarDesc *OpTester::Var(const std::string &name) {
auto it = vars_.find(name);
if (it != vars_.end()) {
return it->second.get();
}
auto *var = new framework::VarDesc(name);
vars_[name].reset(var);
return var;
}
template <typename T>
void OpTester::SetupTensor(framework::LoDTensor *tensor,
const std::vector<int64_t> &shape, T lower,
T upper) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
if (platform::is_cpu_place(place_)) {
for (int i = 0; i < tensor->numel(); ++i) {
ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
} else {
framework::LoDTensor cpu_tensor;
T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
platform::CPUPlace());
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
TensorCopySync(cpu_tensor, place_, tensor);
}
}
void OpTester::CreateVariables(framework::Scope *scope) {
for (auto &item : vars_) {
auto &var = item.second;
if (var->Name() == framework::kEmptyVarName) {
continue;
}
auto *ptr = scope->Var(var->Name());
framework::InitializeVariable(ptr, var->GetType());
if (var->Persistable()) {
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
// Allocate memory for input tensor
for (auto &name : inputs_) {
VLOG(3) << "Allocate memory for tensor " << name;
auto &var_desc = vars_[name];
std::vector<int64_t> shape = var_desc->GetShape();
auto *var = scope->Var(name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0));
}
}
static std::string GenSpaces(int count) {
std::stringstream ss;
for (int i = 0; i < count; ++i) {
ss << " ";
}
return ss.str();
}
std::string OpTester::DebugString() {
std::stringstream ss;
int count = 0;
for (auto &item : vars_) {
auto &var = item.second;
ss << GenSpaces(count++) << "vars {\n";
ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
ss << GenSpaces(count++) << "type: {\n";
ss << GenSpaces(count) << "type: LOD_TENSOR\n";
ss << GenSpaces(count++) << "lod_tensor {\n";
ss << GenSpaces(count++) << "tensor {\n";
ss << GenSpaces(count) << "data_type: FP32\n";
std::vector<int64_t> shape = var->GetShape();
for (auto d : shape) {
ss << GenSpaces(count) << "dims: " << d << "\n";
}
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(count++) << "ops {\n";
for (auto &name : op_desc_.InputNames()) {
ss << GenSpaces(count++) << "inputs {\n";
ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
<< "\"\n";
ss << GenSpaces(--count) << "}\n";
}
for (auto &name : op_desc_.OutputNames()) {
ss << GenSpaces(count++) << "outputs {\n";
ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
<< "\"\n";
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
ss << GenSpaces(--count) << "}\n";
return ss.str();
}
TEST(op_tester, base) {
OpTester tester;
if (!FLAGS_op_config_list.empty()) {
tester.Init(FLAGS_op_config_list);
} else {
OpTesterConfig config;
config.op_type = "elementwise_add";
config.inputs.resize(2);
config.inputs[0].name = "X";
config.inputs[0].dims = {64, 64};
config.inputs[1].name = "Y";
config.inputs[1].dims = {64, 1};
tester.Init(config);
}
tester.Run();
}
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
namespace paddle {
namespace operators {
namespace benchmark {
class OpTester {
public:
OpTester() {}
void Init(const std::string &filename);
void Init(const OpTesterConfig &config);
void Run();
std::string DebugString();
private:
std::vector<std::string> GetOpProtoInputNames();
std::vector<std::string> GetOpProtoOutputNames();
void CreateInputVarDesc();
void CreateOutputVarDesc();
framework::VarDesc *Var(const std::string &name);
void CreateVariables(framework::Scope *scope);
template <typename T>
void SetupTensor(framework::LoDTensor *input,
const std::vector<int64_t> &shape, T lower, T upper);
void RunImpl();
private:
OpTesterConfig config_;
std::string type_;
framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_;
std::unique_ptr<framework::Scope> scope_;
};
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace benchmark {
static const char kStartSeparator[] = "{";
static const char kEndSeparator[] = "}";
static const char kSepBetweenItems[] = ";";
static bool StartWith(const std::string& str, const std::string& substr) {
return str.find(substr) == 0;
}
static bool EndWith(const std::string& str, const std::string& substr) {
return str.rfind(substr) == (str.length() - substr.length());
}
static void EraseEndSep(std::string* str) {
std::string substr = kSepBetweenItems;
if (EndWith(*str, substr)) {
str->erase(str->length() - substr.length(), str->length());
}
}
static std::vector<int64_t> ParseDims(std::string dims_str) {
std::vector<int64_t> dims;
std::string token;
std::istringstream token_stream(dims_str);
while (std::getline(token_stream, token, 'x')) {
dims.push_back(std::stoi(token));
}
return dims;
}
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") {
std::string dims_str;
is >> dims_str;
dims = ParseDims(dims_str);
}
}
}
}
OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
filename.c_str());
Init(fin);
}
void OpTesterConfig::Init(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "op_type" || sep == "op_type:") {
is >> op_type;
} else if (sep == "device_id" || sep == "device_id:") {
is >> device_id;
} else if (sep == "repeat" || sep == "repeat:") {
is >> repeat;
} else if (sep == "profile" || sep == "profile:") {
is >> profile;
} else if (sep == "print_debug_string" || sep == "print_debug_string:") {
is >> print_debug_string;
} else if (sep == "input" || sep == "input:") {
OpInputConfig input_config(is);
inputs.push_back(input_config);
}
}
}
}
const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
for (size_t i = 0; i < inputs.size(); ++i) {
if (inputs[i].name == name) {
return &inputs[i];
}
}
return nullptr;
}
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
You may obtain a copy of the License at You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
...@@ -12,27 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,27 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
/*
* This file contains the list of the ngraph operators for Paddle.
*
* ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API.
*/
#pragma once #pragma once
#include "ops/accuracy_op.h" #include <istream>
#include "ops/activation_op.h" #include <string>
#include "ops/batch_norm_op.h" #include <vector>
#include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h" namespace paddle {
#include "ops/cross_entropy_op.h" namespace operators {
#include "ops/elementwise_add_op.h" namespace benchmark {
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h" struct OpInputConfig {
#include "ops/mul_op.h" OpInputConfig() {}
#include "ops/pool2d_op.h" explicit OpInputConfig(std::istream& is);
#include "ops/scale_op.h"
#include "ops/softmax_op.h" std::string name;
#include "ops/sum_op.h" std::vector<int64_t> dims;
#include "ops/top_k_op.h" };
struct OpTesterConfig {
OpTesterConfig() {}
explicit OpTesterConfig(const std::string& filename);
void Init(std::istream& is);
const OpInputConfig* GetInput(const std::string& name);
std::string op_type;
std::vector<OpInputConfig> inputs;
int device_id{-1}; // CPU: -1
int repeat{1};
int profile{0};
int print_debug_string{0};
double runtime{0.0};
};
} // namespace benchmark
} // namespace operators
} // namespace paddle
...@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase { ...@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase {
device_count = device_count =
is_gpu ? CUDADevCount() : std::thread::hardware_concurrency(); is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
} }
PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count", PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count",
is_gpu ? "GPU" : "CPU"); is_gpu ? "GPU" : "CPU");
auto out_var_name = Output("Out"); auto out_var_name = Output("Out");
......
...@@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel { ...@@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
"Output(ViterbiPath) should be not null."); "Output(ViterbiPath) should be not null.");
auto emission_dims = ctx->GetInputDim("Emission"); auto emission_dims = ctx->GetInputDim("Emission");
PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
"The Input(Emission) should be a 2-D tensor."); "The Input(Emission) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
auto transition_dims = ctx->GetInputDim("Transition"); auto transition_dims = ctx->GetInputDim("Transition");
PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
"The Input(Transition) should be a 2-D tensor."); "The Input(Transition) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
transition_dims[0] - 2, transition_dims[1], transition_dims[0] - 2, transition_dims[1],
......
...@@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
" For instance, the anchor size of 64 means the area of this anchor " " For instance, the anchor size of 64 means the area of this anchor "
"equals to 64**2.") "equals to 64**2.")
.AddCustomChecker([](const std::vector<float>& anchor_sizes) { .AddCustomChecker([](const std::vector<float>& anchor_sizes) {
PADDLE_ENFORCE_GT(anchor_sizes.size(), 0, PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL,
"Size of anchor_sizes must be at least 1."); "Size of anchor_sizes must be at least 1.");
for (size_t i = 0; i < anchor_sizes.size(); ++i) { for (size_t i = 0; i < anchor_sizes.size(); ++i) {
PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0, PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
...@@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<float>) List of variances to be used " "(vector<float>) List of variances to be used "
"in box regression deltas") "in box regression deltas")
.AddCustomChecker([](const std::vector<float>& variances) { .AddCustomChecker([](const std::vector<float>& variances) {
PADDLE_ENFORCE_EQ(variances.size(), 4, PADDLE_ENFORCE_EQ(variances.size(), 4UL,
"Must and only provide 4 variance."); "Must and only provide 4 variance.");
for (size_t i = 0; i < variances.size(); ++i) { for (size_t i = 0; i < variances.size(); ++i) {
PADDLE_ENFORCE_GT(variances[i], 0.0, PADDLE_ENFORCE_GT(variances[i], 0.0,
...@@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(std::vector<float>(2, 16.0)) .SetDefault(std::vector<float>(2, 16.0))
.AddCustomChecker([](const std::vector<float>& stride) { .AddCustomChecker([](const std::vector<float>& stride) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
stride.size(), 2, stride.size(), 2UL,
"Must and only provide 2 stride for width and height."); "Must and only provide 2 stride for width and height.");
for (size_t i = 0; i < stride.size(); ++i) { for (size_t i = 0; i < stride.size(); ++i) {
PADDLE_ENFORCE_GT(stride[i], 0.0, PADDLE_ENFORCE_GT(stride[i], 0.0,
......
...@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep, ...@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
ch_ctx->stub->SendVariable(cntl, &request, response, done); ch_ctx->stub->SendVariable(cntl, &request, response, done);
...@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep, ...@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
if (method_name == kGetMonomerRPC) { if (method_name == kGetMonomerRPC) {
ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done); ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
...@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
&cntl->request_attachment(), out_var_name_val, &cntl->request_attachment(), out_var_name_val,
false, 0, table_name_val); false, 0, table_name_val);
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
...@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback( google::protobuf::Closure* done = brpc::NewCallback(
&HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this); &HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
...@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage( ...@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
sendrecv::VoidMessage* response = new sendrecv::VoidMessage(); sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
cntl->set_timeout_ms(time_out); cntl->set_timeout_ms(time_out);
platform::RecordRPCEvent record_event(method_name, nullptr); platform::RecordRPCEvent record_event(method_name);
VarHandlePtr var_h( VarHandlePtr var_h(
new VarHandle(ep, method_name, req.varname(), nullptr, nullptr)); new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
......
...@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, ...@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = nullptr; s->response_call_back_ = nullptr;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_); s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
...@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar( ...@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
...@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep, ...@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context // stub context
s->response_call_back_ = ProcGetResponse; s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx); platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req, s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
...@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep, ...@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE); req.set_varname(BATCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, ...@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE); req.set_varname(FETCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, ...@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(var_name); req.set_varname(var_name);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, ...@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
sendrecv::VariableMessage req; sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE); req.set_varname(COMPLETE_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
...@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep, ...@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req.set_varname(CHECKPOINT_SAVE_MESSAGE); req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir); req.set_out_varname(dir);
platform::RecordRPCEvent record_event(method, nullptr); platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_); auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s)); rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......
...@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var, ...@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::ByteBuffer* msg, const std::string& out_name, ::grpc::ByteBuffer* msg, const std::string& out_name,
const int trainer_id, const int trainer_id,
const std::string& table_name) { const std::string& table_name) {
platform::RecordRPCEvent record_event("serial", &ctx); platform::RecordRPCEvent record_event("serial");
VarMsg request; VarMsg request;
TensorPayload* payload = nullptr; TensorPayload* payload = nullptr;
...@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg, ...@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx, const platform::DeviceContext& ctx,
const framework::Scope* scope, const framework::Scope* scope,
framework::Variable** var, int* trainer_id) { framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial", &ctx); platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx); operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!"); PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar(); *var = resp.GetVar();
......
...@@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4, PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor."); "Fully Connected input should be 2-D or 4-D tensor.");
} }
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"Fully Connected input should be 2-D tensor."); "Fully Connected input should be 2-D tensor.");
int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims"); int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
......
...@@ -47,10 +47,11 @@ struct EmbeddingVSumFunctor { ...@@ -47,10 +47,11 @@ struct EmbeddingVSumFunctor {
auto *output = output_t->mutable_data<T>(context.GetPlace()); auto *output = output_t->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_LE(table_width * idx_width, out_width); PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width, jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
out_width, jit::SeqPoolType::kSum); out_width, jit::SeqPoolType::kSum);
for (int64_t i = 0; i != ids_lod.size() - 1; ++i) { for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
attr.index_height = ids_lod[i + 1] - ids_lod[i]; attr.index_height = ids_lod[i + 1] - ids_lod[i];
auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>, auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
platform::CPUPlace>(attr); platform::CPUPlace>(attr);
......
...@@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape( ...@@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape(
"Output(Out) of FusionRepeatedFCReluOp should not be null."); "Output(Out) of FusionRepeatedFCReluOp should not be null.");
auto i_dims = ctx->GetInputDim("X"); auto i_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2"); PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2");
auto w_dims = ctx->GetInputsDim("W"); auto w_dims = ctx->GetInputsDim("W");
auto b_dims = ctx->GetInputsDim("Bias"); auto b_dims = ctx->GetInputsDim("Bias");
...@@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape( ...@@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape(
"inpute width should be equal with weight height"); "inpute width should be equal with weight height");
for (size_t i = 1; i < sz; ++i) { for (size_t i = 1; i < sz; ++i) {
PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL, PADDLE_ENFORCE_EQ(w_dims[i].size(), 2,
"Every weight shape size should be 2."); "Every weight shape size should be 2.");
PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1], PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
"The length of Bias must be equal with w_dims[1]."); "The length of Bias must be equal with w_dims[1].");
......
...@@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape( ...@@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape(
auto ins_dims = ctx->GetInputsDim("X"); auto ins_dims = ctx->GetInputsDim("X");
auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2."); PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2.");
const int D = w_dims[1]; const int D = w_dims[1];
int sum = ins_dims[0][1]; int sum = ins_dims[0][1];
for (size_t i = 1; i < ins_dims.size(); ++i) { for (size_t i = 1; i < ins_dims.size(); ++i) {
......
...@@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape( ...@@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape(
// The output height should be confirmed in Compute, // The output height should be confirmed in Compute,
// since input lod is not accessible here. // since input lod is not accessible here.
PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL, PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
"The dims size of first input should be 2."); "The dims size of first input should be 2.");
ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)}); ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
} }
......
...@@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape( ...@@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape(
auto y_dims = ctx->GetInputDim("Y"); auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(), PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
"Input tensors dims size should be equal."); "Input tensors dims size should be equal.");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix."); PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix.");
PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply."); PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
ctx->SetOutputDim("SquaredX", x_dims); ctx->SetOutputDim("SquaredX", x_dims);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
* You may obtain a copy of the License at You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0 http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and See the License for the specific language governing permissions and
* limitations under the License. */ limitations under the License. */
#include <random> #include <random>
#include <string> #include <string>
...@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>, ...@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
const std::vector<T>& x, const std::vector<T>& yref, const std::vector<T>& x, const std::vector<T>& yref,
const typename jit::SeqPoolTuples<T>::attr_type& attr) { const typename jit::SeqPoolTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr); EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size() % yref.size(), 0); EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
int w = yref.size(); int w = yref.size();
std::vector<T> y(w); std::vector<T> y(w);
const T* x_data = x.data(); const T* x_data = x.data();
......
...@@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel { ...@@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel {
int left = static_cast<int>(matrix_dim[0]); int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]); int right = static_cast<int>(matrix_dim[1]);
if (ctx->HasInput("Scale")) { if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
} }
if (ctx->HasInput("Bias")) { if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right); PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
} }
......
...@@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel { ...@@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
"Output(LogLikelihood) should be not null."); "Output(LogLikelihood) should be not null.");
auto emission_dims = ctx->GetInputDim("Emission"); auto emission_dims = ctx->GetInputDim("Emission");
PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL, PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
"The Input(Emission) should be a 2-D tensor."); "The Input(Emission) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed."); PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
auto transition_dims = ctx->GetInputDim("Transition"); auto transition_dims = ctx->GetInputDim("Transition");
PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL, PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
"The Input(Transition) should be a 2-D tensor."); "The Input(Transition) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
transition_dims[0] - 2, transition_dims[1], transition_dims[0] - 2, transition_dims[1],
...@@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel { ...@@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
"Input(LogLikelihood@GRAD) shoudl be not null."); "Input(LogLikelihood@GRAD) shoudl be not null.");
auto emission_exps_dims = ctx->GetInputDim("EmissionExps"); auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL, PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
"The Input(EmissionExps) should be a 2-D tensor."); "The Input(EmissionExps) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_exps_dims[0], PADDLE_ENFORCE(emission_exps_dims[0],
"An empty mini-batch is not allowed."); "An empty mini-batch is not allowed.");
auto transition_exps_dims = ctx->GetInputDim("TransitionExps"); auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL, PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
"The Input(TransitionExps) should be a 2-D tensor."); "The Input(TransitionExps) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
transition_exps_dims[0] - 2, transition_exps_dims[1], transition_exps_dims[0] - 2, transition_exps_dims[1],
......
...@@ -184,6 +184,9 @@ class Blas { ...@@ -184,6 +184,9 @@ class Blas {
template <typename T> template <typename T>
void VINV(int n, const T* a, T* y) const; void VINV(int n, const T* a, T* y) const;
template <typename T>
void VMERF(int n, const T* a, T* y, int64_t mode) const;
private: private:
const DeviceContext& context_; const DeviceContext& context_;
}; };
...@@ -290,6 +293,11 @@ class BlasT : private Blas<DeviceContext> { ...@@ -290,6 +293,11 @@ class BlasT : private Blas<DeviceContext> {
Base()->template VINV<T>(args...); Base()->template VINV<T>(args...);
} }
template <typename... ARGS>
void VMERF(ARGS... args) const {
Base()->template VMERF<T>(args...);
}
private: private:
const Blas<DeviceContext>* Base() const { const Blas<DeviceContext>* Base() const {
return static_cast<const Blas<DeviceContext>*>(this); return static_cast<const Blas<DeviceContext>*>(this);
......
...@@ -123,6 +123,11 @@ struct CBlas<float> { ...@@ -123,6 +123,11 @@ struct CBlas<float> {
static void VINV(ARGS... args) { static void VINV(ARGS... args) {
platform::dynload::vsInv(args...); platform::dynload::vsInv(args...);
} }
template <typename... ARGS>
static void VMERF(ARGS... args) {
platform::dynload::vmsErf(args...);
}
}; };
template <> template <>
...@@ -223,6 +228,11 @@ struct CBlas<double> { ...@@ -223,6 +228,11 @@ struct CBlas<double> {
static void VINV(ARGS... args) { static void VINV(ARGS... args) {
platform::dynload::vdInv(args...); platform::dynload::vdInv(args...);
} }
template <typename... ARGS>
static void VMERF(ARGS... args) {
platform::dynload::vmdErf(args...);
}
}; };
#else #else
...@@ -625,6 +635,19 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const { ...@@ -625,6 +635,19 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
#endif #endif
} }
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
int64_t mode) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VMERF(n, a, y, mode);
#else
for (int i = 0; i < n; ++i) {
y[i] = std::erf(a[i]);
}
#endif
}
} // namespace math } // namespace math
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -52,11 +52,6 @@ class MKLDNNActivationKernel ...@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
"Wrong layout/format set for Input x tensor"); "Wrong layout/format set for Input x tensor");
Functor functor; Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx); functor(ctx);
} }
}; };
...@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel ...@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
"is_test attribute should be set to False in training phase."); "is_test attribute should be set to False in training phase.");
Functor functor; Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx); functor(ctx);
} }
}; };
......
...@@ -2,4 +2,5 @@ if(WITH_NGRAPH) ...@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph) cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto) cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context) op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
add_subdirectory(ops)
endif() endif()
...@@ -19,49 +19,21 @@ limitations under the License. */ ...@@ -19,49 +19,21 @@ limitations under the License. */
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h" #include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h" #include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace NG_OPS = paddle::operators::ngraphs; bool NgraphBridge::isRegister(const std::string& str) {
std::map<std::string, return ops::NgraphSingleton::Lookup(str);
std::function<void(const std::shared_ptr<framework::OperatorBase>&, }
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {
{"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"batch_norm", NG_OPS::BuildBatchNormNode},
{"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
{"cross_entropy", NG_OPS::BuildCrossEntropyNode},
{"cross_entropy_grad", NG_OPS::BuildCrossEntropyGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode},
{"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
{"fill_constant", NG_OPS::BuildFillConstantNode},
{"mean", NG_OPS::BuildMeanNode},
{"mean_grad", NG_OPS::BuildMeanGradNode},
{"mul", NG_OPS::BuildMulNode},
{"mul_grad", NG_OPS::BuildMulGradNode},
{"pool2d", NG_OPS::BuildPool2dNode},
{"pool2d_grad", NG_OPS::BuildPool2dGradNode},
{"softmax", NG_OPS::BuildSoftmaxNode},
{"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
{"scale", NG_OPS::BuildScaleNode},
{"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
{"sum", NG_OPS::BuildSumNode},
{"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
{"relu_grad", NG_OPS::BuildReluGradNode},
{"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
{"tanh_grad", NG_OPS::BuildTanhGradNode},
{"top_k", NG_OPS::BuildTopKNode}};
void NgraphBridge::BuildNgNode( void NgraphBridge::BuildNgNode(
const std::shared_ptr<framework::OperatorBase>& op) { const std::shared_ptr<framework::OperatorBase>& op) {
auto& op_type = op->Type(); auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map_); ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
} }
} // namespace operators } // namespace operators
......
...@@ -28,13 +28,6 @@ namespace operators { ...@@ -28,13 +28,6 @@ namespace operators {
class NgraphBridge { class NgraphBridge {
public: public:
static std::map<
std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP;
explicit NgraphBridge( explicit NgraphBridge(
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
...@@ -43,6 +36,8 @@ class NgraphBridge { ...@@ -43,6 +36,8 @@ class NgraphBridge {
void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op); void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
static bool isRegister(const std::string& str);
private: private:
std::shared_ptr< std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>> std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
......
...@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals( ...@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int pivot = left; int pivot = left;
while (pivot < right) { while (pivot < right) {
auto op_type = ops.at(pivot)->Type(); auto op_type = ops.at(pivot)->Type();
if (NgraphBridge::NG_NODE_MAP.find(op_type) == if (NgraphBridge::isRegister(op_type)) {
NgraphBridge::NG_NODE_MAP.end()) {
++pivot; ++pivot;
} else { } else {
int start = pivot, end = start; int start = pivot, end = start;
while (pivot < right && while (pivot < right &&
(NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) != (!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
NgraphBridge::NG_NODE_MAP.end())) {
++pivot; ++pivot;
++end; ++end;
} }
......
file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
file(APPEND ${pass_file} "\#pragma once\n")
file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt. DO NOT EDIT!\n\n")
foreach(OPS_NAME ${LIST_OPS})
file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
endforeach(OPS_NAME)
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -63,3 +64,5 @@ void BuildAccuracyNode( ...@@ -63,3 +64,5 @@ void BuildAccuracyNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(accuracy, BuildAccuracyNode);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -50,3 +51,6 @@ void BuildTanhGradNode( ...@@ -50,3 +51,6 @@ void BuildTanhGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(relu_grad, BuildReluGradNode);
REGISTER_NG_OP(than_grad, BuildTanhGradNode);
...@@ -20,6 +20,7 @@ limitations under the License. */ ...@@ -20,6 +20,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -155,3 +156,6 @@ void BuildBatchNormGradNode( ...@@ -155,3 +156,6 @@ void BuildBatchNormGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -47,3 +48,7 @@ static void BuildUnaryNode( ...@@ -47,3 +48,7 @@ static void BuildUnaryNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -233,3 +234,6 @@ void BuildConv2dGradNode( ...@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(conv2d, BuildConv2dNode);
REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode( ...@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode( ...@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -55,3 +56,5 @@ void BuildFillConstantNode( ...@@ -55,3 +56,5 @@ void BuildFillConstantNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
...@@ -19,6 +19,7 @@ limitations under the License. */ ...@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -64,3 +65,6 @@ void BuildMeanGradNode( ...@@ -64,3 +65,6 @@ void BuildMeanGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(mean, BuildMeanNode);
REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildMomentumNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map);
auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map);
auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map);
auto learning_rate =
paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map);
auto mu = op_attrs.Get<float>("mu");
bool use_nesterov = op_attrs.Get<bool>("use_nesterov");
auto param_shape = param->get_shape();
auto velocity_shape = velocity->get_shape();
auto grad_shape = grad->get_shape();
auto lr_shape = learning_rate->get_shape();
auto shape_velocity = ngraph::Shape{velocity_shape};
auto mu_create =
ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu});
auto vel_mul = std::make_shared<ngraph::op::Multiply>(velocity, mu_create);
auto vel_out = std::make_shared<ngraph::op::Add>(vel_mul, grad);
ngraph::NodeVector result;
if (use_nesterov) {
auto mul_res = std::make_shared<ngraph::op::Multiply>(vel_out, mu_create);
auto add_res = std::make_shared<ngraph::op::Add>(grad, mul_res);
auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0);
auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d);
auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
learning_rate, vel_reshape->get_shape(),
ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_reshape, ngraph::AxisVector{0}, param->get_shape());
auto mul_res1 = std::make_shared<ngraph::op::Multiply>(add_res, lr_reshape);
auto res = std::make_shared<ngraph::op::Subtract>(param, mul_res1);
paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
} else {
auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0);
auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d);
auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
learning_rate, vel_reshape->get_shape(),
ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_reshape, ngraph::AxisVector{0}, param->get_shape());
auto mul_result =
std::make_shared<ngraph::op::Multiply>(lr_reshape, vel_out);
auto res = std::make_shared<ngraph::op::Subtract>(param, mul_result);
paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
}
paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(momentum, BuildMomentumNode);
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -130,3 +131,6 @@ static void BuildMulGradNode( ...@@ -130,3 +131,6 @@ static void BuildMulGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(mul, BuildMulNode);
REGISTER_NG_OP(mul_grad, BuildMulGradNode);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include "ngraph/node.hpp"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace ops {
class NgraphSingleton {
NgraphSingleton() = default;
NgraphSingleton(NgraphSingleton const&) = delete;
void operator=(NgraphSingleton const) = delete;
~NgraphSingleton() = default;
static std::map<
std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
ng_node_maps_;
public:
template <typename TF>
static void Register(TF&& tf, const std::string& name) {
ng_node_maps_[name] = tf;
}
static bool Lookup(const std::string& name) {
auto it = ng_node_maps_.find(name);
if (it == ng_node_maps_.end()) {
return true;
}
return false;
}
static void BuildNode(
const std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
const std::shared_ptr<framework::OperatorBase>& op,
const std::string& name) {
ng_node_maps_[name](op, ng_maps);
}
};
std::map<std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphSingleton::ng_node_maps_;
} // namespace ops
} // namespace operators
} // namespace paddle
#define REGISTER_NG_OP(op_type__, Converter__) \
struct ng_##op_type__##_converter { \
ng_##op_type__##_converter() { \
paddle::operators::ops::NgraphSingleton::Register( \
paddle::operators::ngraphs::Converter__, #op_type__); \
} \
}; \
ng_##op_type__##_converter ng_##op_type__##_converter__;
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -172,3 +173,6 @@ void BuildPool2dGradNode( ...@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(pool2d, BuildPool2dNode);
REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -37,3 +38,5 @@ void BuildScaleNode( ...@@ -37,3 +38,5 @@ void BuildScaleNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(scale, BuildScaleNode);
...@@ -18,6 +18,7 @@ limitations under the License. */ ...@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h" #include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode( ...@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(softmax, BuildSoftmaxNode);
REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <string> #include <string>
#include "ngraph/ngraph.hpp" #include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h" #include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle { namespace paddle {
...@@ -42,3 +43,5 @@ void BuildTopKNode( ...@@ -42,3 +43,5 @@ void BuildTopKNode(
} // namespace ngraphs } // namespace ngraphs
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
REGISTER_NG_OP(top_k, BuildTopKNode);
...@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase { ...@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
// For profiling // For profiling
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::RecordEvent record_event(Type());
auto& ctx = *pool.Get(dev_place);
platform::RecordEvent record_event(Type(), &ctx);
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (ins.empty()) { if (ins.empty()) {
......
...@@ -31,10 +31,10 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel { ...@@ -31,10 +31,10 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X"); const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
x_dims.size(), 2UL, x_dims.size(), 2,
"Input(X) of SequenceEnumerate operator's rank should be 2."); "Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
x_dims[1], 1UL, x_dims[1], 1,
"Input(X) of SequenceEnumerate operator's 2nd dimension should be 1."); "Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
const auto win_size = ctx->Attrs().Get<int>("win_size"); const auto win_size = ctx->Attrs().Get<int>("win_size");
......
...@@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ...@@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
auto& x_lod = x_var->Get<LoDTensor>().lod(); auto& x_lod = x_var->Get<LoDTensor>().lod();
auto& y_lod = y_var->Get<LoDTensor>().lod(); auto& y_lod = y_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE_LE(x_lod.size(), 1, PADDLE_ENFORCE_LE(x_lod.size(), 1UL,
"Level number of Input(X)'s lod should not be " "Level number of Input(X)'s lod should not be "
"greater than 1."); "greater than 1.");
PADDLE_ENFORCE_GT(y_lod.size(), 0, PADDLE_ENFORCE_GT(y_lod.size(), 0UL,
"Level number of Input(Y)'s lod should be " "Level number of Input(Y)'s lod should be "
"greater than 0."); "greater than 0.");
PADDLE_ENFORCE( PADDLE_ENFORCE(
...@@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel { ...@@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
"size of Input(X)'s first level lod should be equal to " "size of Input(X)'s first level lod should be equal to "
"size of Input(Y)'s referred level lod."); "size of Input(Y)'s referred level lod.");
} else { } else {
PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1, PADDLE_ENFORCE_EQ(x_dims[0],
static_cast<int64_t>(y_lod[ref_level].size()) - 1,
"When Input(X)'s lod is null, the dims[0] of " "When Input(X)'s lod is null, the dims[0] of "
"Input(X) should match the " "Input(X) should match the "
"size of Input(Y)'s referred level lod."); "size of Input(Y)'s referred level lod.");
......
...@@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel { ...@@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel {
class ShapeOpMaker : public framework::OpProtoAndCheckerMaker { class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Input", "(Tensor), The input tensor."); AddInput("Input", "(LoDTensor), The input tensor.");
AddOutput("Out", AddOutput(
"(Tensor), The shape of input tensor, the data type of the shape" "Out",
"(LoDTensor), The shape of input tensor, the data type of the shape"
" is int32_t, will be on the same device with the input Tensor."); " is int32_t, will be on the same device with the input Tensor.");
AddComment(R"DOC( AddComment(R"DOC(
Shape Operator Shape Operator.
Get the shape of input tensor. Only support CPU input Tensor now. Return the shape of the input.
)DOC"); )DOC");
} }
}; };
......
...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info) ...@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce) nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce boost lib_any) cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags) cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload) add_subdirectory(dynload)
...@@ -87,8 +87,12 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) ...@@ -87,8 +87,12 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(timer SRCS timer.cc) cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer) cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto device_context ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
else()
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
......
...@@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) ...@@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
if (dynload::HasCUDNN()) { if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100; auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100; auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < compile_cudnn_version) { if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1) LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device << "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN " << ". The installed Paddle is compiled with CUDNN "
......
...@@ -14,17 +14,23 @@ limitations under the License. */ ...@@ -14,17 +14,23 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include <deque> #include <deque>
#include <forward_list>
#include <fstream> #include <fstream>
#include <list>
#include <map> #include <map>
#include <mutex> // NOLINT #include <mutex> // NOLINT
#include <numeric> #include <numeric>
#include <sstream>
#include <string> #include <string>
#include <thread> // NOLINT #include <thread> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector> #include <vector>
#include "glog/logging.h" #include "glog/logging.h"
#include "google/protobuf/text_format.h" #include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
namespace paddle { namespace paddle {
...@@ -33,17 +39,31 @@ namespace { ...@@ -33,17 +39,31 @@ namespace {
// Tracking the nested block stacks of each thread. // Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack; thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks. // Tracking the nested event stacks.
thread_local std::deque<std::string> annotation_stack; thread_local std::deque<Event *> annotation_stack;
std::map<uint32_t, int32_t> system_thread_id_map;
std::once_flag tracer_once_flag; std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr; DeviceTracer *tracer = nullptr;
void PrintCuptiHint() {
static bool showed = false;
if (showed) return;
showed = true;
LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
"FLAGS_multiple_of_cupti_buffer_size.";
}
} // namespace } // namespace
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
namespace { namespace {
// TODO(panyx0718): Revisit the buffer size here. // The experimental best performance is
uint64_t kBufSize = 32 * 1024; // the same size with CUPTI device buffer size(8M)
uint64_t kBufSize = 1024 * 1024 * 8;
uint64_t kAlignSize = 8; uint64_t kAlignSize = 8;
std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
driver_cbid_str;
#define ALIGN_BUFFER(buffer, align) \ #define ALIGN_BUFFER(buffer, align) \
(((uintptr_t)(buffer) & ((align)-1)) \ (((uintptr_t)(buffer) & ((align)-1)) \
...@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) { ...@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
return "MEMCPY"; return "MEMCPY";
} }
std::string DriverKind(CUpti_CallbackId cbid) {
auto iter = driver_cbid_str.find(cbid);
if (iter == driver_cbid_str.end())
return "Driver API " + std::to_string(cbid);
return iter->second;
}
std::string RuntimeKind(CUpti_CallbackId cbid) {
auto iter = runtime_cbid_str.find(cbid);
if (iter == runtime_cbid_str.end())
return "Runtime API " + std::to_string(cbid);
return iter->second;
}
void EnableActivity() { void EnableActivity() {
// Device activity record is created when CUDA initializes, so we // Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call. // want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY)); CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE)); dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD)); CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
// We don't track these activities for now. // We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME)); // CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
...@@ -110,16 +148,17 @@ void EnableActivity() { ...@@ -110,16 +148,17 @@ void EnableActivity() {
void DisableActivity() { void DisableActivity() {
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL)); CUPTI_CALL(
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE)); dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
// Disable all other activity record kinds. // Disable all other activity record kinds.
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME)); CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD)); // CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
} }
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
...@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size, ...@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
size_t size, size_t validSize) { size_t size, size_t validSize) {
static std::thread::id cupti_thread_id(0);
if (cupti_thread_id == std::thread::id(0))
cupti_thread_id = std::this_thread::get_id();
PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
"Only one thread is allowed to call bufferCompleted()");
CUptiResult status; CUptiResult status;
CUpti_Activity *record = NULL; CUpti_Activity *record = NULL;
if (validSize > 0) { if (validSize > 0) {
...@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy->correlationId, memcpy->bytes); memcpy->correlationId, memcpy->bytes);
break; break;
} }
case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
// -1 device id represents CUDA api call
tracer->AddCPURecords(
DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
tracer->AddCPURecords(
RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
default: { break; } default: { break; }
} }
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) { } else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
...@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, ...@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped)); dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
if (dropped != 0) { if (dropped != 0) {
fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped); fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
PrintCuptiHint();
} }
} }
free(buffer); free(buffer);
} }
void initCuptiCbidStr();
} // namespace } // namespace
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
class DeviceTracerImpl : public DeviceTracer { class DeviceTracerImpl : public DeviceTracer {
public: public:
DeviceTracerImpl() : enabled_(false) {} DeviceTracerImpl() : enabled_(false) {
#ifdef PADDLE_WITH_CUPTI
initCuptiCbidStr();
#endif
}
void AddAnnotation(uint64_t id, const std::string &anno) { void AddAnnotation(uint32_t id, Event *event) {
thread_local std::forward_list<std::pair<uint32_t, Event *>>
*local_correlations_pairs = nullptr;
if (local_correlations_pairs == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
correlations_[id] = anno; correlations_pairs.emplace_front();
local_correlations_pairs = &correlations_pairs.front();
}
local_correlations_pairs->push_front(std::make_pair(id, event));
} }
void AddCPURecords(const std::string &anno, uint64_t start_ns, void AddCPURecords(const std::string &anno, uint64_t start_ns,
...@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation."; VLOG(1) << "Empty timeline annotation.";
return; return;
} }
thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
if (local_cpu_records_ == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back( cpu_records_.emplace_front();
local_cpu_records_ = &cpu_records_.front();
}
local_cpu_records_->push_front(
CPURecord{anno, start_ns, end_ns, device_id, thread_id}); CPURecord{anno, start_ns, end_ns, device_id, thread_id});
} }
...@@ -215,12 +295,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -215,12 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t end_ns, int64_t device_id, int64_t stream_id, uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) { uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) { if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
VLOG(3) << name << " cannot be traced"; VLOG(3) << name << " cannot be traced";
PrintCuptiHint();
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); // NOTE(liangdun): lock is not needed, only one thread call this function.
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id, mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes}); stream_id, correlation_id, bytes});
} }
...@@ -228,12 +309,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -228,12 +309,13 @@ class DeviceTracerImpl : public DeviceTracer {
int64_t device_id, int64_t stream_id, int64_t device_id, int64_t stream_id,
uint32_t correlation_id) { uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel. // 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) { if (start == 0 || end == 0 || start == end) {
VLOG(3) << correlation_id << " cannot be traced"; VLOG(3) << correlation_id << " cannot be traced";
PrintCuptiHint();
return; return;
} }
std::lock_guard<std::mutex> l(trace_mu_); // NOTE(liangdun): lock is not needed, only one thread call this function.
kernel_records_.push_back( kernel_records_.push_front(
KernelRecord{name, start, end, device_id, stream_id, correlation_id}); KernelRecord{name, start, end, device_id, stream_id, correlation_id});
} }
...@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer {
} else if (ret != CUPTI_SUCCESS) { } else if (ret != CUPTI_SUCCESS) {
fprintf(stderr, "Failed to create CUPTI subscriber.\n"); fprintf(stderr, "Failed to create CUPTI subscriber.\n");
} }
CUPTI_CALL( const std::vector<int> cbids {
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API, CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)); CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
#endif
};
for (auto cbid : cbids)
CUPTI_CALL(dynload::cuptiEnableCallback(
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
enabled_ = true; enabled_ = true;
} }
void Reset() {
#ifdef PADDLE_WITH_CUPTI
CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.clear();
mem_records_.clear();
correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear();
}
void GenEventKernelCudaElapsedTime() {
#ifdef PADDLE_WITH_CUPTI
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
for (const auto &r : mem_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
#endif
}
proto::Profile GenProfile(const std::string &profile_path) { proto::Profile GenProfile(const std::string &profile_path) {
int miss = 0, find = 0;
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb; proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_); profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_); profile_pb.set_end_ns(end_ns_);
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) { for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) { auto c = correlations_.find(r.correlation_id);
event->set_name(correlations_.at(r.correlation_id)); if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else { } else {
VLOG(10) << "Missing Kernel Event: " + r.name;
miss++;
event->set_name(r.name); event->set_name(r.name);
} }
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
...@@ -289,8 +426,9 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -289,8 +426,9 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (const CPURecord &r : cpu_records_) { for (auto &tmp : cpu_records_)
for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU); event->set_type(proto::Event::CPU);
event->set_name(r.name); event->set_name(r.name);
...@@ -299,21 +437,30 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -299,21 +437,30 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id); event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
} }
miss = find = 0;
for (const MemRecord &r : mem_records_) { for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events(); auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel); event->set_type(proto::Event::GPUKernel);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else {
miss++;
event->set_name(r.name); event->set_name(r.name);
}
event->set_start_ns(r.start_ns); event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns); event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id); event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id); event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes); event->mutable_memcopy()->set_bytes(r.bytes);
} }
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
std::ofstream profile_f; std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc); profile_f.open(profile_path,
std::string profile_str; std::ios::out | std::ios::trunc | std::ios::binary);
profile_pb.SerializeToString(&profile_str); profile_pb.SerializeToOstream(&profile_f);
profile_f << profile_str;
profile_f.close(); profile_f.close();
return profile_pb; return profile_pb;
} }
...@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer {
void Disable() { void Disable() {
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
// flush might cause additional calls to DeviceTracker. // flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED); CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
std::lock_guard<std::mutex> l(trace_mu_); std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI #ifdef PADDLE_WITH_CUPTI
DisableActivity(); DisableActivity();
dynload::cuptiUnsubscribe(subscriber_); CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_)); CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
#endif // PADDLE_WITH_CUPTI #endif // PADDLE_WITH_CUPTI
enabled_ = false; enabled_ = false;
...@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer {
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain, static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) { CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata); auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata); DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
if (cbInfo->callbackSite == CUPTI_API_ENTER) { if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno = !annotation_stack.empty() Event *event = CurAnnotation();
? annotation_stack.back() tracer->AddAnnotation(cbInfo->correlationId, event);
: cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
} }
} }
CUpti_SubscriberHandle subscriber_; CUpti_SubscriberHandle subscriber_;
...@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer { ...@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer {
bool enabled_; bool enabled_;
uint64_t start_ns_; uint64_t start_ns_;
uint64_t end_ns_; uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_; std::forward_list<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_; std::forward_list<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_; std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_; std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_;
}; };
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); } void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
...@@ -370,21 +512,106 @@ DeviceTracer *GetDeviceTracer() { ...@@ -370,21 +512,106 @@ DeviceTracer *GetDeviceTracer() {
return tracer; return tracer;
} }
void SetCurAnnotation(const std::string &anno) { void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
annotation_stack.push_back(anno);
}
void ClearCurAnnotation() { annotation_stack.pop_back(); } void ClearCurAnnotation() { annotation_stack.pop_back(); }
std::string CurAnnotation() { Event *CurAnnotation() {
if (annotation_stack.empty()) return ""; if (annotation_stack.empty()) return nullptr;
return annotation_stack.back(); return annotation_stack.back();
} }
std::string CurAnnotationName() {
if (annotation_stack.empty()) return "";
return annotation_stack.back()->name();
}
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); } void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); } void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); } int BlockDepth() { return block_id_stack.size(); }
uint32_t GetCurSystemThreadId() {
std::stringstream ss;
ss << std::this_thread::get_id();
uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
return id;
}
void RecoreCurThreadId(int32_t id) {
auto gid = GetCurSystemThreadId();
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
system_thread_id_map[gid] = id;
}
int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread.
return static_cast<int32_t>(id);
}
#ifdef PADDLE_WITH_CUPTI
namespace {
void initCuptiCbidStr() {
static bool called = false;
if (called) return;
called = true;
#define REGISTER_RUNTIME_CBID_STR(cbid) \
runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
REGISTER_RUNTIME_CBID_STR(
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
#endif
#undef REGISTER_RUNTIME_CBID_STR
}
} // namespace
#endif // PADDLE_WITH_CUPTI
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() { ...@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() {
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec); return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
} }
class Event;
// DeviceTracer performs the following tasks: // DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc. // 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc. // 2. Collect cuda statistics: start/end ts, memory, etc.
...@@ -68,11 +70,13 @@ class DeviceTracer { ...@@ -68,11 +70,13 @@ class DeviceTracer {
virtual void Enable() = 0; virtual void Enable() = 0;
// Needs to be called once after use. // Needs to be called once after use.
virtual void Disable() = 0; virtual void Disable() = 0;
// Needs to be called once before reuse.
virtual void Reset() = 0;
// Add a pair to correlate internal cuda id with high level // Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by // annotation event(with string). So cuda statistics can be represented by
// human-readable annotations. // human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0; virtual void AddAnnotation(uint32_t id, Event* event) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns, virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id, uint64_t end_ns, int64_t device_id,
...@@ -92,6 +96,9 @@ class DeviceTracer { ...@@ -92,6 +96,9 @@ class DeviceTracer {
// Generate a proto after done (Disabled). // Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0; virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
// generate kernel elapsed time into Event
virtual void GenEventKernelCudaElapsedTime() = 0;
virtual bool IsEnabled() = 0; virtual bool IsEnabled() = 0;
}; };
...@@ -99,14 +106,19 @@ class DeviceTracer { ...@@ -99,14 +106,19 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer(); DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread. // Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const std::string& anno); void SetCurAnnotation(Event* event);
// Clear the name after the operation is done. // Clear the name after the operation is done.
void ClearCurAnnotation(); void ClearCurAnnotation();
// Current name of the operation being run in the thread. // Current name of the operation being run in the thread.
std::string CurAnnotation(); std::string CurAnnotationName();
Event* CurAnnotation();
void SetCurBlock(int block_id); void SetCurBlock(int block_id);
void ClearCurBlock(); void ClearCurBlock();
int BlockDepth(); int BlockDepth();
// Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(int32_t id);
int32_t GetThreadIdFromSystemThreadId(uint32_t id);
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -86,6 +86,8 @@ extern void* mklml_dso_handle; ...@@ -86,6 +86,8 @@ extern void* mklml_dso_handle;
__macro(vdPowx); \ __macro(vdPowx); \
__macro(vsInv); \ __macro(vsInv); \
__macro(vdInv); \ __macro(vdInv); \
__macro(vmsErf); \
__macro(vmdErf); \
__macro(MKL_Set_Num_Threads) __macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP); MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
......
...@@ -31,6 +31,8 @@ limitations under the License. */ ...@@ -31,6 +31,8 @@ limitations under the License. */
#include <sstream> #include <sstream>
#include <stdexcept> #include <stdexcept>
#include <string> #include <string>
#include <type_traits>
#include <utility>
#include "glog/logging.h" #include "glog/logging.h"
#include "paddle/fluid/platform/macros.h" #include "paddle/fluid/platform/macros.h"
...@@ -280,16 +282,62 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) { ...@@ -280,16 +282,62 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
} \ } \
} while (0) } while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ namespace details {
template <typename T>
inline constexpr bool IsArithmetic() {
return std::is_arithmetic<T>::value;
}
template <typename T1, typename T2, bool kIsArithmetic /* = true */>
struct TypeConverterImpl {
using Type1 = typename std::common_type<T1, T2>::type;
using Type2 = Type1;
};
template <typename T1, typename T2>
struct TypeConverterImpl<T1, T2, false> {
using Type1 = T1;
using Type2 = T2;
};
template <typename T1, typename T2>
struct TypeConverter {
private:
static constexpr bool kIsArithmetic =
IsArithmetic<T1>() && IsArithmetic<T2>();
public:
using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
};
template <typename T1, typename T2>
using CommonType1 = typename std::add_lvalue_reference<
typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
template <typename T1, typename T2>
using CommonType2 = typename std::add_lvalue_reference<
typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
} // namespace details
#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \
do { \ do { \
auto __cond1__ = (__VAL0); \ auto __val1 = (__VAL1); \
auto __cond2__ = (__VAL1); \ auto __val2 = (__VAL2); \
if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) { \ using __TYPE1__ = decltype(__val1); \
using __TYPE2__ = decltype(__val2); \
using __COMMON_TYPE1__ = \
::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \
using __COMMON_TYPE2__ = \
::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \
bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
static_cast<__COMMON_TYPE2__>(__val2)); \
if (UNLIKELY(!__is_not_error)) { \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \ PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \ " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, #__VAL0, \ #__VAL1, #__VAL2, #__VAL1, \
::paddle::string::to_string(__cond1__), #__VAL1, \ ::paddle::string::to_string(__val1), #__VAL2, \
::paddle::string::to_string(__cond2__), \ ::paddle::string::to_string(__val2), \
::paddle::string::Sprintf(__VA_ARGS__)); \ ::paddle::string::Sprintf(__VA_ARGS__)); \
} \ } \
} while (0) } while (0)
......
...@@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); } ...@@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
TEST(ENFORCE_GT, FAIL) { TEST(ENFORCE_GT, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GT(1, 2UL); PADDLE_ENFORCE_GT(1, 2);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE(HasPrefix( EXPECT_TRUE(
StringPiece(error.what()), HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2.")); "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
} }
EXPECT_TRUE(caught_exception); EXPECT_TRUE(caught_exception);
} }
TEST(ENFORCE_GE, OK) { TEST(ENFORCE_GE, OK) {
PADDLE_ENFORCE_GE(2, 2UL); PADDLE_ENFORCE_GE(2, 2);
PADDLE_ENFORCE_GE(3, 2UL);
PADDLE_ENFORCE_GE(3, 2); PADDLE_ENFORCE_GE(3, 2);
PADDLE_ENFORCE_GE(3.21, 2UL); PADDLE_ENFORCE_GE(3.21, 2.0);
} }
TEST(ENFORCE_GE, FAIL) { TEST(ENFORCE_GE, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GE(1, 2UL); PADDLE_ENFORCE_GE(1, 2);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE(HasPrefix( EXPECT_TRUE(
StringPiece(error.what()), HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2.")); "Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2."));
} }
EXPECT_TRUE(caught_exception); EXPECT_TRUE(caught_exception);
} }
TEST(ENFORCE_LE, OK) { TEST(ENFORCE_LE, OK) {
PADDLE_ENFORCE_LE(1, 1); PADDLE_ENFORCE_LE(1, 1);
PADDLE_ENFORCE_LE(1, 1UL); PADDLE_ENFORCE_LE(1UL, 1UL);
PADDLE_ENFORCE_LE(2, 3UL); PADDLE_ENFORCE_LE(2, 3);
PADDLE_ENFORCE_LE(2UL, 3); PADDLE_ENFORCE_LE(2UL, 3UL);
PADDLE_ENFORCE_LE(2UL, 3.2); PADDLE_ENFORCE_LE(2.0, 3.2);
} }
TEST(ENFORCE_LE, FAIL) { TEST(ENFORCE_LE, FAIL) {
bool caught_exception = false; bool caught_exception = false;
try { try {
PADDLE_ENFORCE_GT(1, 2UL); PADDLE_ENFORCE_GT(1, 2);
} catch (paddle::platform::EnforceNotMet error) { } catch (paddle::platform::EnforceNotMet error) {
caught_exception = true; caught_exception = true;
EXPECT_TRUE(HasPrefix( EXPECT_TRUE(
StringPiece(error.what()), HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2.")); "Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
} }
EXPECT_TRUE(caught_exception); EXPECT_TRUE(caught_exception);
} }
TEST(ENFORCE_LT, OK) { TEST(ENFORCE_LT, OK) {
PADDLE_ENFORCE_LT(3, 10); PADDLE_ENFORCE_LT(3, 10);
PADDLE_ENFORCE_LT(2, 3UL); PADDLE_ENFORCE_LT(2UL, 3UL);
PADDLE_ENFORCE_LT(2UL, 3); PADDLE_ENFORCE_LT(2, 3);
} }
TEST(ENFORCE_LT, FAIL) { TEST(ENFORCE_LT, FAIL) {
bool caught_exception = false; bool caught_exception = false;
...@@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) { ...@@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
TEST(ENFORCE_USER_DEFINED_CLASS, NE) { TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}}; Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet); bool caught_exception = false;
try {
PADDLE_ENFORCE_EQ(a, b);
} catch (paddle::platform::EnforceNotMet&) {
caught_exception = true;
}
EXPECT_TRUE(caught_exception);
} }
TEST(EOF_EXCEPTION, THROW_EOF) { TEST(EOF_EXCEPTION, THROW_EOF) {
......
...@@ -22,6 +22,7 @@ limitations under the License. */ ...@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#endif #endif
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h" #include "paddle/fluid/platform/init.h"
...@@ -30,6 +31,9 @@ limitations under the License. */ ...@@ -30,6 +31,9 @@ limitations under the License. */
DEFINE_int32(paddle_num_threads, 1, DEFINE_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance."); "Number of threads for each paddle instance.");
DEFINE_int32(multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value.");
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) { ...@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) {
#endif #endif
} }
void InitCupti() {
#ifdef PADDLE_WITH_CUPTI
if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
size_t attrValue = 0, attrValueSize = sizeof(size_t);
#define MULTIPLY_ATTR_VALUE(attr) \
{ \
PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
attr, &attrValueSize, &attrValue)); \
attrValue *= FLAGS_multiple_of_cupti_buffer_size; \
LOG(WARNING) << "Set " #attr " " << attrValue << " byte"; \
PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
attr, &attrValueSize, &attrValue)); \
}
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
#if CUDA_VERSION >= 9000
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
#endif
#undef MULTIPLY_ATTR_VALUE
#endif
}
void InitDevices(bool init_p2p) { void InitDevices(bool init_p2p) {
// CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute).
InitCupti();
/*Init all available devices by default */ /*Init all available devices by default */
std::vector<int> devices; std::vector<int> devices;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
......
...@@ -43,6 +43,13 @@ std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) { ...@@ -43,6 +43,13 @@ std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape); return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
} }
ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1,
std::multiplies<size_t>());
size_t x1_l = (size_t)x1;
return ngraph::Shape{x1_l};
}
ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) { ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1, auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
std::multiplies<size_t>()); std::multiplies<size_t>());
......
...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <algorithm> #include <algorithm>
#include <iomanip> #include <iomanip>
#include <limits> #include <limits>
...@@ -27,7 +29,6 @@ limitations under the License. */ ...@@ -27,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h" #include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h" #include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h" #include "paddle/fluid/string/printf.h"
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not."); DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
...@@ -66,12 +67,13 @@ struct EventList { ...@@ -66,12 +67,13 @@ struct EventList {
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign); ((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args> template <typename... Args>
void Record(Args&&... args) { Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) { if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front(); event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock); event_blocks.front().reserve(kNumBlock);
} }
event_blocks.front().emplace_back(std::forward<Args>(args)...); event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
} }
std::vector<Event> Reduce() { std::vector<Event> Reduce() {
...@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() { ...@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() {
.count(); .count();
} }
Event::Event(EventType type, std::string name, uint32_t thread_id, Event::Event(EventType type, std::string name, uint32_t thread_id)
const DeviceContext* dev_ctx) : type_(type), name_(name), thread_id_(thread_id) {
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaSetDevice(
boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
}
#endif
cpu_ns_ = GetTimeInNsec(); cpu_ns_ = GetTimeInNsec();
} }
...@@ -123,89 +112,70 @@ double Event::CpuElapsedMs(const Event& e) const { ...@@ -123,89 +112,70 @@ double Event::CpuElapsedMs(const Event& e) const {
} }
double Event::CudaElapsedMs(const Event& e) const { double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUPTI
if (!has_cuda_) return 0.0; return gpu_ns_ / 1000000.0;
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms;
#else #else
PADDLE_THROW("CUDA is not enabled"); LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
return 0;
#endif #endif
} }
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() { inline EventList& GetEventList() {
if (!g_event_list) { if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>(); g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++; g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list); g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id);
} }
return *g_event_list; return *g_event_list;
} }
void Mark(const std::string& name, const DeviceContext* dev_ctx) { void Mark(const std::string& name) {
GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kMark, name, g_thread_id);
} }
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) { Event* PushEvent(const std::string& name) {
GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx); return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
} }
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) { void PopEvent(const std::string& name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx); GetEventList().Record(EventType::kPopRange, name, g_thread_id);
} }
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx) RecordEvent::RecordEvent(const std::string& name)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
is_enabled_ = true; is_enabled_ = true;
dev_ctx_ = dev_ctx;
name_ = name; name_ = name;
PushEvent(name_, dev_ctx_); Event* e = PushEvent(name_);
// Maybe need the same push/pop behavior. // Maybe need the same push/pop behavior.
SetCurAnnotation(name_); SetCurAnnotation(e);
} }
RecordEvent::~RecordEvent() { RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(), tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id); BlockDepth(), g_thread_id);
} }
ClearCurAnnotation(); ClearCurAnnotation();
PopEvent(name_, dev_ctx_); PopEvent(name_);
} }
RecordRPCEvent::RecordRPCEvent(const std::string& name, RecordRPCEvent::RecordRPCEvent(const std::string& name) {
const DeviceContext* dev_ctx) {
if (FLAGS_enable_rpc_profiler) { if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name, dev_ctx)); event_.reset(new platform::RecordEvent(name));
} }
} }
RecordBlock::RecordBlock(int block_id) RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) { : is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
is_enabled_ = true; is_enabled_ = true;
SetCurBlock(block_id); SetCurBlock(block_id);
...@@ -213,7 +183,7 @@ RecordBlock::RecordBlock(int block_id) ...@@ -213,7 +183,7 @@ RecordBlock::RecordBlock(int block_id)
} }
RecordBlock::~RecordBlock() { RecordBlock::~RecordBlock() {
std::lock_guard<std::mutex> l(profiler_mu); // lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return; if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer) { if (tracer) {
...@@ -225,11 +195,21 @@ RecordBlock::~RecordBlock() { ...@@ -225,11 +195,21 @@ RecordBlock::~RecordBlock() {
ClearCurBlock(); ClearCurBlock();
} }
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE(cudaDeviceSynchronize());
}
#endif
}
void EnableProfiler(ProfilerState state) { void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled, PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enable profiling, since the input state is ", "Can't enable profiling, since the input state is ",
"ProfilerState::kDisabled"); "ProfilerState::kDisabled");
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) { if (state == g_state) {
return; return;
...@@ -238,23 +218,20 @@ void EnableProfiler(ProfilerState state) { ...@@ -238,23 +218,20 @@ void EnableProfiler(ProfilerState state) {
should_send_profile_state = true; should_send_profile_state = true;
GetDeviceTracer()->Enable(); GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) { if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead. // Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) { DummyKernelAndEvent();
ForEachDevice([](int d) { GetDeviceTracer()->Reset();
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait();
delete dev_ctx;
});
}
} }
#endif #endif
// Mark the profiling start. // Mark the profiling start.
Mark("_start_profiler_", nullptr); Mark("_start_profiler_");
} }
void ResetProfiler() { void ResetProfiler() {
SynchronizeAllDevice();
GetDeviceTracer()->Reset();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex); std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end(); for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) { ++it) {
...@@ -277,9 +254,11 @@ struct EventItem { ...@@ -277,9 +254,11 @@ struct EventItem {
std::string name; std::string name;
int calls; int calls;
double total_time; double total_time;
double min_time;
double max_time; double max_time;
double ave_time; double ave_time;
double min_time;
double cpu_time;
double gpu_time;
float ratio; float ratio;
}; };
...@@ -313,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -313,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Output events table // Output events table
std::cout.setf(std::ios::left); std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width) std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total" << "Calls" << std::setw(data_width) << "Total";
<< std::setw(data_width) << "Min." << std::setw(data_width) if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
<< std::setw(data_width * 2) << "GPU Time (Ratio)";
}
std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave." << "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl; << std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) { for (size_t i = 0; i < events_table.size(); ++i) {
...@@ -322,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table, ...@@ -322,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const EventItem& event_item = events_table[i][j]; const EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls << std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time << std::setw(data_width) << event_item.total_time;
<< std::setw(data_width) << event_item.min_time if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2)
<< string::Sprintf(
"%f (%f)", event_item.cpu_time,
(event_item.cpu_time / event_item.total_time))
<< std::setw(data_width * 2)
<< string::Sprintf(
"%f (%f)", event_item.gpu_time,
(event_item.gpu_time / event_item.total_time));
}
std::cout << std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time << std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time << std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.ratio << std::endl; << std::setw(data_width) << event_item.ratio << std::endl;
...@@ -372,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -372,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
return a.ave_time > b.ave_time; return a.ave_time > b.ave_time;
}; };
break; break;
case EventSortingKey::kGPUTime:
sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.gpu_time > b.gpu_time;
};
break;
case EventSortingKey::kCPUTime:
sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.cpu_time > b.cpu_time;
};
break;
default: default:
sorted_domain = "event first end time"; sorted_domain = "event first end time";
} }
...@@ -410,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -410,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
} }
if (rit != pushed_events.rend()) { if (rit != pushed_events.rend()) {
double event_time = (g_state == ProfilerState::kCUDA || double event_time = 0;
g_state == ProfilerState::kAll) double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
? rit->CudaElapsedMs((*analyze_events)[i][j]) double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
: rit->CpuElapsedMs((*analyze_events)[i][j]); if (g_state == ProfilerState::kCUDA) {
event_time = gpu_time;
} else if (g_state == ProfilerState::kCPU) {
event_time = cpu_time;
} else {
event_time = gpu_time + cpu_time;
}
total += event_time; total += event_time;
std::string event_name; std::string event_name;
...@@ -430,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -430,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
event_idx[event_name] = event_items.size(); event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time, EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time, event_time, event_time, event_time,
0.}; gpu_time, cpu_time, 0.};
event_items.push_back(event_item); event_items.push_back(event_item);
} else { } else {
int index = event_idx[event_name]; int index = event_idx[event_name];
...@@ -443,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -443,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// max time // max time
event_items[index].max_time = event_items[index].max_time =
std::max(event_time, event_items[index].max_time); std::max(event_time, event_items[index].max_time);
event_items[index].gpu_time += gpu_time;
event_items[index].cpu_time += cpu_time;
} }
// remove the push marker from the list // remove the push marker from the list
...@@ -481,20 +495,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events, ...@@ -481,20 +495,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key, void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) { const std::string& profile_path) {
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu); std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return; if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop. // Mark the profiling stop.
Mark("_stop_profiler_", nullptr); Mark("_stop_profiler_");
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer(); DeviceTracer* tracer = GetDeviceTracer();
if (tracer->IsEnabled()) { if (tracer->IsEnabled()) {
tracer->Disable(); tracer->Disable();
tracer->GenProfile(profile_path); tracer->GenProfile(profile_path);
tracer->GenEventKernelCudaElapsedTime();
} }
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
g_state = ProfilerState::kDisabled; g_state = ProfilerState::kDisabled;
should_send_profile_state = true; should_send_profile_state = true;
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
namespace paddle {
namespace platform {
__global__ void DummyKernel(int *a) { a[0] = 0; }
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
void DummyKernelAndEvent() {
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
dev_ctx->Wait();
PADDLE_ENFORCE(cudaFree(ptr));
delete dev_ctx;
});
}
}
} // namespace platform
} // namespace paddle
...@@ -28,17 +28,17 @@ class Event { ...@@ -28,17 +28,17 @@ class Event {
public: public:
// The DeviceContext is used to get the cuda stream. // The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr. // If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id, Event(EventType type, std::string name, uint32_t thread_id);
const DeviceContext* dev_ctx);
const EventType& type() const; const EventType& type() const;
std::string name() const { return name_; } std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; } uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t event() const { return event_; } cudaEvent_t event() const { return event_; }
int device() const { return device_; } int device() const { return device_; }
#endif
#endif #endif
double CpuElapsedMs(const Event& e) const; double CpuElapsedMs(const Event& e) const;
...@@ -49,11 +49,21 @@ class Event { ...@@ -49,11 +49,21 @@ class Event {
std::string name_; std::string name_;
uint32_t thread_id_; uint32_t thread_id_;
int64_t cpu_ns_; int64_t cpu_ns_;
bool has_cuda_;
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
public:
void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
gpu_ns_ += end_ns - start_ns;
}
private:
#else
cudaEvent_t event_ = nullptr; cudaEvent_t event_ = nullptr;
int device_ = -1; int device_ = -1;
#endif #endif
#endif
}; };
enum ProfilerState { enum ProfilerState {
...@@ -63,22 +73,19 @@ enum ProfilerState { ...@@ -63,22 +73,19 @@ enum ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental). kAll, // Profile both CPU and GPU. (Currently experimental).
}; };
void Mark(const std::string& name, const DeviceContext* dev_ctx); void Mark(const std::string& name);
void PushEvent(const std::string& name, const DeviceContext* dev_ctx); Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name);
struct RecordEvent { struct RecordEvent {
// dev_ctx can be set to nullptr if device is cpu. explicit RecordEvent(const std::string& name);
RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordEvent(); ~RecordEvent();
bool is_enabled_; bool is_enabled_;
uint64_t start_ns_; uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name // Event name
std::string name_; std::string name_;
// Need to distinguish name by op type, block_id, program_id and perhaps // Need to distinguish name by op type, block_id, program_id and perhaps
...@@ -88,8 +95,7 @@ struct RecordEvent { ...@@ -88,8 +95,7 @@ struct RecordEvent {
class RecordRPCEvent { class RecordRPCEvent {
public: public:
// dev_ctx can be set to nullptr if device is cpu. explicit RecordRPCEvent(const std::string& name);
RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
~RecordRPCEvent() {} ~RecordRPCEvent() {}
private: private:
...@@ -111,7 +117,16 @@ struct RecordBlock { ...@@ -111,7 +117,16 @@ struct RecordBlock {
std::vector<std::vector<Event>> GetAllEvents(); std::vector<std::vector<Event>> GetAllEvents();
// Candidate keys to sort the profiling report // Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve }; enum EventSortingKey {
kDefault,
kCalls,
kTotal,
kMin,
kMax,
kAve,
kCPUTime,
kGPUTime
};
// Enable the profiling function. // Enable the profiling function.
void EnableProfiler(ProfilerState state); void EnableProfiler(ProfilerState state);
...@@ -132,5 +147,9 @@ bool ShouldSendProfileState(); ...@@ -132,5 +147,9 @@ bool ShouldSendProfileState();
void SetProfileListener(); void SetProfileListener();
int64_t ListenerId(); int64_t ListenerId();
#ifdef PADDLE_WITH_CUDA
void DummyKernelAndEvent();
#endif
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -31,6 +31,7 @@ message Event { ...@@ -31,6 +31,7 @@ message Event {
optional int64 sub_device_id = 6; optional int64 sub_device_id = 6;
optional MemCopy memcopy = 7; optional MemCopy memcopy = 7;
optional string detail_info = 9;
} }
message Profile { message Profile {
......
...@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) { ...@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventType; using paddle::platform::EventType;
Event start_event(EventType::kPushRange, "test", 0, nullptr); Event start_event(EventType::kPushRange, "test", 0);
EXPECT_TRUE(start_event.has_cuda() == false);
int counter = 0; int counter = 0;
while (counter != 1000) { while (counter != 1000) {
counter++; counter++;
} }
Event stop_event(EventType::kPopRange, "test", 0, nullptr); Event stop_event(EventType::kPopRange, "test", 0);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0); EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
} }
#ifdef PADDLE_WITH_CUDA
TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
using paddle::platform::Event;
using paddle::platform::EventType;
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
}
#endif
TEST(RecordEvent, RecordEvent) { TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext; using paddle::platform::DeviceContext;
using paddle::platform::Event; using paddle::platform::Event;
using paddle::platform::EventType; using paddle::platform::EventType;
using paddle::platform::RecordEvent; using paddle::platform::RecordEvent;
using paddle::platform::PushEvent;
using paddle::platform::PopEvent;
using paddle::platform::ProfilerState; using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey; using paddle::platform::EventSortingKey;
ProfilerState state = ProfilerState::kCPU; ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif
EnableProfiler(state); EnableProfiler(state);
/* Usage 1: /* Usage 1:
* PushEvent(evt_name, dev_ctx); * PushEvent(evt_name);
* ... * ...
* code to be analyzed * code to be analyzed
* ... * ...
* PopEvent(evt_name, dev_ctx); * PopEvent(evt_name);
*/ */
LOG(INFO) << "Usage 1: PushEvent & PopEvent"; LOG(INFO) << "Usage 1: PushEvent & PopEvent";
for (int loop = 0; loop < 3; ++loop) { for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i); std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx); PushEvent(name);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx); PopEvent(name);
} }
} }
/* Usage 2: /* Usage 2:
* { * {
* RecordEvent record_event(name, dev_ctx); * RecordEvent record_event(name);
* ... * ...
* code to be analyzed * code to be analyzed
* ... * ...
...@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) { ...@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 2: RecordEvent"; LOG(INFO) << "Usage 2: RecordEvent";
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i); std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name);
int counter = 1; int counter = 1;
while (counter != i * 1000) counter++; while (counter != i * 1000) counter++;
} }
...@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) { ...@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 3: nested RecordEvent"; LOG(INFO) << "Usage 3: nested RecordEvent";
for (int i = 1; i < 5; ++i) { for (int i = 1; i < 5; ++i) {
std::string name = "ano_evs_op_" + std::to_string(i); std::string name = "ano_evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx); RecordEvent record_event(name);
int counter = 1; int counter = 1;
while (counter != i * 100) counter++; while (counter != i * 100) counter++;
{ {
std::string nested_name = "nested_ano_evs_op_" + std::to_string(i); std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
RecordEvent nested_record_event(nested_name, dev_ctx); RecordEvent nested_record_event(nested_name);
int nested_counter = 1; int nested_counter = 1;
while (nested_counter != i * 100) nested_counter++; while (nested_counter != i * 100) nested_counter++;
} }
} }
// Bad Usage: // Bad Usage:
PushEvent("event_without_pop", dev_ctx); PushEvent("event_without_pop");
PopEvent("event_without_push", dev_ctx); PopEvent("event_without_push");
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents(); std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
int cuda_startup_count = 0; int cuda_startup_count = 0;
......
...@@ -106,6 +106,11 @@ bool IsCompiledWithDIST() { ...@@ -106,6 +106,11 @@ bool IsCompiledWithDIST() {
#endif #endif
} }
template <typename PlaceType1, typename PlaceType2>
static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
return paddle::platform::Place(p1) == paddle::platform::Place(p2);
}
PYBIND11_MODULE(core, m) { PYBIND11_MODULE(core, m) {
// Not used, just make sure cpu_info.cc is linked. // Not used, just make sure cpu_info.cc is linked.
paddle::platform::CpuTotalPhysicalMemory(); paddle::platform::CpuTotalPhysicalMemory();
...@@ -732,23 +737,45 @@ All parameter, weight, gradient are variables in Paddle. ...@@ -732,23 +737,45 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW("Cannot use CUDAPlace in CPU only version"); PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
#endif #endif
}) })
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CUDAPlace &>); .def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace") py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
.def(py::init<>()) .def(py::init<>())
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CPUPlace &>); .def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace") py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
.def("__init__", .def("__init__",
[](platform::CUDAPinnedPlace &) { [](platform::CUDAPinnedPlace &self) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version"); PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
#endif #endif
new (&self) platform::CUDAPinnedPlace();
}) })
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>); .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
py::class_<platform::Place>(m, "Place") py::class_<platform::Place>(m, "Place")
.def(py::init<>()) .def(py::init<>())
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place", .def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); }) [](platform::Place &self) { return platform::is_gpu_place(self); })
.def("gpu_device_id", .def("gpu_device_id",
......
...@@ -9,7 +9,6 @@ ...@@ -9,7 +9,6 @@
PADDLE_LIB=/paddle/lib/dir PADDLE_LIB=/paddle/lib/dir
cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \ cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_FLUID_ONLY=ON \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_STYLE_CHECK=OFF \ -DWITH_STYLE_CHECK=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
......
...@@ -66,12 +66,10 @@ Users can specify the following Docker build arguments with either "ON" or "OFF" ...@@ -66,12 +66,10 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. | | `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | OFF | Build unit tests binaries. | | `WITH_TESTING` | OFF | Build unit tests binaries. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. | | `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. | | `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
| `WITH_STYLE_CHECK` | ON | Check the code style when building. | | `WITH_STYLE_CHECK` | ON | Check the code style when building. |
| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu | | `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
| `RUN_TEST` | OFF | Run unit test immediently after the build. | | `RUN_TEST` | OFF | Run unit test immediently after the build. |
| `WITH_DOC` | OFF | Build docs after build binaries. |
| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` | | `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
## Docker Images ## Docker Images
......
#!/bin/bash #!/bin/bash
## purple to echo
function purple(){
echo -e "\033[35m$1\033[0m"
}
## green to echo
function green(){
echo -e "\033[32m$1\033[0m"
}
## Error to warning with blink
function bred(){
echo -e "\033[31m\033[01m\033[05m$1\033[0m"
}
## Error to warning with blink
function byellow(){
echo -e "\033[33m\033[01m\033[05m$1\033[0m"
}
## Error
function red(){
echo -e "\033[31m\033[01m$1\033[0m"
}
## warning
function yellow(){
echo -e "\033[33m\033[01m$1\033[0m"
}
path='http://paddlepaddle.org/download?url=' path='http://paddlepaddle.org/download?url='
#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1` #release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
release_version=1.2.0 release_version=1.2.0
...@@ -228,16 +260,112 @@ function checkLinuxPaddleVersion(){ ...@@ -228,16 +260,112 @@ function checkLinuxPaddleVersion(){
done done
} }
function checkLinuxPip(){ function checkPythonVirtualenv(){
while true while true
do do
echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):" read -p "
read -p "" pip_path 是否使用python virtualenv虚环境安装(y/n)": check_virtualenv
if [ "$pip_path" == "" -o ! -f "$pip_path" ];then case $check_virtualenv in
echo "检测结果:pip不存在,请重新输入" y)
echo "为您使用python虚环境安装"
;;
n)
break
;;
*)
continue
;;
esac
virtualenv_path=`which virtualenv 2>&1`
if [ "$virtualenv_path" == "" ];then
$python_path -m pip install virtualenv
if [ "$?" != '0' ];then
echo "安装虚拟环境失败,请检查本地环境"
fi
fi
while true
do
read -p "请输入虚拟环境名字:" virtualenv_name
if [ "$virtualenv_name" == "" ];then
echo "不能为空"
continue
fi
break
done
virtualenv -p $python_path ${virtualenv_name}
if [ "$?" != 0 ];then
echo "创建虚环境失败,请检查环境"
exit 2
fi
cd ${virtualenv_name}
source ./bin/activate
if [ "$?" == 0 ];then
use_virtualenv=
python_path=`which python`
break
else
echo "创建虚环境失败,请检查环境"
exit 2
fi
done
}
function checkLinuxPython(){
python_path=`which python 2>/dev/null`
while true
do
if [ "$python_path" == '' ];then
while true
do
read -p "没有找到默认的python版本,请输入要安装的python路径:" python_path
python_path=`$python_path -V`
if [ "$python_path" != "" ];then
break
else
echo "输入路径有误,未找到pyrhon"
fi
done
fi
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
while true
do
read -p "
找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):" check_python
case $check_python in
n)
read -p "请指定您的python路径:" new_python_path
python_V=`$new_python_path -V 2>/dev/null`
if [ "$python_V" != "" ];then
python_path=$new_python_path
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
echo "您的python版本为${python_version}"
break
else
echo 输入有误,未找到python路径
fi
;;
y)
break
;;
*)
echo "输入有误,请重新输入."
continue continue
;;
esac
done
if [ "$pip_version" -lt 9 ];then
echo "您的pip版本小于9.0.1 请升级pip (pip install --upgrade pip)"
exit 0
fi fi
python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [ "$python_version" == "27" ];then if [ "$python_version" == "27" ];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"` uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
if [[ "$uncode" == "" ]];then if [[ "$uncode" == "" ]];then
...@@ -246,16 +374,12 @@ function checkLinuxPip(){ ...@@ -246,16 +374,12 @@ function checkLinuxPip(){
uncode=u uncode=u
fi fi
fi fi
if [ "$python_version" == "" ];then
echo "检测结果:pip不存在,请重新输入"
else
version_list=`echo "${python_list[@]}" | grep "$python_version" ` version_list=`echo "${python_list[@]}" | grep "$python_version" `
if [ "$version_list" != "" ];then if [ "$version_list" == "" ];then
echo "检测结果:找到python${python_version}版本" echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
break
else else
echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 " break
fi
fi fi
done done
} }
...@@ -287,25 +411,36 @@ function PipLinuxInstall(){ ...@@ -287,25 +411,36 @@ function PipLinuxInstall(){
wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl" wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
if [[ "$paddle_version" == "2" ]];then if [[ "$paddle_version" == "2" ]];then
if [[ "$GPU" == "gpu" ]];then if [[ "$GPU" == "gpu" ]];then
if [[ ${AVX} == "avx" ]];then if [[ ${AVX} == "avx" ]];then
rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release wget -q $wheel_gpu_release
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
if [ "$?" == 0 ];then
echo 安装成功
else else
echo "paddlepaddle whl包下载失败" echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
else else
rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release_novax wget -q $wheel_gpu_release_novax
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
if [ "$?" == 0 ];then
echo 安装成功
else else
echo "paddlepaddle whl包下载失败" echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -313,9 +448,15 @@ function PipLinuxInstall(){ ...@@ -313,9 +448,15 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_release wget -q $wheel_cpu_release
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -324,18 +465,30 @@ function PipLinuxInstall(){ ...@@ -324,18 +465,30 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_develop wget -q $wheel_gpu_develop
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else else
echo "paddlepaddle whl包下载失败" echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
else else
rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'` rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_develop wget -q $wheel_cpu_develop
if [ "$?" == "0" ];then if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop $python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else else
echo "paddlepaddle whl包下载失败" echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1 exit 1
fi fi
fi fi
...@@ -575,52 +728,72 @@ gpu_list=( ...@@ -575,52 +728,72 @@ gpu_list=(
echo echo
echo "Step 5. 检测pip版本" echo "Step 5. 检测pip版本"
echo echo
checkLinuxPip checkLinuxPython
echo echo
checkLinuxAVX checkLinuxAVX
echo
echo "Step 6.是否使用Python的虚拟环境"
use_virtualenv="--user"
checkPythonVirtualenv
echo "*********************2. 开始安装*****************************" echo "*********************2. 开始安装*****************************"
PipLinuxInstall PipLinuxInstall
if [ "$check_virtualenv" == 'y' ];then
echo "虚环境创建成功,请cd 进入${virtualenv_name}, 执行 source bin/activate 进入虚环境。退出虚环境执行 deactivate命令。
更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
fi
}
function clearMacPythonEnv(){
python_version=""
python_brief_version=""
python_root=""
} }
function checkMacPython2(){ function checkMacPython2(){
while true while true
do do
read -p "
=> 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)
如希望自定义Python路径,请输入路径:" python_root
echo
python_version=`$python_root --version 2>&1 1>&1` python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then if [[ $? == "0" ]];then
: if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
clearMacPythonEnv
else else
python_version=""
fi
check_python=`echo $python_version | grep "Python 2"` check_python=`echo $python_version | grep "Python 2"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then if [[ -n "$check_python" ]];then
python_version=""
elif [ -n "$check_python" ];then
while true while true
do do
read -p " echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python read -p "" use_python
echo echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'` use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y" use_python="y"
break break
elif [ "$use_python" == "n" ];then elif [[ "$use_python" == "n" ]];then
python_root="" clearMacPythonEnv
break break
else else
echo "输入错误,请重新输入(y/n)" red " 输入错误,请重新输入(y/n)"
fi fi
done done
if [ "$use_python" == "y" ];then if [[ "$use_python" == "y" ]];then
break return 0
fi fi
else else
echo "您输入Python的不是Python2" red " 您输入Python的不是Python2"
python_version="" clearMacPythonEnv
fi
fi
else
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi fi
done done
} }
...@@ -628,41 +801,48 @@ function checkMacPython2(){ ...@@ -628,41 +801,48 @@ function checkMacPython2(){
function checkMacPython3(){ function checkMacPython3(){
while true while true
do do
read -p "
=> 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3
如希望自定义Python路径,请输入路径:" python_root
python_version=`$python_root --version 2>&1 1>&1` python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then if [[ $? == "0" ]];then
: if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
clearMacPythonEnv
else else
python_version=""
fi
check_python=`echo $python_version | grep "Python 3"` check_python=`echo $python_version | grep "Python 3"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then if [[ -n "$check_python" ]];then
python_version=""
elif [ -n "$check_python" ] ;then
while true while true
do do
read -p " echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python read -p "" use_python
echo echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'` use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y" use_python="y"
break break
elif [ "$use_python" == "n" ];then elif [[ "$use_python" == "n" ]];then
python_root="" clearMacPythonEnv
break break
else else
echo "输入错误,请重新输入(y/n)" red " 输入错误,请重新输入(y/n)"
fi fi
done done
if [ "$use_python" == "y" ];then if [[ "$use_python" == "y" ]];then
break return 0
fi fi
else else
echo "您输入Python的不是Python3" red " 您输入Python的不是Python3"
python_version="" clearMacPythonEnv
fi
fi
else
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python3(注意Python版本不能低于3.5.x)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi fi
done done
} }
...@@ -672,105 +852,75 @@ function checkMacPaddleVersion(){ ...@@ -672,105 +852,75 @@ function checkMacPaddleVersion(){
do do
read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..." read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..."
echo echo
read -p " yellow " 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本"
1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本 yellow " 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}"
2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version} read -p " => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
=> 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
echo echo
echo "您选择了数字【"$paddle_version" 】" yellow " 您选择了数字【"$paddle_version" 】"
echo echo
break break
else else
paddle_version="2" paddle_version="2"
echo echo
echo "您选择了数字【2】" yellow " 您选择了数字【2】"
echo echo
break break
fi fi
done done
} }
function initCheckMacPython2(){
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
read -p "
2. 使用python 2.x
3. 使用python 3.x
=> 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
echo echo
if [ "$python_V" == "" ];then yellow " 您选择了Python "$python_V",正在寻找符合要求的Python 2版本"
python_V="2"
fi
echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..."
echo echo
if [ "$python_V" == "2" ];then
python_root=`which python2.7` python_root=`which python2.7`
if [ "$python_root" == "" ];then if [[ "$python_root" == "" ]];then
python_root=`which python` python_root=`which python`
fi fi
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython2 checkMacPython2
fi if [[ "$?" == "1" ]];then
while true return 1
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
python_root=""
checkMacPython2
break
else else
echo "输入错误,请重新输入(y/n)" return 0
fi fi
done }
elif [ "$python_V" == "3" ];then function initCheckMacPython3(){
python_root=`which python3`
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython3
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'` yellow " 您选择了Python "$python_V",正在寻找符合您要求的Python 2版本"
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then echo
break python_root=`which python3`
elif [ "$use_python" == "n" ];then
checkMacPython3 checkMacPython3
break if [[ "$?" == "1" ]];then
return 1
else else
echo "输入错误,请重新输入(y/n)" return 0
fi
done
else
:
fi fi
}
function checkMacPip(){
if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then
python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'` python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [[ ${python_brief_version} == "" ]];then
red "您输入的python:${python_root} 对应的pip不可用,请检查此pip或重新选择其他python"
echo
return 1
fi
pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
if [[ 9 -le ${pip_version} ]];then
:
else
red "您的pip版本过低,请安装pip 9.0.1及以上的版本"
echo
return 1
fi
if [[ "$python_brief_version" == "" ]];then
clearMacPythonEnv
red "您的 $python_root 对应的pip存在问题,请按ctrl + c退出后重新安装pip,或切换其他python版本"
echo
return 1
else
if [[ $python_brief_version == "27" ]];then if [[ $python_brief_version == "27" ]];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"` uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
if [[ $uncode == "" ]];then if [[ $uncode == "" ]];then
...@@ -780,37 +930,82 @@ function checkMacPythonVersion(){ ...@@ -780,37 +930,82 @@ function checkMacPythonVersion(){
fi fi
fi fi
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" ` version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
if [ "$version_list" != "" ];then if [[ "$version_list" != "" ]];then
break return 0
else
red "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
echo
clearMacPythonEnv
return 1
fi
fi
fi
}
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
echo
yellow " 2. 使用python 2.x"
yellow " 3. 使用python 3.x"
read -p " => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
if [[ "$python_V" == "" ]];then
python_V="2"
fi
if [[ "$python_V" == "2" ]];then
initCheckMacPython2
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else
:
fi
elif [[ "$python_V" == "3" ]];then
initCheckMacPython3
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else else
echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出" :
fi
else
:
fi fi
else else
echo "输入错误,请重新输入" red "输入错误,请重新输入"
fi fi
done done
} }
function checkMacAVX(){ function checkMacAVX(){
read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..." read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..."
echo
if [[ $AVX != "" ]];then if [[ $AVX != "" ]];then
AVX="avx" AVX="avx"
echo "检测结果:支持" echo ""
green " 检测结果:支持"
echo ""
return 0
else else
read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..." red " 检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
exit
fi
echo echo
return 1
fi
} }
function checkMacGPU(){ function checkMacGPU(){
read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..." read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..."
echo echo
if [[ $GPU != "" ]];then if [[ $GPU != "" ]];then
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
else else
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle" yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
GPU=cpu GPU=cpu
fi fi
echo echo
...@@ -822,38 +1017,44 @@ function macos() { ...@@ -822,38 +1017,44 @@ function macos() {
while true while true
do do
checkMacPaddleVersion checkMacPaddleVersion
checkMacPythonVersion checkMacPythonVersion
checkMacAVX checkMacAVX
checkMacGPU checkMacGPU
echo "*********************2. 开始安装*****************************" green "*********************2. 开始安装*****************************"
echo echo
read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..." yellow "即将为您下载并安装PaddlePaddle,请按回车键继续..."
read -n1 -p ""
echo echo
if [[ $paddle_version == "2" ]];then if [[ $paddle_version == "2" ]];then
$python_root -m pip install paddlepaddle $python_root -m pip install paddlepaddle
if [ $? == "0" ];then if [[ $? == "0" ]];then
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
rm $whl_cpu_release rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
exit 1 exit 1
fi fi
else else
if [ -f $whl_cpu_develop ];then if [[ -f $whl_cpu_develop ]];then
$python_root -m pip install $whl_cpu_develop $python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
rm -rf $whl_cpu_develop rm -rf $whl_cpu_develop
echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" # TODO add install success check here
green "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -861,15 +1062,15 @@ function macos() { ...@@ -861,15 +1062,15 @@ function macos() {
fi fi
else else
wget ${path}$whl_cpu_develop -O $whl_cpu_develop wget ${path}$whl_cpu_develop -O $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
$python_root -m pip install $whl_cpu_develop $python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then if [[ $? == "0" ]];then
rm $wheel_cpu_develop rm $wheel_cpu_develop
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器" green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break break
else else
rm $whl_cpu_release rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用" red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -877,7 +1078,7 @@ function macos() { ...@@ -877,7 +1078,7 @@ function macos() {
fi fi
else else
rm $whl_cpu_develop rm $whl_cpu_develop
echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues" red "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
echo"" echo""
echo "==========================================================================================" echo "=========================================================================================="
echo"" echo""
...@@ -890,33 +1091,35 @@ function macos() { ...@@ -890,33 +1091,35 @@ function macos() {
function main() { function main() {
echo "*********************************" echo "*********************************"
echo "欢迎使用PaddlePaddle快速安装脚本" green "欢迎使用PaddlePaddle快速安装脚本"
echo "*********************************" echo "*********************************"
echo echo
echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑" yellow "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
echo echo
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分" echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
yellow "1)安装前的准备"
yellow "2)开始安装"
echo echo
read -n1 -p "请按回车键进行下一步..." read -n1 -p "请按回车键进行下一步..."
echo echo
echo echo
echo "*********************1. 安装前的准备*****************************" green "*********************1. 安装前的准备*****************************"
echo echo
echo "Step 1. 正在检测您的操作系统信息..." echo "Step 1. 正在检测您的操作系统信息..."
echo echo
SYSTEM=`uname -s` SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ];then if [[ "$SYSTEM" == "Darwin" ]];then
echo "您的系统为:MAC OSX" yellow " 您的系统为:MAC OSX"
echo echo
macos macos
else else
echo "您的系统为:Linux" yellow " 您的系统为:Linux"
echo echo
OS=`cat /etc/issue|awk 'NR==1 {print $1}'` OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
linux linux
else else
echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档" red "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
fi fi
fi fi
} }
......
...@@ -87,7 +87,6 @@ function cmake_gen() { ...@@ -87,7 +87,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.5 uninstall -y protobuf pip3.5 uninstall -y protobuf
pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
...@@ -101,7 +100,6 @@ function cmake_gen() { ...@@ -101,7 +100,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.6 uninstall -y protobuf pip3.6 uninstall -y protobuf
pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
...@@ -115,7 +113,6 @@ function cmake_gen() { ...@@ -115,7 +113,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.7 uninstall -y protobuf pip3.7 uninstall -y protobuf
pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else else
...@@ -202,7 +199,6 @@ function cmake_gen() { ...@@ -202,7 +199,6 @@ function cmake_gen() {
-DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_TESTING=${WITH_TESTING:-ON}
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
...@@ -235,7 +231,6 @@ EOF ...@@ -235,7 +231,6 @@ EOF
-DCUDNN_ROOT=/usr/ \ -DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \ -DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
...@@ -398,9 +393,7 @@ EOF ...@@ -398,9 +393,7 @@ EOF
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version paddle version
fi
if [ "$1" == "cp27-cp27m" ]; then if [ "$1" == "cp27-cp27m" ]; then
pip uninstall -y paddlepaddle pip uninstall -y paddlepaddle
...@@ -555,7 +548,6 @@ EOF ...@@ -555,7 +548,6 @@ EOF
-DCMAKE_BUILD_TYPE=Release \ -DCMAKE_BUILD_TYPE=Release \
-DWITH_GPU=OFF \ -DWITH_GPU=OFF \
-DWITH_MKL=OFF \ -DWITH_MKL=OFF \
-DWITH_FLUID_ONLY=ON
local LIB_TYPE=$1 local LIB_TYPE=$1
case $LIB_TYPE in case $LIB_TYPE in
...@@ -631,13 +623,8 @@ EOF ...@@ -631,13 +623,8 @@ EOF
NCCL_DEPS="true" NCCL_DEPS="true"
fi fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
PADDLE_VERSION="paddle version" PADDLE_VERSION="paddle version"
CMD='"paddle", "version"' CMD='"paddle", "version"'
else
PADDLE_VERSION="true"
CMD='"true"'
fi
if [ "$1" == "cp35-cp35m" ]; then if [ "$1" == "cp35-cp35m" ]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
...@@ -722,12 +709,6 @@ EOF ...@@ -722,12 +709,6 @@ EOF
EOF EOF
fi fi
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
EOF
fi
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# default command shows the paddle version and exit # default command shows the paddle version and exit
CMD [${CMD}] CMD [${CMD}]
......
...@@ -26,7 +26,6 @@ function start_build_docker() { ...@@ -26,7 +26,6 @@ function start_build_docker() {
-e WITH_GPU=ON \ -e WITH_GPU=ON \
-e CUDA_ARCH_NAME=Auto \ -e CUDA_ARCH_NAME=Auto \
-e WITH_AVX=ON \ -e WITH_AVX=ON \
-e WITH_GOLANG=OFF \
-e WITH_TESTING=ON \ -e WITH_TESTING=ON \
-e WITH_COVERAGE=ON \ -e WITH_COVERAGE=ON \
-e COVERALLS_UPLOAD=ON \ -e COVERALLS_UPLOAD=ON \
...@@ -35,7 +34,6 @@ function start_build_docker() { ...@@ -35,7 +34,6 @@ function start_build_docker() {
-e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \ -e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
-e CUDA_VISIBLE_DEVICES=0,1 \ -e CUDA_VISIBLE_DEVICES=0,1 \
-e WITH_DISTRIBUTE=ON \ -e WITH_DISTRIBUTE=ON \
-e WITH_FLUID_ONLY=ON \
-e RUN_TEST=ON -e RUN_TEST=ON
EOL EOL
) )
......
...@@ -6,10 +6,7 @@ function version(){ ...@@ -6,10 +6,7 @@ function version(){
echo " with_gpu: @WITH_GPU@" echo " with_gpu: @WITH_GPU@"
echo " with_mkl: @WITH_MKL@" echo " with_mkl: @WITH_MKL@"
echo " with_mkldnn: @WITH_MKLDNN@" echo " with_mkldnn: @WITH_MKLDNN@"
echo " with_double: @WITH_DOUBLE@"
echo " with_python: @WITH_PYTHON@" echo " with_python: @WITH_PYTHON@"
echo " with_rdma: @WITH_RDMA@"
echo " with_timer: @WITH_TIMER@"
} }
function ver2num() { function ver2num() {
......
...@@ -4,18 +4,6 @@ set(PY_FILES paddle/__init__.py ...@@ -4,18 +4,6 @@ set(PY_FILES paddle/__init__.py
${UTILS_PY_FILES} ${UTILS_PY_FILES}
${FLUID_PY_FILES}) ${FLUID_PY_FILES})
set(MKL_SHARED_LIBS "")
set(MKL_DEPENDS "")
if(WITH_MKLML)
list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
list(APPEND MKL_DEPENDS mklml)
endif()
if(WITH_MKLDNN)
list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
endif()
if(WITH_GPU) if(WITH_GPU)
SET(PACKAGE_NAME "paddlepaddle-gpu") SET(PACKAGE_NAME "paddlepaddle-gpu")
else() else()
...@@ -42,7 +30,7 @@ IF(WIN32) ...@@ -42,7 +30,7 @@ IF(WIN32)
COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
ELSE(WIN32) ELSE(WIN32)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc COMMAND touch stub.cc
...@@ -51,11 +39,10 @@ ELSE(WIN32) ...@@ -51,11 +39,10 @@ ELSE(WIN32)
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER}) DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
ENDIF() ENDIF()
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies}) add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/) set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
......
...@@ -131,7 +131,8 @@ def __bootstrap__(): ...@@ -131,7 +131,8 @@ def __bootstrap__():
'eager_delete_tensor_gb', 'fast_eager_deletion_mode', 'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode', 'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir', 'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph' 'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size'
] ]
if 'Darwin' not in sysstr: if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory') read_env_flags.append('use_pinned_memory')
......
...@@ -178,9 +178,9 @@ class CompiledProgram(object): ...@@ -178,9 +178,9 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize # FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass. # if turn on python memory optimize, turn off the inplace_pass.
if self._build_strategy.memory_optimize is None: if self._build_strategy.memory_optimize is None:
self._build_strategy.memory_optimize = False if main._is_mem_optimized else True self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is None: if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False if main._is_mem_optimized else True self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints: if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len( assert self._build_strategy.num_trainers == len(
...@@ -220,7 +220,7 @@ class CompiledProgram(object): ...@@ -220,7 +220,7 @@ class CompiledProgram(object):
if self._compiled: if self._compiled:
if scope and self._scope != scope: if scope and self._scope != scope:
raise ValueError("Cannot compile with different scope") raise ValueError("Cannot compile with different scope")
if place and self._place != place: if place and not self._place._equals(place):
raise ValueError("Cannot compile with different place") raise ValueError("Cannot compile with different place")
return self return self
self._compiled = True self._compiled = True
......
...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None): ...@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype=slice_var.dtype, dtype=slice_var.dtype,
persistable=True) persistable=True)
dim1_flatten = 1
if len(slice.shape) >= 2:
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:]) dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten) start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0]) end = int(offset / dim1_flatten + slice.shape[0])
......
...@@ -8744,16 +8744,17 @@ def slice(input, axes, starts, ends): ...@@ -8744,16 +8744,17 @@ def slice(input, axes, starts, ends):
return out return out
@templatedoc()
def shape(input): def shape(input):
""" """
${comment} **Shape Layer**
Get the shape of the input.
Args: Args:
input (Variable): ${input_comment} input (Variable): The input variable.
Returns: Returns:
out (Variable): ${out_comment} Variable: The shape of the input variable.
Examples: Examples:
.. code-block:: python .. code-block:: python
......
...@@ -649,6 +649,7 @@ class AdagradOptimizer(Optimizer): ...@@ -649,6 +649,7 @@ class AdagradOptimizer(Optimizer):
regularization: A Regularizer, such as regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer. fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix. name: A optional name prefix.
initial_accumulator_value (float): Initial value for moment accumulator.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -662,7 +663,8 @@ class AdagradOptimizer(Optimizer): ...@@ -662,7 +663,8 @@ class AdagradOptimizer(Optimizer):
learning_rate, learning_rate,
epsilon=1.0e-6, epsilon=1.0e-6,
regularization=None, regularization=None,
name=None): name=None,
initial_accumulator_value=0.0):
assert learning_rate is not None assert learning_rate is not None
assert epsilon is not None assert epsilon is not None
super(AdagradOptimizer, self).__init__( super(AdagradOptimizer, self).__init__(
...@@ -671,6 +673,7 @@ class AdagradOptimizer(Optimizer): ...@@ -671,6 +673,7 @@ class AdagradOptimizer(Optimizer):
name=name) name=name)
self.type = "adagrad" self.type = "adagrad"
self._epsilon = epsilon self._epsilon = epsilon
self.initial_accumulator_value = initial_accumulator_value
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -683,6 +686,16 @@ class AdagradOptimizer(Optimizer): ...@@ -683,6 +686,16 @@ class AdagradOptimizer(Optimizer):
moment_acc = self._get_accumulator(self._moment_acc_str, moment_acc = self._get_accumulator(self._moment_acc_str,
param_and_grad[0]) param_and_grad[0])
startup_block = framework.default_startup_program().global_block()
startup_block.append_op(
type='fill_constant',
inputs={},
outputs={'Out': [moment_acc]},
attrs={
'dtype': moment_acc.dtype,
'value': self.initial_accumulator_value,
'shape': moment_acc.shape,
})
# Create the adagrad optimizer op # Create the adagrad optimizer op
adagrad_op = block.append_op( adagrad_op = block.append_op(
......
...@@ -113,13 +113,12 @@ py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optim ...@@ -113,13 +113,12 @@ py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optim
endif() endif()
if(NOT APPLE) if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug") endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time. # change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200) set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
endif()
endif() endif()
if (WITH_NGRAPH) if (WITH_NGRAPH)
add_subdirectory(ngraph) add_subdirectory(ngraph)
endif() endif()
......
...@@ -18,8 +18,8 @@ import unittest ...@@ -18,8 +18,8 @@ import unittest
import numpy as np import numpy as np
import paddle.fluid.core as core import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest from paddle.fluid.tests.unittests.op_test import OpTest
from scipy.special import expit
from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
import paddle.fluid as fluid
class TestMKLDNNReluDim2(TestRelu): class TestMKLDNNReluDim2(TestRelu):
...@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs): ...@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
self.attrs = {"use_mkldnn": True} self.attrs = {"use_mkldnn": True}
# Check if primitives already exist in backward
class TestMKLDNNReluPrimitivesAlreadyExist(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_check_forward_backward(self):
place = core.CPUPlace()
np.random.seed(123)
x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
out = np.abs(x)
out_grad = np.random.random_sample(x.shape).astype(np.float32)
x_grad = out_grad * np.sign(x) # Abs grad calculation
var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
var_names = list(var_dict.keys())
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name, dtype='float32', shape=ground_truth[name].shape)
relu_op = block.append_op(
type="abs",
inputs={"X": block.var('x'), },
outputs={"Out": block.var('out')},
attrs={"use_mkldnn": True})
# Generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
relu_op.desc, set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
exe = fluid.Executor(place)
# Do at least 2 iterations
for i in range(2):
out = exe.run(
program,
feed={name: var_dict[name]
for name in ['x', 'out@GRAD']},
fetch_list=['x@GRAD'])
self.__assert_close(x_grad, out[0], "x@GRAD")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -15,261 +15,7 @@ ...@@ -15,261 +15,7 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np from paddle.fluid.tests.unittests.test_cross_entropy_op import TestCrossEntropyOp, TestCrossEntropyOp2, TestCrossEntropyOp3, TestCrossEntropyOp4, TestCrossEntropyOp5, TestCrossEntropyOp6, TestCrossEntropyOp7
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest, randomize_probability
class TestCrossEntropyOp(OpTest):
"""Test cross-entropy with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
self.soft_label = False
self.ignore_index = -100
self.dtype = np.float64
self.batch_size = 30
self.class_num = 10
self._cpu_only = True
self.init_dtype_type()
self.init_attr_type()
self.init_bs_class_num()
self.init_x()
self.init_label()
self.get_cross_entropy()
self.inputs = {"X": self.x, "Label": self.label}
self.outputs = {"Y": self.cross_entropy}
self.attrs = {
"soft_label": self.soft_label,
"ignore_index": self.ignore_index
}
def init_x(self):
self.x = randomize_probability(
self.batch_size, self.class_num, dtype=self.dtype)
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
for i in range(self.x.shape[0])],
dtype="float64")
def init_attr_type(self):
pass
def init_dtype_type(self):
pass
def init_bs_class_num(self):
pass
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
class TestCrossEntropyOp2(TestCrossEntropyOp):
"""Test cross-entropy with vectorized soft labels.
"""
def init_label(self):
self.label = np.random.uniform(
0.1, 1.0, [self.batch_size, self.class_num]).astype(self.dtype)
self.label /= self.label.sum(axis=1, keepdims=True)
def get_cross_entropy(self):
self.cross_entropy = (-self.label * np.log(self.x)).sum(
axis=1, keepdims=True).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp3(TestCrossEntropyOp):
"""Test cross-entropy with vectorized one-hot representation of labels.
"""
def init_label(self):
self.label_index = np.random.randint(0, self.class_num,
(self.batch_size))
self.label = np.zeros(self.x.shape).astype(self.dtype)
self.label[np.arange(self.batch_size), self.label_index] = 1
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label_index[i]])]
for i in range(self.x.shape[0])]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp4(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""
def init_x(self):
self.shape = [10, 2, 4]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_2d = np.random.randint(
0, self.class_num, (self.ins_num, 1), dtype="int64")
self.label = self.label_2d.reshape(self.shape + [1])
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(self.X_2d[i][self.label_2d[i][0]])]
for i in range(self.X_2d.shape[0])]).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
def init_attr_type(self):
self.soft_label = False
def init_dtype_type(self):
self.dtype = np.float64
def init_bs_class_num(self):
self.class_num = 10
class TestCrossEntropyOp5(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""
def init_x(self):
self.shape = [4, 3]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_2d = np.random.uniform(
0.1, 1.0, [self.ins_num, self.class_num]).astype(self.dtype)
self.label_2d /= self.label_2d.sum(axis=1, keepdims=True)
self.label = self.label_2d.reshape(self.shape + [self.class_num])
def get_cross_entropy(self):
cross_entropy_2d = (-self.label_2d * np.log(self.X_2d)).sum(
axis=1, keepdims=True).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp6(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""
def init_x(self):
self.shape = [4, 3, 2]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_index_2d = np.random.randint(
0, self.class_num, (self.ins_num), dtype="int64")
label_2d = np.zeros(self.X_2d.shape)
label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
self.dtype)
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(self.X_2d[i][self.label_index_2d[i]])]
for i in range(self.X_2d.shape[0])])
self.cross_entropy = np.array(cross_entropy_2d).reshape(
self.shape + [1]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp7(TestCrossEntropyOp):
"""Test cross-entropy with ignore index.
"""
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
if self.label[i][0] != self.ignore_index else [0]
for i in range(self.x.shape[0])]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = False
self.ignore_index = 3
def init_dtype_type(self):
self.dtype = np.float64
def init_bs_class_num(self):
self.batch_size = 30
self.class_num = 10
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_momentum_op import TestMomentumOp1, TestMomentumOp2, TestLarsMomentumOp, TestSparseMomentumOp, TestSparseMomentumOp2
if __name__ == '__main__':
unittest.main()
...@@ -40,6 +40,8 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -40,6 +40,8 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self._dropout = dropout self._dropout = dropout
self._input = None self._input = None
self._num_steps = num_steps self._num_steps = num_steps
from paddle.fluid.layer_helper import LayerHelper
self._helper = LayerHelper('SimpleLSTMRNN', act="tanh")
def _build_once(self, input_embedding, init_hidden=None, init_cell=None): def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
self.weight_1_arr = [] self.weight_1_arr = []
...@@ -50,17 +52,21 @@ class SimpleLSTMRNN(fluid.imperative.Layer): ...@@ -50,17 +52,21 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self.mask_array = [] self.mask_array = []
for i in range(self._num_layers): for i in range(self._num_layers):
weight_1 = fluid.layers.create_parameter( weight_1 = self._helper.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 4], shape=[self._hidden_size * 2, self._hidden_size * 4],
dtype="float32", dtype="float32",
name="fc_weight1_" + str(i),
default_initializer=fluid.initializer.UniformInitializer( default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)) low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(weight_1) self.weight_1_arr.append(weight_1)
bias_1 = fluid.layers.create_parameter( bias_1 = self._helper.create_parameter(
[self._hidden_size * 4], attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 4],
dtype="float32", dtype="float32",
name="fc_bias1_" + str(i),
default_initializer=fluid.initializer.Constant(0.0)) default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(bias_1) self.bias_arr.append(bias_1)
...@@ -137,6 +143,8 @@ class PtbModel(fluid.imperative.Layer): ...@@ -137,6 +143,8 @@ class PtbModel(fluid.imperative.Layer):
self.num_layers = num_layers self.num_layers = num_layers
self.num_steps = num_steps self.num_steps = num_steps
self.dropout = dropout self.dropout = dropout
from paddle.fluid.layer_helper import LayerHelper
self._helper = LayerHelper('PtbModel', act="tanh")
self.simple_lstm_rnn = SimpleLSTMRNN( self.simple_lstm_rnn = SimpleLSTMRNN(
hidden_size, hidden_size,
num_steps, num_steps,
...@@ -151,16 +159,16 @@ class PtbModel(fluid.imperative.Layer): ...@@ -151,16 +159,16 @@ class PtbModel(fluid.imperative.Layer):
name='embedding_para', name='embedding_para',
initializer=fluid.initializer.UniformInitializer( initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale))) low=-init_scale, high=init_scale)))
self.softmax_weight = fluid.layers.create_parameter( self.softmax_weight = self._helper.create_parameter(
[self.hidden_size, self.vocab_size], attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32", dtype="float32",
name="softmax_weight",
default_initializer=fluid.initializer.UniformInitializer( default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale)) low=-self.init_scale, high=self.init_scale))
self.softmax_bias = fluid.layers.create_parameter( self.softmax_bias = self._helper.create_parameter(
[self.vocab_size], attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype="float32", dtype="float32",
name='softmax_bias',
default_initializer=fluid.initializer.UniformInitializer( default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale)) low=-self.init_scale, high=self.init_scale))
...@@ -256,7 +264,6 @@ class TestImperativePtbRnn(unittest.TestCase): ...@@ -256,7 +264,6 @@ class TestImperativePtbRnn(unittest.TestCase):
with new_program_scope(): with new_program_scope():
fluid.default_startup_program().random_seed = seed fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel( ptb_model = PtbModel(
hidden_size=hidden_size, hidden_size=hidden_size,
vocab_size=vocab_size, vocab_size=vocab_size,
......
...@@ -173,13 +173,16 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold, ...@@ -173,13 +173,16 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
normalized, normalized,
shared=False) shared=False)
if nmsed_num == 0: if nmsed_num == 0:
#lod.append(1)
continue continue
lod.append(nmsed_num) lod.append(nmsed_num)
tmp_det_out = []
for c, indices in nmsed_outs.items(): for c, indices in nmsed_outs.items():
for idx in indices: for idx in indices:
xmin, ymin, xmax, ymax = box[idx, c, :] xmin, ymin, xmax, ymax = box[idx, c, :]
det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax]) tmp_det_out.append([c, score[idx][c], xmin, ymin, xmax, ymax])
sorted_det_out = sorted(
tmp_det_out, key=lambda tup: tup[0], reverse=False)
det_outs.extend(sorted_det_out)
if len(lod) == 0: if len(lod) == 0:
lod.append(1) lod.append(1)
......
...@@ -274,7 +274,7 @@ class TestAdagradOptimizer(unittest.TestCase): ...@@ -274,7 +274,7 @@ class TestAdagradOptimizer(unittest.TestCase):
# Check init_program # Check init_program
init_ops = init_program.global_block().ops init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2) self.assertEqual(len(init_ops), 3)
self.assertEqual(init_ops[0].type, "fill_constant") self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant") self.assertEqual(init_ops[1].type, "fill_constant")
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import os
os.environ['FLAGS_enable_parallel_graph'] = str(1)
import paddle.fluid.core as core
import os
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self):
np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32)
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def test_simple_fc(self):
# use_cuda
self.check_simple_fc_convergence(True)
def check_simple_fc_parallel_accuracy(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_parallel_executor=True)
self.assertAlmostEquals(
np.mean(parallel_first_loss),
single_first_loss,
delta=1e-6, )
self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(True)
if __name__ == '__main__':
unittest.main()
...@@ -16,15 +16,19 @@ from __future__ import print_function ...@@ -16,15 +16,19 @@ from __future__ import print_function
import unittest import unittest
import os import os
import tempfile
import numpy as np import numpy as np
import paddle.fluid as fluid import paddle.fluid as fluid
import paddle.fluid.profiler as profiler import paddle.fluid.profiler as profiler
import paddle.fluid.layers as layers import paddle.fluid.layers as layers
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
class TestProfiler(unittest.TestCase): class TestProfiler(unittest.TestCase):
def net_profiler(self, state, profile_path='/tmp/profile'): def net_profiler(self, state, use_parallel_executor=False):
profile_path = os.path.join(tempfile.gettempdir(), "profile")
open(profile_path, "w").write("")
startup_program = fluid.Program() startup_program = fluid.Program()
main_program = fluid.Program() main_program = fluid.Program()
...@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase): ...@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase):
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0) place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place) exe = fluid.Executor(place)
exe.run(startup_program) exe.run(startup_program)
if use_parallel_executor:
pe = fluid.ParallelExecutor(
state != 'CPU',
loss_name=avg_cost.name,
main_program=main_program)
pass_acc_calculator = fluid.average.WeightedAverage() pass_acc_calculator = fluid.average.WeightedAverage()
with profiler.profiler(state, 'total', profile_path) as prof: with profiler.profiler(state, 'total', profile_path) as prof:
...@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase): ...@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase):
x = np.random.random((32, 784)).astype("float32") x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64") y = np.random.randint(0, 10, (32, 1)).astype("int64")
if use_parallel_executor:
pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
continue
outs = exe.run(main_program, outs = exe.run(main_program,
feed={'x': x, feed={'x': x,
'y': y}, 'y': y},
...@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase): ...@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase):
b_size = np.array(outs[2]) b_size = np.array(outs[2])
pass_acc_calculator.add(value=acc, weight=b_size) pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval() pass_acc = pass_acc_calculator.eval()
data = open(profile_path, 'rb').read()
self.assertGreater(len(data), 0)
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(data)
self.assertGreater(len(profile_pb.events), 0)
for event in profile_pb.events:
if event.type == profiler_pb2.Event.GPUKernel:
if not event.detail_info and not event.name.startswith("MEM"):
raise Exception(
"Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
% event.name)
elif event.type == profiler_pb2.Event.CPU and (
event.name.startswith("Driver API") or
event.name.startswith("Runtime API")):
print("Warning: unregister", event.name)
def test_cpu_profiler(self): def test_cpu_profiler(self):
self.net_profiler('CPU') self.net_profiler('CPU')
self.net_profiler('CPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU") "profiler is enabled only with GPU")
def test_cuda_profiler(self): def test_cuda_profiler(self):
self.net_profiler('GPU') self.net_profiler('GPU')
self.net_profiler('GPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(), @unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU") "profiler is enabled only with GPU")
def test_all_profiler(self): def test_all_profiler(self):
self.net_profiler('All', '/tmp/profile_out') self.net_profiler('All')
with open('/tmp/profile_out', 'rb') as f: self.net_profiler('All', use_parallel_executor=True)
self.assertGreater(len(f.read()), 0)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object): ...@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0 = 0 skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name] slice_vars = self.param_var_mapping[orig_var_name]
orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:]) orig_dim1_flatten = 1
if len(slice_vars[0].shape) >= 2:
orig_dim1_flatten = reduce(lambda x, y: x * y,
slice_vars[0].shape[1:])
for slice_var in slice_vars[:block_idx]: for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0] skip_dim0 += slice_var.shape[0]
......
requests==2.9.2 requests==2.9.2
numpy>=1.12 numpy>=1.12
protobuf>=3.6 protobuf>=3.1.0
recordio>=0.1.0 recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile rarfile
......
...@@ -24,3 +24,8 @@ sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \ ...@@ -24,3 +24,8 @@ sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp . docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
sed 's/<baseimg>/10.0-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
...@@ -107,11 +107,13 @@ curl-config --features ...@@ -107,11 +107,13 @@ curl-config --features
rm -rf /usr/local/ssl rm -rf /usr/local/ssl
# Install patchelf (latest with unreleased bug fixes) # Install patchelf (latest with unreleased bug fixes)
curl -sLO https://nixos.org/releases/patchelf/patchelf-0.9/patchelf-0.9.tar.gz # FIXME(typhoonzero): restore this when the link is fixed.
check_sha256sum patchelf-0.9.tar.gz $PATCHELF_HASH # curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
tar -xzf patchelf-0.9.tar.gz # check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
(cd patchelf-0.9 && ./configure && make && make install) # tar -xzf patchelf-0.9njs2.tar.gz
rm -rf patchelf-0.9.tar.gz patchelf-0.9 # (cd patchelf-0.9njs2 && ./configure && make && make install)
# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
yum install -y patchelf
# Install latest pypi release of auditwheel # Install latest pypi release of auditwheel
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
......
...@@ -87,6 +87,8 @@ function do_cpython_build { ...@@ -87,6 +87,8 @@ function do_cpython_build {
# NOTE Make libpython shared library visible to python calls below # NOTE Make libpython shared library visible to python calls below
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
cd /
ls ${MY_DIR}
local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py) local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
ln -s ${prefix} /opt/python/${abi_tag} ln -s ${prefix} /opt/python/${abi_tag}
} }
......
...@@ -131,8 +131,12 @@ class Timeline(object): ...@@ -131,8 +131,12 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices: if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid self._devices[(k, event.device_id, "CPU")] = pid
self._chrome_trace.emit_pid("%s:cpu:block:%d" % # -1 device id represents CUDA api call
(k, event.device_id), pid) if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else:
self._chrome_trace.emit_pid(
"%s:cpu:block:%d" % (k, event.device_id), pid)
elif event.type == profiler_pb2.Event.GPUKernel: elif event.type == profiler_pb2.Event.GPUKernel:
if (k, event.device_id, "GPUKernel") not in self._devices: if (k, event.device_id, "GPUKernel") not in self._devices:
pid = self._allocate_pid() pid = self._allocate_pid()
...@@ -150,7 +154,9 @@ class Timeline(object): ...@@ -150,7 +154,9 @@ class Timeline(object):
pid = self._devices[(k, event.device_id, type)] pid = self._devices[(k, event.device_id, type)]
args = {'name': event.name} args = {'name': event.name}
if event.memcopy.bytes > 0: if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes} args['mem_bytes'] = event.memcopy.bytes
if event.detail_info:
args['detail_info'] = event.detail_info
# TODO(panyx0718): Chrome tracing only handles ms. However, some # TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here. # ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region( self._chrome_trace.emit_region(
...@@ -173,7 +179,7 @@ if args.timeline_path: ...@@ -173,7 +179,7 @@ if args.timeline_path:
profile_paths = profile_path.split(',') profile_paths = profile_path.split(',')
profile_dict = dict() profile_dict = dict()
if len(profile_paths) == 1: if len(profile_paths) == 1:
with open(profile_path, 'r') as f: with open(profile_path, 'rb') as f:
profile_s = f.read() profile_s = f.read()
profile_pb = profiler_pb2.Profile() profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s) profile_pb.ParseFromString(profile_s)
...@@ -181,7 +187,7 @@ if len(profile_paths) == 1: ...@@ -181,7 +187,7 @@ if len(profile_paths) == 1:
else: else:
for profile_path in profile_paths: for profile_path in profile_paths:
k, v = profile_path.split('=') k, v = profile_path.split('=')
with open(v, 'r') as f: with open(v, 'rb') as f:
profile_s = f.read() profile_s = f.read()
profile_pb = profiler_pb2.Profile() profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s) profile_pb.ParseFromString(profile_s)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册