未验证 提交 98069d99 编写于 作者: L lujun 提交者: GitHub

Merge pull request #9 from PaddlePaddle/develop

merge to local
......@@ -44,6 +44,7 @@
| qingqing01 | Qing-Qing Dang |
| reyoung | Yang Yu |
| Sand3r- | Michal Gallus |
| sfraczek | Sylwester Fraczek |
| Superjom | Chun-Wei Yan |
| tensor-tang | Jian Tang |
| tianbingsz | Tian-Bing Xu |
......@@ -54,6 +55,7 @@
| wangyang59 | Yang Wang |
| wangzhen-nlp | Zhen Wang |
| wen-bo-yang | Wen-Bo Yang |
| wojtuss | Wojciech Uss |
| wwhu | Wei-Wei Hu |
| xinghai-sun | Xing-Hai Sun |
| Xreki | Yi-Qun Liu |
......
......@@ -54,23 +54,12 @@ option(WITH_NGRAPH "Compile PaddlePaddle with nGraph support." OFF)
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
option(WITH_TESTING "Compile PaddlePaddle with unit testing" OFF)
option(WITH_PYTHON "Compile PaddlePaddle with python interpreter" ON)
option(WITH_DOUBLE "Compile PaddlePaddle with double precision" OFF)
option(WITH_RDMA "Compile PaddlePaddle with RDMA support" OFF)
option(WITH_TIMER "Compile PaddlePaddle with stats timer" OFF)
option(WITH_PROFILER "Compile PaddlePaddle with GPU profiler and gperftools" OFF)
option(WITH_JEMALLOC "Compile PaddlePaddle with jemalloc" OFF)
option(WITH_DOC "Compile PaddlePaddle with documentation" OFF)
option(WITH_COVERAGE "Compile PaddlePaddle with code coverage" OFF)
option(COVERALLS_UPLOAD "Package code coverage data to coveralls" OFF)
option(ON_TRAVIS "Exclude special unit test on Travis CI" OFF)
option(WITH_FLUID_ONLY "Compile PaddlePaddle fluid only" OFF)
option(WITH_GOLANG "Compile PaddlePaddle with GOLANG" OFF)
option(GLIDE_INSTALL "Download and install go dependencies " ON)
option(WITH_DISTRIBUTE "Compile with distributed support" OFF)
option(WITH_PSLIB "Compile with pslib support" OFF)
option(USE_EIGEN_FOR_BLAS "Use matrix multiplication in Eigen" OFF)
option(EIGEN_USE_THREADS "Compile with multi-threaded Eigen" OFF)
option(WITH_ARM_FP16 "Use half precision support on armv8.2-a cpu" OFF)
option(WITH_CONTRIB "Compile the third-party contributation" OFF)
option(REPLACE_ENFORCE_GLOG "Replace PADDLE_ENFORCE with glog/CHECK for better debug." OFF)
option(WITH_ANAKIN "Compile with Anakin library" OFF)
......@@ -105,8 +94,6 @@ endif()
if (WIN32)
set(WITH_DISTRIBUTE OFF CACHE STRING
"Disable DISTRIBUTE when compiling for Windows" FORCE)
set(WITH_FLUID_ONLY ON CACHE STRING
"Enable FLUID_ONLY when compiling for Windows" FORCE)
endif()
set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
......@@ -148,7 +135,6 @@ include(external/openblas) # download, build, install openblas
include(external/mkldnn) # download, build, install mkldnn
include(external/ngraph) # download, build, install nGraph
include(external/boost) # download boost
include(external/any) # download libn::any
include(external/eigen) # download eigen3
include(external/pybind11) # download pybind11
include(external/cares)
......@@ -225,7 +211,6 @@ include(generic) # simplify cmake module
include(package) # set paddle packages
include(ccache) # set ccache for compilation
include(util) # set unittest and link libs
include(rdma) # set rdma libraries
include(version) # set PADDLE_VERSION
include(coveralls) # set code coverage
include(inference_lib) # add paddle fluid inference libraries
......@@ -233,38 +218,11 @@ include(inference_lib) # add paddle fluid inference libraries
include_directories("${PADDLE_SOURCE_DIR}")
set(EXTERNAL_LIBS
gflags
glog
${CBLAS_LIBRARIES}
protobuf
zlib
${PYTHON_LIBRARIES}
)
if(WITH_PSLIB)
list(APPEND EXTERNAL_LIBS pslib)
list(APPEND EXTERNAL_LIBS pslib_brpc)
list(APPEND EXTERNAL_LIBS libmct)
endif(WITH_PSLIB)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif()
if(WITH_LIBXSMM)
list(APPEND EXTERNAL_LIBS ${LIBXSMM_LIBS})
endif()
if(WITH_MKLDNN)
list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB})
endif()
set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
......
......@@ -20,31 +20,10 @@ if(WITH_DSO)
add_definitions(-DPADDLE_USE_DSO)
endif(WITH_DSO)
if(WITH_DOUBLE)
add_definitions(-DPADDLE_TYPE_DOUBLE)
endif(WITH_DOUBLE)
if(WITH_ARM_FP16)
add_definitions(-DPADDLE_ARM_FP16)
add_definitions("-march=armv8.2-a+fp16+simd")
endif(WITH_ARM_FP16)
if(WITH_TESTING)
add_definitions(-DPADDLE_WITH_TESTING)
endif(WITH_TESTING)
if(NOT WITH_TIMER)
add_definitions(-DPADDLE_DISABLE_TIMER)
endif(NOT WITH_TIMER)
if(USE_EIGEN_FOR_BLAS)
add_definitions(-DPADDLE_USE_EIGEN_FOR_BLAS)
endif(USE_EIGEN_FOR_BLAS)
if(EIGEN_USE_THREADS)
add_definitions(-DEIGEN_USE_THREADS)
endif(EIGEN_USE_THREADS)
if(NOT WITH_PROFILER)
add_definitions(-DPADDLE_DISABLE_PROFILER)
endif(NOT WITH_PROFILER)
......@@ -78,10 +57,6 @@ if(WIN32)
endif(NOT MSVC)
endif(WIN32)
if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG)
if(WITH_PSLIB)
add_definitions(-DPADDLE_WITH_PSLIB)
endif()
......@@ -171,55 +146,6 @@ if(WITH_DISTRIBUTE)
add_definitions(-DPADDLE_WITH_DISTRIBUTE)
endif()
if(WITH_GOLANG)
# we need to symlink Paddle directory into GOPATH. If we
# don't do it and we have code that depends on Paddle, go
# get ./... will download a new Paddle repo from Github,
# without the changes in our current Paddle repo that we
# want to build.
set(GOPATH "${CMAKE_CURRENT_BINARY_DIR}/go")
file(MAKE_DIRECTORY ${GOPATH})
set(PADDLE_IN_GOPATH "${GOPATH}/src/github.com/PaddlePaddle/Paddle")
file(MAKE_DIRECTORY "${PADDLE_IN_GOPATH}")
set(PADDLE_GO_PATH "${CMAKE_SOURCE_DIR}/go")
add_custom_target(go_path)
add_custom_command(TARGET go_path
# Symlink Paddle directory into GOPATH
COMMAND mkdir -p ${PADDLE_IN_GOPATH}
COMMAND rm -rf ${PADDLE_IN_GOPATH}
COMMAND ln -sf ${CMAKE_SOURCE_DIR} ${PADDLE_IN_GOPATH}
# Automatically get all dependencies specified in the source code
# We can't run `go get -d ./...` for every target, because
# multiple `go get` can not run concurrently, but make need to be
# able to run with multiple jobs.
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}
)
if (GLIDE_INSTALL)
if(EXISTS $ENV{GOPATH}/bin/glide)
set(GLIDE "$ENV{GOPATH}/bin/glide")
else()
message(FATAL_ERROR "no glide executeble found: $ENV{GOPATH}/bin/glide")
endif()
# this command will only run when the file it depends is missing
# or has changed, or the output is missing.
add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
COMMAND env GOPATH=${GOPATH} ${GLIDE} install
COMMAND touch ${CMAKE_BINARY_DIR}/glide
DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
)
# depends on the custom command which outputs
# ${CMAKE_BINARY_DIR}/glide, the custom command does not need to
# run every time this target is built.
add_custom_target(go_vendor DEPENDS ${CMAKE_BINARY_DIR}/glide go_path)
endif()
endif(WITH_GOLANG)
if(WITH_GRPC)
add_definitions(-DPADDLE_WITH_GRPC)
endif(WITH_GRPC)
......
......@@ -168,10 +168,7 @@ elseif (${CUDA_VERSION} LESS 11.0) # CUDA 10.x
endif()
include_directories(${CUDA_INCLUDE_DIRS})
list(APPEND EXTERNAL_LIBS ${CUDA_LIBRARIES} ${CUDA_rt_LIBRARY})
if(NOT WITH_DSO)
# TODO(panyx0718): CUPTI only allows DSO?
list(APPEND EXTERNAL_LIBS ${CUDNN_LIBRARY} ${CUPTI_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY} ${NCCL_LIBRARY})
if(WIN32)
set_property(GLOBAL PROPERTY CUDA_MODULES ${CUDNN_LIBRARY} ${CUDA_CUBLAS_LIBRARIES} ${CUDA_curand_LIBRARY})
endif(WIN32)
......
......@@ -74,5 +74,3 @@ add_dependencies(anakin_shared extern_anakin)
add_library(anakin_saber SHARED IMPORTED GLOBAL)
set_property(TARGET anakin_saber PROPERTY IMPORTED_LOCATION ${ANAKIN_SABER_LIB})
add_dependencies(anakin_saber extern_anakin)
list(APPEND external_project_dependencies anakin_shared anakin_saber)
INCLUDE(ExternalProject)
SET(ANY_SOURCE_DIR ${THIRD_PARTY_PATH}/any)
INCLUDE_DIRECTORIES(${ANY_SOURCE_DIR}/src/extern_lib_any)
ExternalProject_Add(
extern_lib_any
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/PaddlePaddle/any.git"
GIT_TAG "15595d8324be9e8a9a80d9ae442fdd12bd66df5d"
PREFIX ${ANY_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/lib_any_dummy.c)
file(WRITE ${dummyfile} "const char * dummy_any = \"${dummyfile}\";")
add_library(lib_any STATIC ${dummyfile})
else()
add_library(lib_any INTERFACE)
endif()
add_dependencies(lib_any extern_lib_any)
add_definitions(-DANY_IMPL_ANY_CAST_MOVEABLE)
LIST(APPEND external_project_dependencies lib_any)
......@@ -57,5 +57,4 @@ else()
endif()
add_dependencies(boost ${BOOST_PROJECT})
list(APPEND external_project_dependencies boost)
set(Boost_INCLUDE_DIR ${BOOST_INCLUDE_DIR})
......@@ -69,5 +69,3 @@ SET_PROPERTY(TARGET brpc PROPERTY IMPORTED_LOCATION ${BRPC_LIBRARIES})
ADD_DEPENDENCIES(brpc extern_brpc)
add_definitions(-DBRPC_WITH_GLOG)
LIST(APPEND external_project_dependencies brpc)
......@@ -31,5 +31,3 @@ else()
endif()
add_dependencies(cub extern_cub)
LIST(APPEND external_project_dependencies cub)
......@@ -27,5 +27,3 @@ else()
endif()
add_dependencies(dlpack extern_dlpack)
LIST(APPEND external_project_dependencies dlpack)
......@@ -52,5 +52,3 @@ else()
endif()
add_dependencies(eigen3 extern_eigen3)
LIST(APPEND external_project_dependencies eigen3)
......@@ -61,8 +61,6 @@ ADD_LIBRARY(gflags STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET gflags PROPERTY IMPORTED_LOCATION ${GFLAGS_LIBRARIES})
ADD_DEPENDENCIES(gflags extern_gflags)
LIST(APPEND external_project_dependencies gflags)
# On Windows (including MinGW), the Shlwapi library is used by gflags if available.
if (WIN32)
include(CheckIncludeFileCXX)
......
......@@ -72,5 +72,3 @@ ADD_LIBRARY(glog STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET glog PROPERTY IMPORTED_LOCATION ${GLOG_LIBRARIES})
ADD_DEPENDENCIES(glog extern_glog gflags)
LINK_LIBRARIES(glog gflags)
LIST(APPEND external_project_dependencies glog)
......@@ -79,5 +79,4 @@ IF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
SET_PROPERTY(TARGET gtest_main PROPERTY IMPORTED_LOCATION ${GTEST_MAIN_LIBRARIES})
ADD_DEPENDENCIES(gtest_main extern_gtest)
LIST(APPEND external_project_dependencies gtest gtest_main)
ENDIF(WITH_TESTING OR (WITH_DISTRIBUTE AND NOT WITH_GRPC))
......@@ -39,6 +39,3 @@ ADD_DEPENDENCIES(extern_leveldb snappy)
ADD_LIBRARY(leveldb STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET leveldb PROPERTY IMPORTED_LOCATION ${LEVELDB_LIBRARIES})
ADD_DEPENDENCIES(leveldb extern_leveldb)
LIST(APPEND external_project_dependencies leveldb)
......@@ -72,7 +72,4 @@ else()
add_library(libmct INTERFACE)
endif()
#ADD_LIBRARY(libmct SHARED IMPORTED GLOBAL)
ADD_DEPENDENCIES(libmct ${LIBMCT_PROJECT})
LIST(APPEND external_project_dependencies libmct)
......@@ -53,5 +53,3 @@ MESSAGE(STATUS "Libxsmm library: ${LIBXSMM_LIBS}")
include_directories(${LIBXSMM_INCLUDE_DIR})
ADD_DEFINITIONS(-DPADDLE_WITH_LIBXSMM)
ADD_DEPENDENCIES(libxsmm extern_libxsmm)
LIST(APPEND external_project_dependencies libxsmm)
......@@ -89,7 +89,6 @@ SET_PROPERTY(TARGET shared_mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIB})
ADD_DEPENDENCIES(shared_mkldnn ${MKLDNN_PROJECT})
MESSAGE(STATUS "MKLDNN library: ${MKLDNN_LIB}")
add_definitions(-DPADDLE_WITH_MKLDNN)
LIST(APPEND external_project_dependencies shared_mkldnn)
# generate a static dummy target to track mkldnn dependencies
# for cc_library(xxx SRCS xxx.c DEPS mkldnn)
......
......@@ -40,7 +40,9 @@ IF(WIN32)
SET(MKLML_SHARED_LIB ${MKLML_LIB_DIR}/mklml.dll)
SET(MKLML_SHARED_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5md.dll)
ELSE()
SET(MKLML_VER "mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
#TODO(intel-huying):
# Now enable Erf function in mklml library temporarily, it will be updated as offical version later.
SET(MKLML_VER "VsErf_mklml_lnx_${TIME_VERSION}" CACHE STRING "" FORCE)
SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
SET(MKLML_IOMP_LIB ${MKLML_LIB_DIR}/libiomp5.so)
......@@ -73,4 +75,3 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
ADD_LIBRARY(mklml SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET mklml PROPERTY IMPORTED_LOCATION ${MKLML_LIB})
ADD_DEPENDENCIES(mklml ${MKLML_PROJECT})
LIST(APPEND external_project_dependencies mklml)
......@@ -77,4 +77,3 @@ add_dependencies(ngraph ${NGRAPH_PROJECT})
target_compile_definitions(ngraph INTERFACE -DPADDLE_WITH_NGRAPH)
target_include_directories(ngraph INTERFACE ${NGRAPH_INC_DIR})
target_link_libraries(ngraph INTERFACE ${NGRAPH_SHARED_LIB})
LIST(APPEND external_project_dependencies ngraph)
......@@ -11,11 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
IF(USE_EIGEN_FOR_BLAS)
return()
ENDIF(USE_EIGEN_FOR_BLAS)
INCLUDE(cblas)
IF(NOT ${CBLAS_FOUND})
......@@ -91,7 +86,6 @@ ENDIF()
IF(NOT ${CBLAS_FOUND})
ADD_DEPENDENCIES(cblas extern_openblas)
LIST(APPEND external_project_dependencies cblas)
ELSE()
IF("${CBLAS_PROVIDER}" STREQUAL "MKLML")
ADD_DEPENDENCIES(cblas mklml)
......
......@@ -129,7 +129,6 @@ macro(PROMPT_PROTOBUF_LIB)
ADD_DEPENDENCIES(protoc ${dep})
ENDFOREACH()
LIST(APPEND external_project_dependencies protobuf)
RETURN()
endmacro()
macro(SET_PROTOBUF_VERSION)
......@@ -203,7 +202,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
ENDIF()
SET(PROTOBUF_REPO "https://github.com/google/protobuf.git")
SET(PROTOBUF_TAG "v3.6.1")
SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546")
ExternalProject_Add(
${TARGET_NAME}
......@@ -231,7 +230,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
)
ENDFUNCTION()
SET(PROTOBUF_VERSION 3.6.1)
SET(PROTOBUF_VERSION 3.1.0)
IF(NOT PROTOBUF_FOUND)
build_protobuf(extern_protobuf FALSE)
......
......@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib PROPERTY IMPORTED_LOCATION ${PSLIB_LIB})
ADD_DEPENDENCIES(pslib ${PSLIB_PROJECT})
LIST(APPEND external_project_dependencies pslib)
......@@ -70,4 +70,3 @@ ExternalProject_Add(
ADD_LIBRARY(pslib_brpc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET pslib_brpc PROPERTY IMPORTED_LOCATION ${PSLIB_BRPC_LIB})
ADD_DEPENDENCIES(pslib_brpc ${PSLIB_BRPC_PROJECT})
LIST(APPEND external_project_dependencies pslib_brpc)
......@@ -74,8 +74,8 @@ IF(PYTHONINTERP_FOUND)
find_python_module(wheel REQUIRED)
find_python_module(google.protobuf REQUIRED)
FIND_PACKAGE(NumPy REQUIRED)
IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.6.1")
MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.6.1, "
IF(${PY_GOOGLE.PROTOBUF_VERSION} AND ${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
MESSAGE(FATAL_ERROR "Found Python Protobuf ${PY_GOOGLE.PROTOBUF_VERSION} < 3.0.0, "
"please use pip to upgrade protobuf. pip install -U protobuf")
ENDIF()
ENDIF(PYTHONINTERP_FOUND)
......
......@@ -26,5 +26,3 @@ else()
endif()
add_dependencies(simple_threadpool extern_threadpool)
LIST(APPEND external_project_dependencies simple_threadpool)
......@@ -83,5 +83,3 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include wa
ADD_LIBRARY(warpctc SHARED IMPORTED GLOBAL)
SET_PROPERTY(TARGET warpctc PROPERTY IMPORTED_LOCATION ${WARPCTC_LIBRARIES})
ADD_DEPENDENCIES(warpctc extern_warpctc)
LIST(APPEND external_project_dependencies warpctc)
......@@ -55,4 +55,3 @@ else()
endif()
add_dependencies(xbyak ${XBYAK_PROJECT})
list(APPEND external_project_dependencies xbyak)
......@@ -71,5 +71,3 @@ add_library(xxhash STATIC IMPORTED GLOBAL)
set_property(TARGET xxhash PROPERTY IMPORTED_LOCATION ${XXHASH_LIBRARIES})
include_directories(${XXHASH_INCLUDE_DIR})
add_dependencies(xxhash extern_xxhash)
LIST(APPEND external_project_dependencies xxhash)
......@@ -57,5 +57,3 @@ ENDIF(WIN32)
ADD_LIBRARY(zlib STATIC IMPORTED GLOBAL)
SET_PROPERTY(TARGET zlib PROPERTY IMPORTED_LOCATION ${ZLIB_LIBRARIES})
ADD_DEPENDENCIES(zlib extern_zlib)
LIST(APPEND external_project_dependencies zlib)
......@@ -11,8 +11,6 @@ include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" )
if(WITH_DSO)
......@@ -31,22 +29,12 @@ if(WITH_GRPC)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC")
endif(WITH_GRPC)
if(NOT WITH_GOLANG)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG")
endif(NOT WITH_GOLANG)
if(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN")
endif(WITH_MKLDNN)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE")
if(NOT WITH_RDMA)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA")
endif(NOT WITH_RDMA)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
......
# user should download rdma first from subversion repository
# execute following instruction to download svn mannally
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/sockrdmav1 rdma/
# svn co https://svn.baidu.com/sys/ip/trunk/rdma/thirdparty rdma/
# we use static output in svn repositories to avoid implict bugs from not standard runtime env.
if(WITH_RDMA)
set(RDMA_ROOT $ENV{RDMA_ROOT} CACHE PATH "Folder contains RDMA sock library and thirdparty library")
function(generate_rdma_links)
#redirect to current DIR to isolate the pollution from system runtime environment
#it can benifits unified control for different gcc environment.
#e.g, by default gcc48 did not refer /usr/lib64 which could contain low version
#runtime libraries that will crash process while loading it. That redirect trick
#can fix it.
execute_process(
COMMAND mkdir -p librdma
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so.1
COMMAND ln -s -f /usr/lib64/libibverbs.so.1.0.0 librdma/libibverbs.so
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so.1
COMMAND ln -s -f /usr/lib64/librdmacm.so.1.0.0 librdma/librdmacm.so
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so.1
COMMAND ln -s -f /lib64/libnl.so.1.1.4 librdma/libnl.so
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}
)
endfunction(generate_rdma_links)
#check and set headers
find_path(RDMA_INC_SXISOCK sxi_sock.h PATHS ${RDMA_ROOT}/sockrdmav1/output/include)
find_path(RDMA_INC_XIO libxio.h PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_path(RDMA_INC_EVENT event2 PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_path(RDMA_INC_NUMA numa.h PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
#check and set libs
find_library(RDMA_LIB_SXISOCK NAMES sxisock PATHS ${RDMA_ROOT}/sockrdmav1/output)
find_library(RDMA_LIB_XIO NAMES xio PATHS ${RDMA_ROOT}/thirdparty/output/accelio)
find_library(RDMA_LIB_EVENT NAMES event PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_CORE NAMES event_core PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_EXTRA NAMES event_extra PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_EVENT_PTHREADS NAMES event_pthreads PATHS ${RDMA_ROOT}/thirdparty/output/libevent)
find_library(RDMA_LIB_NUMA NAMES numa PATHS ${RDMA_ROOT}/thirdparty/output/libnuma)
if(
RDMA_INC_SXISOCK AND
RDMA_INC_XIO AND
RDMA_INC_EVENT AND
RDMA_INC_NUMA AND
RDMA_LIB_SXISOCK AND
RDMA_LIB_XIO AND
RDMA_LIB_EVENT AND
RDMA_LIB_EVENT_CORE AND
RDMA_LIB_EVENT_EXTRA AND
RDMA_LIB_EVENT_PTHREADS AND
RDMA_LIB_NUMA
)
set(RDMA_INC_DIR
${RDMA_INC_SXISOCK}
${RDMA_INC_XIO}
${RDMA_INC_EVENT}
${RDMA_INC_NUMA})
set(RDMA_LIBS
${RDMA_LIB_SXISOCK}
${RDMA_LIB_XIO}
${RDMA_LIB_EVENT}
${RDMA_LIB_EVENT_CORE}
${RDMA_LIB_EVENT_EXTRA}
${RDMA_LIB_EVENT_PTHREADS}
${RDMA_LIB_NUMA}
)
set(RDMA_LD_FLAGS "-L./librdma -libverbs -lrdmacm -Xlinker -rpath ./librdma")
include_directories("${RDMA_INC_DIR}")
else()
#if this module is not called, RDMA_INC_DIR RDMA_LIBS will be null, so top module always refer this variable
message(FATAL_ERROR, "RDMA libraries are not found, try to set RDMA_ROOT or check all related libraries.")
endif()
else(WITH_RDMA)
set(RDMA_LIBS "")
set(RDMA_LD_FLAGS "")
add_definitions(-DPADDLE_DISABLE_RDMA)
endif(WITH_RDMA)
......@@ -33,6 +33,5 @@ if(TENSORRT_FOUND)
message(STATUS "Current TensorRT header is ${TENSORRT_INCLUDE_DIR}/NvInfer.h. "
"Current TensorRT version is v${TENSORRT_MAJOR_VERSION}. ")
include_directories(${TENSORRT_INCLUDE_DIR})
list(APPEND EXTERNAL_LIBS ${TENSORRT_LIBRARY})
add_definitions(-DPADDLE_WITH_TENSORRT)
endif()
......@@ -14,9 +14,7 @@ cmake .. -DWITH_AVX=OFF \
-DWITH_MKL=OFF \
-DWITH_GPU=ON \
-DWITH_TESTING=ON \
-DWITH_TIMER=ON \
-DWITH_PROFILER=ON \
-DWITH_FLUID_ONLY=ON
make -j `nproc`
pip install -U "$WHEEL_PATH/$(ls $WHEEL_PATH)"
......
......@@ -427,7 +427,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.MomentumOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name', 'initial_accumulator_value'], varargs=None, keywords=None, defaults=(1e-06, None, None, 0.0))
paddle.fluid.optimizer.AdagradOptimizer.apply_gradients ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None)
paddle.fluid.optimizer.AdagradOptimizer.backward ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
......@@ -30,8 +30,6 @@ namespace paddle {
namespace framework {
namespace details {
static constexpr char kAllOpDescs[] = "all_op_descs";
VarHandle* GetValidInput(const OpHandleBase* a) {
for (auto p : a->Inputs()) {
VarHandle* b = dynamic_cast<VarHandle*>(p);
......
......@@ -53,7 +53,7 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
#endif
void AllReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
platform::RecordEvent record_event(Name());
WaitInputVarGenerated();
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
......@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void BroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name());
if (places_.size() == 1) return;
......@@ -30,7 +30,7 @@ void BroadcastOpHandle::RunImpl() {
VarHandle *in_var_handle;
{
auto in_var_handles = DynamicCast<VarHandle>(inputs_);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1,
PADDLE_ENFORCE_EQ(in_var_handles.size(), 1UL,
"The number of input should be one.");
in_var_handle = in_var_handles[0];
}
......
......@@ -34,9 +34,11 @@ namespace details {
static inline bool SeqOnlyAllReduceOps(const BuildStrategy &strategy) {
// Should fix the allreduce op order if scheduling
// them in multiple threads or processes to avoid hang.
// NOTE: ParallelGraph would execute this pass on each graph, so
// don't need to append it here.
return (!strategy.enable_sequential_execution_ &&
strategy.num_trainers_ > 1) ||
strategy.enable_parallel_graph_;
strategy.num_trainers_ > 1) &&
!strategy.enable_parallel_graph_;
}
class ParallelExecutorPassBuilder : public ir::PassBuilder {
......
......@@ -86,7 +86,7 @@ std::vector<std::array<int, 3>> DataBalanceOpHandle::GetBalancePlan(
}
void DataBalanceOpHandle::RunImpl() {
PADDLE_ENFORCE_GT(places_.size(), 1,
PADDLE_ENFORCE_GT(places_.size(), 1UL,
"Data balance can only be enabled when the number of "
"places to run larger than 1.");
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
......
......@@ -23,7 +23,7 @@ void FuseVarsOpHandle::RunImpl() {
auto in_var_handles = DynamicCast<VarHandle>(this->Inputs());
auto out_var_handles = DynamicCast<VarHandle>(this->Outputs());
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0);
PADDLE_ENFORCE_EQ(in_var_handles.size(), 0UL);
PADDLE_ENFORCE_EQ(out_var_handles.size() - 1, inputs_numel_.size(), "");
auto scope = local_scope_->FindVar(kLocalExecScopeName)->Get<Scope *>();
......
......@@ -22,7 +22,7 @@ namespace framework {
namespace details {
void FusedBroadcastOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.begin()->second);
platform::RecordEvent record_event(Name());
if (places_.size() == 1UL) return;
......
......@@ -129,7 +129,13 @@ size_t NodeSize(const VarDesc& node) {
}
size_t NodeSize(ir::Node* n) {
auto* desc = FindVarDescInBlock(n);
VarDesc* desc = nullptr;
// some op do not have block pointer
if (n->inputs[0]->Op() != nullptr) {
desc = FindVarDescInBlock(n);
} else {
desc = n->Var();
}
return NodeSize(*desc);
}
......
......@@ -29,8 +29,6 @@ namespace paddle {
namespace framework {
namespace details {
constexpr char kAllOpDescs[] = "all_op_descs";
std::vector<ir::Node*> SortOpLikeDescOrder(const ir::Graph& graph);
// NOTE(dzh): A ordered set for node reuse in memory optimize.
......
......@@ -194,7 +194,8 @@ void MemoryOptimizePass::SubGraphOptimize(OpDesc* op_desc) const {
// effect. Because it is a single op in graph. No need to
// update the ir nodes.
sub_op_desc->Rename(var->Name(), cache->Name());
if (sub_op_desc->Block()->HasVar(var->Name())) {
if (sub_op_desc->Block() != nullptr &&
sub_op_desc->Block()->HasVar(var->Name())) {
sub_op_desc->Block()->RemoveVar(var->Name());
}
}
......@@ -235,7 +236,13 @@ void MemoryOptimizePass::RenameVarInGraphDesc(const std::string& var,
auto* op_desc = op->Op();
op_desc->RenameInput(var, cache_var);
op_desc->RenameOutput(var, cache_var);
if (op_desc->Block()->HasVar(var)) op_desc->Block()->RemoveVar(var);
if (op_desc->Block() != nullptr) {
op_desc->Block()->RemoveVar(var);
} else {
LOG(WARNING) << "op " << op->Name() << " not know its block."
<< "Is the op_desc created without block pointer? "
<< "Can not find " << var << " in Block(0)";
}
op_desc->Flush();
}
}
......
......@@ -392,20 +392,32 @@ void MultiDevSSAGraphBuilderBase::CreateComputationalOp(ir::Graph *result,
void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
ir::Graph *result, const std::string &og) const {
OpHandleBase *op_handle = nullptr;
auto append_allreduce_op = [&](
const std::vector<Scope *> &scopes,
const std::vector<platform::Place> &places) -> OpHandleBase * {
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_, nccl_ctxs_));
scopes, places, nccl_ctxs_));
#else
result->Get<GraphOps>(kGraphOps).emplace_back(new AllReduceOpHandle(
result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation),
local_scopes_, places_));
scopes, places));
#endif
auto *op_handle = result->Get<GraphOps>(kGraphOps).back();
return result->Get<GraphOps>(kGraphOps).back();
};
if (!strategy_.enable_parallel_graph_)
op_handle = append_allreduce_op(local_scopes_, places_);
for (size_t i = 0; i < places_.size(); ++i) {
auto &p = places_[i];
SetCommunicationContext(op_handle, p);
if (strategy_.enable_parallel_graph_) {
op_handle = append_allreduce_op({local_scopes_[i]}, {places_[i]});
}
SetCommunicationContext(op_handle, places_[i]);
auto &vars = result->Get<GraphVars>(kGraphVars)[i][og];
PADDLE_ENFORCE(!vars.empty());
auto &prev_grad = vars.back();
......@@ -413,7 +425,7 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(
auto var =
new VarHandle(result->CreateEmptyNode(og, ir::Node::Type::kVariable),
vars.size(), i, og, p);
vars.size(), i, og, places_[i]);
vars.emplace_back(var);
op_handle->AddOutput(var);
}
......
......@@ -36,13 +36,14 @@ namespace details {
// map from variable name to variables. The variables, who have the same name,
// will have a differsent version. The offset in the
// `std::vector<VarHandle*>` is the version of varaibles.
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle*>>>
typedef std::vector<std::unordered_map<std::string, std::vector<VarHandle *>>>
GraphVars;
const char kGraphVars[] = "vars";
// aux variables to represent dependency. Useful to resolve data hazard.
typedef std::unordered_set<VarHandleBase*> GraphDepVars;
typedef std::unordered_set<VarHandleBase *> GraphDepVars;
const char kGraphDepVars[] = "dep_vars";
} // namespace details
} // namespace framework
} // namespace paddle
......@@ -70,6 +70,9 @@ class OpHandleBase {
auto it = dev_ctxes_.find(place);
return it != dev_ctxes_.end() ? it->second : nullptr;
}
const std::map<platform::Place, platform::DeviceContext *> &DeviceContext() {
return dev_ctxes_;
}
void SetDeviceContext(platform::Place place, platform::DeviceContext *ctx_) {
dev_ctxes_[place] = ctx_;
......
......@@ -13,22 +13,92 @@
// limitations under the License.
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph_helper.h"
namespace paddle {
namespace framework {
namespace details {
std::vector<std::unique_ptr<ir::Graph>>
ParallelSSAGraphExecutor::SeparateMultiDevicesGraph(
std::unique_ptr<ir::Graph> &&graph) {
std::vector<std::unique_ptr<ir::Graph>> graphs;
graphs.reserve(places_.size());
for (size_t i = 0; i < places_.size(); ++i) {
ProgramDesc empty;
graphs.emplace_back(std::unique_ptr<ir::Graph>(new ir::Graph(empty)));
auto &g = graphs.back();
g->Set(kGraphVars, new GraphVars(1UL));
g->Set(kGraphDepVars, new GraphDepVars);
}
auto op_handles = ir::FilterByNodeWrapper<OpHandleBase>(*graph);
for (auto &op : op_handles) {
auto &dev_ctx = op->DeviceContext();
auto &p = dev_ctx.begin()->first;
int dev_id = boost::get<platform::CUDAPlace>(p).device;
auto &dev_dummys = graphs[dev_id]->Get<GraphDepVars>(kGraphDepVars);
graphs[dev_id]->AddNode(graph->RemoveNode(op->Node()).release());
for (auto &var : op->Inputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
for (auto &var : op->Outputs()) {
auto dummy_ptr = dynamic_cast<DummyVarHandle *>(var);
if (dummy_ptr) {
dev_dummys.insert(var);
if (graph->Nodes().count(var->Node()))
graphs[dev_id]->AddNode(graph->RemoveNode(var->Node()).release());
}
}
}
for (size_t dev_id = 0; dev_id < places_.size(); ++dev_id) {
auto &dev_vars = graphs[dev_id]->Get<GraphVars>(kGraphVars)[0];
auto &origin_vars = graph->Get<GraphVars>(kGraphVars)[dev_id];
for (auto &name_pair : origin_vars) {
dev_vars.emplace(name_pair.first, name_pair.second);
for (auto &version_pair : name_pair.second) {
if (graph->Nodes().count(version_pair->Node())) {
graphs[dev_id]->AddNode(
graph->RemoveNode(version_pair->Node()).release());
}
}
}
}
return graphs;
}
ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
const ExecutionStrategy &strategy, const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs)
const framework::ProgramDesc &main_prog, std::unique_ptr<ir::Graph> &&graph)
: strategy_(std::move(strategy)),
local_scopes_(std::move(local_scopes)),
pool_(places.size() >= 2 ? new ::ThreadPool(places.size()) : nullptr),
places_(std::move(places)),
graphs_(std::move(graphs)) {
main_prog_(main_prog),
// TODO(Yancey1989): Copying graphs is not safely since it deleted the
// attrs.
graphs_(SeparateMultiDevicesGraph(std::move(graph))) {
PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size());
auto seq_allreduce_pass =
ir::PassRegistry::Instance().Get("all_reduce_deps_pass");
seq_allreduce_pass->Erase(details::kAllOpDescs);
seq_allreduce_pass->Set<const std::vector<OpDesc *>>(
details::kAllOpDescs,
new std::vector<OpDesc *>(main_prog_.Block(0).AllOps()));
for (size_t i = 0; i < graphs_.size(); ++i) {
graphs_[i] = seq_allreduce_pass->Apply(std::move(graphs_[i]));
}
// set the correct size of thread pool to each device.
strategy_.num_threads_ = strategy_.num_threads_ < places_.size()
? 1UL
......@@ -37,7 +107,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor(
<< " to run the operators of the graph on each device.";
for (size_t i = 0; i < places.size(); ++i) {
executors_.emplace_back(new details::ThreadedSSAGraphExecutor(
strategy_, {local_scopes_[i]}, {places_[i]}, std::move(graphs_[i])));
strategy_, local_scopes_, {places_[i]}, std::move(graphs_.at(i))));
}
}
......
......@@ -18,7 +18,9 @@
#include <vector>
#include "ThreadPool.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/ir/graph.h"
namespace paddle {
namespace framework {
......@@ -29,17 +31,23 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor {
ParallelSSAGraphExecutor(const ExecutionStrategy &strategy,
const std::vector<Scope *> &local_scopes,
const std::vector<platform::Place> &places,
std::vector<std::unique_ptr<ir::Graph>> &&graphs);
const framework::ProgramDesc &main_prog,
std::unique_ptr<ir::Graph> &&graph);
~ParallelSSAGraphExecutor() final = default;
const ir::Graph &Graph() const override { return *graphs_[0]; }
FeedFetchList Run(const std::vector<std::string> &fetch_tensors) override;
private:
std::vector<std::unique_ptr<ir::Graph>> SeparateMultiDevicesGraph(
std::unique_ptr<ir::Graph> &&graph);
ExecutionStrategy strategy_;
std::vector<Scope *> local_scopes_;
std::unique_ptr<::ThreadPool> pool_{nullptr};
std::vector<platform::Place> places_;
framework::ProgramDesc main_prog_;
std::vector<std::unique_ptr<ir::Graph>> graphs_;
std::vector<std::unique_ptr<details::ThreadedSSAGraphExecutor>> executors_;
......
......@@ -139,7 +139,7 @@ void ReduceOpHandle::GatherSelectedRows(
#endif
void ReduceOpHandle::RunImpl() {
platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second);
platform::RecordEvent record_event(Name());
if (places_.size() == 1) return;
// the input and output may have dummy var.
......@@ -153,7 +153,7 @@ void ReduceOpHandle::RunImpl() {
{
auto out_var_handles = DynamicCast<VarHandle>(outputs_);
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1,
PADDLE_ENFORCE_EQ(out_var_handles.size(), 1UL,
"The number of output should be one.");
out_var_handle = out_var_handles.front();
}
......
......@@ -63,7 +63,7 @@ FeedFetchList ScopeBufferedSSAGraphExecutor::Run(
eptr = std::current_exception();
}
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun", nullptr);
platform::RecordEvent e("ScopeBufferedSSAGraphExecutorAfterRun");
++drop_scope_counter_;
bool stream_end = false;
......
......@@ -37,7 +37,7 @@ ThreadedSSAGraphExecutor::ThreadedSSAGraphExecutor(
FeedFetchList ThreadedSSAGraphExecutor::Run(
const std::vector<std::string> &fetch_tensors) {
std::unique_ptr<platform::RecordEvent> event(
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare", nullptr));
new platform::RecordEvent("ThreadedSSAGraphExecutorPrepare"));
std::unordered_map<OpHandleBase *, size_t> pending_ops;
std::unordered_set<VarHandleBase *> pending_vars;
auto ready_vars = std::make_shared<BlockingQueue<VarHandleBase *>>();
......@@ -219,7 +219,7 @@ void ThreadedSSAGraphExecutor::RunOp(
VLOG(10) << op << " " << op->Name() << " Done ";
running_ops_--;
ready_var_q->Extend(op->Outputs());
VLOG(10) << op << " " << op->Name() << "Signal posted";
VLOG(10) << op << " " << op->Name() << " Signal posted";
} catch (...) {
exception_holder_.Catch(std::current_exception());
}
......
......@@ -102,6 +102,7 @@ cc_test(test_seqpool_concat_fuse_pass SRCS seqpool_concat_fuse_pass_tester.cc DE
cc_test(test_is_test_pass SRCS is_test_pass_tester.cc DEPS is_test_pass)
if (WITH_MKLDNN)
cc_test(test_depthwise_conv_mkldnn_pass SRCS mkldnn/depthwise_conv_mkldnn_pass_tester.cc DEPS depthwise_conv_mkldnn_pass)
cc_test(test_conv_bias_mkldnn_fuse_pass SRCS mkldnn/conv_bias_mkldnn_fuse_pass_tester.cc DEPS conv_bias_mkldnn_fuse_pass naive_executor)
cc_test(test_conv_relu_mkldnn_fuse_pass SRCS mkldnn/conv_relu_mkldnn_fuse_pass_tester.cc DEPS conv_relu_mkldnn_fuse_pass)
cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
endif ()
......@@ -22,7 +22,8 @@ namespace ir {
class AttentionLSTMFusePass : public FusePassBase {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -31,7 +31,8 @@ class ConvAffineChannelFusePass : public FusePassBase {
virtual ~ConvAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_affine_channel_fuse"};
};
......@@ -40,7 +41,8 @@ class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
virtual ~ConvEltwiseAddAffineChannelFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
};
......
......@@ -169,7 +169,7 @@ std::unique_ptr<ir::Graph> ConvBNFusePass::ApplyImpl(
if (has_bias && conv->Op()->Input("Bias").size() > 0) {
// reuse existing conv bias node
auto conv_bias_names = conv->Op()->Input("Bias");
PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1);
PADDLE_ENFORCE_EQ(conv_bias_names.size(), 1UL);
auto* conv_bias_var = scope->FindVar(conv_bias_names[0]);
auto* conv_bias_tensor = conv_bias_var->GetMutable<LoDTensor>();
PADDLE_ENFORCE_EQ(conv_bias_tensor->dims(),
......
......@@ -31,7 +31,8 @@ class ConvBNFusePass : public FusePassBase {
virtual ~ConvBNFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_bn_fuse"};
};
......@@ -40,7 +41,8 @@ class ConvEltwiseAddBNFusePass : public FusePassBase {
virtual ~ConvEltwiseAddBNFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_eltwiseadd_bn_fuse"};
};
......
......@@ -25,7 +25,8 @@ class ConvElementwiseAdd2ActFusePass : public FusePassBase {
virtual ~ConvElementwiseAdd2ActFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -25,7 +25,8 @@ class ConvElementwiseAddActFusePass : public FusePassBase {
virtual ~ConvElementwiseAddActFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -25,7 +25,8 @@ class ConvElementwiseAddFusePass : public FusePassBase {
virtual ~ConvElementwiseAddFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -14,6 +14,8 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......@@ -30,7 +32,8 @@ class EmbeddingFCLSTMFusePass : public FusePassBase {
virtual ~EmbeddingFCLSTMFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"embedding_fc_lstm_fuse"};
};
......
......@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......@@ -29,7 +31,8 @@ class FCFusePass : public FusePassBase {
virtual ~FCFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -30,7 +30,8 @@ class FCGRUFusePass : public FusePassBase {
virtual ~FCGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_gru_fuse"};
};
......@@ -41,7 +42,8 @@ class MulGRUFusePass : public FusePassBase {
virtual ~MulGRUFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_nobias_gru_fuse"};
};
......
......@@ -14,6 +14,8 @@
#pragma once
#include <string>
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
......@@ -30,7 +32,8 @@ class FCLstmFusePass : public FusePassBase {
virtual ~FCLstmFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_lstm_fuse"};
};
......@@ -40,7 +43,8 @@ class MulLstmFusePass : public FusePassBase {
virtual ~MulLstmFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"fc_nobias_lstm_fuse"};
};
......
......@@ -32,7 +32,8 @@ class FuseElewiseAddActPass : public FusePassBase {
virtual ~FuseElewiseAddActPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
std::unique_ptr<ir::Graph> FuseElewiseAddAct(
std::unique_ptr<ir::Graph> graph,
......
......@@ -111,7 +111,7 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
xg_var = subgraph.at(xg)->Var();
}
PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1);
PADDLE_ENFORCE_EQ(layer_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_op->Input("Input")[0], y_var->Name());
layer_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer)->inputs.push_back(subgraph.at(x));
......@@ -119,13 +119,13 @@ std::unique_ptr<ir::Graph> FuseReluDepthwiseConvPass::FuseReluDepthwiseConv(
VLOG(4) << "replace " << y_var->Name() << " -> " << x_var->Name();
if (!only_forward) {
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1);
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input").size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Input("Input")[0], y_var->Name());
layer_g_op->SetInput("Input", {x_var->Name()});
subgraph.at(layer_g)->inputs.push_back(subgraph.at(x));
subgraph.at(x)->outputs.push_back(subgraph.at(layer_g));
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1);
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input")).size(), 1UL);
PADDLE_ENFORCE_EQ(layer_g_op->Output(GradVarName("Input"))[0],
yg_var->Name());
layer_g_op->SetOutput(GradVarName("Input"), {xg_var->Name()});
......
......@@ -32,7 +32,8 @@ class FuseReluDepthwiseConvPass : public FusePassBase {
virtual ~FuseReluDepthwiseConvPass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
std::unique_ptr<ir::Graph> FuseReluDepthwiseConv(
std::unique_ptr<ir::Graph> graph, bool only_forward) const;
};
......
......@@ -26,6 +26,14 @@ limitations under the License. */
namespace paddle {
namespace framework {
namespace details {
// This attr is not recommended, because the graph should not dependence
// the program once it is built.
constexpr char kAllOpDescs[] = "all_op_descs";
} // namespace details
namespace ir {
/*
......@@ -168,10 +176,13 @@ class Graph {
return ret;
}
void RemoveNode(ir::Node *node) {
std::unique_ptr<ir::Node> RemoveNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) != node_set_.end());
node_set_.erase(node);
std::unique_ptr<ir::Node> ret;
ret.reset(nodes_.at(node).release());
nodes_.erase(node);
node_set_.erase(node);
return ret;
}
// NOTE low performance, but simple and secure.
......@@ -184,13 +195,6 @@ class Graph {
return nullptr;
}
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// This method takes ownership of `node`.
ir::Node *AddNode(ir::Node *node) {
PADDLE_ENFORCE(node_set_.find(node) == node_set_.end());
......@@ -199,6 +203,13 @@ class Graph {
return node;
}
void ResolveHazard(
const std::map<std::string, std::vector<ir::Node *>> &var_nodes);
private:
std::map<std::string, std::vector<ir::Node *>> InitFromProgram(
const ProgramDesc &program);
// NOTE: program_ shouldn't be exposed to user.
const ProgramDesc program_;
std::map<std::string, boost::any> attrs_;
......
......@@ -38,7 +38,7 @@ size_t PDPattern::id_ = 0UL;
PDNode *PDPattern::NewNode(const std::string &name) {
if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]",
name);
}
......@@ -51,7 +51,7 @@ PDNode *PDPattern::NewNode(const std::string &name) {
PDNode *PDPattern::NewNode(PDNode::teller_t &&teller, const std::string &name) {
if (!name.empty()) {
PADDLE_ENFORCE_EQ(node_map_.count(name), 0,
PADDLE_ENFORCE_EQ(node_map_.count(name), 0UL,
"PDNode's name should be unique, get duplicate [%s]",
name);
}
......
......@@ -22,7 +22,8 @@ namespace ir {
class IdentityScaleOpCleanPass : public FusePassBase {
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
virtual ~IdentityScaleOpCleanPass() = default;
......
......@@ -60,7 +60,8 @@ class LockFreeOptimizePass : public Pass {
virtual ~LockFreeOptimizePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
private:
// Create a new sgd node via current optimizer node
......
......@@ -29,7 +29,8 @@ class ConvBiasFusePass : public FusePassBase {
virtual bool is_conv3d() const { return false; }
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"conv_bias_mkldnn_fuse"};
};
/*
......
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.h"
#include "paddle/fluid/framework/naive_executor.h"
#include "paddle/fluid/platform/place.h"
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_proto_maker.h"
namespace paddle {
namespace framework {
namespace ir {
void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
const std::vector<std::string>& inputs,
const std::vector<std::string>& outputs) {
auto* op = prog->MutableBlock(0)->AppendOp();
op->SetType(type);
if (type == "conv2d") {
op->SetAttr("use_mkldnn", true);
op->SetAttr("name", name);
op->SetInput("Input", {inputs[0]});
op->SetInput("Filter", {inputs[1]});
if (inputs.size() > 2)
op->SetInput("Bias", {inputs[2]});
else
op->SetInput("Bias", {});
} else if (type == "elementwise_add") {
op->SetAttr("use_mkldnn", true);
op->SetInput("X", {inputs[0]});
op->SetInput("Y", {inputs[1]});
}
op->SetOutput("Out", outputs);
op->SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(),
static_cast<int>(OpRole::kForward));
}
// (c, weights)->conv->f
// (f)->elementwise_add->g
ProgramDesc BuildProgramDesc(bool convWithExistingBias) {
ProgramDesc prog;
std::vector<std::string> nodes{"c", "weights", "f", "eltwise_bias", "g"};
if (convWithExistingBias) nodes.push_back("conv_bias");
for (auto& v : nodes) {
auto* var = prog.MutableBlock(0)->Var(v);
var->SetType(proto::VarType::LOD_TENSOR);
if (v == "weights" || v == "conv_bias" || v == "eltwise_bias") {
var->SetPersistable(true);
}
}
// conv+bias, both with MKL-DNN
if (convWithExistingBias) {
SetOp(&prog, "conv2d", "conv",
std::vector<std::string>({"c", "weights", "conv_bias"}),
std::vector<std::string>({"f"}));
} else {
SetOp(&prog, "conv2d", "conv", std::vector<std::string>({"c", "weights"}),
std::vector<std::string>({"f"}));
}
SetOp(&prog, "elementwise_add", "eltwise",
std::vector<std::string>({"f", "eltwise_bias"}),
std::vector<std::string>({"g"}));
return prog;
}
void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
const char* var_name) {
auto x = scope->Var(var_name);
auto tensor = x->GetMutable<LoDTensor>();
tensor->mutable_data(place, proto::VarType::FP32,
::paddle::memory::Allocator::kDefault, 1);
}
void MainTest(bool convWithExistingBias) {
auto prog = BuildProgramDesc(convWithExistingBias);
std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
auto place = paddle::platform::CPUPlace();
NaiveExecutor exe{place};
Scope scope;
// Init scope, as it is used in pass
exe.CreateVariables(prog, 0, true, &scope);
if (convWithExistingBias) {
InitTensorHolder(&scope, place, "conv_bias");
InitTensorHolder(&scope, place, "eltwise_bias");
}
graph->Set(kParamScopeAttr, new framework::Scope*(&scope));
auto pass = PassRegistry::Instance().Get("conv_bias_mkldnn_fuse_pass");
int original_nodes_num = graph->Nodes().size();
graph = pass->Apply(std::move(graph));
int current_nodes_num = graph->Nodes().size();
// Remove 3 Nodes: Conv, Bias, conv_out
// Add 1 Node: ConvBias
EXPECT_EQ(original_nodes_num - 2, current_nodes_num);
// Assert conv_bias op in newly generated graph
int conv_bias_count = 0;
for (auto* node : graph->Nodes()) {
if (node->IsOp() && node->Op()->Type() == "conv2d") {
auto* op = node->Op();
ASSERT_TRUE(op->HasAttr("use_mkldnn"));
EXPECT_TRUE(boost::get<bool>(op->GetAttr("use_mkldnn")));
// check if "conv" convolution is fused
auto op_name = boost::get<std::string>(op->GetAttr("name"));
if (op_name == "conv") {
auto input_names = op->InputNames();
ASSERT_TRUE(std::find(input_names.begin(), input_names.end(), "Bias") !=
input_names.end());
auto bias = boost::get<std::vector<std::string>>(op->Input("Bias"));
if (bias.size()) {
++conv_bias_count;
}
}
}
}
EXPECT_EQ(conv_bias_count, 1);
}
TEST(ConvBiasFusePass, bias_free_conv) { MainTest(false); }
TEST(ConvBiasFusePass, conv_with_existing_bias) { MainTest(true); }
TEST(ConvBiasFusePass, conv3d) {
Conv3DBiasFusePass pass;
ASSERT_TRUE(pass.is_conv3d());
}
} // namespace ir
} // namespace framework
} // namespace paddle
USE_PASS(conv_bias_mkldnn_fuse_pass);
......@@ -31,7 +31,8 @@ class RepeatedFCReluFusePass : public FusePassBase {
virtual ~RepeatedFCReluFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"repeated_fc_relu_fuse"};
};
......
......@@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/ir/fuse_pass_base.h"
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/ir/pass.h"
......@@ -25,7 +27,8 @@ class SeqConcatFcFusePass : public FusePassBase {
virtual ~SeqConcatFcFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -28,7 +28,8 @@ class SeqConvEltAddReluFusePass : public FusePassBase {
virtual ~SeqConvEltAddReluFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"seqconv_eltadd_relu_fuse"};
};
......
......@@ -42,7 +42,8 @@ class SeqPoolConcatFusePass : public FusePassBase {
virtual ~SeqPoolConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"seqpool_concat_fuse"};
};
......
......@@ -31,7 +31,8 @@ class SquaredMatSubFusePass : public FusePassBase {
virtual ~SquaredMatSubFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
const std::string name_scope_{"squared_mat_sub_fuse"};
};
......
......@@ -30,7 +30,8 @@ class TransposeFlattenConcatFusePass : public FusePassBase {
virtual ~TransposeFlattenConcatFusePass() {}
protected:
std::unique_ptr<ir::Graph> ApplyImpl(std::unique_ptr<ir::Graph> graph) const;
std::unique_ptr<ir::Graph> ApplyImpl(
std::unique_ptr<ir::Graph> graph) const override;
};
} // namespace ir
......
......@@ -27,7 +27,7 @@ enum class OpRole {
kForward = 0x0000,
kBackward = 0x0001,
kOptimize = 0x0002,
// RPC role is for send/recv releated op
// RPC role is for send/recv related op
kRPC = 0x0004,
// Dist role is for split_byref/split_selected_rows/concat
// used for distributed training.
......
......@@ -177,9 +177,7 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
// in concurrency scenerio. Here use an `if` to fix this issue.
// Please not remove the `if`, ask @Superjomn if there are any concern.
if (platform::IsProfileEnabled()) {
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
platform::RecordEvent record_event(Type(), pool.Get(place));
platform::RecordEvent record_event(Type());
RunImpl(scope, place);
} else {
RunImpl(scope, place);
......
......@@ -21,6 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/ir/graph.h"
#include "paddle/fluid/framework/details/all_reduce_deps_pass.h"
#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h"
#include "paddle/fluid/framework/details/multi_devices_helper.h"
#include "paddle/fluid/framework/details/parallel_ssa_graph_executor.h"
......@@ -193,7 +194,6 @@ ParallelExecutor::ParallelExecutor(
member_->use_all_reduce_ =
build_strategy.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce;
member_->nranks_ = build_strategy.num_trainers_ * places.size();
if (!member_->use_all_reduce_) {
PADDLE_ENFORCE(places.size() > 1,
"If you set build_strategy.reduce with 'Reduce',"
......@@ -221,9 +221,10 @@ ParallelExecutor::ParallelExecutor(
// choice the execution strategy.
build_strategy.enable_parallel_graph_ =
EnableParallelGraphExecution(main_program, exec_strategy, build_strategy);
VLOG(1) << "Enable ParallelGraph Execution: "
<< build_strategy.enable_parallel_graph_;
if (build_strategy.enable_parallel_graph_)
VLOG(0) << "The Executor would execute the graph by ParallelGraph "
"Execution which can get better performance,"
<< "you can force it off by env FLAGS_enable_parallel_graph=0";
if (member_->use_cuda_) {
// Bcast Parameters to all GPUs
......@@ -257,42 +258,27 @@ ParallelExecutor::ParallelExecutor(
// Step 2. Convert main_program to SSA form and dependency graph. Also, insert
// ncclOp
std::vector<std::unique_ptr<ir::Graph>> graphs;
std::unique_ptr<ir::Graph> graph;
#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
if (build_strategy.enable_parallel_graph_) {
for (size_t i = 0; i < member_->places_.size(); ++i) {
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, {member_->places_[i]}, loss_var_name,
{member_->local_scopes_[i]}, member_->nranks_, member_->use_cuda_,
member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
} else {
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_.get());
graphs.push_back(std::move(graph));
}
graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
member_->local_scopes_, member_->nranks_,
member_->use_cuda_, member_->nccl_ctxs_.get());
#else
std::unique_ptr<ir::Graph> graph = build_strategy.Apply(
main_program, member_->places_, loss_var_name, member_->local_scopes_,
member_->nranks_, member_->use_cuda_);
graphs.push_back(std::move(graph));
graph = build_strategy.Apply(main_program, member_->places_, loss_var_name,
member_->local_scopes_, member_->nranks_,
member_->use_cuda_);
#endif
auto max_memory_size = GetEagerDeletionThreshold();
VLOG(10) << "Eager Deletion Threshold "
<< static_cast<float>(max_memory_size) / (1 << 30);
if (max_memory_size >= 0) {
for (size_t i = 0; i < graphs.size(); ++i) {
graphs[i] = member_->PrepareGCAndRefCnts(
std::move(graphs[i]), static_cast<size_t>(max_memory_size));
}
graph = member_->PrepareGCAndRefCnts(std::move(graph),
static_cast<size_t>(max_memory_size));
}
// Step 3. Create vars in each scope. Passes may also create new vars.
// skip control vars and empty vars
std::vector<details::VariableInfo> var_infos;
for (auto &graph : graphs) {
for (auto &node : graph->Nodes()) {
if (node->IsVar() && !node->IsCtrlVar() && node->Var()) {
var_infos.emplace_back();
......@@ -301,16 +287,15 @@ ParallelExecutor::ParallelExecutor(
var_infos.back().persistable_ = node->Var()->Persistable();
}
}
}
// If the loss_var_name is given, the number of graph should be only one.
if (loss_var_name.size()) {
size_t graph_num = ir::GraphNum(*graphs[0]);
size_t graph_num = ir::GraphNum(*graph);
if (graph_num > 1) {
LOG(WARNING)
<< "The number of graph should be only one, "
"but the current graph has "
<< ir::GraphNum(*graphs[0])
<< ir::GraphNum(*graph)
<< " sub_graphs. If you want to see the nodes of the "
"sub_graphs, you should use 'FLAGS_print_sub_graph_dir' "
"to specify the output dir. NOTES: if you not do training, "
......@@ -319,18 +304,25 @@ ParallelExecutor::ParallelExecutor(
}
if (build_strategy.enable_parallel_graph_) {
#ifdef PADDLE_WITH_CUDA
// TODO(Yancey1989): Remove passing in the main_program when
// allreduce_seq_pass doesn't need it as the attr.
member_->executor_.reset(new details::ParallelSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs)));
exec_strategy, member_->local_scopes_, member_->places_, main_program,
std::move(graph)));
#else
PADDLE_THROW(
"Paddle should be compiled with CUDA for ParallelGraph Execution.");
#endif
} else {
if (exec_strategy.type_ == ExecutionStrategy::kDefault) {
member_->executor_.reset(new details::ThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs[0])));
std::move(graph)));
} else {
member_->executor_.reset(new details::FastThreadedSSAGraphExecutor(
exec_strategy, member_->local_scopes_, member_->places_,
std::move(graphs[0])));
std::move(graph)));
}
}
......@@ -482,7 +474,6 @@ bool ParallelExecutor::EnableParallelGraphExecution(
}
if (!member_->use_all_reduce_ || !member_->use_cuda_)
enable_parallel_graph = false;
if (build_strategy.enable_sequential_execution_ ||
exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental)
......
......@@ -89,7 +89,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(params_file_);
CP_MEMBER(model_from_memory_); // the memory model reuses prog_file_ and
// params_file_ fields.
// Gpu releated.
// Gpu related.
CP_MEMBER(use_gpu_);
CP_MEMBER(device_id_);
CP_MEMBER(memory_pool_init_size_mb_);
......@@ -97,13 +97,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
CP_MEMBER(enable_memory_optim_);
CP_MEMBER(static_memory_optim_);
CP_MEMBER(static_memory_optim_force_update_);
// TensorRT releated.
// TensorRT related.
CP_MEMBER(use_tensorrt_);
CP_MEMBER(tensorrt_workspace_size_);
CP_MEMBER(tensorrt_max_batchsize_);
CP_MEMBER(tensorrt_min_subgraph_size_);
CP_MEMBER(tensorrt_precision_mode_);
// MKLDNN releated.
// MKLDNN related.
CP_MEMBER(use_mkldnn_);
CP_MEMBER(mkldnn_enabled_op_types_);
......
......@@ -392,7 +392,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
AnalysisConfig, PaddleEngineKind::kAnalysis>(const AnalysisConfig &config) {
VLOG(3) << "create AnalysisConfig";
if (config.use_gpu()) {
// 1. GPU memeroy
// 1. GPU memory
PADDLE_ENFORCE_GT(config.memory_pool_init_size_mb(), 0.f);
PADDLE_ENFORCE_GE(config.gpu_device_id(), 0, "Invalid device id %d",
config.gpu_device_id());
......@@ -726,7 +726,7 @@ bool AnalysisPredictor::need_collect_var_shapes_for_memory_optim() {
return need;
}
std::string AnalysisPredictor::GetSeriazlizedProgram() const {
std::string AnalysisPredictor::GetSerializedProgram() const {
return inference_program_->Proto()->SerializeAsString();
}
......
......@@ -74,7 +74,7 @@ class AnalysisPredictor : public PaddlePredictor {
void SetMkldnnThreadID(int tid);
std::string GetSeriazlizedProgram() const override;
std::string GetSerializedProgram() const override;
protected:
// For memory optimization.
......
......@@ -214,8 +214,8 @@ TEST(AnalysisPredictor, memory_optim) {
{
// The first predictor help to cache the memory optimize strategy.
auto predictor = CreatePaddlePredictor<AnalysisConfig>(config);
LOG(INFO) << "serialized program: " << predictor->GetSeriazlizedProgram();
ASSERT_FALSE(predictor->GetSeriazlizedProgram().empty());
LOG(INFO) << "serialized program: " << predictor->GetSerializedProgram();
ASSERT_FALSE(predictor->GetSerializedProgram().empty());
// Run several times to check the parameters are not reused by mistake.
for (int i = 0; i < 5; i++) {
......
......@@ -92,7 +92,7 @@ void PaddleBuf::Reset(void *data, size_t length) {
void PaddleBuf::Free() {
if (memory_owned_ && data_) {
PADDLE_ENFORCE_GT(length_, 0);
PADDLE_ENFORCE_GT(length_, 0UL);
free(static_cast<char *>(data_));
data_ = nullptr;
length_ = 0;
......
......@@ -290,7 +290,7 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
NativeConfig, PaddleEngineKind::kNative>(const NativeConfig &config) {
VLOG(3) << "create NativePaddlePredictor";
if (config.use_gpu) {
// 1. GPU memeroy
// 1. GPU memory
PADDLE_ENFORCE_GE(
config.fraction_of_gpu_memory, 0.f,
"fraction_of_gpu_memory in the config should be set to range (0., 1.]");
......
......@@ -212,12 +212,12 @@ struct AnalysisConfig {
std::string prog_file_;
std::string params_file_;
// GPU releated.
// GPU related.
bool use_gpu_{false};
int device_id_{0};
uint64_t memory_pool_init_size_mb_{100}; // initial size is 100MB.
// TensorRT releated.
// TensorRT related.
bool use_tensorrt_{false};
// For workspace_size, refer it from here:
// https://docs.nvidia.com/deeplearning/sdk/tensorrt-developer-guide/index.html#troubleshooting
......
......@@ -248,7 +248,7 @@ class PaddlePredictor {
/** \brief Get the serialized model program that executes in inference phase.
* Its data type is ProgramDesc, which is a protobuf message.
*/
virtual std::string GetSeriazlizedProgram() const {
virtual std::string GetSerializedProgram() const {
assert(false); // Force raise error.
return "NotImplemented";
}
......
......@@ -60,10 +60,13 @@ set(RNN2_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/rnn2")
download_model_and_data(${RNN2_INSTALL_DIR} "rnn2_model.tar.gz" "rnn2_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_rnn2 ${RNN2_INSTALL_DIR} analyzer_rnn2_tester.cc)
# TODO(luotao, Superjom) Disable DAM test, temporarily fix
# https://github.com/PaddlePaddle/Paddle/issues/15032#issuecomment-455990914.
# After inference framework refactor, will reopen it.
# normal DAM
set(DAM_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/dam")
download_model_and_data(${DAM_INSTALL_DIR} "DAM_model.tar.gz" "DAM_data.txt.tar.gz")
inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
#inference_analysis_api_test(test_analyzer_dam ${DAM_INSTALL_DIR} analyzer_dam_tester.cc EXTRA_DEPS legacy_allocator SERIAL)
# small DAM
set(DAM_SMALL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/small_dam")
......
......@@ -56,14 +56,14 @@ struct DataRecord {
std::vector<float> slot_data;
split_to_float(data[1], ' ', &slot_data);
std::string name = data[0];
PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0,
PADDLE_ENFORCE_EQ(slot_data.size() % 11, 0UL,
"line %d, %s should be divisible", num_lines, name);
datasets[name].emplace_back(std::move(slot_data));
}
num_samples = num_lines / num_slots;
PADDLE_ENFORCE_EQ(num_samples * num_slots, static_cast<size_t>(num_lines),
"num samples should be divisible");
PADDLE_ENFORCE_GT(num_samples, 0);
PADDLE_ENFORCE_GT(num_samples, 0UL);
}
void Prepare(int bs) {
......
include(ExternalProject)
set(INFERENCE_URL "http://paddle-inference-dist.cdn.bcebos.com" CACHE STRING "inference download url")
set(INFERENCE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
"A path setting inference demo download directories.")
function (inference_download install_dir url filename)
message(STATUS "Download inference test stuff from ${url}/${filename}")
file(DOWNLOAD "${url}/${filename}" "${install_dir}/${filename}")
message(STATUS "finish downloading ${filename}")
function(inference_download INSTALL_DIR URL FILENAME)
message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
ExternalProject_Add(
extern_inference_download_${FILENAME_EX}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
DOWNLOAD_COMMAND wget -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ""
)
endfunction()
function (inference_download_and_uncompress install_dir url filename)
inference_download(${install_dir} ${url} ${filename})
execute_process(
COMMAND ${CMAKE_COMMAND} -E tar xzf ${install_dir}/${filename}
WORKING_DIRECTORY ${install_dir}
function(inference_download_and_uncompress INSTALL_DIR URL FILENAME)
message(STATUS "Download inference test stuff from ${URL}/${FILENAME}")
string(REGEX REPLACE "[-%.]" "_" FILENAME_EX ${FILENAME})
set(EXTERNAL_PROJECT_NAME "extern_inference_download_${FILENAME_EX}")
set(UNPACK_DIR "${INSTALL_DIR}/src/${EXTERNAL_PROJECT_NAME}")
ExternalProject_Add(
${EXTERNAL_PROJECT_NAME}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${INSTALL_DIR}
URL ${URL}/${FILENAME}
DOWNLOAD_DIR ${INSTALL_DIR}
DOWNLOAD_NO_PROGRESS 1
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
UPDATE_COMMAND ""
INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${UNPACK_DIR} ${INSTALL_DIR}
)
endfunction()
......
......@@ -171,9 +171,7 @@ void TestInference(const std::string& dirname,
// Enable the profiler
paddle::platform::EnableProfiler(state);
{
paddle::platform::RecordEvent record_event(
"init_program",
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::platform::RecordEvent record_event("init_program");
inference_program = InitProgram(&executor, scope, dirname, is_combined);
}
......@@ -230,9 +228,7 @@ void TestInference(const std::string& dirname,
// Run repeat times to profile the performance
for (int i = 0; i < repeat; ++i) {
paddle::platform::RecordEvent record_event(
"run_inference",
paddle::platform::DeviceContextPool::Instance().Get(place));
paddle::platform::RecordEvent record_event("run_inference");
if (PrepareContext) {
// Note: if you change the inference_program, you need to call
......
......@@ -356,7 +356,7 @@ void MemInfo::Minus(const size_t &size) {
usage_ -= size;
}
uint64_t MemInfo::GetPeakUsage() { return peak_usage_; }
uint64_t MemInfo::GetPeakUsage() const { return peak_usage_; }
LegacyMemMonitor::~LegacyMemMonitor() {
for (auto &item : gpu_mem_info_) delete item.second;
......@@ -380,10 +380,10 @@ void LegacyMemMonitor::Minus(const int &device, const size_t &size) {
gpu_mem_info_[device]->Minus(size);
}
uint64_t LegacyMemMonitor::GetMemUsage(const int &device) {
uint64_t LegacyMemMonitor::GetMemUsage(const int &device) const {
return gpu_mem_info_.find(device) == gpu_mem_info_.end()
? 0
: gpu_mem_info_[device]->GetPeakUsage();
: gpu_mem_info_.at(device)->GetPeakUsage();
}
void LegacyMemMonitor::PrintMemUsage() {
......
......@@ -27,20 +27,20 @@ namespace allocation {
class MemInfo {
public:
MemInfo() : usage_(0), peak_usage_(0) {}
MemInfo(const MemInfo &) = delete;
MemInfo &operator=(const MemInfo &) = delete;
// return a flag to indicate current operation will create a peak point or not
bool Add(const size_t &);
void Minus(const size_t &);
uint64_t GetPeakUsage();
uint64_t GetPeakUsage() const;
private:
/* current memory usage*/
uint64_t usage_;
uint64_t peak_usage_;
std::mutex mutex_;
DISABLE_COPY_AND_ASSIGN(MemInfo);
};
class LegacyMemMonitor {
......@@ -56,11 +56,11 @@ class LegacyMemMonitor {
void Add(const int &, const size_t &);
void Minus(const int &, const size_t &);
uint64_t GetMemUsage(const int &);
uint64_t GetMemUsage(const int &) const;
void PrintMemUsage();
protected:
private:
MemUsage gpu_mem_info_;
};
......
......@@ -97,3 +97,4 @@ if (WITH_PYTHON)
endif()
set(GLOB_OP_LIB ${OP_LIBRARY} CACHE INTERNAL "Global OP library")
add_subdirectory(benchmark)
......@@ -11,6 +11,7 @@ limitations under the License. */
#pragma once
#include <glog/logging.h>
#include <algorithm>
#include <string>
#include <unordered_set>
#include <utility>
......@@ -24,6 +25,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/float16.h"
#ifdef PADDLE_WITH_MKLDNN
......@@ -301,8 +303,28 @@ template <typename T>
struct GeluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const {
// Because the execute or device context can not be deliver here, it keep the
// marco for NVCC.
#if defined(PADDLE_WITH_MKLML) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) && !defined(PADDLE_WITH_CUDA)
auto x_data = x.data();
auto out_data = out.data();
int n = std::min(x.size(), out.size());
std::memset(out_data, 0, n * sizeof(T));
math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, 1);
math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
for (int i = 0; i < n; i++) {
out_data[i] += static_cast<T>(1);
}
math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
for (int i = 0; i < n; i++) {
out_data[i] *= static_cast<T>(0.5);
}
#else
auto temp = (x * static_cast<T>(M_SQRT1_2)).erf();
out.device(d) = x * static_cast<T>(0.5) * (static_cast<T>(1) + temp);
#endif
}
};
......
......@@ -293,7 +293,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
int len = x_lod[0][i + 1] - x_lod[0][i];
max_seq_len = max_seq_len < len ? len : max_seq_len;
}
PADDLE_ENFORCE_EQ(x_lod.size(), 1, "Input(X)'s lod size must be 1.");
PADDLE_ENFORCE_EQ(x_lod.size(), 1UL, "Input(X)'s lod size must be 1.");
PADDLE_ENFORCE_EQ(c0->dims()[0], N, "C0 dims should be %d x %d.", N, D);
fc_out->Resize({max_seq_len, 1});
......
cc_test(op_tester SRCS op_tester.cc op_tester_config.cc
DEPS memory timer framework_proto proto_desc lod_tensor op_registry
device_context scope ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS})
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester.h"
#include "gflags/gflags.h"
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_info.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/init.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/platform/timer.h"
#include "paddle/fluid/pybind/pybind.h"
namespace paddle {
namespace operators {
namespace benchmark {
DEFINE_string(op_config_list, "", "Path of op config file.");
void OpTester::Init(const std::string &filename) {
Init(OpTesterConfig(filename));
}
void OpTester::Init(const OpTesterConfig &config) {
config_ = config;
auto &op_desc_info = framework::OpInfoMap::Instance();
// Initialize the OpDesc
if (op_desc_info.Has(config_.op_type)) {
type_ = config_.op_type;
op_desc_.SetType(config_.op_type);
CreateInputVarDesc();
CreateOutputVarDesc();
} else {
LOG(FATAL) << "Op \"" << config_.op_type << "\" is not registered.";
}
if (config_.device_id >= 0) {
place_ = paddle::platform::CUDAPlace(config_.device_id);
} else {
place_ = paddle::platform::CPUPlace();
}
framework::InitDevices(false);
scope_.reset(new paddle::framework::Scope());
op_ = framework::OpRegistry::CreateOp(op_desc_);
CreateVariables(scope_.get());
}
void OpTester::Run() {
if (config_.print_debug_string) {
LOG(INFO) << DebugString();
}
// Warm up
RunImpl();
platform::Timer timer;
if (config_.profile) {
if (platform::is_cpu_place(place_)) {
platform::EnableProfiler(platform::ProfilerState::kCPU);
} else {
#ifdef PADDLE_WITH_CUDA
platform::EnableProfiler(platform::ProfilerState::kAll);
platform::SetDeviceId(config_.device_id);
#else
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
#endif
}
timer.Start();
for (int i = config_.repeat; i > 0; --i) {
RunImpl();
}
timer.Pause();
platform::DisableProfiler(platform::EventSortingKey::kDefault,
"op_tester_profiler");
} else {
timer.Start();
for (int i = config_.repeat; i > 0; --i) {
RunImpl();
}
timer.Pause();
}
config_.runtime = timer.ElapsedMS() / config_.repeat;
LOG(INFO) << "=== Run " << config_.repeat
<< " times, latency: " << config_.runtime << " ms ===";
}
void OpTester::RunImpl() {
op_->Run(*scope_, place_);
platform::DeviceContextPool::Instance().Get(place_)->Wait();
scope_->DropKids();
}
std::vector<std::string> OpTester::GetOpProtoInputNames() {
std::vector<std::string> input_names;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
for (int i = 0; i != proto.inputs_size(); ++i) {
const auto &input = proto.inputs(i);
input_names.push_back(input.name());
}
return input_names;
}
std::vector<std::string> OpTester::GetOpProtoOutputNames() {
std::vector<std::string> output_names;
const framework::proto::OpProto &proto =
framework::OpInfoMap::Instance().Get(type_).Proto();
for (int i = 0; i != proto.outputs_size(); ++i) {
const auto &output = proto.outputs(i);
output_names.push_back(output.name());
}
return output_names;
}
void OpTester::CreateInputVarDesc() {
std::vector<std::string> input_names = GetOpProtoInputNames();
for (auto &name : input_names) {
const OpInputConfig *input = config_.GetInput(name);
if (input == nullptr) {
LOG(FATAL) << "The input " << name << " of op " << config_.op_type
<< " is not correctlly provided.";
}
std::string var_name = config_.op_type + "." + name;
framework::VarDesc *var = Var(var_name);
// Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32);
var->SetShape(input->dims);
op_desc_.SetInput(name, {var_name});
inputs_.push_back(var_name);
}
}
void OpTester::CreateOutputVarDesc() {
std::vector<std::string> output_names = GetOpProtoOutputNames();
for (auto &name : output_names) {
std::string var_name = config_.op_type + "." + name;
framework::VarDesc *var = Var(var_name);
// Need to support more type
var->SetType(framework::proto::VarType::LOD_TENSOR);
var->SetPersistable(false);
var->SetDataType(framework::proto::VarType::FP32);
op_desc_.SetOutput(name, {var_name});
outputs_.push_back(var_name);
}
}
framework::VarDesc *OpTester::Var(const std::string &name) {
auto it = vars_.find(name);
if (it != vars_.end()) {
return it->second.get();
}
auto *var = new framework::VarDesc(name);
vars_[name].reset(var);
return var;
}
template <typename T>
void OpTester::SetupTensor(framework::LoDTensor *tensor,
const std::vector<int64_t> &shape, T lower,
T upper) {
static unsigned int seed = 100;
std::mt19937 rng(seed++);
std::uniform_real_distribution<double> uniform_dist(0, 1);
T *ptr = tensor->mutable_data<T>(framework::make_ddim(shape), place_);
if (platform::is_cpu_place(place_)) {
for (int i = 0; i < tensor->numel(); ++i) {
ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
} else {
framework::LoDTensor cpu_tensor;
T *cpu_ptr = cpu_tensor.mutable_data<T>(framework::make_ddim(shape),
platform::CPUPlace());
for (int i = 0; i < cpu_tensor.numel(); ++i) {
cpu_ptr[i] = static_cast<T>(uniform_dist(rng) * (upper - lower) + lower);
}
TensorCopySync(cpu_tensor, place_, tensor);
}
}
void OpTester::CreateVariables(framework::Scope *scope) {
for (auto &item : vars_) {
auto &var = item.second;
if (var->Name() == framework::kEmptyVarName) {
continue;
}
auto *ptr = scope->Var(var->Name());
framework::InitializeVariable(ptr, var->GetType());
if (var->Persistable()) {
VLOG(3) << "Create Variable " << var->Name()
<< " global, which pointer is " << ptr;
} else {
VLOG(3) << "Create Variable " << var->Name()
<< " locally, which pointer is " << ptr;
}
}
// Allocate memory for input tensor
for (auto &name : inputs_) {
VLOG(3) << "Allocate memory for tensor " << name;
auto &var_desc = vars_[name];
std::vector<int64_t> shape = var_desc->GetShape();
auto *var = scope->Var(name);
auto *tensor = var->GetMutable<framework::LoDTensor>();
SetupTensor<float>(tensor, shape, static_cast<float>(0.0),
static_cast<float>(1.0));
}
}
static std::string GenSpaces(int count) {
std::stringstream ss;
for (int i = 0; i < count; ++i) {
ss << " ";
}
return ss.str();
}
std::string OpTester::DebugString() {
std::stringstream ss;
int count = 0;
for (auto &item : vars_) {
auto &var = item.second;
ss << GenSpaces(count++) << "vars {\n";
ss << GenSpaces(count) << "name: \"" << var->Name() << "\"\n";
ss << GenSpaces(count++) << "type: {\n";
ss << GenSpaces(count) << "type: LOD_TENSOR\n";
ss << GenSpaces(count++) << "lod_tensor {\n";
ss << GenSpaces(count++) << "tensor {\n";
ss << GenSpaces(count) << "data_type: FP32\n";
std::vector<int64_t> shape = var->GetShape();
for (auto d : shape) {
ss << GenSpaces(count) << "dims: " << d << "\n";
}
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(--count) << "}\n";
ss << GenSpaces(count) << "persistable: " << var->Persistable() << "\n";
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(count++) << "ops {\n";
for (auto &name : op_desc_.InputNames()) {
ss << GenSpaces(count++) << "inputs {\n";
ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
ss << GenSpaces(count) << "arguments: \"" << op_desc_.Input(name)[0]
<< "\"\n";
ss << GenSpaces(--count) << "}\n";
}
for (auto &name : op_desc_.OutputNames()) {
ss << GenSpaces(count++) << "outputs {\n";
ss << GenSpaces(count) << "parameters: \"" << name << "\"\n";
ss << GenSpaces(count) << "arguments: \"" << op_desc_.Output(name)[0]
<< "\"\n";
ss << GenSpaces(--count) << "}\n";
}
ss << GenSpaces(count) << "type: " << op_desc_.Type() << "\n";
ss << GenSpaces(--count) << "}\n";
return ss.str();
}
TEST(op_tester, base) {
OpTester tester;
if (!FLAGS_op_config_list.empty()) {
tester.Init(FLAGS_op_config_list);
} else {
OpTesterConfig config;
config.op_type = "elementwise_add";
config.inputs.resize(2);
config.inputs[0].name = "X";
config.inputs[0].dims = {64, 64};
config.inputs[1].name = "Y";
config.inputs[1].dims = {64, 1};
tester.Init(config);
}
tester.Run();
}
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_desc.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
namespace paddle {
namespace operators {
namespace benchmark {
class OpTester {
public:
OpTester() {}
void Init(const std::string &filename);
void Init(const OpTesterConfig &config);
void Run();
std::string DebugString();
private:
std::vector<std::string> GetOpProtoInputNames();
std::vector<std::string> GetOpProtoOutputNames();
void CreateInputVarDesc();
void CreateOutputVarDesc();
framework::VarDesc *Var(const std::string &name);
void CreateVariables(framework::Scope *scope);
template <typename T>
void SetupTensor(framework::LoDTensor *input,
const std::vector<int64_t> &shape, T lower, T upper);
void RunImpl();
private:
OpTesterConfig config_;
std::string type_;
framework::OpDesc op_desc_;
std::unordered_map<std::string, std::unique_ptr<framework::VarDesc>> vars_;
std::vector<std::string> inputs_;
std::vector<std::string> outputs_;
std::unique_ptr<framework::OperatorBase> op_;
platform::Place place_;
std::unique_ptr<framework::Scope> scope_;
};
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/benchmark/op_tester_config.h"
#include <fstream>
#include "glog/logging.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace benchmark {
static const char kStartSeparator[] = "{";
static const char kEndSeparator[] = "}";
static const char kSepBetweenItems[] = ";";
static bool StartWith(const std::string& str, const std::string& substr) {
return str.find(substr) == 0;
}
static bool EndWith(const std::string& str, const std::string& substr) {
return str.rfind(substr) == (str.length() - substr.length());
}
static void EraseEndSep(std::string* str) {
std::string substr = kSepBetweenItems;
if (EndWith(*str, substr)) {
str->erase(str->length() - substr.length(), str->length());
}
}
static std::vector<int64_t> ParseDims(std::string dims_str) {
std::vector<int64_t> dims;
std::string token;
std::istringstream token_stream(dims_str);
while (std::getline(token_stream, token, 'x')) {
dims.push_back(std::stoi(token));
}
return dims;
}
OpInputConfig::OpInputConfig(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "name" || sep == "name:") {
is >> name;
EraseEndSep(&name);
} else if (sep == "dims" || sep == "dims:") {
std::string dims_str;
is >> dims_str;
dims = ParseDims(dims_str);
}
}
}
}
OpTesterConfig::OpTesterConfig(const std::string& filename) {
std::ifstream fin(filename, std::ios::in | std::ios::binary);
PADDLE_ENFORCE(static_cast<bool>(fin), "Cannot open file %s",
filename.c_str());
Init(fin);
}
void OpTesterConfig::Init(std::istream& is) {
std::string sep;
is >> sep;
if (sep == kStartSeparator) {
while (sep != kEndSeparator) {
is >> sep;
if (sep == "op_type" || sep == "op_type:") {
is >> op_type;
} else if (sep == "device_id" || sep == "device_id:") {
is >> device_id;
} else if (sep == "repeat" || sep == "repeat:") {
is >> repeat;
} else if (sep == "profile" || sep == "profile:") {
is >> profile;
} else if (sep == "print_debug_string" || sep == "print_debug_string:") {
is >> print_debug_string;
} else if (sep == "input" || sep == "input:") {
OpInputConfig input_config(is);
inputs.push_back(input_config);
}
}
}
}
const OpInputConfig* OpTesterConfig::GetInput(const std::string& name) {
for (size_t i = 0; i < inputs.size(); ++i) {
if (inputs[i].name == name) {
return &inputs[i];
}
}
return nullptr;
}
} // namespace benchmark
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
......@@ -12,27 +12,40 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
/*
* This file contains the list of the ngraph operators for Paddle.
*
* ATTENTION: It requires some C++11 features, for lower version C++ or C, we
* might release another API.
*/
#pragma once
#include "ops/accuracy_op.h"
#include "ops/activation_op.h"
#include "ops/batch_norm_op.h"
#include "ops/binary_unary_op.h"
#include "ops/conv2d_op.h"
#include "ops/cross_entropy_op.h"
#include "ops/elementwise_add_op.h"
#include "ops/fill_constant_op.h"
#include "ops/mean_op.h"
#include "ops/mul_op.h"
#include "ops/pool2d_op.h"
#include "ops/scale_op.h"
#include "ops/softmax_op.h"
#include "ops/sum_op.h"
#include "ops/top_k_op.h"
#include <istream>
#include <string>
#include <vector>
namespace paddle {
namespace operators {
namespace benchmark {
struct OpInputConfig {
OpInputConfig() {}
explicit OpInputConfig(std::istream& is);
std::string name;
std::vector<int64_t> dims;
};
struct OpTesterConfig {
OpTesterConfig() {}
explicit OpTesterConfig(const std::string& filename);
void Init(std::istream& is);
const OpInputConfig* GetInput(const std::string& name);
std::string op_type;
std::vector<OpInputConfig> inputs;
int device_id{-1}; // CPU: -1
int repeat{1};
int profile{0};
int print_debug_string{0};
double runtime{0.0};
};
} // namespace benchmark
} // namespace operators
} // namespace paddle
......@@ -52,7 +52,7 @@ class GetPlacesOp : public framework::OperatorBase {
device_count =
is_gpu ? CUDADevCount() : std::thread::hardware_concurrency();
}
PADDLE_ENFORCE_NE(device_count, 0, "Cannot indicate %s device count",
PADDLE_ENFORCE_NE(device_count, 0UL, "Cannot indicate %s device count",
is_gpu ? "GPU" : "CPU");
auto out_var_name = Output("Out");
......
......@@ -84,12 +84,12 @@ class CRFDecodingOp : public framework::OperatorWithKernel {
"Output(ViterbiPath) should be not null.");
auto emission_dims = ctx->GetInputDim("Emission");
PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
"The Input(Emission) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
auto transition_dims = ctx->GetInputDim("Transition");
PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
"The Input(Transition) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
transition_dims[0] - 2, transition_dims[1],
......
......@@ -85,7 +85,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
" For instance, the anchor size of 64 means the area of this anchor "
"equals to 64**2.")
.AddCustomChecker([](const std::vector<float>& anchor_sizes) {
PADDLE_ENFORCE_GT(anchor_sizes.size(), 0,
PADDLE_ENFORCE_GT(anchor_sizes.size(), 0UL,
"Size of anchor_sizes must be at least 1.");
for (size_t i = 0; i < anchor_sizes.size(); ++i) {
PADDLE_ENFORCE_GT(anchor_sizes[i], 0.0,
......@@ -103,7 +103,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
"(vector<float>) List of variances to be used "
"in box regression deltas")
.AddCustomChecker([](const std::vector<float>& variances) {
PADDLE_ENFORCE_EQ(variances.size(), 4,
PADDLE_ENFORCE_EQ(variances.size(), 4UL,
"Must and only provide 4 variance.");
for (size_t i = 0; i < variances.size(); ++i) {
PADDLE_ENFORCE_GT(variances[i], 0.0,
......@@ -117,7 +117,7 @@ class AnchorGeneratorOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(std::vector<float>(2, 16.0))
.AddCustomChecker([](const std::vector<float>& stride) {
PADDLE_ENFORCE_EQ(
stride.size(), 2,
stride.size(), 2UL,
"Must and only provide 2 stride for width and height.");
for (size_t i = 0; i < stride.size(); ++i) {
PADDLE_ENFORCE_GT(stride[i], 0.0,
......
......@@ -80,7 +80,7 @@ VarHandlePtr BRPCClient::AsyncSendVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback(
&HandleSendResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
ch_ctx->stub->SendVariable(cntl, &request, response, done);
......@@ -184,7 +184,7 @@ VarHandlePtr BRPCClient::_AsyncGetVar(const std::string& ep,
google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
if (method_name == kGetMonomerRPC) {
ch_ctx->stub->GetMonomerVariable(cntl, &req, response, done);
......@@ -272,7 +272,7 @@ VarHandlePtr BRPCClient::AsyncPrefetchVar(const std::string& ep,
&cntl->request_attachment(), out_var_name_val,
false, 0, table_name_val);
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback(
&HandleGetResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
......@@ -311,7 +311,7 @@ VarHandlePtr BRPCClient::AsyncSendFetchBarrier(const std::string& ep,
VarHandlePtr var_h(
new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr));
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
google::protobuf::Closure* done = brpc::NewCallback(
&HandleFetchBarrierResponse, cntl, response, var_h, ch_ptr, ch_ctx, this);
......@@ -406,7 +406,7 @@ VarHandlePtr BRPCClient::AsyncSendVarMessage(
sendrecv::VoidMessage* response = new sendrecv::VoidMessage();
cntl->set_timeout_ms(time_out);
platform::RecordRPCEvent record_event(method_name, nullptr);
platform::RecordRPCEvent record_event(method_name);
VarHandlePtr var_h(
new VarHandle(ep, method_name, req.varname(), nullptr, nullptr));
......
......@@ -89,7 +89,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep,
// stub context
s->response_call_back_ = nullptr;
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
......@@ -184,7 +184,7 @@ VarHandlePtr GRPCClient::_AsyncGetVar(
// stub context
s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
auto call =
s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_);
......@@ -235,7 +235,7 @@ VarHandlePtr GRPCClient::AsyncPrefetchVar(const std::string& ep,
// stub context
s->response_call_back_ = ProcGetResponse;
platform::RecordRPCEvent record_event(method, p_ctx);
platform::RecordRPCEvent record_event(method);
auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/PrefetchVariable", req,
......@@ -265,7 +265,7 @@ VarHandlePtr GRPCClient::AsyncSendBatchBarrier(const std::string& ep,
sendrecv::VariableMessage req;
req.set_varname(BATCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......@@ -290,7 +290,7 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep,
sendrecv::VariableMessage req;
req.set_varname(FETCH_BARRIER_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......@@ -317,7 +317,7 @@ VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep,
sendrecv::VariableMessage req;
req.set_varname(var_name);
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......@@ -342,7 +342,7 @@ VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep,
sendrecv::VariableMessage req;
req.set_varname(COMPLETE_MESSAGE);
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......@@ -372,7 +372,7 @@ VarHandlePtr GRPCClient::AsyncCheckpointNotify(const std::string& ep,
req.set_varname(CHECKPOINT_SAVE_MESSAGE);
req.set_out_varname(dir);
platform::RecordRPCEvent record_event(method, nullptr);
platform::RecordRPCEvent record_event(method);
auto rpc = s->stub_->AsyncCheckpointNotify(s->context_.get(), req, &cq_);
rpc->Finish(&s->reply_, &s->status_, reinterpret_cast<void*>(s));
......
......@@ -38,7 +38,7 @@ void SerializeToByteBuffer(const std::string& name, framework::Variable* var,
::grpc::ByteBuffer* msg, const std::string& out_name,
const int trainer_id,
const std::string& table_name) {
platform::RecordRPCEvent record_event("serial", &ctx);
platform::RecordRPCEvent record_event("serial");
VarMsg request;
TensorPayload* payload = nullptr;
......@@ -147,7 +147,7 @@ void DeserializeFromByteBuffer(const ::grpc::ByteBuffer& msg,
const platform::DeviceContext& ctx,
const framework::Scope* scope,
framework::Variable** var, int* trainer_id) {
platform::RecordRPCEvent record_event("deserial", &ctx);
platform::RecordRPCEvent record_event("deserial");
operators::distributed::GRPCVariableResponse resp(scope, &ctx);
PADDLE_ENFORCE(resp.Parse(msg) == 0, "parse bytebuffer to tensor error!");
*var = resp.GetVar();
......
......@@ -47,7 +47,7 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
"Fully Connected input should be 2-D or 4-D tensor.");
}
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(w_dims.size(), 2,
"Fully Connected input should be 2-D tensor.");
int in_num_col_dims = ctx->Attrs().Get<int>("in_num_col_dims");
PADDLE_ENFORCE_GT(
......
......@@ -47,10 +47,11 @@ struct EmbeddingVSumFunctor {
auto *output = output_t->mutable_data<T>(context.GetPlace());
PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
out_width, jit::SeqPoolType::kSum);
for (int64_t i = 0; i != ids_lod.size() - 1; ++i) {
for (size_t i = 0; i != ids_lod.size() - 1; ++i) {
attr.index_height = ids_lod[i + 1] - ids_lod[i];
auto emb_seqpool = jit::Get<jit::kEmbSeqPool, jit::EmbSeqPoolTuples<T>,
platform::CPUPlace>(attr);
......
......@@ -37,7 +37,7 @@ void FusionRepeatedFCReluOp::InferShape(
"Output(Out) of FusionRepeatedFCReluOp should not be null.");
auto i_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(i_dims.size(), 2UL, "Input shape size should be 2");
PADDLE_ENFORCE_EQ(i_dims.size(), 2, "Input shape size should be 2");
auto w_dims = ctx->GetInputsDim("W");
auto b_dims = ctx->GetInputsDim("Bias");
......@@ -49,7 +49,7 @@ void FusionRepeatedFCReluOp::InferShape(
"inpute width should be equal with weight height");
for (size_t i = 1; i < sz; ++i) {
PADDLE_ENFORCE_EQ(w_dims[i].size(), 2UL,
PADDLE_ENFORCE_EQ(w_dims[i].size(), 2,
"Every weight shape size should be 2.");
PADDLE_ENFORCE_EQ(framework::product(b_dims[i]), w_dims[i][1],
"The length of Bias must be equal with w_dims[1].");
......
......@@ -39,7 +39,7 @@ void FusionSeqExpandConcatFCOp::InferShape(
auto ins_dims = ctx->GetInputsDim("X");
auto w_dims = ctx->GetInputDim("FCWeight"); // (M0+M1+M2+..) x D
PADDLE_ENFORCE_EQ(w_dims.size(), 2UL, "Input(FCWeight)'s rank must be 2.");
PADDLE_ENFORCE_EQ(w_dims.size(), 2, "Input(FCWeight)'s rank must be 2.");
const int D = w_dims[1];
int sum = ins_dims[0][1];
for (size_t i = 1; i < ins_dims.size(); ++i) {
......
......@@ -39,7 +39,7 @@ void FusionSeqPoolConcatOp::InferShape(
// The output height should be confirmed in Compute,
// since input lod is not accessible here.
PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2UL,
PADDLE_ENFORCE_EQ(ins_dims[0].size(), 2,
"The dims size of first input should be 2.");
ctx->SetOutputDim("Out", {-1, ins_dims[0][axis] * static_cast<int>(n)});
}
......
......@@ -42,7 +42,7 @@ void FusionSquaredMatSubOp::InferShape(
auto y_dims = ctx->GetInputDim("Y");
PADDLE_ENFORCE_EQ(x_dims.size(), y_dims.size(),
"Input tensors dims size should be equal.");
PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input tensors should be a Matrix.");
PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input tensors should be a Matrix.");
PADDLE_ENFORCE_EQ(x_dims[1], y_dims[0], "Inputs Matrix should be multiply.");
ctx->SetOutputDim("SquaredX", x_dims);
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <random>
#include <string>
......@@ -259,7 +259,7 @@ struct TestFuncWithRefer<jit::SeqPoolTuples<T>, std::vector<T>, std::vector<T>,
const std::vector<T>& x, const std::vector<T>& yref,
const typename jit::SeqPoolTuples<T>::attr_type& attr) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(x.size() % yref.size(), 0);
EXPECT_EQ(x.size() % yref.size(), static_cast<size_t>(0));
int w = yref.size();
std::vector<T> y(w);
const T* x_data = x.data();
......
......@@ -44,11 +44,11 @@ class LayerNormOp : public framework::OperatorWithKernel {
int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]);
if (ctx->HasInput("Scale")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
}
if (ctx->HasInput("Bias")) {
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1UL);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
}
......
......@@ -144,12 +144,12 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
"Output(LogLikelihood) should be not null.");
auto emission_dims = ctx->GetInputDim("Emission");
PADDLE_ENFORCE_EQ(emission_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(emission_dims.size(), 2,
"The Input(Emission) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_dims[0], "An empty mini-batch is not allowed.");
auto transition_dims = ctx->GetInputDim("Transition");
PADDLE_ENFORCE_EQ(transition_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
"The Input(Transition) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
transition_dims[0] - 2, transition_dims[1],
......@@ -202,13 +202,13 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
"Input(LogLikelihood@GRAD) shoudl be not null.");
auto emission_exps_dims = ctx->GetInputDim("EmissionExps");
PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(emission_exps_dims.size(), 2,
"The Input(EmissionExps) should be a 2-D tensor.");
PADDLE_ENFORCE(emission_exps_dims[0],
"An empty mini-batch is not allowed.");
auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2UL,
PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
"The Input(TransitionExps) should be a 2-D tensor.");
PADDLE_ENFORCE_EQ(
transition_exps_dims[0] - 2, transition_exps_dims[1],
......
......@@ -184,6 +184,9 @@ class Blas {
template <typename T>
void VINV(int n, const T* a, T* y) const;
template <typename T>
void VMERF(int n, const T* a, T* y, int64_t mode) const;
private:
const DeviceContext& context_;
};
......@@ -290,6 +293,11 @@ class BlasT : private Blas<DeviceContext> {
Base()->template VINV<T>(args...);
}
template <typename... ARGS>
void VMERF(ARGS... args) const {
Base()->template VMERF<T>(args...);
}
private:
const Blas<DeviceContext>* Base() const {
return static_cast<const Blas<DeviceContext>*>(this);
......
......@@ -123,6 +123,11 @@ struct CBlas<float> {
static void VINV(ARGS... args) {
platform::dynload::vsInv(args...);
}
template <typename... ARGS>
static void VMERF(ARGS... args) {
platform::dynload::vmsErf(args...);
}
};
template <>
......@@ -223,6 +228,11 @@ struct CBlas<double> {
static void VINV(ARGS... args) {
platform::dynload::vdInv(args...);
}
template <typename... ARGS>
static void VMERF(ARGS... args) {
platform::dynload::vmdErf(args...);
}
};
#else
......@@ -625,6 +635,19 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
#endif
}
template <>
template <typename T>
void Blas<platform::CPUDeviceContext>::VMERF(int n, const T *a, T *y,
int64_t mode) const {
#ifdef PADDLE_WITH_MKLML
CBlas<T>::VMERF(n, a, y, mode);
#else
for (int i = 0; i < n; ++i) {
y[i] = std::erf(a[i]);
}
#endif
}
} // namespace math
} // namespace operators
} // namespace paddle
......@@ -52,11 +52,6 @@ class MKLDNNActivationKernel
"Wrong layout/format set for Input x tensor");
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx);
}
};
......@@ -76,11 +71,6 @@ class MKLDNNActivationGradKernel
"is_test attribute should be set to False in training phase.");
Functor functor;
auto attrs = functor.GetAttrs();
for (auto &attr : attrs) {
*attr.second = ctx.Attr<float>(attr.first);
}
functor(ctx);
}
};
......
......@@ -2,4 +2,5 @@ if(WITH_NGRAPH)
cc_library(ngraph_bridge SRCS ngraph_bridge.cc DEPS operator framework_proto ngraph)
cc_library(ngraph_engine SRCS ngraph_engine.cc DEPS ngraph_bridge framework_proto)
op_library(ngraph_engine_op DEPS ngraph_engine op_registry op_info device_context)
add_subdirectory(ops)
endif()
......@@ -19,49 +19,21 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/operators/ngraph/ngraph_ops.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace NG_OPS = paddle::operators::ngraphs;
std::map<std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphBridge::NG_NODE_MAP = {
{"accuracy", NG_OPS::BuildAccuracyNode},
{"conv2d", NG_OPS::BuildConv2dNode},
{"conv2d_grad", NG_OPS::BuildConv2dGradNode},
{"batch_norm", NG_OPS::BuildBatchNormNode},
{"batch_norm_grad", NG_OPS::BuildBatchNormGradNode},
{"cross_entropy", NG_OPS::BuildCrossEntropyNode},
{"cross_entropy_grad", NG_OPS::BuildCrossEntropyGradNode},
{"elementwise_add", NG_OPS::BuildElementwiseAddNode},
{"elementwise_add_grad", NG_OPS::BuildElementwiseAddGradNode},
{"fill_constant", NG_OPS::BuildFillConstantNode},
{"mean", NG_OPS::BuildMeanNode},
{"mean_grad", NG_OPS::BuildMeanGradNode},
{"mul", NG_OPS::BuildMulNode},
{"mul_grad", NG_OPS::BuildMulGradNode},
{"pool2d", NG_OPS::BuildPool2dNode},
{"pool2d_grad", NG_OPS::BuildPool2dGradNode},
{"softmax", NG_OPS::BuildSoftmaxNode},
{"softmax_grad", NG_OPS::BuildSoftmaxGradNode},
{"scale", NG_OPS::BuildScaleNode},
{"sigmoid", NG_OPS::BuildUnaryNode<ngraph::op::Sigmoid>},
{"sum", NG_OPS::BuildSumNode},
{"relu", NG_OPS::BuildUnaryNode<ngraph::op::Relu>},
{"relu_grad", NG_OPS::BuildReluGradNode},
{"tanh", NG_OPS::BuildUnaryNode<ngraph::op::Tanh>},
{"tanh_grad", NG_OPS::BuildTanhGradNode},
{"top_k", NG_OPS::BuildTopKNode}};
bool NgraphBridge::isRegister(const std::string& str) {
return ops::NgraphSingleton::Lookup(str);
}
void NgraphBridge::BuildNgNode(
const std::shared_ptr<framework::OperatorBase>& op) {
auto& op_type = op->Type();
NG_NODE_MAP[op_type](op, ngb_node_map_);
ops::NgraphSingleton::BuildNode(ngb_node_map_, op, op_type);
}
} // namespace operators
......
......@@ -28,13 +28,6 @@ namespace operators {
class NgraphBridge {
public:
static std::map<
std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NG_NODE_MAP;
explicit NgraphBridge(
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
......@@ -43,6 +36,8 @@ class NgraphBridge {
void BuildNgNode(const std::shared_ptr<framework::OperatorBase>& op);
static bool isRegister(const std::string& str);
private:
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
......
......@@ -88,14 +88,12 @@ static std::vector<std::vector<int>> NgraphOpIntervals(
int pivot = left;
while (pivot < right) {
auto op_type = ops.at(pivot)->Type();
if (NgraphBridge::NG_NODE_MAP.find(op_type) ==
NgraphBridge::NG_NODE_MAP.end()) {
if (NgraphBridge::isRegister(op_type)) {
++pivot;
} else {
int start = pivot, end = start;
while (pivot < right &&
(NgraphBridge::NG_NODE_MAP.find(ops.at(pivot)->Type()) !=
NgraphBridge::NG_NODE_MAP.end())) {
(!NgraphBridge::isRegister(ops.at(pivot)->Type()))) {
++pivot;
++end;
}
......
file(GLOB LIST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.h")
set(pass_file ${PADDLE_BINARY_DIR}/paddle/fluid/operators/ngraph/ngraph_ops.h)
file(APPEND ${pass_file} "\#pragma once\n")
file(WRITE ${pass_file} "// Generated by the /paddle/fluid/operators/ngraph/ops/CMakeLists.txt. DO NOT EDIT!\n\n")
foreach(OPS_NAME ${LIST_OPS})
file(APPEND ${pass_file} "\#include \"paddle/fluid/operators/ngraph/ops/${OPS_NAME}\"\n")
endforeach(OPS_NAME)
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -63,3 +64,5 @@ void BuildAccuracyNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(accuracy, BuildAccuracyNode);
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -50,3 +51,6 @@ void BuildTanhGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(relu_grad, BuildReluGradNode);
REGISTER_NG_OP(than_grad, BuildTanhGradNode);
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -155,3 +156,6 @@ void BuildBatchNormGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(batch_norm, BuildBatchNormNode);
REGISTER_NG_OP(batch_norm_grad, BuildBatchNormGradNode);
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -47,3 +48,7 @@ static void BuildUnaryNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(relu, BuildUnaryNode<ngraph::op::Relu>);
REGISTER_NG_OP(tanh, BuildUnaryNode<ngraph::op::Tanh>);
REGISTER_NG_OP(sigmoid, BuildUnaryNode<ngraph::op::Sigmoid>);
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -233,3 +234,6 @@ void BuildConv2dGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(conv2d, BuildConv2dNode);
REGISTER_NG_OP(conv2d_grad, BuildConv2dGradNode);
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -143,3 +144,6 @@ void BuildCrossEntropyGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(cross_entropy, BuildCrossEntropyNode);
REGISTER_NG_OP(cross_entropy_grad, BuildCrossEntropyGradNode);
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_node.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -85,3 +86,6 @@ void BuildElementwiseAddGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(elementwise_add, BuildElementwiseAddNode);
REGISTER_NG_OP(elementwise_add_grad, BuildElementwiseAddGradNode);
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -55,3 +56,5 @@ void BuildFillConstantNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(fill_constant, BuildFillConstantNode);
......@@ -19,6 +19,7 @@ limitations under the License. */
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -64,3 +65,6 @@ void BuildMeanGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(mean, BuildMeanNode);
REGISTER_NG_OP(mean_grad, BuildMeanGradNode);
/*Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <string>
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
namespace operators {
namespace ngraphs {
void BuildMomentumNode(
const std::shared_ptr<paddle::framework::OperatorBase>& op,
std::shared_ptr<
std::unordered_map<std::string, std::shared_ptr<ngraph::Node>>>
ngb_node_map) {
auto op_attrs = paddle::framework::AttrReader(op->Attrs());
auto param = paddle::platform::GetInputNode(op, "Param", ngb_node_map);
auto grad = paddle::platform::GetInputNode(op, "Grad", ngb_node_map);
auto velocity = paddle::platform::GetInputNode(op, "Velocity", ngb_node_map);
auto learning_rate =
paddle::platform::GetInputNode(op, "LearningRate", ngb_node_map);
auto mu = op_attrs.Get<float>("mu");
bool use_nesterov = op_attrs.Get<bool>("use_nesterov");
auto param_shape = param->get_shape();
auto velocity_shape = velocity->get_shape();
auto grad_shape = grad->get_shape();
auto lr_shape = learning_rate->get_shape();
auto shape_velocity = ngraph::Shape{velocity_shape};
auto mu_create =
ngraph::op::Constant::create(ngraph::element::f32, shape_velocity, {mu});
auto vel_mul = std::make_shared<ngraph::op::Multiply>(velocity, mu_create);
auto vel_out = std::make_shared<ngraph::op::Add>(vel_mul, grad);
ngraph::NodeVector result;
if (use_nesterov) {
auto mul_res = std::make_shared<ngraph::op::Multiply>(vel_out, mu_create);
auto add_res = std::make_shared<ngraph::op::Add>(grad, mul_res);
auto add_2d = paddle::platform::FlattenTo2d(add_res->get_shape(), 0);
auto vel_reshape = paddle::platform::NgReshaper(vel_out, add_2d);
auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
learning_rate, vel_reshape->get_shape(),
ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_reshape, ngraph::AxisVector{0}, param->get_shape());
auto mul_res1 = std::make_shared<ngraph::op::Multiply>(add_res, lr_reshape);
auto res = std::make_shared<ngraph::op::Subtract>(param, mul_res1);
paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
} else {
auto vel_2d = paddle::platform::FlattenTo2d(vel_out->get_shape(), 0);
auto vel_reshape = paddle::platform::NgReshaper(vel_out, vel_2d);
auto lr_bcast = std::make_shared<ngraph::op::Broadcast>(
learning_rate, vel_reshape->get_shape(),
ngraph::AxisSet{vel_reshape->get_shape().size() - 1});
auto lr_1d = paddle::platform::FlattenTo1d(lr_bcast->get_shape(), 0);
auto lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_bcast, ngraph::AxisVector{0, 1}, lr_1d);
lr_reshape = std::make_shared<ngraph::op::Reshape>(
lr_reshape, ngraph::AxisVector{0}, param->get_shape());
auto mul_result =
std::make_shared<ngraph::op::Multiply>(lr_reshape, vel_out);
auto res = std::make_shared<ngraph::op::Subtract>(param, mul_result);
paddle::platform::SetOutputNode(op, "ParamOut", res, ngb_node_map);
}
paddle::platform::SetOutputNode(op, "VelocityOut", vel_out, ngb_node_map);
}
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(momentum, BuildMomentumNode);
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -130,3 +131,6 @@ static void BuildMulGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(mul, BuildMulNode);
REGISTER_NG_OP(mul_grad, BuildMulGradNode);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <map>
#include <string>
#include <unordered_map>
#include "ngraph/node.hpp"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/ngraph/ngraph_bridge.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace operators {
namespace ops {
class NgraphSingleton {
NgraphSingleton() = default;
NgraphSingleton(NgraphSingleton const&) = delete;
void operator=(NgraphSingleton const) = delete;
~NgraphSingleton() = default;
static std::map<
std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
ng_node_maps_;
public:
template <typename TF>
static void Register(TF&& tf, const std::string& name) {
ng_node_maps_[name] = tf;
}
static bool Lookup(const std::string& name) {
auto it = ng_node_maps_.find(name);
if (it == ng_node_maps_.end()) {
return true;
}
return false;
}
static void BuildNode(
const std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>& ng_maps,
const std::shared_ptr<framework::OperatorBase>& op,
const std::string& name) {
ng_node_maps_[name](op, ng_maps);
}
};
std::map<std::string,
std::function<void(const std::shared_ptr<framework::OperatorBase>&,
std::shared_ptr<std::unordered_map<
std::string, std::shared_ptr<ngraph::Node>>>)>>
NgraphSingleton::ng_node_maps_;
} // namespace ops
} // namespace operators
} // namespace paddle
#define REGISTER_NG_OP(op_type__, Converter__) \
struct ng_##op_type__##_converter { \
ng_##op_type__##_converter() { \
paddle::operators::ops::NgraphSingleton::Register( \
paddle::operators::ngraphs::Converter__, #op_type__); \
} \
}; \
ng_##op_type__##_converter ng_##op_type__##_converter__;
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -172,3 +173,6 @@ void BuildPool2dGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(pool2d, BuildPool2dNode);
REGISTER_NG_OP(pool2d_grad, BuildPool2dGradNode);
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -37,3 +38,5 @@ void BuildScaleNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(scale, BuildScaleNode);
......@@ -18,6 +18,7 @@ limitations under the License. */
#include <vector>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/elementwise_scalar_op.h"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -72,3 +73,6 @@ void BuildSoftmaxGradNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(softmax, BuildSoftmaxNode);
REGISTER_NG_OP(softmax_grad, BuildSoftmaxGradNode);
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <string>
#include "ngraph/ngraph.hpp"
#include "paddle/fluid/operators/ngraph/ops/op_bridge.h"
#include "paddle/fluid/platform/ngraph_helper.h"
namespace paddle {
......@@ -42,3 +43,5 @@ void BuildTopKNode(
} // namespace ngraphs
} // namespace operators
} // namespace paddle
REGISTER_NG_OP(top_k, BuildTopKNode);
......@@ -85,9 +85,7 @@ class ReadOp : public framework::OperatorBase {
std::vector<framework::LoDTensor> ins;
// For profiling
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(dev_place);
platform::RecordEvent record_event(Type(), &ctx);
platform::RecordEvent record_event(Type());
reader->ReadNext(&ins);
if (ins.empty()) {
......
......@@ -31,10 +31,10 @@ class SequenceEnumerateOp : public framework::OperatorWithKernel {
const auto x_dims = ctx->GetInputDim("X");
PADDLE_ENFORCE_EQ(
x_dims.size(), 2UL,
x_dims.size(), 2,
"Input(X) of SequenceEnumerate operator's rank should be 2.");
PADDLE_ENFORCE_EQ(
x_dims[1], 1UL,
x_dims[1], 1,
"Input(X) of SequenceEnumerate operator's 2nd dimension should be 1.");
const auto win_size = ctx->Attrs().Get<int>("win_size");
......
......@@ -48,10 +48,10 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
auto& x_lod = x_var->Get<LoDTensor>().lod();
auto& y_lod = y_var->Get<LoDTensor>().lod();
PADDLE_ENFORCE_LE(x_lod.size(), 1,
PADDLE_ENFORCE_LE(x_lod.size(), 1UL,
"Level number of Input(X)'s lod should not be "
"greater than 1.");
PADDLE_ENFORCE_GT(y_lod.size(), 0,
PADDLE_ENFORCE_GT(y_lod.size(), 0UL,
"Level number of Input(Y)'s lod should be "
"greater than 0.");
PADDLE_ENFORCE(
......@@ -69,7 +69,8 @@ class SequenceExpandOp : public framework::OperatorWithKernel {
"size of Input(X)'s first level lod should be equal to "
"size of Input(Y)'s referred level lod.");
} else {
PADDLE_ENFORCE_EQ(x_dims[0], y_lod[ref_level].size() - 1,
PADDLE_ENFORCE_EQ(x_dims[0],
static_cast<int64_t>(y_lod[ref_level].size()) - 1,
"When Input(X)'s lod is null, the dims[0] of "
"Input(X) should match the "
"size of Input(Y)'s referred level lod.");
......
......@@ -35,14 +35,15 @@ class ShapeOp : public framework::OperatorWithKernel {
class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Input", "(Tensor), The input tensor.");
AddOutput("Out",
"(Tensor), The shape of input tensor, the data type of the shape"
AddInput("Input", "(LoDTensor), The input tensor.");
AddOutput(
"Out",
"(LoDTensor), The shape of input tensor, the data type of the shape"
" is int32_t, will be on the same device with the input Tensor.");
AddComment(R"DOC(
Shape Operator
Shape Operator.
Get the shape of input tensor. Only support CPU input Tensor now.
Return the shape of the input.
)DOC");
}
};
......
......@@ -36,7 +36,7 @@ cc_test(cpu_info_test SRCS cpu_info_test.cc DEPS cpu_info)
nv_library(gpu_info SRCS gpu_info.cc DEPS gflags glog enforce)
cc_library(place SRCS place.cc DEPS enforce boost lib_any)
cc_library(place SRCS place.cc DEPS enforce boost)
cc_test(place_test SRCS place_test.cc DEPS place glog gflags)
add_subdirectory(dynload)
......@@ -87,8 +87,12 @@ nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context)
cc_library(timer SRCS timer.cc)
cc_test(timer_test SRCS timer_test.cc DEPS timer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto device_context ${GPU_CTX_DEPS})
if(WITH_GPU)
nv_library(profiler SRCS profiler.cc profiler.cu DEPS device_context device_tracer)
else()
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
endif()
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor)
......
......@@ -291,7 +291,7 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place)
if (dynload::HasCUDNN()) {
auto local_cudnn_version = cudnn_dso_ver / 100;
auto compile_cudnn_version = CUDNN_VERSION / 100;
if (local_cudnn_version < compile_cudnn_version) {
if (local_cudnn_version < static_cast<size_t>(compile_cudnn_version)) {
LOG_FIRST_N(WARNING, 1)
<< "WARNING: device: " << place_.device
<< ". The installed Paddle is compiled with CUDNN "
......
......@@ -14,17 +14,23 @@ limitations under the License. */
#include "paddle/fluid/platform/device_tracer.h"
#include <deque>
#include <forward_list>
#include <fstream>
#include <list>
#include <map>
#include <mutex> // NOLINT
#include <numeric>
#include <sstream>
#include <string>
#include <thread> // NOLINT
#include <unordered_map>
#include <utility>
#include <vector>
#include "glog/logging.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
namespace paddle {
......@@ -33,17 +39,31 @@ namespace {
// Tracking the nested block stacks of each thread.
thread_local std::deque<int> block_id_stack;
// Tracking the nested event stacks.
thread_local std::deque<std::string> annotation_stack;
thread_local std::deque<Event *> annotation_stack;
std::map<uint32_t, int32_t> system_thread_id_map;
std::once_flag tracer_once_flag;
DeviceTracer *tracer = nullptr;
void PrintCuptiHint() {
static bool showed = false;
if (showed) return;
showed = true;
LOG(WARNING) << "Invalid timestamp occured. Please try increasing the "
"FLAGS_multiple_of_cupti_buffer_size.";
}
} // namespace
#ifdef PADDLE_WITH_CUPTI
namespace {
// TODO(panyx0718): Revisit the buffer size here.
uint64_t kBufSize = 32 * 1024;
// The experimental best performance is
// the same size with CUPTI device buffer size(8M)
uint64_t kBufSize = 1024 * 1024 * 8;
uint64_t kAlignSize = 8;
std::unordered_map<CUpti_CallbackId, std::string> runtime_cbid_str,
driver_cbid_str;
#define ALIGN_BUFFER(buffer, align) \
(((uintptr_t)(buffer) & ((align)-1)) \
......@@ -92,15 +112,33 @@ std::string MemcpyKind(CUpti_ActivityMemcpyKind kind) {
return "MEMCPY";
}
std::string DriverKind(CUpti_CallbackId cbid) {
auto iter = driver_cbid_str.find(cbid);
if (iter == driver_cbid_str.end())
return "Driver API " + std::to_string(cbid);
return iter->second;
}
std::string RuntimeKind(CUpti_CallbackId cbid) {
auto iter = runtime_cbid_str.find(cbid);
if (iter == runtime_cbid_str.end())
return "Runtime API " + std::to_string(cbid);
return iter->second;
}
void EnableActivity() {
// Device activity record is created when CUDA initializes, so we
// want to enable it before cuInit() or any CUDA runtime call.
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
CUPTI_CALL(
dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
// We don't track these activities for now.
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_MEMSET));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DEVICE));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_DRIVER));
// CUPTI_CALL(dynload::cuptiActivityEnable(CUPTI_ACTIVITY_KIND_RUNTIME));
......@@ -110,16 +148,17 @@ void EnableActivity() {
void DisableActivity() {
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMCPY));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_KERNEL));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
CUPTI_CALL(
dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DEVICE));
// Disable all other activity record kinds.
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONTEXT));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_DRIVER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_RUNTIME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MEMSET));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_NAME));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_MARKER));
// CUPTI_CALL(dynload::cuptiActivityDisable(CUPTI_ACTIVITY_KIND_OVERHEAD));
}
void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
......@@ -132,6 +171,11 @@ void CUPTIAPI bufferRequested(uint8_t **buffer, size_t *size,
void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
size_t size, size_t validSize) {
static std::thread::id cupti_thread_id(0);
if (cupti_thread_id == std::thread::id(0))
cupti_thread_id = std::this_thread::get_id();
PADDLE_ENFORCE_EQ(std::this_thread::get_id(), cupti_thread_id,
"Only one thread is allowed to call bufferCompleted()");
CUptiResult status;
CUpti_Activity *record = NULL;
if (validSize > 0) {
......@@ -168,6 +212,23 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
memcpy->correlationId, memcpy->bytes);
break;
}
case CUPTI_ACTIVITY_KIND_DRIVER: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
// -1 device id represents CUDA api call
tracer->AddCPURecords(
DriverKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
case CUPTI_ACTIVITY_KIND_RUNTIME: {
auto *api = reinterpret_cast<const CUpti_ActivityAPI *>(record);
if (api->start != 0 && api->end != 0)
tracer->AddCPURecords(
RuntimeKind(api->cbid), api->start, api->end, -1,
GetThreadIdFromSystemThreadId(api->threadId));
break;
}
default: { break; }
}
} else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
......@@ -183,21 +244,35 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer,
dynload::cuptiActivityGetNumDroppedRecords(ctx, streamId, &dropped));
if (dropped != 0) {
fprintf(stderr, "Dropped %u activity records\n", (unsigned int)dropped);
PrintCuptiHint();
}
}
free(buffer);
}
void initCuptiCbidStr();
} // namespace
#endif // PADDLE_WITH_CUPTI
class DeviceTracerImpl : public DeviceTracer {
public:
DeviceTracerImpl() : enabled_(false) {}
DeviceTracerImpl() : enabled_(false) {
#ifdef PADDLE_WITH_CUPTI
initCuptiCbidStr();
#endif
}
void AddAnnotation(uint64_t id, const std::string &anno) {
void AddAnnotation(uint32_t id, Event *event) {
thread_local std::forward_list<std::pair<uint32_t, Event *>>
*local_correlations_pairs = nullptr;
if (local_correlations_pairs == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
correlations_[id] = anno;
correlations_pairs.emplace_front();
local_correlations_pairs = &correlations_pairs.front();
}
local_correlations_pairs->push_front(std::make_pair(id, event));
}
void AddCPURecords(const std::string &anno, uint64_t start_ns,
......@@ -206,8 +281,13 @@ class DeviceTracerImpl : public DeviceTracer {
VLOG(1) << "Empty timeline annotation.";
return;
}
thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
if (local_cpu_records_ == nullptr) {
std::lock_guard<std::mutex> l(trace_mu_);
cpu_records_.push_back(
cpu_records_.emplace_front();
local_cpu_records_ = &cpu_records_.front();
}
local_cpu_records_->push_front(
CPURecord{anno, start_ns, end_ns, device_id, thread_id});
}
......@@ -215,12 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
uint64_t end_ns, int64_t device_id, int64_t stream_id,
uint32_t correlation_id, uint64_t bytes) {
// 0 means timestamp information could not be collected for the kernel.
if (start_ns == 0 || end_ns == 0) {
if (start_ns == 0 || end_ns == 0 || start_ns == end_ns) {
VLOG(3) << name << " cannot be traced";
PrintCuptiHint();
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
mem_records_.push_back(MemRecord{name, start_ns, end_ns, device_id,
// NOTE(liangdun): lock is not needed, only one thread call this function.
mem_records_.push_front(MemRecord{name, start_ns, end_ns, device_id,
stream_id, correlation_id, bytes});
}
......@@ -228,12 +309,13 @@ class DeviceTracerImpl : public DeviceTracer {
int64_t device_id, int64_t stream_id,
uint32_t correlation_id) {
// 0 means timestamp information could not be collected for the kernel.
if (start == 0 || end == 0) {
if (start == 0 || end == 0 || start == end) {
VLOG(3) << correlation_id << " cannot be traced";
PrintCuptiHint();
return;
}
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.push_back(
// NOTE(liangdun): lock is not needed, only one thread call this function.
kernel_records_.push_front(
KernelRecord{name, start, end, device_id, stream_id, correlation_id});
}
......@@ -263,25 +345,80 @@ class DeviceTracerImpl : public DeviceTracer {
} else if (ret != CUPTI_SUCCESS) {
fprintf(stderr, "Failed to create CUPTI subscriber.\n");
}
CUPTI_CALL(
dynload::cuptiEnableCallback(1, subscriber_, CUPTI_CB_DOMAIN_DRIVER_API,
CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel));
const std::vector<int> cbids {
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpy_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaMemcpyAsync_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunch_v3020,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchKernel_v7000
#if CUDA_VERSION >= 9000
,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernel_v9000,
CUPTI_RUNTIME_TRACE_CBID_cudaLaunchCooperativeKernelMultiDevice_v9000
#endif
};
for (auto cbid : cbids)
CUPTI_CALL(dynload::cuptiEnableCallback(
1, subscriber_, CUPTI_CB_DOMAIN_RUNTIME_API, cbid));
CUPTI_CALL(dynload::cuptiGetTimestamp(&start_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = true;
}
void Reset() {
#ifdef PADDLE_WITH_CUPTI
CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif
std::lock_guard<std::mutex> l(trace_mu_);
kernel_records_.clear();
mem_records_.clear();
correlations_.clear();
for (auto &tmp : correlations_pairs) tmp.clear();
for (auto &tmp : cpu_records_) tmp.clear();
}
void GenEventKernelCudaElapsedTime() {
#ifdef PADDLE_WITH_CUPTI
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
for (const auto &r : mem_records_) {
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
Event *e = c->second;
e->AddCudaElapsedTime(r.start_ns, r.end_ns);
}
}
#endif
}
proto::Profile GenProfile(const std::string &profile_path) {
int miss = 0, find = 0;
std::lock_guard<std::mutex> l(trace_mu_);
proto::Profile profile_pb;
profile_pb.set_start_ns(start_ns_);
profile_pb.set_end_ns(end_ns_);
if (correlations_.empty())
for (auto &tmp : correlations_pairs)
for (auto &pair : tmp) correlations_[pair.first] = pair.second;
for (const KernelRecord &r : kernel_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
if (correlations_.find(r.correlation_id) != correlations_.end()) {
event->set_name(correlations_.at(r.correlation_id));
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else {
VLOG(10) << "Missing Kernel Event: " + r.name;
miss++;
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
......@@ -289,8 +426,9 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
}
for (const CPURecord &r : cpu_records_) {
VLOG(1) << "KernelRecord event miss: " << miss << " find: " << find;
for (auto &tmp : cpu_records_)
for (const CPURecord &r : tmp) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::CPU);
event->set_name(r.name);
......@@ -299,21 +437,30 @@ class DeviceTracerImpl : public DeviceTracer {
event->set_sub_device_id(r.thread_id);
event->set_device_id(r.device_id);
}
miss = find = 0;
for (const MemRecord &r : mem_records_) {
auto *event = profile_pb.add_events();
event->set_type(proto::Event::GPUKernel);
auto c = correlations_.find(r.correlation_id);
if (c != correlations_.end() && c->second != nullptr) {
event->set_name(c->second->name());
event->set_detail_info(r.name);
find++;
} else {
miss++;
event->set_name(r.name);
}
event->set_start_ns(r.start_ns);
event->set_end_ns(r.end_ns);
event->set_sub_device_id(r.stream_id);
event->set_device_id(r.device_id);
event->mutable_memcopy()->set_bytes(r.bytes);
}
VLOG(1) << "MemRecord event miss: " << miss << " find: " << find;
std::ofstream profile_f;
profile_f.open(profile_path, std::ios::out | std::ios::trunc);
std::string profile_str;
profile_pb.SerializeToString(&profile_str);
profile_f << profile_str;
profile_f.open(profile_path,
std::ios::out | std::ios::trunc | std::ios::binary);
profile_pb.SerializeToOstream(&profile_f);
profile_f.close();
return profile_pb;
}
......@@ -321,12 +468,13 @@ class DeviceTracerImpl : public DeviceTracer {
void Disable() {
#ifdef PADDLE_WITH_CUPTI
// flush might cause additional calls to DeviceTracker.
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED);
CUPTI_CALL(
dynload::cuptiActivityFlushAll(CUPTI_ACTIVITY_FLAG_FLUSH_FORCED));
#endif // PADDLE_WITH_CUPTI
std::lock_guard<std::mutex> l(trace_mu_);
#ifdef PADDLE_WITH_CUPTI
DisableActivity();
dynload::cuptiUnsubscribe(subscriber_);
CUPTI_CALL(dynload::cuptiUnsubscribe(subscriber_));
CUPTI_CALL(dynload::cuptiGetTimestamp(&end_ns_));
#endif // PADDLE_WITH_CUPTI
enabled_ = false;
......@@ -337,18 +485,10 @@ class DeviceTracerImpl : public DeviceTracer {
static void CUPTIAPI ApiCallback(void *userdata, CUpti_CallbackDomain domain,
CUpti_CallbackId cbid, const void *cbdata) {
auto *cbInfo = reinterpret_cast<const CUpti_CallbackData *>(cbdata);
DeviceTracer *tracer = reinterpret_cast<DeviceTracer *>(userdata);
if ((domain == CUPTI_CB_DOMAIN_DRIVER_API) &&
(cbid == CUPTI_DRIVER_TRACE_CBID_cuLaunchKernel)) {
DeviceTracerImpl *tracer = reinterpret_cast<DeviceTracerImpl *>(userdata);
if (cbInfo->callbackSite == CUPTI_API_ENTER) {
const std::string anno = !annotation_stack.empty()
? annotation_stack.back()
: cbInfo->symbolName;
tracer->AddAnnotation(cbInfo->correlationId, anno);
}
} else {
VLOG(1) << "Unhandled API Callback for " << domain << " " << cbid;
Event *event = CurAnnotation();
tracer->AddAnnotation(cbInfo->correlationId, event);
}
}
CUpti_SubscriberHandle subscriber_;
......@@ -357,10 +497,12 @@ class DeviceTracerImpl : public DeviceTracer {
bool enabled_;
uint64_t start_ns_;
uint64_t end_ns_;
std::vector<KernelRecord> kernel_records_;
std::vector<MemRecord> mem_records_;
std::vector<CPURecord> cpu_records_;
std::unordered_map<uint32_t, std::string> correlations_;
std::forward_list<KernelRecord> kernel_records_;
std::forward_list<MemRecord> mem_records_;
std::forward_list<std::forward_list<CPURecord>> cpu_records_;
std::forward_list<std::forward_list<std::pair<uint32_t, Event *>>>
correlations_pairs;
std::unordered_map<uint32_t, Event *> correlations_;
};
void CreateTracer(DeviceTracer **t) { *t = new DeviceTracerImpl(); }
......@@ -370,21 +512,106 @@ DeviceTracer *GetDeviceTracer() {
return tracer;
}
void SetCurAnnotation(const std::string &anno) {
annotation_stack.push_back(anno);
}
void SetCurAnnotation(Event *event) { annotation_stack.push_back(event); }
void ClearCurAnnotation() { annotation_stack.pop_back(); }
std::string CurAnnotation() {
if (annotation_stack.empty()) return "";
Event *CurAnnotation() {
if (annotation_stack.empty()) return nullptr;
return annotation_stack.back();
}
std::string CurAnnotationName() {
if (annotation_stack.empty()) return "";
return annotation_stack.back()->name();
}
void SetCurBlock(int block_id) { block_id_stack.push_back(block_id); }
void ClearCurBlock() { block_id_stack.pop_back(); }
int BlockDepth() { return block_id_stack.size(); }
uint32_t GetCurSystemThreadId() {
std::stringstream ss;
ss << std::this_thread::get_id();
uint32_t id = static_cast<uint32_t>(std::stoull(ss.str()));
return id;
}
void RecoreCurThreadId(int32_t id) {
auto gid = GetCurSystemThreadId();
VLOG(1) << "RecoreCurThreadId: " << gid << " -> " << id;
system_thread_id_map[gid] = id;
}
int32_t GetThreadIdFromSystemThreadId(uint32_t id) {
auto it = system_thread_id_map.find(id);
if (it != system_thread_id_map.end()) return it->second;
// return origin id if no event is recorded in this thread.
return static_cast<int32_t>(id);
}
#ifdef PADDLE_WITH_CUPTI
namespace {
void initCuptiCbidStr() {
static bool called = false;
if (called) return;
called = true;
#define REGISTER_RUNTIME_CBID_STR(cbid) \
runtime_cbid_str[CUPTI_RUNTIME_TRACE_CBID_##cbid] = #cbid
REGISTER_RUNTIME_CBID_STR(cudaBindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaConfigureCall_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetAttribute_v5000);
REGISTER_RUNTIME_CBID_STR(cudaDeviceGetStreamPriorityRange_v5050);
REGISTER_RUNTIME_CBID_STR(cudaDeviceSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaDriverGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventCreateWithFlags_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventDestroy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventQuery_v3020);
REGISTER_RUNTIME_CBID_STR(cudaEventRecord_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFreeHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFree_v3020);
REGISTER_RUNTIME_CBID_STR(cudaFuncGetAttributes_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceCount_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDeviceProperties_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetErrorString_v3020);
REGISTER_RUNTIME_CBID_STR(cudaGetLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostAlloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaHostGetDevicePointer_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunchKernel_v7000);
REGISTER_RUNTIME_CBID_STR(cudaMallocHost_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMalloc_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpyAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemcpy_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemsetAsync_v3020);
REGISTER_RUNTIME_CBID_STR(cudaMemset_v3020);
REGISTER_RUNTIME_CBID_STR(
cudaOccupancyMaxActiveBlocksPerMultiprocessorWithFlags_v7000);
REGISTER_RUNTIME_CBID_STR(cudaPeekAtLastError_v3020);
REGISTER_RUNTIME_CBID_STR(cudaRuntimeGetVersion_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetDevice_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreate_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithFlags_v5000);
REGISTER_RUNTIME_CBID_STR(cudaStreamCreateWithPriority_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamDestroy_v5050);
REGISTER_RUNTIME_CBID_STR(cudaStreamSynchronize_v3020);
REGISTER_RUNTIME_CBID_STR(cudaStreamWaitEvent_v3020);
REGISTER_RUNTIME_CBID_STR(cudaUnbindTexture_v3020);
REGISTER_RUNTIME_CBID_STR(cudaSetupArgument_v3020);
REGISTER_RUNTIME_CBID_STR(cudaLaunch_v3020);
#if CUDA_VERSION >= 9000
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernel_v9000);
REGISTER_RUNTIME_CBID_STR(cudaLaunchCooperativeKernelMultiDevice_v9000);
#endif
#undef REGISTER_RUNTIME_CBID_STR
}
} // namespace
#endif // PADDLE_WITH_CUPTI
} // namespace platform
} // namespace paddle
......@@ -32,6 +32,8 @@ inline uint64_t PosixInNsec() {
return 1000 * (static_cast<uint64_t>(tv.tv_sec) * 1000000 + tv.tv_usec);
}
class Event;
// DeviceTracer performs the following tasks:
// 1. Register cuda callbacks for various events: kernel, memcpy, etc.
// 2. Collect cuda statistics: start/end ts, memory, etc.
......@@ -68,11 +70,13 @@ class DeviceTracer {
virtual void Enable() = 0;
// Needs to be called once after use.
virtual void Disable() = 0;
// Needs to be called once before reuse.
virtual void Reset() = 0;
// Add a pair to correlate internal cuda id with high level
// annotation (string). So cuda statistics can be represented by
// annotation event(with string). So cuda statistics can be represented by
// human-readable annotations.
virtual void AddAnnotation(uint64_t id, const std::string& anno) = 0;
virtual void AddAnnotation(uint32_t id, Event* event) = 0;
virtual void AddMemRecords(const std::string& name, uint64_t start_ns,
uint64_t end_ns, int64_t device_id,
......@@ -92,6 +96,9 @@ class DeviceTracer {
// Generate a proto after done (Disabled).
virtual proto::Profile GenProfile(const std::string& profile_path) = 0;
// generate kernel elapsed time into Event
virtual void GenEventKernelCudaElapsedTime() = 0;
virtual bool IsEnabled() = 0;
};
......@@ -99,14 +106,19 @@ class DeviceTracer {
DeviceTracer* GetDeviceTracer();
// Set a name for the cuda kernel operation being launched by the thread.
void SetCurAnnotation(const std::string& anno);
void SetCurAnnotation(Event* event);
// Clear the name after the operation is done.
void ClearCurAnnotation();
// Current name of the operation being run in the thread.
std::string CurAnnotation();
std::string CurAnnotationName();
Event* CurAnnotation();
void SetCurBlock(int block_id);
void ClearCurBlock();
int BlockDepth();
// Set current thread id, so we can map the system thread id to thread id.
void RecoreCurThreadId(int32_t id);
int32_t GetThreadIdFromSystemThreadId(uint32_t id);
} // namespace platform
} // namespace paddle
......@@ -86,6 +86,8 @@ extern void* mklml_dso_handle;
__macro(vdPowx); \
__macro(vsInv); \
__macro(vdInv); \
__macro(vmsErf); \
__macro(vmdErf); \
__macro(MKL_Set_Num_Threads)
MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);
......
......@@ -31,6 +31,8 @@ limitations under the License. */
#include <sstream>
#include <stdexcept>
#include <string>
#include <type_traits>
#include <utility>
#include "glog/logging.h"
#include "paddle/fluid/platform/macros.h"
......@@ -280,16 +282,62 @@ inline void throw_on_error(ncclResult_t stat, const std::string& msg) {
} \
} while (0)
#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \
namespace details {
template <typename T>
inline constexpr bool IsArithmetic() {
return std::is_arithmetic<T>::value;
}
template <typename T1, typename T2, bool kIsArithmetic /* = true */>
struct TypeConverterImpl {
using Type1 = typename std::common_type<T1, T2>::type;
using Type2 = Type1;
};
template <typename T1, typename T2>
struct TypeConverterImpl<T1, T2, false> {
using Type1 = T1;
using Type2 = T2;
};
template <typename T1, typename T2>
struct TypeConverter {
private:
static constexpr bool kIsArithmetic =
IsArithmetic<T1>() && IsArithmetic<T2>();
public:
using Type1 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type1;
using Type2 = typename TypeConverterImpl<T1, T2, kIsArithmetic>::Type2;
};
template <typename T1, typename T2>
using CommonType1 = typename std::add_lvalue_reference<
typename std::add_const<typename TypeConverter<T1, T2>::Type1>::type>::type;
template <typename T1, typename T2>
using CommonType2 = typename std::add_lvalue_reference<
typename std::add_const<typename TypeConverter<T1, T2>::Type2>::type>::type;
} // namespace details
#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...) \
do { \
auto __cond1__ = (__VAL0); \
auto __cond2__ = (__VAL1); \
if (UNLIKELY(!((__cond1__)__CMP(__cond2__)))) { \
auto __val1 = (__VAL1); \
auto __val2 = (__VAL2); \
using __TYPE1__ = decltype(__val1); \
using __TYPE2__ = decltype(__val2); \
using __COMMON_TYPE1__ = \
::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>; \
using __COMMON_TYPE2__ = \
::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>; \
bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP( \
static_cast<__COMMON_TYPE2__>(__val2)); \
if (UNLIKELY(!__is_not_error)) { \
PADDLE_THROW("Enforce failed. Expected %s " #__CMP \
" %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
#__VAL0, #__VAL1, #__VAL0, \
::paddle::string::to_string(__cond1__), #__VAL1, \
::paddle::string::to_string(__cond2__), \
#__VAL1, #__VAL2, #__VAL1, \
::paddle::string::to_string(__val1), #__VAL2, \
::paddle::string::to_string(__val2), \
::paddle::string::Sprintf(__VA_ARGS__)); \
} \
} while (0)
......
......@@ -118,59 +118,58 @@ TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
TEST(ENFORCE_GT, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GT(1, 2UL);
PADDLE_ENFORCE_GT(1, 2);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(HasPrefix(
StringPiece(error.what()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
EXPECT_TRUE(
HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
}
EXPECT_TRUE(caught_exception);
}
TEST(ENFORCE_GE, OK) {
PADDLE_ENFORCE_GE(2, 2UL);
PADDLE_ENFORCE_GE(3, 2UL);
PADDLE_ENFORCE_GE(2, 2);
PADDLE_ENFORCE_GE(3, 2);
PADDLE_ENFORCE_GE(3.21, 2UL);
PADDLE_ENFORCE_GE(3.21, 2.0);
}
TEST(ENFORCE_GE, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GE(1, 2UL);
PADDLE_ENFORCE_GE(1, 2);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(HasPrefix(
StringPiece(error.what()),
"Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
EXPECT_TRUE(
HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 >= 2, but received 1:1 < 2:2."));
}
EXPECT_TRUE(caught_exception);
}
TEST(ENFORCE_LE, OK) {
PADDLE_ENFORCE_LE(1, 1);
PADDLE_ENFORCE_LE(1, 1UL);
PADDLE_ENFORCE_LE(2, 3UL);
PADDLE_ENFORCE_LE(2UL, 3);
PADDLE_ENFORCE_LE(2UL, 3.2);
PADDLE_ENFORCE_LE(1UL, 1UL);
PADDLE_ENFORCE_LE(2, 3);
PADDLE_ENFORCE_LE(2UL, 3UL);
PADDLE_ENFORCE_LE(2.0, 3.2);
}
TEST(ENFORCE_LE, FAIL) {
bool caught_exception = false;
try {
PADDLE_ENFORCE_GT(1, 2UL);
PADDLE_ENFORCE_GT(1, 2);
} catch (paddle::platform::EnforceNotMet error) {
caught_exception = true;
EXPECT_TRUE(HasPrefix(
StringPiece(error.what()),
"Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
EXPECT_TRUE(
HasPrefix(StringPiece(error.what()),
"Enforce failed. Expected 1 > 2, but received 1:1 <= 2:2."));
}
EXPECT_TRUE(caught_exception);
}
TEST(ENFORCE_LT, OK) {
PADDLE_ENFORCE_LT(3, 10);
PADDLE_ENFORCE_LT(2, 3UL);
PADDLE_ENFORCE_LT(2UL, 3);
PADDLE_ENFORCE_LT(2UL, 3UL);
PADDLE_ENFORCE_LT(2, 3);
}
TEST(ENFORCE_LT, FAIL) {
bool caught_exception = false;
......@@ -235,7 +234,13 @@ TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
bool caught_exception = false;
try {
PADDLE_ENFORCE_EQ(a, b);
} catch (paddle::platform::EnforceNotMet&) {
caught_exception = true;
}
EXPECT_TRUE(caught_exception);
}
TEST(EOF_EXCEPTION, THROW_EOF) {
......
......@@ -22,6 +22,7 @@ limitations under the License. */
#include "paddle/fluid/string/split.h"
#ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/dynload/cupti.h"
#endif
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/init.h"
......@@ -30,6 +31,9 @@ limitations under the License. */
DEFINE_int32(paddle_num_threads, 1,
"Number of threads for each paddle instance.");
DEFINE_int32(multiple_of_cupti_buffer_size, 1,
"Multiple of the CUPTI device buffer size. If the timestamps have "
"been dropped when you are profiling, try increasing this value.");
namespace paddle {
namespace framework {
......@@ -78,7 +82,32 @@ void InitP2P(std::vector<int> devices) {
#endif
}
void InitCupti() {
#ifdef PADDLE_WITH_CUPTI
if (FLAGS_multiple_of_cupti_buffer_size == 1) return;
size_t attrValue = 0, attrValueSize = sizeof(size_t);
#define MULTIPLY_ATTR_VALUE(attr) \
{ \
PADDLE_ENFORCE(!platform::dynload::cuptiActivityGetAttribute( \
attr, &attrValueSize, &attrValue)); \
attrValue *= FLAGS_multiple_of_cupti_buffer_size; \
LOG(WARNING) << "Set " #attr " " << attrValue << " byte"; \
PADDLE_ENFORCE(!platform::dynload::cuptiActivitySetAttribute( \
attr, &attrValueSize, &attrValue)); \
}
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE);
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_DEVICE_BUFFER_SIZE_CDP);
#if CUDA_VERSION >= 9000
MULTIPLY_ATTR_VALUE(CUPTI_ACTIVITY_ATTR_PROFILING_SEMAPHORE_POOL_SIZE);
#endif
#undef MULTIPLY_ATTR_VALUE
#endif
}
void InitDevices(bool init_p2p) {
// CUPTI attribute should be set before any CUDA context is created (see CUPTI
// documentation about CUpti_ActivityAttribute).
InitCupti();
/*Init all available devices by default */
std::vector<int> devices;
#ifdef PADDLE_WITH_CUDA
......
......@@ -43,6 +43,13 @@ std::shared_ptr<ngraph::Node> Nchw2Nhwc(std::shared_ptr<ngraph::Node> in) {
return std::make_shared<ngraph::op::Reshape>(in, axis_vec, in_shape);
}
ngraph::Shape FlattenTo1d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::end(sh) + num, 1,
std::multiplies<size_t>());
size_t x1_l = (size_t)x1;
return ngraph::Shape{x1_l};
}
ngraph::Shape FlattenTo2d(ngraph::Shape sh, int num) {
auto x1 = std::accumulate(std::begin(sh), std::begin(sh) + num, 1,
std::multiplies<size_t>());
......
......@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <algorithm>
#include <iomanip>
#include <limits>
......@@ -27,7 +29,6 @@ limitations under the License. */
#include "paddle/fluid/framework/block_desc.h"
#include "paddle/fluid/platform/device_tracer.h"
#include "paddle/fluid/platform/port.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/printf.h"
DEFINE_bool(enable_rpc_profiler, false, "Enable rpc profiler or not.");
......@@ -66,12 +67,13 @@ struct EventList {
((kEventSize + kEventAlign - 1) / kEventAlign * kEventAlign);
template <typename... Args>
void Record(Args&&... args) {
Event* Record(Args&&... args) {
if (event_blocks.empty() || event_blocks.front().size() == kNumBlock) {
event_blocks.emplace_front();
event_blocks.front().reserve(kNumBlock);
}
event_blocks.front().emplace_back(std::forward<Args>(args)...);
return &event_blocks.front().back();
}
std::vector<Event> Reduce() {
......@@ -98,21 +100,8 @@ inline uint64_t GetTimeInNsec() {
.count();
}
Event::Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx)
: type_(type), name_(name), thread_id_(thread_id), has_cuda_(false) {
#ifdef PADDLE_WITH_CUDA
has_cuda_ = dev_ctx ? platform::is_gpu_place(dev_ctx->GetPlace()) : false;
if (has_cuda_) {
auto* cuda_dev_ctx = static_cast<const CUDADeviceContext*>(dev_ctx);
PADDLE_ENFORCE(cudaSetDevice(
boost::get<platform::CUDAPlace>(cuda_dev_ctx->GetPlace()).device));
PADDLE_ENFORCE(cudaGetDevice(&device_));
PADDLE_ENFORCE(cudaEventCreate(&event_));
auto stream = cuda_dev_ctx->stream();
PADDLE_ENFORCE(cudaEventRecord(event_, stream));
}
#endif
Event::Event(EventType type, std::string name, uint32_t thread_id)
: type_(type), name_(name), thread_id_(thread_id) {
cpu_ns_ = GetTimeInNsec();
}
......@@ -123,89 +112,70 @@ double Event::CpuElapsedMs(const Event& e) const {
}
double Event::CudaElapsedMs(const Event& e) const {
#ifdef PADDLE_WITH_CUDA
if (!has_cuda_) return 0.0;
PADDLE_ENFORCE(e.has_cuda() && has_cuda());
PADDLE_ENFORCE(e.device() == device());
PADDLE_ENFORCE(cudaEventSynchronize(event_));
PADDLE_ENFORCE(cudaEventSynchronize(e.event()));
float ms;
PADDLE_ENFORCE(cudaEventElapsedTime(&ms, event_, e.event()));
return ms;
#ifdef PADDLE_WITH_CUPTI
return gpu_ns_ / 1000000.0;
#else
PADDLE_THROW("CUDA is not enabled");
LOG_FIRST_N(WARNING, 1) << "CUDA CUPTI is not enabled";
return 0;
#endif
}
#ifdef PADDLE_WITH_CUDA
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
#endif
inline EventList& GetEventList() {
if (!g_event_list) {
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
g_event_list = std::make_shared<EventList>();
g_thread_id = g_next_thread_id++;
g_all_event_lists.emplace_front(g_event_list);
RecoreCurThreadId(g_thread_id);
}
return *g_event_list;
}
void Mark(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventType::kMark, name, g_thread_id, dev_ctx);
void Mark(const std::string& name) {
GetEventList().Record(EventType::kMark, name, g_thread_id);
}
void PushEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventType::kPushRange, name, g_thread_id, dev_ctx);
Event* PushEvent(const std::string& name) {
return GetEventList().Record(EventType::kPushRange, name, g_thread_id);
}
void PopEvent(const std::string& name, const DeviceContext* dev_ctx) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id, dev_ctx);
void PopEvent(const std::string& name) {
GetEventList().Record(EventType::kPopRange, name, g_thread_id);
}
RecordEvent::RecordEvent(const std::string& name, const DeviceContext* dev_ctx)
RecordEvent::RecordEvent(const std::string& name)
: is_enabled_(false), start_ns_(PosixInNsec()) {
if (g_state == ProfilerState::kDisabled) return;
std::lock_guard<std::mutex> l(profiler_mu);
// lock is not needed, the code below is thread-safe
is_enabled_ = true;
dev_ctx_ = dev_ctx;
name_ = name;
PushEvent(name_, dev_ctx_);
Event* e = PushEvent(name_);
// Maybe need the same push/pop behavior.
SetCurAnnotation(name_);
SetCurAnnotation(e);
}
RecordEvent::~RecordEvent() {
if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
std::lock_guard<std::mutex> l(profiler_mu);
// lock is not needed, the code below is thread-safe
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
tracer->AddCPURecords(CurAnnotation(), start_ns_, PosixInNsec(),
tracer->AddCPURecords(CurAnnotationName(), start_ns_, PosixInNsec(),
BlockDepth(), g_thread_id);
}
ClearCurAnnotation();
PopEvent(name_, dev_ctx_);
PopEvent(name_);
}
RecordRPCEvent::RecordRPCEvent(const std::string& name,
const DeviceContext* dev_ctx) {
RecordRPCEvent::RecordRPCEvent(const std::string& name) {
if (FLAGS_enable_rpc_profiler) {
event_.reset(new platform::RecordEvent(name, dev_ctx));
event_.reset(new platform::RecordEvent(name));
}
}
RecordBlock::RecordBlock(int block_id)
: is_enabled_(false), start_ns_(PosixInNsec()) {
std::lock_guard<std::mutex> l(profiler_mu);
// lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled) return;
is_enabled_ = true;
SetCurBlock(block_id);
......@@ -213,7 +183,7 @@ RecordBlock::RecordBlock(int block_id)
}
RecordBlock::~RecordBlock() {
std::lock_guard<std::mutex> l(profiler_mu);
// lock is not needed, the code below is thread-safe
if (g_state == ProfilerState::kDisabled || !is_enabled_) return;
DeviceTracer* tracer = GetDeviceTracer();
if (tracer) {
......@@ -225,11 +195,21 @@ RecordBlock::~RecordBlock() {
ClearCurBlock();
}
void SynchronizeAllDevice() {
#ifdef PADDLE_WITH_CUDA
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
PADDLE_ENFORCE(cudaDeviceSynchronize());
}
#endif
}
void EnableProfiler(ProfilerState state) {
PADDLE_ENFORCE(state != ProfilerState::kDisabled,
"Can't enable profiling, since the input state is ",
"ProfilerState::kDisabled");
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu);
if (state == g_state) {
return;
......@@ -238,23 +218,20 @@ void EnableProfiler(ProfilerState state) {
should_send_profile_state = true;
GetDeviceTracer()->Enable();
#ifdef PADDLE_WITH_CUDA
if (g_state == ProfilerState::kCUDA) {
if (g_state == ProfilerState::kCUDA || g_state == ProfilerState::kAll ||
g_state == ProfilerState::kCPU) {
// Generate some dummy events first to reduce the startup overhead.
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_", dev_ctx);
dev_ctx->Wait();
delete dev_ctx;
});
}
DummyKernelAndEvent();
GetDeviceTracer()->Reset();
}
#endif
// Mark the profiling start.
Mark("_start_profiler_", nullptr);
Mark("_start_profiler_");
}
void ResetProfiler() {
SynchronizeAllDevice();
GetDeviceTracer()->Reset();
std::lock_guard<std::mutex> guard(g_all_event_lists_mutex);
for (auto it = g_all_event_lists.begin(); it != g_all_event_lists.end();
++it) {
......@@ -277,9 +254,11 @@ struct EventItem {
std::string name;
int calls;
double total_time;
double min_time;
double max_time;
double ave_time;
double min_time;
double cpu_time;
double gpu_time;
float ratio;
};
......@@ -313,8 +292,12 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
// Output events table
std::cout.setf(std::ios::left);
std::cout << std::setw(name_width) << "Event" << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total"
<< std::setw(data_width) << "Min." << std::setw(data_width)
<< "Calls" << std::setw(data_width) << "Total";
if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2) << "CPU Time (Ratio)"
<< std::setw(data_width * 2) << "GPU Time (Ratio)";
}
std::cout << std::setw(data_width) << "Min." << std::setw(data_width)
<< "Max." << std::setw(data_width) << "Ave."
<< std::setw(data_width) << "Ratio." << std::endl;
for (size_t i = 0; i < events_table.size(); ++i) {
......@@ -322,8 +305,18 @@ void PrintProfiler(const std::vector<std::vector<EventItem>>& events_table,
const EventItem& event_item = events_table[i][j];
std::cout << std::setw(name_width) << event_item.name
<< std::setw(data_width) << event_item.calls
<< std::setw(data_width) << event_item.total_time
<< std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.total_time;
if (g_state == ProfilerState::kAll) {
std::cout << std::setw(data_width * 2)
<< string::Sprintf(
"%f (%f)", event_item.cpu_time,
(event_item.cpu_time / event_item.total_time))
<< std::setw(data_width * 2)
<< string::Sprintf(
"%f (%f)", event_item.gpu_time,
(event_item.gpu_time / event_item.total_time));
}
std::cout << std::setw(data_width) << event_item.min_time
<< std::setw(data_width) << event_item.max_time
<< std::setw(data_width) << event_item.ave_time
<< std::setw(data_width) << event_item.ratio << std::endl;
......@@ -372,6 +365,18 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
return a.ave_time > b.ave_time;
};
break;
case EventSortingKey::kGPUTime:
sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.gpu_time > b.gpu_time;
};
break;
case EventSortingKey::kCPUTime:
sorted_domain = "average time";
sorted_func = [](const EventItem& a, const EventItem& b) {
return a.cpu_time > b.cpu_time;
};
break;
default:
sorted_domain = "event first end time";
}
......@@ -410,10 +415,17 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
}
if (rit != pushed_events.rend()) {
double event_time = (g_state == ProfilerState::kCUDA ||
g_state == ProfilerState::kAll)
? rit->CudaElapsedMs((*analyze_events)[i][j])
: rit->CpuElapsedMs((*analyze_events)[i][j]);
double event_time = 0;
double gpu_time = rit->CudaElapsedMs((*analyze_events)[i][j]);
double cpu_time = rit->CpuElapsedMs((*analyze_events)[i][j]);
if (g_state == ProfilerState::kCUDA) {
event_time = gpu_time;
} else if (g_state == ProfilerState::kCPU) {
event_time = cpu_time;
} else {
event_time = gpu_time + cpu_time;
}
total += event_time;
std::string event_name;
......@@ -430,7 +442,7 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
event_idx[event_name] = event_items.size();
EventItem event_item = {event_name, 1, event_time,
event_time, event_time, event_time,
0.};
gpu_time, cpu_time, 0.};
event_items.push_back(event_item);
} else {
int index = event_idx[event_name];
......@@ -443,6 +455,8 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
// max time
event_items[index].max_time =
std::max(event_time, event_items[index].max_time);
event_items[index].gpu_time += gpu_time;
event_items[index].cpu_time += cpu_time;
}
// remove the push marker from the list
......@@ -481,20 +495,23 @@ void ParseEvents(const std::vector<std::vector<Event>>& events,
void DisableProfiler(EventSortingKey sorted_key,
const std::string& profile_path) {
SynchronizeAllDevice();
std::lock_guard<std::mutex> l(profiler_mu);
if (g_state == ProfilerState::kDisabled) return;
// Mark the profiling stop.
Mark("_stop_profiler_", nullptr);
Mark("_stop_profiler_");
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
DeviceTracer* tracer = GetDeviceTracer();
if (tracer->IsEnabled()) {
tracer->Disable();
tracer->GenProfile(profile_path);
tracer->GenEventKernelCudaElapsedTime();
}
std::vector<std::vector<Event>> all_events = GetAllEvents();
ParseEvents(all_events, true, sorted_key);
ParseEvents(all_events, false, sorted_key);
ResetProfiler();
g_state = ProfilerState::kDisabled;
should_send_profile_state = true;
}
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/profiler.h"
#include <cuda.h>
namespace paddle {
namespace platform {
__global__ void DummyKernel(int *a) { a[0] = 0; }
static void ForEachDevice(std::function<void(int)> func) {
auto original_device = GetCurrentDeviceId();
int count = GetCUDADeviceCount();
for (int i = 0; i < count; i++) {
SetDeviceId(i);
func(i);
}
SetDeviceId(original_device);
}
void DummyKernelAndEvent() {
for (int i = 0; i < 5; i++) {
ForEachDevice([](int d) {
CUDADeviceContext *dev_ctx = new CUDADeviceContext(CUDAPlace(d));
Mark("_cuda_startup_");
int *ptr;
PADDLE_ENFORCE(cudaMalloc(&ptr, sizeof(int)));
DummyKernel<<<1, 1, 0, dev_ctx->stream()>>>(ptr);
dev_ctx->Wait();
PADDLE_ENFORCE(cudaFree(ptr));
delete dev_ctx;
});
}
}
} // namespace platform
} // namespace paddle
......@@ -28,17 +28,17 @@ class Event {
public:
// The DeviceContext is used to get the cuda stream.
// If CPU profiling mode, can pass nullptr.
Event(EventType type, std::string name, uint32_t thread_id,
const DeviceContext* dev_ctx);
Event(EventType type, std::string name, uint32_t thread_id);
const EventType& type() const;
std::string name() const { return name_; }
uint32_t thread_id() const { return thread_id_; }
bool has_cuda() const { return has_cuda_; }
#ifdef PADDLE_WITH_CUDA
#ifndef PADDLE_WITH_CUPTI
cudaEvent_t event() const { return event_; }
int device() const { return device_; }
#endif
#endif
double CpuElapsedMs(const Event& e) const;
......@@ -49,11 +49,21 @@ class Event {
std::string name_;
uint32_t thread_id_;
int64_t cpu_ns_;
bool has_cuda_;
#ifdef PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_CUPTI
int64_t gpu_ns_ = 0;
public:
void AddCudaElapsedTime(int64_t start_ns, int64_t end_ns) {
gpu_ns_ += end_ns - start_ns;
}
private:
#else
cudaEvent_t event_ = nullptr;
int device_ = -1;
#endif
#endif
};
enum ProfilerState {
......@@ -63,22 +73,19 @@ enum ProfilerState {
kAll, // Profile both CPU and GPU. (Currently experimental).
};
void Mark(const std::string& name, const DeviceContext* dev_ctx);
void Mark(const std::string& name);
void PushEvent(const std::string& name, const DeviceContext* dev_ctx);
Event* PushEvent(const std::string& name);
void PopEvent(const std::string& name, const DeviceContext* dev_ctx);
void PopEvent(const std::string& name);
struct RecordEvent {
// dev_ctx can be set to nullptr if device is cpu.
RecordEvent(const std::string& name, const DeviceContext* dev_ctx);
explicit RecordEvent(const std::string& name);
~RecordEvent();
bool is_enabled_;
uint64_t start_ns_;
// The device context is used by Event to get the current cuda stream.
const DeviceContext* dev_ctx_;
// Event name
std::string name_;
// Need to distinguish name by op type, block_id, program_id and perhaps
......@@ -88,8 +95,7 @@ struct RecordEvent {
class RecordRPCEvent {
public:
// dev_ctx can be set to nullptr if device is cpu.
RecordRPCEvent(const std::string& name, const DeviceContext* dev_ctx);
explicit RecordRPCEvent(const std::string& name);
~RecordRPCEvent() {}
private:
......@@ -111,7 +117,16 @@ struct RecordBlock {
std::vector<std::vector<Event>> GetAllEvents();
// Candidate keys to sort the profiling report
enum EventSortingKey { kDefault, kCalls, kTotal, kMin, kMax, kAve };
enum EventSortingKey {
kDefault,
kCalls,
kTotal,
kMin,
kMax,
kAve,
kCPUTime,
kGPUTime
};
// Enable the profiling function.
void EnableProfiler(ProfilerState state);
......@@ -132,5 +147,9 @@ bool ShouldSendProfileState();
void SetProfileListener();
int64_t ListenerId();
#ifdef PADDLE_WITH_CUDA
void DummyKernelAndEvent();
#endif
} // namespace platform
} // namespace paddle
......@@ -31,6 +31,7 @@ message Event {
optional int64 sub_device_id = 6;
optional MemCopy memcopy = 7;
optional string detail_info = 9;
}
message Profile {
......
......@@ -23,76 +23,49 @@ TEST(Event, CpuElapsedTime) {
using paddle::platform::Event;
using paddle::platform::EventType;
Event start_event(EventType::kPushRange, "test", 0, nullptr);
EXPECT_TRUE(start_event.has_cuda() == false);
Event start_event(EventType::kPushRange, "test", 0);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventType::kPopRange, "test", 0, nullptr);
Event stop_event(EventType::kPopRange, "test", 0);
EXPECT_GT(start_event.CpuElapsedMs(stop_event), 0);
}
#ifdef PADDLE_WITH_CUDA
TEST(Event, CudaElapsedTime) {
using paddle::platform::DeviceContext;
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
using paddle::platform::Event;
using paddle::platform::EventType;
DeviceContext* dev_ctx = new CUDADeviceContext(CUDAPlace(0));
Event start_event(EventType::kPushRange, "test", 0, dev_ctx);
EXPECT_TRUE(start_event.has_cuda() == true);
int counter = 0;
while (counter != 1000) {
counter++;
}
Event stop_event(EventType::kPopRange, "test", 0, dev_ctx);
EXPECT_GT(start_event.CudaElapsedMs(stop_event), 0);
}
#endif
TEST(RecordEvent, RecordEvent) {
using paddle::platform::DeviceContext;
using paddle::platform::Event;
using paddle::platform::EventType;
using paddle::platform::RecordEvent;
using paddle::platform::PushEvent;
using paddle::platform::PopEvent;
using paddle::platform::ProfilerState;
using paddle::platform::EventSortingKey;
ProfilerState state = ProfilerState::kCPU;
DeviceContext* dev_ctx = nullptr;
#ifdef PADDLE_WITH_CUDA
using paddle::platform::CUDADeviceContext;
using paddle::platform::CUDAPlace;
state = ProfilerState::kCUDA;
dev_ctx =
new paddle::platform::CUDADeviceContext(paddle::platform::CUDAPlace(0));
#endif
EnableProfiler(state);
/* Usage 1:
* PushEvent(evt_name, dev_ctx);
* PushEvent(evt_name);
* ...
* code to be analyzed
* ...
* PopEvent(evt_name, dev_ctx);
* PopEvent(evt_name);
*/
LOG(INFO) << "Usage 1: PushEvent & PopEvent";
for (int loop = 0; loop < 3; ++loop) {
for (int i = 1; i < 5; ++i) {
std::string name = "op_" + std::to_string(i);
PushEvent(name, dev_ctx);
PushEvent(name);
int counter = 1;
while (counter != i * 1000) counter++;
PopEvent(name, dev_ctx);
PopEvent(name);
}
}
/* Usage 2:
* {
* RecordEvent record_event(name, dev_ctx);
* RecordEvent record_event(name);
* ...
* code to be analyzed
* ...
......@@ -101,7 +74,7 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 2: RecordEvent";
for (int i = 1; i < 5; ++i) {
std::string name = "evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
RecordEvent record_event(name);
int counter = 1;
while (counter != i * 1000) counter++;
}
......@@ -123,20 +96,20 @@ TEST(RecordEvent, RecordEvent) {
LOG(INFO) << "Usage 3: nested RecordEvent";
for (int i = 1; i < 5; ++i) {
std::string name = "ano_evs_op_" + std::to_string(i);
RecordEvent record_event(name, dev_ctx);
RecordEvent record_event(name);
int counter = 1;
while (counter != i * 100) counter++;
{
std::string nested_name = "nested_ano_evs_op_" + std::to_string(i);
RecordEvent nested_record_event(nested_name, dev_ctx);
RecordEvent nested_record_event(nested_name);
int nested_counter = 1;
while (nested_counter != i * 100) nested_counter++;
}
}
// Bad Usage:
PushEvent("event_without_pop", dev_ctx);
PopEvent("event_without_push", dev_ctx);
PushEvent("event_without_pop");
PopEvent("event_without_push");
std::vector<std::vector<Event>> events = paddle::platform::GetAllEvents();
int cuda_startup_count = 0;
......
......@@ -106,6 +106,11 @@ bool IsCompiledWithDIST() {
#endif
}
template <typename PlaceType1, typename PlaceType2>
static inline bool IsSamePlace(const PlaceType1 &p1, const PlaceType2 &p2) {
return paddle::platform::Place(p1) == paddle::platform::Place(p2);
}
PYBIND11_MODULE(core, m) {
// Not used, just make sure cpu_info.cc is linked.
paddle::platform::CpuTotalPhysicalMemory();
......@@ -732,23 +737,45 @@ All parameter, weight, gradient are variables in Paddle.
PADDLE_THROW("Cannot use CUDAPlace in CPU only version");
#endif
})
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CUDAPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CUDAPlace &>);
py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
.def(py::init<>())
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CPUPlace &>);
py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace")
.def("__init__",
[](platform::CUDAPinnedPlace &) {
[](platform::CUDAPinnedPlace &self) {
#ifndef PADDLE_WITH_CUDA
PADDLE_THROW("Cannot use CUDAPinnedPlace in CPU only version");
#endif
new (&self) platform::CUDAPinnedPlace();
})
.def("_equals", &IsSamePlace<platform::CUDAPinnedPlace, platform::Place>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
.def("_equals",
&IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
.def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
py::class_<platform::Place>(m, "Place")
.def(py::init<>())
.def("_equals", &IsSamePlace<platform::Place, platform::Place>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
.def("gpu_device_id",
......
......@@ -9,7 +9,6 @@
PADDLE_LIB=/paddle/lib/dir
cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
-DCMAKE_BUILD_TYPE=Release \
-DWITH_FLUID_ONLY=ON \
-DWITH_GPU=OFF \
-DWITH_STYLE_CHECK=OFF \
-DWITH_MKL=OFF \
......
......@@ -66,12 +66,10 @@ Users can specify the following Docker build arguments with either "ON" or "OFF"
| `WITH_AVX` | OFF | Set to "ON" to enable AVX support. |
| `WITH_TESTING` | OFF | Build unit tests binaries. |
| `WITH_MKL` | ON | Build with [Intel® MKL](https://software.intel.com/en-us/mkl) and [Intel® MKL-DNN](https://github.com/01org/mkl-dnn) support. |
| `WITH_GOLANG` | OFF | Build fault-tolerant parameter server written in go. |
| `WITH_PYTHON` | ON | Build with python support. Turn this off if build is only for capi. |
| `WITH_STYLE_CHECK` | ON | Check the code style when building. |
| `PYTHON_ABI` | "" | Build for different python ABI support, can be cp27-cp27m or cp27-cp27mu |
| `RUN_TEST` | OFF | Run unit test immediently after the build. |
| `WITH_DOC` | OFF | Build docs after build binaries. |
| `WOBOQ` | OFF | Generate WOBOQ code viewer under `build/woboq_out` |
## Docker Images
......
#!/bin/bash
## purple to echo
function purple(){
echo -e "\033[35m$1\033[0m"
}
## green to echo
function green(){
echo -e "\033[32m$1\033[0m"
}
## Error to warning with blink
function bred(){
echo -e "\033[31m\033[01m\033[05m$1\033[0m"
}
## Error to warning with blink
function byellow(){
echo -e "\033[33m\033[01m\033[05m$1\033[0m"
}
## Error
function red(){
echo -e "\033[31m\033[01m$1\033[0m"
}
## warning
function yellow(){
echo -e "\033[33m\033[01m$1\033[0m"
}
path='http://paddlepaddle.org/download?url='
#release_version=`curl -s https://pypi.org/project/paddlepaddle/|grep -E "/project/paddlepaddle/"|grep "release"|awk -F '/' '{print $(NF-1)}'|head -1`
release_version=1.2.0
......@@ -228,16 +260,112 @@ function checkLinuxPaddleVersion(){
done
}
function checkLinuxPip(){
function checkPythonVirtualenv(){
while true
do
echo "请输入您要使用的pip目录(您可以另起终端,并使用which pip来查看):"
read -p "" pip_path
if [ "$pip_path" == "" -o ! -f "$pip_path" ];then
echo "检测结果:pip不存在,请重新输入"
read -p "
是否使用python virtualenv虚环境安装(y/n)": check_virtualenv
case $check_virtualenv in
y)
echo "为您使用python虚环境安装"
;;
n)
break
;;
*)
continue
;;
esac
virtualenv_path=`which virtualenv 2>&1`
if [ "$virtualenv_path" == "" ];then
$python_path -m pip install virtualenv
if [ "$?" != '0' ];then
echo "安装虚拟环境失败,请检查本地环境"
fi
fi
while true
do
read -p "请输入虚拟环境名字:" virtualenv_name
if [ "$virtualenv_name" == "" ];then
echo "不能为空"
continue
fi
break
done
virtualenv -p $python_path ${virtualenv_name}
if [ "$?" != 0 ];then
echo "创建虚环境失败,请检查环境"
exit 2
fi
cd ${virtualenv_name}
source ./bin/activate
if [ "$?" == 0 ];then
use_virtualenv=
python_path=`which python`
break
else
echo "创建虚环境失败,请检查环境"
exit 2
fi
done
}
function checkLinuxPython(){
python_path=`which python 2>/dev/null`
while true
do
if [ "$python_path" == '' ];then
while true
do
read -p "没有找到默认的python版本,请输入要安装的python路径:" python_path
python_path=`$python_path -V`
if [ "$python_path" != "" ];then
break
else
echo "输入路径有误,未找到pyrhon"
fi
done
fi
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`$python_path -m pip -V|awk -F '[ .]' '{print $2}'`
while true
do
read -p "
找到python版本$python_version,使用请输入y,选择其他版本请输n(y/n):" check_python
case $check_python in
n)
read -p "请指定您的python路径:" new_python_path
python_V=`$new_python_path -V 2>/dev/null`
if [ "$python_V" != "" ];then
python_path=$new_python_path
python_version=`$python_path -V 2>&1|awk -F '[ .]' '{print $2$3}'`
pip_version=`python -m pip -V|awk -F '[ .]' '{print $2}'`
echo "您的python版本为${python_version}"
break
else
echo 输入有误,未找到python路径
fi
;;
y)
break
;;
*)
echo "输入有误,请重新输入."
continue
;;
esac
done
if [ "$pip_version" -lt 9 ];then
echo "您的pip版本小于9.0.1 请升级pip (pip install --upgrade pip)"
exit 0
fi
python_version=`$pip_path --version|awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [ "$python_version" == "27" ];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27mu"`
if [[ "$uncode" == "" ]];then
......@@ -246,16 +374,12 @@ function checkLinuxPip(){
uncode=u
fi
fi
if [ "$python_version" == "" ];then
echo "检测结果:pip不存在,请重新输入"
else
version_list=`echo "${python_list[@]}" | grep "$python_version" `
if [ "$version_list" != "" ];then
echo "检测结果:找到python${python_version}版本"
break
if [ "$version_list" == "" ];then
echo "找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
else
echo "检测结果:找不到可用的 pip, 我们只支持Python27/35/36/37及其对应的pip, 请重新输入, 或使用ctrl + c退出 "
fi
break
fi
done
}
......@@ -287,25 +411,36 @@ function PipLinuxInstall(){
wheel_cpu_develop="http://paddle-wheel.bj.bcebos.com/latest-cpu-${AVX}-${math}/paddlepaddle-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
wheel_gpu_develop="http://paddle-wheel.bj.bcebos.com/latest-gpu-cuda${CUDA}-cudnn${CUDNN}-${AVX}-${math}/paddlepaddle_gpu-latest-cp${python_version}-cp${python_version}m${uncode}-linux_x86_64.whl"
if [[ "$paddle_version" == "2" ]];then
if [[ "$GPU" == "gpu" ]];then
if [[ ${AVX} == "avx" ]];then
rm -rf `echo $wheel_gpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo "paddlepaddle whl包下载失败"
echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1
fi
else
rm -rf `echo $wheel_gpu_release_novax|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_release_novax
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_release_noavx
if [ "$?" == 0 ];then
echo 安装成功
else
echo "paddlepaddle whl包下载失败"
echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -313,9 +448,15 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_cpu_release|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_release
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_release
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -324,18 +465,30 @@ function PipLinuxInstall(){
rm -rf `echo $wheel_gpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_gpu_develop
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_gpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo 安装失败
exit 1
fi
else
echo "paddlepaddle whl包下载失败"
echo paddlepaddle whl包下载失败
exit 1
fi
else
rm -rf `echo $wheel_cpu_develop|awk -F '/' '{print $NF}'`
wget -q $wheel_cpu_develop
if [ "$?" == "0" ];then
$pip_path install --user -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
$python_path -m pip install ${use_virtualenv} -i https://mirrors.aliyun.com/pypi/simple --trusted-host=mirrors.aliyun.com $wheel_cpu_develop
if [ "$?" == 0 ];then
echo 安装成功
else
echo "paddlepaddle whl包下载失败"
echo 安装失败
exit 1
fi
else
echo paddlepaddle whl包下载失败
exit 1
fi
fi
......@@ -575,52 +728,72 @@ gpu_list=(
echo
echo "Step 5. 检测pip版本"
echo
checkLinuxPip
checkLinuxPython
echo
checkLinuxAVX
echo
echo "Step 6.是否使用Python的虚拟环境"
use_virtualenv="--user"
checkPythonVirtualenv
echo "*********************2. 开始安装*****************************"
PipLinuxInstall
if [ "$check_virtualenv" == 'y' ];then
echo "虚环境创建成功,请cd 进入${virtualenv_name}, 执行 source bin/activate 进入虚环境。退出虚环境执行 deactivate命令。
更多虚环境使用方法请参考virtualenv官网:https://virtualenv.pypa.io/en/latest/"
fi
}
function clearMacPythonEnv(){
python_version=""
python_brief_version=""
python_root=""
}
function checkMacPython2(){
while true
do
read -p "
=> 未能在常规路径下找到Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)
如希望自定义Python路径,请输入路径:" python_root
echo
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
if [[ $? == "0" ]];then
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
clearMacPythonEnv
else
python_version=""
fi
check_python=`echo $python_version | grep "Python 2"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ];then
if [[ -n "$check_python" ]];then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
echo "输入错误,请重新输入(y/n)"
red " 输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
if [[ "$use_python" == "y" ]];then
return 0
fi
else
echo "您输入Python的不是Python2"
python_version=""
red " 您输入Python的不是Python2"
clearMacPythonEnv
fi
fi
else
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python2,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python2(注意Python版本不能低于2.7.15)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi
done
}
......@@ -628,41 +801,48 @@ function checkMacPython2(){
function checkMacPython3(){
while true
do
read -p "
=> 未能在常规路径下找到Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载Python3
如希望自定义Python路径,请输入路径:" python_root
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
if [[ $? == "0" ]];then
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
clearMacPythonEnv
else
python_version=""
fi
check_python=`echo $python_version | grep "Python 3"`
if [ "$python_version" == "" ] || [ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ] ;then
python_version=""
elif [ -n "$check_python" ] ;then
if [[ -n "$check_python" ]];then
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: " use_python
echo -e " => 在您的环境中找到 \033[32m[ $python_version ]\033[0m, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车: "
read -p "" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
if [[ "$use_python" == "y" ]]||[[ "$use_python" == "" ]];then
use_python="y"
break
elif [ "$use_python" == "n" ];then
python_root=""
elif [[ "$use_python" == "n" ]];then
clearMacPythonEnv
break
else
echo "输入错误,请重新输入(y/n)"
red " 输入错误,请重新输入(y/n)"
fi
done
if [ "$use_python" == "y" ];then
break
if [[ "$use_python" == "y" ]];then
return 0
fi
else
echo "您输入Python的不是Python3"
python_version=""
red " 您输入Python的不是Python3"
clearMacPythonEnv
fi
fi
else
clearMacPythonEnv
red " => 未能在常规路径下找到可用的Python3,请使用ctrl+c命令退出安装程序,并使用brew或pypi.org下载安装Python3(注意Python版本不能低于3.5.x)"
read -p " 如希望自定义Python路径,请输入路径
如果希望重新选择Python版本,请回车:" python_root
echo
if [[ "$python_root" == "" ]];then
python_V=""
clearMacPythonEnv
return 1
fi
fi
done
}
......@@ -672,105 +852,75 @@ function checkMacPaddleVersion(){
do
read -n1 -p "Step 2. 选择PaddlePaddle的版本,请按回车键继续..."
echo
read -p "
1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本
2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}
=> 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [ "$paddle_version" == "1" ]||[ "$paddle_version" == "2" ];then
yellow " 1. 开发版:对应Github上develop分支,如您需要开发、或希望使用PaddlePaddle最新功能,请选用此版本"
yellow " 2. 稳定版(推荐):如您无特殊开发需求,建议使用此版本,目前最新的版本号为 ${release_version}"
read -p " => 请输入数字1或2。如输入其他字符或直接回车,将会默认选择【 2. 稳定版 】 。请在这里输入并回车:" paddle_version
if [[ "$paddle_version" == "1" ]]||[[ "$paddle_version" == "2" ]];then
echo
echo "您选择了数字【"$paddle_version" 】"
yellow " 您选择了数字【"$paddle_version" 】"
echo
break
else
paddle_version="2"
echo
echo "您选择了数字【2】"
yellow " 您选择了数字【2】"
echo
break
fi
done
}
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
read -p "
2. 使用python 2.x
3. 使用python 3.x
=> 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
function initCheckMacPython2(){
echo
if [ "$python_V" == "" ];then
python_V="2"
fi
echo "您选择了数字【"$python_V"】,正在寻找符合您要求的Python版本,请按回车键继续..."
yellow " 您选择了Python "$python_V",正在寻找符合要求的Python 2版本"
echo
if [ "$python_V" == "2" ];then
python_root=`which python2.7`
if [ "$python_root" == "" ];then
if [[ "$python_root" == "" ]];then
python_root=`which python`
fi
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ]||[ "$python_root" == "/usr/bin/python2.7" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython2
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
python_root=""
checkMacPython2
break
if [[ "$?" == "1" ]];then
return 1
else
echo "输入错误,请重新输入(y/n)"
return 0
fi
done
}
elif [ "$python_V" == "3" ];then
python_root=`which python3`
python_version=`$python_root --version 2>&1 1>&1`
if [ $? == "0" ];then
:
else
python_version=""
fi
if [ "$python_root" == "" ]||[ "$python_root" == "/usr/bin/python" -a "$python_version" == "Python 2.7.10" ];then
checkMacPython3
fi
while true
do
read -p "
=> 在您的环境中找到 $python_version, 确认使用此版本请输入y;如您希望自定义Python路径请输入n。请在这里输入(y/n)并回车:" use_python
function initCheckMacPython3(){
echo
use_python=`echo $use_python | tr 'A-Z' 'a-z'`
if [ "$use_python" == "y" ]||[ "$use_python" == "" ];then
break
elif [ "$use_python" == "n" ];then
yellow " 您选择了Python "$python_V",正在寻找符合您要求的Python 2版本"
echo
python_root=`which python3`
checkMacPython3
break
if [[ "$?" == "1" ]];then
return 1
else
echo "输入错误,请重新输入(y/n)"
fi
done
else
:
return 0
fi
}
function checkMacPip(){
if [[ "$python_V" == "2" ]]||[[ "$python_V" == "3" ]];then
if [ "$python_V" == "2" ]||[ "$python_V" == "3" ];then
python_brief_version=`$python_root -m pip -V |awk -F "[ |)]" '{print $6}'|sed 's#\.##g'`
if [[ ${python_brief_version} == "" ]];then
red "您输入的python:${python_root} 对应的pip不可用,请检查此pip或重新选择其他python"
echo
return 1
fi
pip_version=`$python_root -m pip -V |awk -F '[ .]' '{print $2}'`
if [[ 9 -le ${pip_version} ]];then
:
else
red "您的pip版本过低,请安装pip 9.0.1及以上的版本"
echo
return 1
fi
if [[ "$python_brief_version" == "" ]];then
clearMacPythonEnv
red "您的 $python_root 对应的pip存在问题,请按ctrl + c退出后重新安装pip,或切换其他python版本"
echo
return 1
else
if [[ $python_brief_version == "27" ]];then
uncode=`python -c "import pip._internal;print(pip._internal.pep425tags.get_supported())"|grep "cp27"`
if [[ $uncode == "" ]];then
......@@ -780,37 +930,82 @@ function checkMacPythonVersion(){
fi
fi
version_list=`echo "${python_list[@]}" | grep "$python_brief_version" `
if [ "$version_list" != "" ];then
break
if [[ "$version_list" != "" ]];then
return 0
else
red "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
echo
clearMacPythonEnv
return 1
fi
fi
fi
}
function checkMacPythonVersion(){
while true
do
read -n1 -p "Step 3. 选择Python版本,请按回车键继续..."
echo
yellow " 2. 使用python 2.x"
yellow " 3. 使用python 3.x"
read -p " => 请输入数字2或3。如输入其他字符或直接回车,将会默认使用【Python 2 】。请在这里输入并回车:" python_V
if [[ "$python_V" == "" ]];then
python_V="2"
fi
if [[ "$python_V" == "2" ]];then
initCheckMacPython2
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
:
fi
else
:
fi
elif [[ "$python_V" == "3" ]];then
initCheckMacPython3
if [[ "$?" == "0" ]];then
checkMacPip
if [[ "$?" == "0" ]];then
return 0
else
echo "未找到可用的pip或pip3。PaddlePaddle目前支持:Python2.7/3.5/3.6/3.7及其对应的pip, 请重新输入,或使用ctrl + c退出"
:
fi
else
:
fi
else
echo "输入错误,请重新输入"
red "输入错误,请重新输入"
fi
done
}
function checkMacAVX(){
read -n1 -p "Step 4. 检测您的Mac是否支持AVX指令集,请按回车键继续..."
echo
if [[ $AVX != "" ]];then
AVX="avx"
echo "检测结果:支持"
echo ""
green " 检测结果:支持"
echo ""
return 0
else
read -n1 -p "检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
exit
fi
red " 检测结果:不支持。非常抱歉,PaddlePaddle在Mac系统暂不提供no_avx类型的安装包,您可以选择在Linux系统中安装no_avx版的PaddlePaddle, 请按回车键退出..."
echo
return 1
fi
}
function checkMacGPU(){
read -n1 -p "Step 5. 选择CPU/GPU版本,请按回车键继续..."
echo
if [[ $GPU != "" ]];then
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
else
echo "MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
yellow " MacOS环境下,暂未提供GPU版本的PaddlePaddle安装包,将为您安装CPU版本的PaddlePaddle"
GPU=cpu
fi
echo
......@@ -822,38 +1017,44 @@ function macos() {
while true
do
checkMacPaddleVersion
checkMacPythonVersion
checkMacAVX
checkMacGPU
echo "*********************2. 开始安装*****************************"
green "*********************2. 开始安装*****************************"
echo
read -n1 -p "即将为您下载并安装PaddlePaddle,请按回车键继续..."
yellow "即将为您下载并安装PaddlePaddle,请按回车键继续..."
read -n1 -p ""
echo
if [[ $paddle_version == "2" ]];then
$python_root -m pip install paddlepaddle
if [ $? == "0" ];then
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
if [[ $? == "0" ]];then
green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
exit 1
fi
else
if [ -f $whl_cpu_develop ];then
if [[ -f $whl_cpu_develop ]];then
$python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
rm -rf $whl_cpu_develop
echo "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
# TODO add install success check here
green "安装成功!小提示:可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
......@@ -861,15 +1062,15 @@ function macos() {
fi
else
wget ${path}$whl_cpu_develop -O $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
$python_root -m pip install $whl_cpu_develop
if [ $? == "0" ];then
if [[ $? == "0" ]];then
rm $wheel_cpu_develop
echo "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
green "安装成功,可以使用: ${python_root} 来启动安装了PaddlePaddle的Python解释器"
break
else
rm $whl_cpu_release
echo "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
red "未能正常安装PaddlePaddle,请尝试更换您输入的python路径,或者ctrl + c退出后请检查您使用的python对应的pip或pip源是否可用"
echo""
echo "=========================================================================================="
echo""
......@@ -877,7 +1078,7 @@ function macos() {
fi
else
rm $whl_cpu_develop
echo "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
red "未能正常安装PaddlePaddle,请检查您的网络 或者确认您是否安装有 wget,或者ctrl + c退出后反馈至https://github.com/PaddlePaddle/Paddle/issues"
echo""
echo "=========================================================================================="
echo""
......@@ -890,33 +1091,35 @@ function macos() {
function main() {
echo "*********************************"
echo "欢迎使用PaddlePaddle快速安装脚本"
green "欢迎使用PaddlePaddle快速安装脚本"
echo "*********************************"
echo
echo "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
yellow "如果您在安装过程中遇到任何问题,请在https://github.com/PaddlePaddle/Paddle/issues反馈,我们的工作人员将会帮您答疑解惑"
echo
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括 1)安装前的准备和 2)开始安装 两部分"
echo "本安装包将帮助您在Linux或Mac系统下安装PaddlePaddle,包括"
yellow "1)安装前的准备"
yellow "2)开始安装"
echo
read -n1 -p "请按回车键进行下一步..."
echo
echo
echo "*********************1. 安装前的准备*****************************"
green "*********************1. 安装前的准备*****************************"
echo
echo "Step 1. 正在检测您的操作系统信息..."
echo
SYSTEM=`uname -s`
if [ "$SYSTEM" == "Darwin" ];then
echo "您的系统为:MAC OSX"
if [[ "$SYSTEM" == "Darwin" ]];then
yellow " 您的系统为:MAC OSX"
echo
macos
else
echo "您的系统为:Linux"
yellow " 您的系统为:Linux"
echo
OS=`cat /etc/issue|awk 'NR==1 {print $1}'`
if [ $OS == "\S" ] || [ "$OS" == "CentOS" ] || [ $OS == "Ubuntu" ];then
if [[ $OS == "\S" ]] || [[ "$OS" == "CentOS" ]] || [[ $OS == "Ubuntu" ]];then
linux
else
echo "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
red "您的系统不在本安装包的支持范围,如您需要在windows环境下安装PaddlePaddle,请您参考PaddlePaddle官网的windows安装文档"
fi
fi
}
......
......@@ -87,7 +87,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.5/include/python3.5m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.5/lib/libpython3.5m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.5 uninstall -y protobuf
pip3.5 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
......@@ -101,7 +100,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.6 uninstall -y protobuf
pip3.6 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
......@@ -115,7 +113,6 @@ function cmake_gen() {
PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3
-DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/
-DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib"
WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON}
pip3.7 uninstall -y protobuf
pip3.7 install --user -r ${PADDLE_ROOT}/python/requirements.txt
else
......@@ -202,7 +199,6 @@ function cmake_gen() {
-DWITH_TESTING=${WITH_TESTING:-ON}
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF}
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON
-DWITH_CONTRIB=${WITH_CONTRIB:-ON}
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON}
......@@ -235,7 +231,6 @@ EOF
-DCUDNN_ROOT=/usr/ \
-DWITH_TESTING=${WITH_TESTING:-ON} \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DWITH_FLUID_ONLY=${WITH_FLUID_ONLY:-OFF} \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
-DWITH_CONTRIB=${WITH_CONTRIB:-ON} \
-DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \
......@@ -398,9 +393,7 @@ EOF
pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then
paddle version
fi
if [ "$1" == "cp27-cp27m" ]; then
pip uninstall -y paddlepaddle
......@@ -555,7 +548,6 @@ EOF
-DCMAKE_BUILD_TYPE=Release \
-DWITH_GPU=OFF \
-DWITH_MKL=OFF \
-DWITH_FLUID_ONLY=ON
local LIB_TYPE=$1
case $LIB_TYPE in
......@@ -631,13 +623,8 @@ EOF
NCCL_DEPS="true"
fi
if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
PADDLE_VERSION="paddle version"
CMD='"paddle", "version"'
else
PADDLE_VERSION="true"
CMD='"true"'
fi
if [ "$1" == "cp35-cp35m" ]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
......@@ -722,12 +709,6 @@ EOF
EOF
fi
if [[ ${WITH_GOLANG:-OFF} == "ON" ]]; then
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
ADD go/cmd/pserver/pserver /usr/bin/
ADD go/cmd/master/master /usr/bin/
EOF
fi
cat >> ${PADDLE_ROOT}/build/Dockerfile <<EOF
# default command shows the paddle version and exit
CMD [${CMD}]
......
......@@ -26,7 +26,6 @@ function start_build_docker() {
-e WITH_GPU=ON \
-e CUDA_ARCH_NAME=Auto \
-e WITH_AVX=ON \
-e WITH_GOLANG=OFF \
-e WITH_TESTING=ON \
-e WITH_COVERAGE=ON \
-e COVERALLS_UPLOAD=ON \
......@@ -35,7 +34,6 @@ function start_build_docker() {
-e PADDLE_FRACTION_GPU_MEMORY_TO_USE=0.15 \
-e CUDA_VISIBLE_DEVICES=0,1 \
-e WITH_DISTRIBUTE=ON \
-e WITH_FLUID_ONLY=ON \
-e RUN_TEST=ON
EOL
)
......
......@@ -6,10 +6,7 @@ function version(){
echo " with_gpu: @WITH_GPU@"
echo " with_mkl: @WITH_MKL@"
echo " with_mkldnn: @WITH_MKLDNN@"
echo " with_double: @WITH_DOUBLE@"
echo " with_python: @WITH_PYTHON@"
echo " with_rdma: @WITH_RDMA@"
echo " with_timer: @WITH_TIMER@"
}
function ver2num() {
......
......@@ -4,18 +4,6 @@ set(PY_FILES paddle/__init__.py
${UTILS_PY_FILES}
${FLUID_PY_FILES})
set(MKL_SHARED_LIBS "")
set(MKL_DEPENDS "")
if(WITH_MKLML)
list(APPEND MKL_SHARED_LIBS ${MKLML_LIB} ${MKLML_IOMP_LIB})
list(APPEND MKL_DEPENDS mklml)
endif()
if(WITH_MKLDNN)
list(APPEND MKL_SHARED_LIBS "${MKLDNN_SHARED_LIB}")
list(APPEND MKL_DEPENDS mkldnn mkldnn_shared_lib)
endif()
if(WITH_GPU)
SET(PACKAGE_NAME "paddlepaddle-gpu")
else()
......@@ -42,7 +30,7 @@ IF(WIN32)
COMMAND ${CMAKE_COMMAND} -E env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
ELSE(WIN32)
add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND touch stub.cc
......@@ -51,11 +39,10 @@ ELSE(WIN32)
COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
DEPENDS copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES})
ENDIF()
set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS} ${external_project_dependencies})
add_custom_target(paddle_python ALL DEPENDS ${paddle_python_deps})
add_custom_target(paddle_python ALL DEPENDS ${PADDLE_PYTHON_BUILD_DIR}/.timestamp)
set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
......
......@@ -131,7 +131,8 @@ def __bootstrap__():
'eager_delete_tensor_gb', 'fast_eager_deletion_mode',
'allocator_strategy', 'reader_queue_speed_test_mode',
'print_sub_graph_dir', 'pe_profile_fname', 'warpctc_dir',
'inner_op_parallelism', 'enable_parallel_graph'
'inner_op_parallelism', 'enable_parallel_graph',
'multiple_of_cupti_buffer_size'
]
if 'Darwin' not in sysstr:
read_env_flags.append('use_pinned_memory')
......
......@@ -178,9 +178,9 @@ class CompiledProgram(object):
# FIXME(dzhwinter): enable_inplace should be after memory_optimize
# if turn on python memory optimize, turn off the inplace_pass.
if self._build_strategy.memory_optimize is None:
self._build_strategy.memory_optimize = False if main._is_mem_optimized else True
self._build_strategy.memory_optimize = False if self._program._is_mem_optimized else True
if self._build_strategy.enable_inplace is None:
self._build_strategy.enable_inplace = False if main._is_mem_optimized else True
self._build_strategy.enable_inplace = False if self._program._is_mem_optimized else True
if self._build_strategy.num_trainers > 1 and trainers_endpoints:
assert self._build_strategy.num_trainers == len(
......@@ -220,7 +220,7 @@ class CompiledProgram(object):
if self._compiled:
if scope and self._scope != scope:
raise ValueError("Cannot compile with different scope")
if place and self._place != place:
if place and not self._place._equals(place):
raise ValueError("Cannot compile with different place")
return self
self._compiled = True
......
......@@ -766,7 +766,10 @@ def _load_distributed_persistables(executor, dirname, main_program=None):
dtype=slice_var.dtype,
persistable=True)
dim1_flatten = 1
if len(slice.shape) >= 2:
dim1_flatten = reduce(lambda x, y: x * y, slice.shape[1:])
start = int(offset / dim1_flatten)
end = int(offset / dim1_flatten + slice.shape[0])
......
......@@ -8744,16 +8744,17 @@ def slice(input, axes, starts, ends):
return out
@templatedoc()
def shape(input):
"""
${comment}
**Shape Layer**
Get the shape of the input.
Args:
input (Variable): ${input_comment}
input (Variable): The input variable.
Returns:
out (Variable): ${out_comment}
Variable: The shape of the input variable.
Examples:
.. code-block:: python
......
......@@ -649,6 +649,7 @@ class AdagradOptimizer(Optimizer):
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
initial_accumulator_value (float): Initial value for moment accumulator.
Examples:
.. code-block:: python
......@@ -662,7 +663,8 @@ class AdagradOptimizer(Optimizer):
learning_rate,
epsilon=1.0e-6,
regularization=None,
name=None):
name=None,
initial_accumulator_value=0.0):
assert learning_rate is not None
assert epsilon is not None
super(AdagradOptimizer, self).__init__(
......@@ -671,6 +673,7 @@ class AdagradOptimizer(Optimizer):
name=name)
self.type = "adagrad"
self._epsilon = epsilon
self.initial_accumulator_value = initial_accumulator_value
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -683,6 +686,16 @@ class AdagradOptimizer(Optimizer):
moment_acc = self._get_accumulator(self._moment_acc_str,
param_and_grad[0])
startup_block = framework.default_startup_program().global_block()
startup_block.append_op(
type='fill_constant',
inputs={},
outputs={'Out': [moment_acc]},
attrs={
'dtype': moment_acc.dtype,
'value': self.initial_accumulator_value,
'shape': moment_acc.shape,
})
# Create the adagrad optimizer op
adagrad_op = block.append_op(
......
......@@ -113,13 +113,12 @@ py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optim
endif()
if(NOT APPLE)
py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
endif()
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
# change the timeout from 600 to 1200, because in debug mode, this test need more time.
set_tests_properties(test_image_classification_resnet PROPERTIES TIMEOUT 1200)
endif()
set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 1200)
endif()
if (WITH_NGRAPH)
add_subdirectory(ngraph)
endif()
......
......@@ -18,8 +18,8 @@ import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest
from scipy.special import expit
from paddle.fluid.tests.unittests.test_activation_op import TestRelu, TestTanh, TestSqrt, TestAbs
import paddle.fluid as fluid
class TestMKLDNNReluDim2(TestRelu):
......@@ -97,5 +97,64 @@ class TestMKLDNNAbsDim4(TestAbs):
self.attrs = {"use_mkldnn": True}
# Check if primitives already exist in backward
class TestMKLDNNReluPrimitivesAlreadyExist(unittest.TestCase):
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def test_check_forward_backward(self):
place = core.CPUPlace()
np.random.seed(123)
x = np.random.uniform(-1, 1, [2, 2]).astype(np.float32)
out = np.abs(x)
out_grad = np.random.random_sample(x.shape).astype(np.float32)
x_grad = out_grad * np.sign(x) # Abs grad calculation
var_dict = {'x': x, 'out': out, 'out@GRAD': out_grad, 'x@GRAD': x_grad}
var_names = list(var_dict.keys())
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name, dtype='float32', shape=ground_truth[name].shape)
relu_op = block.append_op(
type="abs",
inputs={"X": block.var('x'), },
outputs={"Out": block.var('out')},
attrs={"use_mkldnn": True})
# Generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
relu_op.desc, set(), [])
grad_op_desc = grad_op_desc_list[0]
new_op_desc = block.desc.append_op()
new_op_desc.copy_from(grad_op_desc)
for var_name in grad_op_desc.output_arg_names():
block.desc.var(var_name.encode("ascii"))
grad_op_desc.infer_var_type(block.desc)
grad_op_desc.infer_shape(block.desc)
for arg in grad_op_desc.output_arg_names():
grad_var = block.desc.find_var(arg.encode("ascii"))
grad_var.set_dtype(core.VarDesc.VarType.FP32)
exe = fluid.Executor(place)
# Do at least 2 iterations
for i in range(2):
out = exe.run(
program,
feed={name: var_dict[name]
for name in ['x', 'out@GRAD']},
fetch_list=['x@GRAD'])
self.__assert_close(x_grad, out[0], "x@GRAD")
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -15,261 +15,7 @@
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest, randomize_probability
class TestCrossEntropyOp(OpTest):
"""Test cross-entropy with discrete one-hot labels.
"""
def setUp(self):
self.op_type = "cross_entropy"
self.soft_label = False
self.ignore_index = -100
self.dtype = np.float64
self.batch_size = 30
self.class_num = 10
self._cpu_only = True
self.init_dtype_type()
self.init_attr_type()
self.init_bs_class_num()
self.init_x()
self.init_label()
self.get_cross_entropy()
self.inputs = {"X": self.x, "Label": self.label}
self.outputs = {"Y": self.cross_entropy}
self.attrs = {
"soft_label": self.soft_label,
"ignore_index": self.ignore_index
}
def init_x(self):
self.x = randomize_probability(
self.batch_size, self.class_num, dtype=self.dtype)
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
for i in range(self.x.shape[0])],
dtype="float64")
def init_attr_type(self):
pass
def init_dtype_type(self):
pass
def init_bs_class_num(self):
pass
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
class TestCrossEntropyOp2(TestCrossEntropyOp):
"""Test cross-entropy with vectorized soft labels.
"""
def init_label(self):
self.label = np.random.uniform(
0.1, 1.0, [self.batch_size, self.class_num]).astype(self.dtype)
self.label /= self.label.sum(axis=1, keepdims=True)
def get_cross_entropy(self):
self.cross_entropy = (-self.label * np.log(self.x)).sum(
axis=1, keepdims=True).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp3(TestCrossEntropyOp):
"""Test cross-entropy with vectorized one-hot representation of labels.
"""
def init_label(self):
self.label_index = np.random.randint(0, self.class_num,
(self.batch_size))
self.label = np.zeros(self.x.shape).astype(self.dtype)
self.label[np.arange(self.batch_size), self.label_index] = 1
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label_index[i]])]
for i in range(self.x.shape[0])]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.batch_size = 5
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp4(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with discrete one-hot labels.
"""
def init_x(self):
self.shape = [10, 2, 4]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_2d = np.random.randint(
0, self.class_num, (self.ins_num, 1), dtype="int64")
self.label = self.label_2d.reshape(self.shape + [1])
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(self.X_2d[i][self.label_2d[i][0]])]
for i in range(self.X_2d.shape[0])]).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
def init_attr_type(self):
self.soft_label = False
def init_dtype_type(self):
self.dtype = np.float64
def init_bs_class_num(self):
self.class_num = 10
class TestCrossEntropyOp5(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized soft labels.
"""
def init_x(self):
self.shape = [4, 3]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_2d = np.random.uniform(
0.1, 1.0, [self.ins_num, self.class_num]).astype(self.dtype)
self.label_2d /= self.label_2d.sum(axis=1, keepdims=True)
self.label = self.label_2d.reshape(self.shape + [self.class_num])
def get_cross_entropy(self):
cross_entropy_2d = (-self.label_2d * np.log(self.X_2d)).sum(
axis=1, keepdims=True).astype(self.dtype)
self.cross_entropy = np.array(cross_entropy_2d).reshape(self.shape +
[1])
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.class_num = 37
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp6(TestCrossEntropyOp):
"""Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
"""
def init_x(self):
self.shape = [4, 3, 2]
self.ins_num = np.prod(np.array(self.shape))
self.X_2d = randomize_probability(self.ins_num,
self.class_num).astype(self.dtype)
self.x = self.X_2d.reshape(self.shape + [self.class_num])
def init_label(self):
self.label_index_2d = np.random.randint(
0, self.class_num, (self.ins_num), dtype="int64")
label_2d = np.zeros(self.X_2d.shape)
label_2d[np.arange(self.ins_num), self.label_index_2d] = 1
self.label = label_2d.reshape(self.shape + [self.class_num]).astype(
self.dtype)
def get_cross_entropy(self):
cross_entropy_2d = np.asmatrix(
[[-np.log(self.X_2d[i][self.label_index_2d[i]])]
for i in range(self.X_2d.shape[0])])
self.cross_entropy = np.array(cross_entropy_2d).reshape(
self.shape + [1]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = True
def init_dtype_type(self):
self.dtype = np.float32
def init_bs_class_num(self):
self.class_num = 17
def test_check_grad(self):
self.check_grad(
["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
class TestCrossEntropyOp7(TestCrossEntropyOp):
"""Test cross-entropy with ignore index.
"""
def init_label(self):
self.label = np.random.randint(
0, self.class_num, (self.batch_size, 1), dtype="int64")
def get_cross_entropy(self):
self.cross_entropy = np.asmatrix(
[[-np.log(self.x[i][self.label[i][0]])]
if self.label[i][0] != self.ignore_index else [0]
for i in range(self.x.shape[0])]).astype(self.dtype)
def init_attr_type(self):
self.soft_label = False
self.ignore_index = 3
def init_dtype_type(self):
self.dtype = np.float64
def init_bs_class_num(self):
self.batch_size = 30
self.class_num = 10
from paddle.fluid.tests.unittests.test_cross_entropy_op import TestCrossEntropyOp, TestCrossEntropyOp2, TestCrossEntropyOp3, TestCrossEntropyOp4, TestCrossEntropyOp5, TestCrossEntropyOp6, TestCrossEntropyOp7
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
from paddle.fluid.tests.unittests.test_momentum_op import TestMomentumOp1, TestMomentumOp2, TestLarsMomentumOp, TestSparseMomentumOp, TestSparseMomentumOp2
if __name__ == '__main__':
unittest.main()
......@@ -40,6 +40,8 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self._dropout = dropout
self._input = None
self._num_steps = num_steps
from paddle.fluid.layer_helper import LayerHelper
self._helper = LayerHelper('SimpleLSTMRNN', act="tanh")
def _build_once(self, input_embedding, init_hidden=None, init_cell=None):
self.weight_1_arr = []
......@@ -50,17 +52,21 @@ class SimpleLSTMRNN(fluid.imperative.Layer):
self.mask_array = []
for i in range(self._num_layers):
weight_1 = fluid.layers.create_parameter(
weight_1 = self._helper.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 2, self._hidden_size * 4],
dtype="float32",
name="fc_weight1_" + str(i),
default_initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale))
self.weight_1_arr.append(weight_1)
bias_1 = fluid.layers.create_parameter(
[self._hidden_size * 4],
bias_1 = self._helper.create_parameter(
attr=fluid.ParamAttr(
initializer=fluid.initializer.UniformInitializer(
low=-self._init_scale, high=self._init_scale)),
shape=[self._hidden_size * 4],
dtype="float32",
name="fc_bias1_" + str(i),
default_initializer=fluid.initializer.Constant(0.0))
self.bias_arr.append(bias_1)
......@@ -137,6 +143,8 @@ class PtbModel(fluid.imperative.Layer):
self.num_layers = num_layers
self.num_steps = num_steps
self.dropout = dropout
from paddle.fluid.layer_helper import LayerHelper
self._helper = LayerHelper('PtbModel', act="tanh")
self.simple_lstm_rnn = SimpleLSTMRNN(
hidden_size,
num_steps,
......@@ -151,16 +159,16 @@ class PtbModel(fluid.imperative.Layer):
name='embedding_para',
initializer=fluid.initializer.UniformInitializer(
low=-init_scale, high=init_scale)))
self.softmax_weight = fluid.layers.create_parameter(
[self.hidden_size, self.vocab_size],
self.softmax_weight = self._helper.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.hidden_size, self.vocab_size],
dtype="float32",
name="softmax_weight",
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
self.softmax_bias = fluid.layers.create_parameter(
[self.vocab_size],
self.softmax_bias = self._helper.create_parameter(
attr=fluid.ParamAttr(),
shape=[self.vocab_size],
dtype="float32",
name='softmax_bias',
default_initializer=fluid.initializer.UniformInitializer(
low=-self.init_scale, high=self.init_scale))
......@@ -256,7 +264,6 @@ class TestImperativePtbRnn(unittest.TestCase):
with new_program_scope():
fluid.default_startup_program().random_seed = seed
fluid.default_main_program().random_seed = seed
# TODO: marsyang1993 Change seed to
ptb_model = PtbModel(
hidden_size=hidden_size,
vocab_size=vocab_size,
......
......@@ -173,13 +173,16 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
normalized,
shared=False)
if nmsed_num == 0:
#lod.append(1)
continue
lod.append(nmsed_num)
tmp_det_out = []
for c, indices in nmsed_outs.items():
for idx in indices:
xmin, ymin, xmax, ymax = box[idx, c, :]
det_outs.append([c, score[idx][c], xmin, ymin, xmax, ymax])
tmp_det_out.append([c, score[idx][c], xmin, ymin, xmax, ymax])
sorted_det_out = sorted(
tmp_det_out, key=lambda tup: tup[0], reverse=False)
det_outs.extend(sorted_det_out)
if len(lod) == 0:
lod.append(1)
......
......@@ -274,7 +274,7 @@ class TestAdagradOptimizer(unittest.TestCase):
# Check init_program
init_ops = init_program.global_block().ops
self.assertEqual(len(init_ops), 2)
self.assertEqual(len(init_ops), 3)
self.assertEqual(init_ops[0].type, "fill_constant")
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
self.assertEqual(init_ops[1].type, "fill_constant")
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import os
os.environ['FLAGS_enable_parallel_graph'] = str(1)
import paddle.fluid.core as core
import os
import paddle.fluid as fluid
from parallel_executor_test_base import TestParallelExecutorBase
def simple_fc_net(use_feed):
img = fluid.layers.data(name='image', shape=[784], dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int64')
hidden = img
for _ in range(4):
hidden = fluid.layers.fc(
hidden,
size=200,
act='tanh',
bias_attr=fluid.ParamAttr(
initializer=fluid.initializer.Constant(value=1.0)))
prediction = fluid.layers.fc(hidden, size=10, act='softmax')
loss = fluid.layers.cross_entropy(input=prediction, label=label)
loss = fluid.layers.mean(loss)
return loss
class TestMNIST(TestParallelExecutorBase):
@classmethod
def setUpClass(cls):
os.environ['CPU_NUM'] = str(4)
def _init_data(self):
np.random.seed(5)
img = np.random.random(size=[32, 784]).astype(np.float32)
label = np.ones(shape=[32, 1], dtype='int64')
return img, label
# simple_fc
def check_simple_fc_convergence(self, use_cuda, use_reduce=False):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
self.check_network_convergence(
simple_fc_net,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_reduce=use_reduce)
def test_simple_fc(self):
# use_cuda
self.check_simple_fc_convergence(True)
def check_simple_fc_parallel_accuracy(self, use_cuda):
if use_cuda and not core.is_compiled_with_cuda():
return
img, label = self._init_data()
single_first_loss, single_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_parallel_executor=False)
parallel_first_loss, parallel_last_loss = self.check_network_convergence(
method=simple_fc_net,
seed=1,
feed_dict={"image": img,
"label": label},
use_cuda=use_cuda,
use_parallel_executor=True)
self.assertAlmostEquals(
np.mean(parallel_first_loss),
single_first_loss,
delta=1e-6, )
self.assertAlmostEquals(
np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_simple_fc_parallel_accuracy(self):
self.check_simple_fc_parallel_accuracy(True)
if __name__ == '__main__':
unittest.main()
......@@ -16,15 +16,19 @@ from __future__ import print_function
import unittest
import os
import tempfile
import numpy as np
import paddle.fluid as fluid
import paddle.fluid.profiler as profiler
import paddle.fluid.layers as layers
import paddle.fluid.core as core
import paddle.fluid.proto.profiler.profiler_pb2 as profiler_pb2
class TestProfiler(unittest.TestCase):
def net_profiler(self, state, profile_path='/tmp/profile'):
def net_profiler(self, state, use_parallel_executor=False):
profile_path = os.path.join(tempfile.gettempdir(), "profile")
open(profile_path, "w").write("")
startup_program = fluid.Program()
main_program = fluid.Program()
......@@ -60,6 +64,11 @@ class TestProfiler(unittest.TestCase):
place = fluid.CPUPlace() if state == 'CPU' else fluid.CUDAPlace(0)
exe = fluid.Executor(place)
exe.run(startup_program)
if use_parallel_executor:
pe = fluid.ParallelExecutor(
state != 'CPU',
loss_name=avg_cost.name,
main_program=main_program)
pass_acc_calculator = fluid.average.WeightedAverage()
with profiler.profiler(state, 'total', profile_path) as prof:
......@@ -69,6 +78,9 @@ class TestProfiler(unittest.TestCase):
x = np.random.random((32, 784)).astype("float32")
y = np.random.randint(0, 10, (32, 1)).astype("int64")
if use_parallel_executor:
pe.run(feed={'x': x, 'y': y}, fetch_list=[avg_cost.name])
continue
outs = exe.run(main_program,
feed={'x': x,
'y': y},
......@@ -77,21 +89,37 @@ class TestProfiler(unittest.TestCase):
b_size = np.array(outs[2])
pass_acc_calculator.add(value=acc, weight=b_size)
pass_acc = pass_acc_calculator.eval()
data = open(profile_path, 'rb').read()
self.assertGreater(len(data), 0)
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(data)
self.assertGreater(len(profile_pb.events), 0)
for event in profile_pb.events:
if event.type == profiler_pb2.Event.GPUKernel:
if not event.detail_info and not event.name.startswith("MEM"):
raise Exception(
"Kernel %s missing event. Has this kernel been recorded by RecordEvent?"
% event.name)
elif event.type == profiler_pb2.Event.CPU and (
event.name.startswith("Driver API") or
event.name.startswith("Runtime API")):
print("Warning: unregister", event.name)
def test_cpu_profiler(self):
self.net_profiler('CPU')
self.net_profiler('CPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_cuda_profiler(self):
self.net_profiler('GPU')
self.net_profiler('GPU', use_parallel_executor=True)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"profiler is enabled only with GPU")
def test_all_profiler(self):
self.net_profiler('All', '/tmp/profile_out')
with open('/tmp/profile_out', 'rb') as f:
self.assertGreater(len(f.read()), 0)
self.net_profiler('All')
self.net_profiler('All', use_parallel_executor=True)
if __name__ == '__main__':
......
......@@ -1020,7 +1020,11 @@ class DistributeTranspiler(object):
skip_dim0 = 0
slice_vars = self.param_var_mapping[orig_var_name]
orig_dim1_flatten = reduce(lambda x, y: x * y, slice_vars[0].shape[1:])
orig_dim1_flatten = 1
if len(slice_vars[0].shape) >= 2:
orig_dim1_flatten = reduce(lambda x, y: x * y,
slice_vars[0].shape[1:])
for slice_var in slice_vars[:block_idx]:
skip_dim0 += slice_var.shape[0]
......
requests==2.9.2
numpy>=1.12
protobuf>=3.6
protobuf>=3.1.0
recordio>=0.1.0
matplotlib==2.2.3 # TODO: let python3 paddlepaddle package use latest matplotlib
rarfile
......
......@@ -24,3 +24,8 @@ sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
sed 's/<baseimg>/10.0-devel-centos6/g' Dockerfile.x64 | \
sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70 -gencode=arch=compute_75,code=sm_75"/g'> Dockerfile.tmp
docker build -t ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7 -f Dockerfile.tmp .
docker push ${REPO}/paddle_manylinux_devel:cuda10.0_cudnn7
......@@ -107,11 +107,13 @@ curl-config --features
rm -rf /usr/local/ssl
# Install patchelf (latest with unreleased bug fixes)
curl -sLO https://nixos.org/releases/patchelf/patchelf-0.9/patchelf-0.9.tar.gz
check_sha256sum patchelf-0.9.tar.gz $PATCHELF_HASH
tar -xzf patchelf-0.9.tar.gz
(cd patchelf-0.9 && ./configure && make && make install)
rm -rf patchelf-0.9.tar.gz patchelf-0.9
# FIXME(typhoonzero): restore this when the link is fixed.
# curl -sLO http://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
# check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
# tar -xzf patchelf-0.9njs2.tar.gz
# (cd patchelf-0.9njs2 && ./configure && make && make install)
# rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
yum install -y patchelf
# Install latest pypi release of auditwheel
LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
......
......@@ -87,6 +87,8 @@ function do_cpython_build {
# NOTE Make libpython shared library visible to python calls below
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
cd /
ls ${MY_DIR}
local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
ln -s ${prefix} /opt/python/${abi_tag}
}
......
......@@ -131,8 +131,12 @@ class Timeline(object):
if (k, event.device_id, "CPU") not in self._devices:
pid = self._allocate_pid()
self._devices[(k, event.device_id, "CPU")] = pid
self._chrome_trace.emit_pid("%s:cpu:block:%d" %
(k, event.device_id), pid)
# -1 device id represents CUDA api call
if event.device_id == -1:
self._chrome_trace.emit_pid("%s:cuda_api" % k, pid)
else:
self._chrome_trace.emit_pid(
"%s:cpu:block:%d" % (k, event.device_id), pid)
elif event.type == profiler_pb2.Event.GPUKernel:
if (k, event.device_id, "GPUKernel") not in self._devices:
pid = self._allocate_pid()
......@@ -150,7 +154,9 @@ class Timeline(object):
pid = self._devices[(k, event.device_id, type)]
args = {'name': event.name}
if event.memcopy.bytes > 0:
args = {'mem_bytes': event.memcopy.bytes}
args['mem_bytes'] = event.memcopy.bytes
if event.detail_info:
args['detail_info'] = event.detail_info
# TODO(panyx0718): Chrome tracing only handles ms. However, some
# ops takes micro-seconds. Hence, we keep the ns here.
self._chrome_trace.emit_region(
......@@ -173,7 +179,7 @@ if args.timeline_path:
profile_paths = profile_path.split(',')
profile_dict = dict()
if len(profile_paths) == 1:
with open(profile_path, 'r') as f:
with open(profile_path, 'rb') as f:
profile_s = f.read()
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
......@@ -181,7 +187,7 @@ if len(profile_paths) == 1:
else:
for profile_path in profile_paths:
k, v = profile_path.split('=')
with open(v, 'r') as f:
with open(v, 'rb') as f:
profile_s = f.read()
profile_pb = profiler_pb2.Profile()
profile_pb.ParseFromString(profile_s)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册