提交 896ed573 编写于 作者: J jingqinghe

update

...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module ...@@ -28,7 +28,10 @@ include(generic) # simplify cmake module
# TODO(Shibo Tao): remove find_package(CUDA) completely. # TODO(Shibo Tao): remove find_package(CUDA) completely.
find_package(CUDA QUIET) find_package(CUDA QUIET)
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF)
if (WITH_GPU AND WITH_XPU)
message(FATAL_ERROR "Error when compile GPU and XPU at the same time")
endif()
# cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them. # cmake 3.12, 3.13, 3.14 will append gcc link options to nvcc, and nvcc doesn't recognize them.
if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15)) if(WITH_GPU AND (${CMAKE_VERSION} VERSION_GREATER_EQUAL 3.12) AND (${CMAKE_VERSION} VERSION_LESS 3.15))
message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. " message(FATAL_ERROR "cmake ${CMAKE_VERSION} is not supported when WITH_GPU=ON because of bug https://cmake.org/pipermail/cmake/2018-September/068195.html. "
......
...@@ -33,7 +33,7 @@ pip install paddlepaddle ...@@ -33,7 +33,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website. It is recommended to read [this doc](https://www.paddlepaddle.org.cn/documentation/docs/en/beginners_guide/install/index_en.html) on our website.
......
...@@ -30,7 +30,7 @@ pip install paddlepaddle ...@@ -30,7 +30,7 @@ pip install paddlepaddle
# Linux GPU cuda10cudnn7 # Linux GPU cuda10cudnn7
pip install paddlepaddle-gpu pip install paddlepaddle-gpu
# Linux GPU cuda9cudnn7 # Linux GPU cuda9cudnn7
pip install paddlepaddle-gpu==1.8.3.post97 pip install paddlepaddle-gpu==1.8.4.post97
``` ```
更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html) 更多安装信息详见官网 [安装说明](http://www.paddlepaddle.org.cn/documentation/docs/zh/1.8/beginners_guide/install/index_cn.html)
......
...@@ -63,6 +63,11 @@ if(WITH_BOX_PS) ...@@ -63,6 +63,11 @@ if(WITH_BOX_PS)
add_definitions(-DPADDLE_WITH_BOX_PS) add_definitions(-DPADDLE_WITH_BOX_PS)
endif() endif()
if(WITH_XPU)
message(STATUS "Compile with XPU!")
add_definitions(-DPADDLE_WITH_XPU)
endif()
if(WITH_GPU) if(WITH_GPU)
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
add_definitions(-DEIGEN_USE_GPU) add_definitions(-DEIGEN_USE_GPU)
......
...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable) ...@@ -61,6 +61,10 @@ function(detect_installed_gpus out_variable)
if(NOT CUDA_gpu_detect_output) if(NOT CUDA_gpu_detect_output)
message(STATUS "Automatic GPU detection failed. Building for all known architectures.") message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE) set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
#Todo: fix Automatic GPU detection failed on windows
if(WIN32)
set(${out_variable} "61 75" PARENT_SCOPE)
endif()
else() else()
set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE) set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
endif() endif()
......
...@@ -17,7 +17,7 @@ include(ExternalProject) ...@@ -17,7 +17,7 @@ include(ExternalProject)
set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub) set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub) set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
set(CUB_REPOSITORY https://github.com/NVlabs/cub.git) set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
set(CUB_TAG 1.9.8) set(CUB_TAG 1.8.0)
cache_third_party(extern_cub cache_third_party(extern_cub
REPOSITORY ${CUB_REPOSITORY} REPOSITORY ${CUB_REPOSITORY}
......
...@@ -14,13 +14,21 @@ ...@@ -14,13 +14,21 @@
INCLUDE(ExternalProject) INCLUDE(ExternalProject)
execute_process(COMMAND bash -c "gcc -dumpversion" OUTPUT_VARIABLE GCC_VERSION)
SET(GLOO_PROJECT "extern_gloo") SET(GLOO_PROJECT "extern_gloo")
IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL)) IF((NOT DEFINED GLOO_VER) OR (NOT DEFINED GLOO_URL))
MESSAGE(STATUS "use pre defined download url") MESSAGE(STATUS "use pre defined download url")
SET(GLOO_VER "master" CACHE STRING "" FORCE) SET(GLOO_VER "master" CACHE STRING "" FORCE)
SET(GLOO_NAME "gloo" CACHE STRING "" FORCE) SET(GLOO_NAME "gloo" CACHE STRING "" FORCE)
SET(GLOO_URL "https://pslib.bj.bcebos.com/gloo.tar.gz" CACHE STRING "" FORCE)
if(${GCC_VERSION} VERSION_EQUAL "8.2.0")
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc8" CACHE STRING "" FORCE)
else()
SET(GLOO_URL "https://fleet.bj.bcebos.com/gloo/gloo.tar.gz.gcc482" CACHE STRING "" FORCE)
endif()
ENDIF() ENDIF()
MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}") MESSAGE(STATUS "GLOO_NAME: ${GLOO_NAME}, GLOO_URL: ${GLOO_URL}")
SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo") SET(GLOO_SOURCE_DIR "${THIRD_PARTY_PATH}/gloo")
SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}") SET(GLOO_DOWNLOAD_DIR "${GLOO_SOURCE_DIR}/src/${GLOO_PROJECT}")
......
if (NOT WITH_XPU)
return()
endif()
INCLUDE(ExternalProject)
SET(XPU_PROJECT "extern_xpu")
SET(XPU_URL "https://kunlun1.su.bcebos.com/xpu.tar.gz" CACHE STRING "" FORCE)
SET(XPU_SOURCE_DIR "${THIRD_PARTY_PATH}/xpu")
SET(XPU_DOWNLOAD_DIR "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
SET(XPU_INSTALL_DIR "${THIRD_PARTY_PATH}/install/xpu")
SET(XPU_API_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/api/include")
SET(XPU_RUNTIME_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
SET(XPU_LIB_DIR "${THIRD_PARTY_PATH}/install/xpu/lib")
SET(XPU_API_LIB_NAME "libxpuapi.so")
SET(XPU_RT_LIB_NAME "libxpurt.so")
SET(XPU_SIM_LIB_NAME "libxpusim.so")
SET(XPU_API_LIB "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
SET(XPU_RT_LIB "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
SET(XPU_SIM_LIB "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(XPU)\n"
"cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
" DESTINATION ${XPU_INSTALL_DIR})\n")
ExternalProject_Add(
${XPU_PROJECT}
${EXTERNAL_PROJECT_LOG_ARGS}
PREFIX ${XPU_SOURCE_DIR}
DOWNLOAD_DIR ${XPU_DOWNLOAD_DIR}
DOWNLOAD_COMMAND wget --no-check-certificate ${XPU_URL} -c -q -O xpu.tar.gz
&& tar xvf xpu.tar.gz
DOWNLOAD_NO_PROGRESS 1
UPDATE_COMMAND ""
CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${XPU_INSTALL_ROOT}
CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${XPU_INSTALL_ROOT}
)
ADD_LIBRARY(shared_xpuapi SHARED IMPORTED GLOBAL)
set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
# generate a static dummy target to track xpulib dependencies
# for cc_library(xxx SRCS xxx.c DEPS xpulib)
generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
...@@ -384,8 +384,8 @@ function(cc_test_run TARGET_NAME) ...@@ -384,8 +384,8 @@ function(cc_test_run TARGET_NAME)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cpu_deterministic=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true) set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
# No unit test should exceed 10 minutes. # No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif() endif()
endfunction() endfunction()
...@@ -743,8 +743,8 @@ function(py_test TARGET_NAME) ...@@ -743,8 +743,8 @@ function(py_test TARGET_NAME)
WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
endif() endif()
# No unit test should exceed 10 minutes. # No unit test should exceed 2 minutes.
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 600) set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
endif() endif()
endfunction() endfunction()
......
...@@ -8,6 +8,7 @@ function(op_library TARGET) ...@@ -8,6 +8,7 @@ function(op_library TARGET)
set(hip_cu_srcs) set(hip_cu_srcs)
set(miopen_hip_cc_srcs) set(miopen_hip_cc_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(xpu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(cudnn_cu_srcs) set(cudnn_cu_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
...@@ -60,6 +61,12 @@ function(op_library TARGET) ...@@ -60,6 +61,12 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc) list(APPEND mkldnn_cc_srcs mkldnn/${MKLDNN_FILE}.cc)
endif() endif()
endif() endif()
if(WITH_XPU)
string(REPLACE "_op" "_xpu_op" XPU_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${XPU_FILE}.cc)
list(APPEND xpu_cc_srcs xpu/${XPU_FILE}.cc)
endif()
endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.hip.cu$") if (${src} MATCHES ".*\\.hip.cu$")
...@@ -76,6 +83,8 @@ function(op_library TARGET) ...@@ -76,6 +83,8 @@ function(op_library TARGET)
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
list(APPEND cu_cc_srcs ${src}) list(APPEND cu_cc_srcs ${src})
elseif(WITH_XPU AND ${src} MATCHES ".*_xpu_op.cc$")
list(APPEND xpu_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cc$") elseif(${src} MATCHES ".*\\.cc$")
list(APPEND cc_srcs ${src}) list(APPEND cc_srcs ${src})
else() else()
...@@ -109,7 +118,7 @@ function(op_library TARGET) ...@@ -109,7 +118,7 @@ function(op_library TARGET)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} ${xpu_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
endif() endif()
...@@ -150,10 +159,11 @@ function(op_library TARGET) ...@@ -150,10 +159,11 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
list(LENGTH xpu_cc_srcs xpu_cc_srcs_len)
list(LENGTH hip_cu_srcs hip_cu_srcs_len) list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len) list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0) ${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0 AND ${xpu_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -179,6 +189,9 @@ function(op_library TARGET) ...@@ -179,6 +189,9 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif() endif()
if (WITH_XPU AND ${xpu_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, XPU);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator # Append first implemented MKLDNN activation operator
...@@ -228,6 +241,7 @@ function(register_operators) ...@@ -228,6 +241,7 @@ function(register_operators)
file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc") file(GLOB OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*_op.cc")
string(REPLACE "_mkldnn" "" OPS "${OPS}") string(REPLACE "_mkldnn" "" OPS "${OPS}")
string(REPLACE "_xpu" "" OPS "${OPS}")
string(REPLACE ".cc" "" OPS "${OPS}") string(REPLACE ".cc" "" OPS "${OPS}")
list(REMOVE_DUPLICATES OPS) list(REMOVE_DUPLICATES OPS)
list(LENGTH register_operators_DEPS register_operators_DEPS_len) list(LENGTH register_operators_DEPS register_operators_DEPS_len)
......
...@@ -250,6 +250,11 @@ if(WITH_GPU) ...@@ -250,6 +250,11 @@ if(WITH_GPU)
file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage file_download_and_uncompress(${CUDAERROR_URL} "cudaerror") # download file cudaErrorMessage
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_XPU)
include(external/xpu) # download, build, install xpu
list(APPEND third_party_deps extern_xpu)
endif(WITH_XPU)
if(WITH_PSLIB) if(WITH_PSLIB)
include(external/pslib) # download, build, install pslib include(external/pslib) # download, build, install pslib
list(APPEND third_party_deps extern_pslib) list(APPEND third_party_deps extern_pslib)
...@@ -263,10 +268,6 @@ if(WITH_PSLIB) ...@@ -263,10 +268,6 @@ if(WITH_PSLIB)
endif() endif()
endif(WITH_PSLIB) endif(WITH_PSLIB)
if(NOT WIN32 AND NOT APPLE)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_BOX_PS) if(WITH_BOX_PS)
include(external/box_ps) include(external/box_ps)
...@@ -274,6 +275,11 @@ if(WITH_BOX_PS) ...@@ -274,6 +275,11 @@ if(WITH_BOX_PS)
endif(WITH_BOX_PS) endif(WITH_BOX_PS)
if(WITH_DISTRIBUTE) if(WITH_DISTRIBUTE)
if(WITH_GLOO)
include(external/gloo)
list(APPEND third_party_deps extern_gloo)
endif()
if(WITH_GRPC) if(WITH_GRPC)
list(APPEND third_party_deps extern_grpc) list(APPEND third_party_deps extern_grpc)
else() else()
......
...@@ -122,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor ...@@ -122,6 +122,10 @@ cc_library(data_transform SRCS data_transform.cc DEPS math_function tensor
cc_library(attribute SRCS attribute.cc DEPS framework_proto boost) cc_library(attribute SRCS attribute.cc DEPS framework_proto boost)
cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc cc_test(program_desc_test SRCS program_desc_test.cc DEPS proto_desc
device_context) device_context)
cc_library(op_version_registry SRCS op_version_registry.cc DEPS framework_proto boost)
cc_test(op_version_registry_test SRCS op_version_registry_test.cc DEPS op_version_registry)
cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog) cc_library(op_proto_maker SRCS op_proto_maker.cc DEPS framework_proto attribute glog)
cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context) cc_library(no_need_buffer_vars_inference SRCS no_need_buffer_vars_inference.cc DEPS attribute device_context)
...@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib ...@@ -268,6 +272,7 @@ cc_test(op_compatible_info_test SRCS op_compatible_info_test.cc DEPS op_compatib
cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer) cc_library(save_load_util SRCS save_load_util DEPS tensor scope layer)
cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer) cc_test(save_load_util_test SRCS save_load_util_test.cc DEPS save_load_util tensor scope layer)
cc_library(generator SRCS generator.cc)
# Get the current working branch # Get the current working branch
execute_process( execute_process(
......
...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> { ...@@ -70,6 +70,11 @@ struct DLContextVisitor : public boost::static_visitor<::DLContext> {
return ctx; return ctx;
} }
inline ::DLContext operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
}
inline ::DLContext operator()(const platform::CUDAPlace &place) const { inline ::DLContext operator()(const platform::CUDAPlace &place) const {
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
::DLContext ctx; ::DLContext ctx;
......
...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -444,8 +444,8 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
int64_t max_memory_size = GetEagerDeletionThreshold(); int64_t max_memory_size = GetEagerDeletionThreshold();
std::unique_ptr<GarbageCollector> gc; std::unique_ptr<GarbageCollector> gc;
if (!ctx->force_disable_gc_ && max_memory_size >= 0) { if (!ctx->force_disable_gc_ && max_memory_size >= 0) {
#ifdef PADDLE_WITH_CUDA
if (platform::is_gpu_place(place_)) { if (platform::is_gpu_place(place_)) {
#ifdef PADDLE_WITH_CUDA
if (IsFastEagerDeletionModeEnabled()) { if (IsFastEagerDeletionModeEnabled()) {
gc.reset(new UnsafeFastGPUGarbageCollector( gc.reset(new UnsafeFastGPUGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
...@@ -453,14 +453,23 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, ...@@ -453,14 +453,23 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
gc.reset(new DefaultStreamGarbageCollector( gc.reset(new DefaultStreamGarbageCollector(
BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CUDAPlace, place_), max_memory_size));
} }
} else if (platform::is_cpu_place(place_)) { #else
PADDLE_THROW(
platform::errors::Unimplemented("No GPU gc found in CPU/XPU paddle"));
#endif #endif
} else if (platform::is_cpu_place(place_)) {
gc.reset(new CPUGarbageCollector( gc.reset(new CPUGarbageCollector(
BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size)); BOOST_GET_CONST(platform::CPUPlace, place_), max_memory_size));
#ifdef PADDLE_WITH_CUDA } else if (platform::is_xpu_place(place_)) {
} #ifdef PADDLE_WITH_XPU
gc.reset(new XPUGarbageCollector(
BOOST_GET_CONST(platform::XPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif #endif
} }
}
for (int64_t i = start_op_index; i < end_op_index; ++i) { for (int64_t i = start_op_index; i < end_op_index; ++i) {
auto& op = ctx->ops_[i]; auto& op = ctx->ops_[i];
......
...@@ -19,6 +19,6 @@ else() ...@@ -19,6 +19,6 @@ else()
cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope) cc_library(gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope)
endif(WITH_GLOO) endif(WITH_GLOO)
cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context) cc_library(heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto device_context heter_service_proto)
cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell) cc_test(test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell)
...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) { ...@@ -50,6 +50,15 @@ void CPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback(); callback();
} }
#ifdef PADDLE_WITH_XPU
XPUGarbageCollector::XPUGarbageCollector(const platform::XPUPlace &place,
size_t max_memory_size)
: GarbageCollector(place, max_memory_size) {}
void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
callback();
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector( UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
const platform::CUDAPlace &place, size_t max_memory_size) const platform::CUDAPlace &place, size_t max_memory_size)
......
...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector { ...@@ -59,6 +59,16 @@ class CPUGarbageCollector : public GarbageCollector {
void ClearCallback(const std::function<void()> &callback) override; void ClearCallback(const std::function<void()> &callback) override;
}; };
#ifdef PADDLE_WITH_XPU
class XPUGarbageCollector : public GarbageCollector {
public:
XPUGarbageCollector(const platform::XPUPlace &place, size_t max_memory_size);
protected:
void ClearCallback(const std::function<void()> &callback) override;
};
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class UnsafeFastGPUGarbageCollector : public GarbageCollector { class UnsafeFastGPUGarbageCollector : public GarbageCollector {
public: public:
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <deque>
#include <memory>
#include <unordered_map>
#include <unordered_set>
#include <utility>
#include "paddle/fluid/framework/generator.h"
namespace paddle {
namespace framework {
std::shared_ptr<Generator> Generator::gen_instance_ = NULL;
GeneratorState* Generator::GetState() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_.get();
}
void Generator::SetState(GeneratorState* state_in) {
std::lock_guard<std::mutex> lock(this->mutex);
*this->state_ = *state_in;
}
uint64_t Generator::GetCurrentSeed() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->current_seed;
}
uint64_t Generator::Seed() {
std::lock_guard<std::mutex> lock(this->mutex);
uint64_t seed;
std::random_device de;
seed = ((((uint64_t)de()) << 32) + de()) & 0x1FFFFFFFFFFFFF;
this->state_->current_seed = seed;
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
return this->state_->current_seed;
}
void Generator::SetCurrentSeed(uint64_t seed) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->current_seed = uint64_t(seed);
std::seed_seq seq({seed});
this->state_->cpu_engine.seed(seq);
}
std::mt19937_64& Generator::GetCPUEngine() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine;
}
void Generator::SetCPUEngine(std::mt19937_64 engine) {
std::lock_guard<std::mutex> lock(this->mutex);
this->state_->cpu_engine = std::mt19937_64(engine);
}
uint64_t Generator::Random64() {
std::lock_guard<std::mutex> lock(this->mutex);
return this->state_->cpu_engine();
}
} // namespace framework
} // namespace paddle
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#include <atomic>
#include <deque>
#include <iostream> // temp for debug
#include <memory>
#include <mutex> // NOLINT
#include <random>
#include <typeinfo>
#include <utility>
namespace paddle {
namespace framework {
struct GeneratorState {
int64_t device = -1;
uint64_t current_seed = 34342423252;
std::mt19937_64 cpu_engine;
};
struct Generator {
Generator() {
GeneratorState default_gen_state_cpu;
default_gen_state_cpu.device = -1;
default_gen_state_cpu.current_seed = 34342423252;
std::seed_seq seq({34342423252});
default_gen_state_cpu.cpu_engine = std::mt19937_64(seq);
this->state_ = std::make_shared<GeneratorState>(default_gen_state_cpu);
}
explicit Generator(GeneratorState state_in)
: state_{std::make_shared<GeneratorState>(state_in)} {}
Generator(const Generator& other)
: Generator(other, std::lock_guard<std::mutex>(other.mutex)) {}
// get random state
GeneratorState* GetState();
// set random state
void SetState(GeneratorState* state_in);
// get current seed
uint64_t GetCurrentSeed();
// random a seed and get
uint64_t Seed();
// set seed
void SetCurrentSeed(uint64_t seed);
// get cpu engine
std::mt19937_64& GetCPUEngine();
// set cpu engine
void SetCPUEngine(std::mt19937_64 engine);
uint64_t Random64();
bool is_init_py = false;
// CPU Generator singleton
static std::shared_ptr<Generator> GetInstance() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
return gen_instance_;
}
static std::shared_ptr<Generator> GetInstanceX() {
if (NULL == gen_instance_) {
gen_instance_.reset(new paddle::framework::Generator());
}
gen_instance_->is_init_py = true;
return gen_instance_;
}
private:
static std::shared_ptr<Generator> gen_instance_;
std::shared_ptr<GeneratorState> state_;
mutable std::mutex mutex;
Generator(const Generator& other, const std::lock_guard<std::mutex>&)
: state_(std::make_shared<GeneratorState>(*(other.state_))) {}
};
} // namespace framework
} // namespace paddle
...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass, ...@@ -368,3 +368,7 @@ REGISTER_PASS(conv_transpose_bn_fuse_pass,
paddle::framework::ir::ConvTransposeBNFusePass); paddle::framework::ir::ConvTransposeBNFusePass);
REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass, REGISTER_PASS(conv_transpose_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass); paddle::framework::ir::ConvTransposeEltwiseAddBNFusePass);
REGISTER_PASS(depthwise_conv_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvBNFusePass);
REGISTER_PASS(depthwise_conv_eltwiseadd_bn_fuse_pass,
paddle::framework::ir::DepthwiseConvEltwiseAddBNFusePass);
...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass { ...@@ -56,6 +56,16 @@ class ConvTransposeEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
std::string conv_type() const { return "conv2d_transpose"; } std::string conv_type() const { return "conv2d_transpose"; }
}; };
class DepthwiseConvBNFusePass : public ConvBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
class DepthwiseConvEltwiseAddBNFusePass : public ConvEltwiseAddBNFusePass {
public:
std::string conv_type() const { return "depthwise_conv2d"; }
};
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() { ...@@ -309,7 +309,8 @@ std::vector<std::vector<Node *>> SubgraphDetector::ExtractSubGraphs() {
BriefNode *brief_node = itr.second; BriefNode *brief_node = itr.second;
if (!Agent(brief_node->node).marked()) { if (!Agent(brief_node->node).marked()) {
VLOG(4) << brief_node->node->id() << " node not a trt candidate."; VLOG(4) << brief_node->node->id() << " node named "
<< brief_node->node->Name() << " is not a trt candidate.";
continue; continue;
} }
......
...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) { ...@@ -59,6 +59,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
// CPU, CUDA, PLAIN are same library type. // CPU, CUDA, PLAIN are same library type.
} else if (s == std::string("CPU")) { } else if (s == std::string("CPU")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else if (s == std::string("XPU")) {
return LibraryType::kPlain;
} else if (s == std::string("CUDA")) { } else if (s == std::string("CUDA")) {
return LibraryType::kPlain; return LibraryType::kPlain;
} else { } else {
......
...@@ -78,21 +78,37 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -78,21 +78,37 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void ShareDim(const std::string &in, const std::string &out, size_t i = 0, void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) override { size_t j = 0) override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size(),
PADDLE_ENFORCE_LT(j, Outputs(out).size()); platform::errors::InvalidArgument(
"The input variable index is out of range, expected "
"index less than %d, but received index is %d.",
Inputs(in).size(), i));
PADDLE_ENFORCE_LT(j, Outputs(out).size(),
platform::errors::InvalidArgument(
"The output variable index is out of range, expected "
"index less than %d, but received index is %d.",
Outputs(out).size(), j));
std::string input_n = Inputs(in)[i]; std::string input_n = Inputs(in)[i];
std::string output_n = Outputs(out)[j]; std::string output_n = Outputs(out)[j];
PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@", PADDLE_ENFORCE_NE(input_n, framework::kEmptyVarName,
in, i); platform::errors::InvalidArgument(
PADDLE_ENFORCE(output_n != framework::kEmptyVarName, "The input variable %s[%d] is empty.", in, i));
"The %s[%d] is @EMPTY@", out, j); PADDLE_ENFORCE_NE(output_n, framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The output variable %s[%d] is empty.", out, j));
auto *in_var = block_.FindVarRecursive(input_n); auto *in_var = block_.FindVarRecursive(input_n);
auto *out_var = block_.FindVarRecursive(output_n); auto *out_var = block_.FindVarRecursive(output_n);
PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(), PADDLE_ENFORCE_EQ(
"The type of %s and %s is not the same.", input_n, output_n); in_var->GetType(), out_var->GetType(),
platform::errors::InvalidArgument(
"The type of input %s and output %s do not match. The input type "
"is %s, output type is %s.",
input_n, output_n, DataTypeToString(in_var->GetType()),
DataTypeToString(out_var->GetType())));
SetDim(output_n, GetDim(input_n)); SetDim(output_n, GetDim(input_n));
} }
...@@ -126,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -126,12 +142,22 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void ShareLoD(const std::string &in, const std::string &out, size_t i = 0, void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size()); PADDLE_ENFORCE_LT(i, Inputs(in).size(),
PADDLE_ENFORCE_LT(j, Outputs(out).size()); platform::errors::InvalidArgument(
PADDLE_ENFORCE(Inputs(in)[i] != framework::kEmptyVarName, "The input variable index is out of range, expected "
"The %s[%d] is @EMPTY@", in, i); "index less than %d, but received index is %d.",
PADDLE_ENFORCE(Outputs(out)[j] != framework::kEmptyVarName, Inputs(in).size(), i));
"The %s[%d] is @EMPTY@", out, j); PADDLE_ENFORCE_LT(j, Outputs(out).size(),
platform::errors::InvalidArgument(
"The output variable index is out of range, expected "
"index less than %d, but received index is %d.",
Outputs(out).size(), j));
PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The input variable %s[%d] is empty.", in, i));
PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
platform::errors::InvalidArgument(
"The output variable %s[%d] is empty.", out, j));
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
if (in_var->GetType() != proto::VarType::LOD_TENSOR && if (in_var->GetType() != proto::VarType::LOD_TENSOR &&
...@@ -144,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -144,30 +170,38 @@ class CompileTimeInferShapeContext : public InferShapeContext {
int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override { int32_t GetLoDLevel(const std::string &in, size_t i = 0) const override {
PADDLE_ENFORCE_LT(i, Inputs(in).size(), PADDLE_ENFORCE_LT(i, Inputs(in).size(),
"Input %s of operator %s only has %d elements.", in, platform::errors::InvalidArgument(
op_.Type(), Inputs(in).size()); "The input variable index is out of range, input "
"variable %s of operator %s only has %d elements.",
in, op_.Type(), Inputs(in).size()));
PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName, PADDLE_ENFORCE_NE(Inputs(in)[i], framework::kEmptyVarName,
"Input %s[%d] of operator %s is @EMPTY@", in, op_.Type(), platform::errors::InvalidArgument(
i); "The input variable %s[%d] of operator %s is empty.",
in, i, op_.Type()));
auto *in_var = block_.FindVarRecursive(Inputs(in)[i]); auto *in_var = block_.FindVarRecursive(Inputs(in)[i]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
in_var, "Input %s[%d] of operator %s should not be nullptr.", in, in_var, platform::errors::NotFound(
op_.Type(), i); "The input variable %s[%d] of operator %s is not found.",
in, i, op_.Type()));
return in_var->GetLoDLevel(); return in_var->GetLoDLevel();
} }
void SetLoDLevel(const std::string &out, int32_t lod_level, void SetLoDLevel(const std::string &out, int32_t lod_level,
size_t j = 0) const override { size_t j = 0) const override {
PADDLE_ENFORCE_LT(j, Outputs(out).size(), PADDLE_ENFORCE_LT(j, Outputs(out).size(),
"Output %s of operator %s only has %d elements.", out, platform::errors::InvalidArgument(
op_.Type(), Outputs(out).size()); "The output variable index is out of range, output "
"variable %s of operator %s only has %d elements.",
out, op_.Type(), Outputs(out).size()));
PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName, PADDLE_ENFORCE_NE(Outputs(out)[j], framework::kEmptyVarName,
"Output %s[%d] of operator %s is @EMPTY@", out, platform::errors::InvalidArgument(
op_.Type(), j); "The output variable %s[%d] of operator %s is empty.",
out, j, op_.Type()));
auto *out_var = block_.FindVarRecursive(Outputs(out)[j]); auto *out_var = block_.FindVarRecursive(Outputs(out)[j]);
PADDLE_ENFORCE_NOT_NULL( PADDLE_ENFORCE_NOT_NULL(
out_var, "Output %s[%d] of operator %s should not be nullptr.", out, out_var, platform::errors::NotFound(
op_.Type(), j); "The output variable %s[%d] of operator %s is not found.",
out, j, op_.Type()));
if (lod_level >= 0) { if (lod_level >= 0) {
out_var->SetLoDLevel(lod_level); out_var->SetLoDLevel(lod_level);
} }
...@@ -200,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -200,8 +234,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
DDim GetInputDim(const std::string &name) const override { DDim GetInputDim(const std::string &name) const override {
const std::vector<std::string> &arg_names = Inputs(name); const std::vector<std::string> &arg_names = Inputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Input(%s) should hold one element, but now it holds %d", platform::errors::InvalidArgument(
name, arg_names.size()); "The input(%s) should hold only one element, but now "
"it holds %d elements.",
name, arg_names.size()));
return this->GetDim(arg_names[0]); return this->GetDim(arg_names[0]);
} }
...@@ -225,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -225,8 +261,10 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetOutputDim(const std::string &name, const DDim &dim) override { void SetOutputDim(const std::string &name, const DDim &dim) override {
auto arg_names = Outputs(name); auto arg_names = Outputs(name);
PADDLE_ENFORCE_EQ(arg_names.size(), 1UL, PADDLE_ENFORCE_EQ(arg_names.size(), 1UL,
"Output(%s) should hold one element, but now it holds %d", platform::errors::InvalidArgument(
name, arg_names.size()); "The iutput(%s) should hold only one element, but "
"now it holds %d elements.",
name, arg_names.size()));
SetDim(arg_names[0], dim); SetDim(arg_names[0], dim);
} }
...@@ -252,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -252,7 +290,8 @@ class CompileTimeInferShapeContext : public InferShapeContext {
DDim GetDim(const std::string &name) const { DDim GetDim(const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
DDim res; DDim res;
try { try {
auto shape = var->GetShape(); auto shape = var->GetShape();
...@@ -278,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext { ...@@ -278,7 +317,11 @@ class CompileTimeInferShapeContext : public InferShapeContext {
void SetDims(const std::vector<std::string> &names, void SetDims(const std::vector<std::string> &names,
const std::vector<DDim> &dims) { const std::vector<DDim> &dims) {
size_t length = names.size(); size_t length = names.size();
PADDLE_ENFORCE_EQ(length, dims.size()); PADDLE_ENFORCE_EQ(length, dims.size(),
platform::errors::InvalidArgument(
"The input variables number(%d) and input dimensions "
"number(%d) do not match.",
length, dims.size()));
for (size_t i = 0; i < length; ++i) { for (size_t i = 0; i < length; ++i) {
if (names[i] == framework::kEmptyVarName) { if (names[i] == framework::kEmptyVarName) {
continue; continue;
...@@ -364,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() { ...@@ -364,8 +407,10 @@ proto::OpDesc *OpDesc::Proto() {
const std::vector<std::string> &OpDesc::Input(const std::string &name) const { const std::vector<std::string> &OpDesc::Input(const std::string &name) const {
auto it = inputs_.find(name); auto it = inputs_.find(name);
PADDLE_ENFORCE(it != inputs_.end(), "Input %s cannot be found in Op %s", name, PADDLE_ENFORCE_NE(
Type()); it, inputs_.end(),
platform::errors::NotFound("Input %s cannot be found in operator %s.",
name, Type()));
return it->second; return it->second;
} }
...@@ -385,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name, ...@@ -385,8 +430,10 @@ void OpDesc::SetInput(const std::string &param_name,
const std::vector<std::string> &OpDesc::Output(const std::string &name) const { const std::vector<std::string> &OpDesc::Output(const std::string &name) const {
auto it = outputs_.find(name); auto it = outputs_.find(name);
PADDLE_ENFORCE(it != outputs_.end(), "Output %s cannot be found in Op %s", PADDLE_ENFORCE_NE(
name, Type()); it, outputs_.end(),
platform::errors::NotFound("Output %s cannot be found in operator %s.",
name, Type()));
return it->second; return it->second;
} }
...@@ -427,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const { ...@@ -427,7 +474,8 @@ bool OpDesc::HasProtoAttr(const std::string &name) const {
proto::AttrType OpDesc::GetAttrType(const std::string &name) const { proto::AttrType OpDesc::GetAttrType(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
"Attribute %s is not found.", name));
return static_cast<proto::AttrType>(it->second.which() - 1); return static_cast<proto::AttrType>(it->second.which() - 1);
} }
...@@ -492,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { ...@@ -492,7 +540,8 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
return; return;
} }
default: default:
PADDLE_THROW("Wrong attr type %d", attr.type()); PADDLE_THROW(platform::errors::Unimplemented(
"Unsupported attribute type (code %d).", attr.type()));
} }
need_update_ = true; need_update_ = true;
return; return;
...@@ -529,7 +578,8 @@ void OpDesc::SetAttrMap( ...@@ -529,7 +578,8 @@ void OpDesc::SetAttrMap(
Attribute OpDesc::GetAttr(const std::string &name) const { Attribute OpDesc::GetAttr(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(it, attrs_.end(), platform::errors::NotFound(
"Attribute %s is not found.", name));
return it->second; return it->second;
} }
...@@ -543,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr( ...@@ -543,7 +593,8 @@ const proto::OpProto::Attr &OpDesc::GetProtoAttr(
} }
} }
PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type()); PADDLE_THROW(platform::errors::NotFound(
"Attribute %s is not found in proto %s.", name, proto.type()));
} }
Attribute OpDesc::GetNullableAttr(const std::string &name) const { Attribute OpDesc::GetNullableAttr(const std::string &name) const {
...@@ -557,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const { ...@@ -557,7 +608,10 @@ Attribute OpDesc::GetNullableAttr(const std::string &name) const {
std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const { std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(
it, attrs_.end(),
platform::errors::NotFound(
"Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second); auto blocks = BOOST_GET_CONST(std::vector<BlockDesc *>, it->second);
std::vector<int> ids; std::vector<int> ids;
...@@ -570,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const { ...@@ -570,7 +624,10 @@ std::vector<int> OpDesc::GetBlocksAttrIds(const std::string &name) const {
int OpDesc::GetBlockAttrId(const std::string &name) const { int OpDesc::GetBlockAttrId(const std::string &name) const {
auto it = attrs_.find(name); auto it = attrs_.find(name);
PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); PADDLE_ENFORCE_NE(
it, attrs_.end(),
platform::errors::NotFound(
"Attribute `%s` is not found in operator `%s`.", name, desc_.type()));
return BOOST_GET_CONST(BlockDesc *, it->second)->ID(); return BOOST_GET_CONST(BlockDesc *, it->second)->ID();
} }
...@@ -657,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> { ...@@ -657,7 +714,11 @@ struct SetAttrDescVisitor : public boost::static_visitor<void> {
VectorToRepeated(v, attr_->mutable_longs()); VectorToRepeated(v, attr_->mutable_longs());
} }
void operator()(boost::blank) const { PADDLE_THROW("Unexpected branch"); } void operator()(boost::blank) const {
PADDLE_THROW(platform::errors::Unavailable(
"Unsupported calling method of SetAttrDescVisitor object for "
"`boosst::blank` type."));
}
}; };
void OpDesc::Flush() { void OpDesc::Flush() {
...@@ -691,8 +752,9 @@ void OpDesc::Flush() { ...@@ -691,8 +752,9 @@ void OpDesc::Flush() {
} }
void OpDesc::CheckAttrs() { void OpDesc::CheckAttrs() {
PADDLE_ENFORCE(!Type().empty(), PADDLE_ENFORCE_EQ(Type().empty(), false,
"CheckAttr() can not be called before type is set."); platform::errors::PreconditionNotMet(
"CheckAttrs() can not be called before type is set."));
auto *checker = OpInfoMap::Instance().Get(Type()).Checker(); auto *checker = OpInfoMap::Instance().Get(Type()).Checker();
if (checker == nullptr) { if (checker == nullptr) {
// checker is not configured. That operator could be generated by Paddle, // checker is not configured. That operator could be generated by Paddle,
...@@ -707,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const { ...@@ -707,8 +769,10 @@ void OpDesc::InferShape(const BlockDesc &block) const {
try { try {
VLOG(3) << "CompileTime infer shape on " << Type(); VLOG(3) << "CompileTime infer shape on " << Type();
auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_; auto &infer_shape = OpInfoMap::Instance().Get(this->Type()).infer_shape_;
PADDLE_ENFORCE(static_cast<bool>(infer_shape), PADDLE_ENFORCE_EQ(
"%s's infer_shape has not been registered", this->Type()); static_cast<bool>(infer_shape), true,
platform::errors::NotFound(
"Operator %s's infer_shape is not registered.", this->Type()));
CompileTimeInferShapeContext ctx(*this, block); CompileTimeInferShapeContext ctx(*this, block);
if (VLOG_IS_ON(10)) { if (VLOG_IS_ON(10)) {
std::ostringstream sout; std::ostringstream sout;
...@@ -758,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const { ...@@ -758,10 +822,10 @@ bool CompileTimeInferShapeContext::HasInput(const std::string &name) const {
if (length == 0) { if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
"Input(%s) should have only one value, " "Input(%s) should have only one value, "
"but it have %d now", "but it has %d values now.",
name, length); name, length));
return block_.HasVarRecursive(input_names[0]); return block_.HasVarRecursive(input_names[0]);
} }
...@@ -774,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const { ...@@ -774,10 +838,10 @@ bool CompileTimeInferShapeContext::HasOutput(const std::string &name) const {
if (length == 0) { if (length == 0) {
return false; return false;
} }
PADDLE_ENFORCE_EQ(length, 1UL, PADDLE_ENFORCE_EQ(length, 1UL, platform::errors::InvalidArgument(
"Output(%s) should have only one value, " "Output(%s) should have only one value, "
"but it have %d now", "but it has %d values now.",
name, length); name, length));
return block_.HasVarRecursive(output_names[0]); return block_.HasVarRecursive(output_names[0]);
} }
...@@ -826,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs( ...@@ -826,7 +890,8 @@ std::vector<std::string> CompileTimeInferShapeContext::Outputs(
std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims( std::vector<DDim> CompileTimeInferShapeContext::GetRepeatedDims(
const std::string &name) const { const std::string &name) const {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
std::vector<DDim> res; std::vector<DDim> res;
try { try {
auto shapes = var->GetShapes(); auto shapes = var->GetShapes();
...@@ -848,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name, ...@@ -848,7 +913,8 @@ void CompileTimeInferShapeContext::SetDim(const std::string &name,
void CompileTimeInferShapeContext::SetRepeatedDims( void CompileTimeInferShapeContext::SetRepeatedDims(
const std::string &name, const std::vector<DDim> &dims) { const std::string &name, const std::vector<DDim> &dims) {
auto var = block_.FindVarRecursive(name); auto var = block_.FindVarRecursive(name);
PADDLE_ENFORCE(var != nullptr, "Cannot find variable %s", name); PADDLE_ENFORCE_NOT_NULL(
var, platform::errors::NotFound("Variable %s is not found.", name));
std::vector<std::vector<int64_t>> dim_vec(dims.size()); std::vector<std::vector<int64_t>> dim_vec(dims.size());
std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>); std::transform(dims.begin(), dims.end(), dim_vec.begin(), vectorize<>);
var->SetShapes(dim_vec); var->SetShapes(dim_vec);
......
...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -268,6 +268,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \ #define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ #define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \
customized_name, \ customized_name, \
customized_type_value, \ customized_type_value, \
...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I, ...@@ -298,6 +301,12 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__) __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL_FUNCTOR(op_type, ...) \
REGISTER_OP_KERNEL_EX( \
op_type, XPU, ::paddle::platform::XPUPlace, DEFAULT_TYPE, \
::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \
__VA_ARGS__)
/** /**
* Macro to mark what Operator and Kernel * Macro to mark what Operator and Kernel
* we will use and tell the compiler to * we will use and tell the compiler to
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_version_registry.h"
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <memory>
#include <string>
#include <unordered_map>
#include <utility>
#include <vector>
#include <boost/any.hpp>
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/platform/enforce.h"
namespace paddle {
namespace framework {
namespace compatible {
struct OpUpdateRecord {
enum class Type { kInvalid = 0, kModifyAttr, kNewAttr };
Type type_;
std::string remark_;
};
struct ModifyAttr : OpUpdateRecord {
ModifyAttr(const std::string& name, const std::string& remark,
boost::any default_value)
: OpUpdateRecord({Type::kModifyAttr, remark}),
name_(name),
default_value_(default_value) {
// TODO(Shixiaowei02): Check the data type with proto::OpDesc.
}
private:
std::string name_;
boost::any default_value_;
};
struct NewAttr : OpUpdateRecord {
NewAttr(const std::string& name, const std::string& remark,
boost::any default_value)
: OpUpdateRecord({Type::kNewAttr, remark}),
name_(name),
default_value_(default_value) {}
private:
std::string name_;
boost::any default_value_;
};
class OpVersionDesc {
public:
OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
boost::any default_value) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::ModifyAttr(name, remark, default_value)));
return *this;
}
OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
boost::any default_value) {
infos_.push_back(std::shared_ptr<OpUpdateRecord>(
new compatible::NewAttr(name, remark, default_value)));
return *this;
}
private:
std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
};
class OpVersion {
public:
OpVersion& AddCheckpoint(const std::string& note,
const OpVersionDesc& op_version_desc) {
checkpoints_.push_back(Checkpoint({note, op_version_desc}));
return *this;
}
private:
struct Checkpoint {
std::string note_;
OpVersionDesc op_version_desc_;
};
std::vector<Checkpoint> checkpoints_;
};
class OpVersionRegistrar {
public:
static OpVersionRegistrar& GetInstance() {
static OpVersionRegistrar instance;
return instance;
}
OpVersion& Register(const std::string& op_type) {
if (op_version_map_.find(op_type) != op_version_map_.end()) {
PADDLE_THROW("'%s' is registered in operator version more than once.",
op_type);
}
op_version_map_.insert({op_type, OpVersion()});
return op_version_map_[op_type];
}
private:
std::unordered_map<std::string, OpVersion> op_version_map_;
OpVersionRegistrar() = default;
OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
};
} // namespace compatible
} // namespace framework
} // namespace paddle
#define REGISTER_OP_VERSION(op_type) \
static paddle::framework::compatible::OpVersion \
RegisterOpVersion__##op_type = \
paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
.Register(#op_type)
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <glog/logging.h>
#include <gtest/gtest.h>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
namespace compatible {
TEST(test_operator_version, test_operator_version) {
REGISTER_OP_VERSION(test__)
.AddCheckpoint(
R"ROC(
Upgrade reshape, modified one attribute [axis] and add a new attribute [size].
)ROC",
framework::compatible::OpVersionDesc()
.ModifyAttr("axis",
"Increased from the original one method to two.", -1)
.NewAttr("size",
"In order to represent a two-dimensional rectangle, the "
"parameter size is added.",
0))
.AddCheckpoint(
R"ROC(
Add a new attribute [height]
)ROC",
framework::compatible::OpVersionDesc().NewAttr(
"height",
"In order to represent a two-dimensional rectangle, the "
"parameter height is added.",
0));
}
} // namespace compatible
} // namespace framework
} // namespace paddle
...@@ -34,6 +34,9 @@ limitations under the License. */ ...@@ -34,6 +34,9 @@ limitations under the License. */
#include "paddle/fluid/framework/unused_var_check.h" #include "paddle/fluid/framework/unused_var_check.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) { ...@@ -165,6 +168,14 @@ void OperatorBase::Run(const Scope& scope, const platform::Place& place) {
#else #else
auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device; auto dev_id = BOOST_GET_CONST(platform::CUDAPlace, place).device;
platform::SetDeviceId(dev_id); platform::SetDeviceId(dev_id);
#endif
} else if (platform::is_xpu_place(place)) {
#ifndef PADDLE_WITH_XPU
PADDLE_THROW(platform::errors::Unimplemented(
"Cannot run operator on place %s", place));
#else
auto dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device;
platform::SetXPUDeviceId(dev_id);
#endif #endif
} }
...@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx, ...@@ -1109,6 +1120,16 @@ void OperatorWithKernel::ChooseKernel(const RuntimeContext& ctx,
expected_kernel_key.data_layout_ = DataLayout::kAnyLayout; expected_kernel_key.data_layout_ = DataLayout::kAnyLayout;
kernel_iter = kernels.find(expected_kernel_key); kernel_iter = kernels.find(expected_kernel_key);
} }
#endif
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
VLOG(3) << "missing XPU kernel: " << type_
<< ", expected_kernel_key:" << expected_kernel_key
<< ", fallbacking to CPU one!";
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif #endif
if (kernel_iter == kernels.end()) { if (kernel_iter == kernels.end()) {
PADDLE_THROW("op %s does not have kernel for %s", type_, PADDLE_THROW("op %s does not have kernel for %s", type_,
......
...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places, ...@@ -449,6 +449,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
const BuildStrategy &build_strategy, const BuildStrategy &build_strategy,
ir::Graph *graph) ir::Graph *graph)
: member_(new ParallelExecutorPrivate(places, scope)) { : member_(new ParallelExecutorPrivate(places, scope)) {
PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]),
platform::errors::Unavailable(
"XPU is not supported in ParallelExecutor"));
ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_),
member_->places_.size()); member_->places_.size());
member_->use_cuda_ = exec_strategy.use_cuda_; member_->use_cuda_ = exec_strategy.use_cuda_;
......
...@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output, ...@@ -210,6 +210,23 @@ void prune_impl(const proto::ProgramDesc& input, proto::ProgramDesc* output,
should_run.push_back(true); should_run.push_back(true);
} else { } else {
should_run.push_back(false); should_run.push_back(false);
// If the output of an op modifies feed vars, the op should not clip.
// For example, in the transformer structure, the third parameter returned
// by beam_search op is generally assigned to a feed var. Cutting the
// assign op will cause an error.
if (parent_block_id != -1) {
bool flag = false;
for (auto& var : op_desc.outputs()) {
for (auto& argu : var.arguments()) {
if (feed_var_names.count(argu)) {
flag = true;
}
}
}
if (flag) {
should_run.back() = true;
}
}
} }
} }
......
...@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) { ...@@ -185,3 +185,34 @@ TEST(Prune, recurrrent_op) {
EXPECT_EQ(pruned.blocks(0).ops_size(), 2); EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1); EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
} }
// If the output of an op modifies feed vars, the op should not clip.
TEST(Prune, recurrrent_op_2) {
f::ProgramDesc program;
f::BlockDesc *block = program.MutableBlock(0);
f::BlockDesc *sub_block = program.AppendBlock(*block);
AddOp("one_two", {{"input", {"a"}}}, {{"output", {"b", "c"}}},
f::AttributeMap{}, block);
std::vector<std::string> state_var_name(1, "y");
AddOp("recurrent", {{"input", {"b", "c"}}}, {{"output", {"b1, c1"}}},
{{"ex_states", state_var_name},
{"states", state_var_name},
{"sub_block", sub_block}},
block);
EXPECT_TRUE(sub_block != nullptr);
AddOp("rnn_memory_helper", {{"input", {"x"}}}, {{"output", {"a"}}},
f::AttributeMap{}, sub_block);
f::proto::ProgramDesc *pdesc = program.Proto();
pdesc->mutable_blocks(0)->mutable_ops(1)->set_is_target(true);
f::proto::ProgramDesc pruned;
std::set<std::string> feed_var_names = {"x", "a"};
f::Prune(*pdesc, feed_var_names, &pruned);
EXPECT_EQ(pruned.blocks_size(), 2);
EXPECT_EQ(pruned.blocks(0).ops_size(), 2);
EXPECT_EQ(pruned.blocks(1).ops_size(), 1);
}
...@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor, ...@@ -76,6 +76,13 @@ void TensorFromStream(std::istream& is, Tensor* tensor,
const platform::DeviceContext& dev_ctx, const platform::DeviceContext& dev_ctx,
const size_t& seek, const std::vector<int64_t>& shape); const size_t& seek, const std::vector<int64_t>& shape);
// store the bool result tensor in out tensor
void TensorContainsNANV2(const framework::Tensor& tensor,
framework::Tensor* out);
void TensorContainsInfV2(const framework::Tensor& tensor,
framework::Tensor* out);
void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
// convert dlpack's DLTensor to tensor // convert dlpack's DLTensor to tensor
void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst); void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
......
...@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -76,6 +76,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
void operator()(const platform::XPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
void operator()(const platform::CUDAPlace& place) { void operator()(const platform::CUDAPlace& place) {
platform::CUDADeviceContext* ctx = platform::CUDADeviceContext* ctx =
......
...@@ -42,11 +42,6 @@ static void PrepareData(const platform::Place& place, ...@@ -42,11 +42,6 @@ static void PrepareData(const platform::Place& place,
for (const auto& var_base : name_pair.second) { for (const auto& var_base : name_pair.second) {
const auto* tensor = GetTensorFromVar(var_base->Var()); const auto* tensor = GetTensorFromVar(var_base->Var());
if (tensor && tensor->IsInitialized()) { if (tensor && tensor->IsInitialized()) {
auto tmp_place = tensor->place();
// TODO(jiabin): Support transform data layout when we Verify it on more
// tests
if (!(tmp_place == place)) {
auto kernel_type_for_var = op.GetKernelTypeForVar( auto kernel_type_for_var = op.GetKernelTypeForVar(
name_pair.first, *tensor, expected_kernel_key); name_pair.first, *tensor, expected_kernel_key);
if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) { if (!NeedTransform(kernel_type_for_var, expected_kernel_key)) {
...@@ -62,7 +57,6 @@ static void PrepareData(const platform::Place& place, ...@@ -62,7 +57,6 @@ static void PrepareData(const platform::Place& place,
} }
} }
} }
}
} }
PreparedOp::PreparedOp(const framework::OperatorBase& op, PreparedOp::PreparedOp(const framework::OperatorBase& op,
...@@ -93,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins, ...@@ -93,12 +87,26 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
auto& kernels = kernels_iter->second; auto& kernels = kernels_iter->second;
framework::RuntimeContext ctx({}, {}); framework::RuntimeContext ctx({}, {});
#ifdef PADDLE_WITH_MKLDNN
// MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
// GetKernelType functions, so we need to copy the attributes there.
// Const qualifier of Attrs had to be discarded to overwrite it.
auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
mutable_op_attrs = attrs;
#endif
auto expected_kernel_key = auto expected_kernel_key =
op.GetExpectedKernelType(DygraphExecutionContext<VarType>( op.GetExpectedKernelType(DygraphExecutionContext<VarType>(
op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs)); op, framework::Scope(), *dev_ctx, ctx, ins, outs, attrs));
VLOG(3) << "expected_kernel_key:" << expected_kernel_key; VLOG(3) << "expected_kernel_key:" << expected_kernel_key;
auto kernel_iter = kernels.find(expected_kernel_key); auto kernel_iter = kernels.find(expected_kernel_key);
#ifdef PADDLE_WITH_XPU
if (kernel_iter == kernels.end() &&
is_xpu_place(expected_kernel_key.place_)) {
expected_kernel_key.place_ = platform::CPUPlace();
kernel_iter = kernels.find(expected_kernel_key);
}
#endif
// TODO(jiabin): Add operator.cc's line 1000 part back when we need that case // TODO(jiabin): Add operator.cc's line 1000 part back when we need that case
PADDLE_ENFORCE_NE(kernel_iter, kernels.end(), PADDLE_ENFORCE_NE(kernel_iter, kernels.end(),
platform::errors::NotFound( platform::errors::NotFound(
......
...@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) { ...@@ -176,7 +176,7 @@ TEST(test_prepare_op, test_prepare_data) {
} }
#endif #endif
TEST(test_prepare_op, test_prepare_data_same_place) { void TestPrepareDataSamePlace(framework::AttributeMap attr_map) {
std::shared_ptr<imperative::VarBase> vin( std::shared_ptr<imperative::VarBase> vin(
new imperative::VarBase(false, "vin")); new imperative::VarBase(false, "vin"));
std::shared_ptr<imperative::VarBase> vout( std::shared_ptr<imperative::VarBase> vout(
...@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) { ...@@ -198,7 +198,6 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
var_pair out_pair = var_pair("Out", vb_vector(1, vout)); var_pair out_pair = var_pair("Out", vb_vector(1, vout));
imperative::NameVarBaseMap ins = {x_pair}; imperative::NameVarBaseMap ins = {x_pair};
imperative::NameVarBaseMap outs = {out_pair}; imperative::NameVarBaseMap outs = {out_pair};
framework::AttributeMap attr_map;
const std::string op_type = "relu"; const std::string op_type = "relu";
const auto& info = framework::OpInfoMap::Instance().Get(op_type); const auto& info = framework::OpInfoMap::Instance().Get(op_type);
if (info.Checker()) info.Checker()->Check(&attr_map); if (info.Checker()) info.Checker()->Check(&attr_map);
...@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) { ...@@ -222,8 +221,21 @@ TEST(test_prepare_op, test_prepare_data_same_place) {
} }
} }
} }
TEST(test_prepare_op, test_prepare_data_same_place) {
TestPrepareDataSamePlace({});
}
#ifdef PADDLE_WITH_MKLDNN
TEST(test_prepare_op, test_prepare_data_cpu_mkldnn) {
TestPrepareDataSamePlace({{"use_mkldnn", true}});
}
#endif
} // namespace imperative } // namespace imperative
} // namespace paddle } // namespace paddle
USE_OP(split); USE_OP(split);
USE_OP(relu); USE_OP(relu);
#ifdef PADDLE_WITH_MKLDNN
USE_OP_DEVICE_KERNEL(relu, MKLDNN);
#endif
...@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input, ...@@ -83,7 +83,12 @@ nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
} else if (shape.size() == 3UL) { } else if (shape.size() == 3UL) {
return nvinfer1::Dims3(shape[0], shape[1], shape[2]); return nvinfer1::Dims3(shape[0], shape[1], shape[2]);
} }
return nvinfer1::Dims4(shape[0], shape[1], 1, 1); nvinfer1::Dims dims;
dims.nbDims = shape.size();
for (size_t i = 0; i < shape.size(); i++) {
dims.d[i] = shape[i];
}
return dims;
} }
} }
} // NOLINT } // NOLINT
......
...@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -24,6 +24,8 @@ struct SimpleOpTypeSetTeller : public Teller {
#if IS_TRT_VERSION_GE(5130) #if IS_TRT_VERSION_GE(5130)
teller_set.insert("relu6"); teller_set.insert("relu6");
teller_set.insert("hard_sigmoid"); teller_set.insert("hard_sigmoid");
int8_teller_set.insert("relu6");
int8_teller_set.insert("hard_sigmoid");
#endif #endif
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
teller_set.insert("fused_embedding_eltwise_layernorm"); teller_set.insert("fused_embedding_eltwise_layernorm");
...@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller { ...@@ -53,11 +55,11 @@ struct SimpleOpTypeSetTeller : public Teller {
"elementwise_add", "elementwise_add",
"leaky_relu", "leaky_relu",
"fc", "fc",
"relu6",
"concat", "concat",
"scale", "scale",
"elementwise_mul", "elementwise_mul",
"conv2d_transpose"}; "conv2d_transpose",
"hard_swish"};
std::unordered_set<std::string> teller_set{ std::unordered_set<std::string> teller_set{
"mul", "mul",
"conv2d", "conv2d",
......
...@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions( ...@@ -76,6 +76,16 @@ nvinfer1::DimsExprs EmbEltwiseLayernormPluginDynamic<T>::getOutputDimensions(
return ret; return ret;
} }
template <typename T>
void EmbEltwiseLayernormPluginDynamic<T>::terminate() {
for (auto ptr : embs_gpu_) {
if (ptr) cudaFree(ptr);
}
if (bias_gpu_) cudaFree(bias_gpu_);
if (scale_gpu_) cudaFree(scale_gpu_);
}
template <typename T> template <typename T>
bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination( bool EmbEltwiseLayernormPluginDynamic<T>::supportsFormatCombination(
int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs, int pos, const nvinfer1::PluginTensorDesc *in_out, int nb_inputs,
...@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue( ...@@ -153,7 +163,7 @@ int EmbEltwiseLayernormPluginDynamic<T>::enqueue(
int64_t *emb_ptr_gpu_d = int64_t *emb_ptr_gpu_d =
emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id)); emb_ptr_tensor.mutable_data<int64_t>(platform::CUDAPlace(device_id));
std::vector<int64_t> in_ptr, emb_ptr; std::vector<uintptr_t> in_ptr, emb_ptr;
for (int i = 0; i < input_num; i++) { for (int i = 0; i < input_num; i++) {
in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i])); in_ptr.push_back(reinterpret_cast<uintptr_t>(inputs[i]));
emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i])); emb_ptr.push_back(reinterpret_cast<uintptr_t>(embs_gpu_[i]));
......
...@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -81,9 +81,13 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new EmbEltwiseLayernormPluginDynamic( auto ptr = new EmbEltwiseLayernormPluginDynamic(
embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_, embs_, bias_, scale_, emb_sizes_, bias_size_, scale_size_, hidden_size_,
eps_); eps_);
ptr->embs_gpu_ = embs_gpu_;
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = scale_gpu_;
return ptr;
} }
const char* getPluginType() const override { const char* getPluginType() const override {
...@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT { ...@@ -111,6 +115,7 @@ class EmbEltwiseLayernormPluginDynamic : public DynamicPluginTensorRT {
return sum_num; return sum_num;
} }
void terminate() override;
void serialize(void* buffer) const override { void serialize(void* buffer) const override {
// SerializeValue(&buffer, with_fp16_); // SerializeValue(&buffer, with_fp16_);
SerializeValue(&buffer, emb_sizes_); SerializeValue(&buffer, emb_sizes_);
......
...@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs, ...@@ -80,6 +80,12 @@ int PReluPlugin::enqueue(int batch_size, const void *const *inputs,
#if IS_TRT_VERSION_GE(6000) #if IS_TRT_VERSION_GE(6000)
void PReluPluginDynamic::terminate() {
if (p_gpu_weight_) {
cudaFree(p_gpu_weight_);
}
}
int PReluPluginDynamic::initialize() { int PReluPluginDynamic::initialize() {
cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size()); cudaMalloc(&p_gpu_weight_, sizeof(float) * weight_.size());
cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float), cudaMemcpy(p_gpu_weight_, weight_.data(), weight_.size() * sizeof(float),
......
...@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT { ...@@ -102,12 +102,15 @@ class PReluPluginDynamic : public DynamicPluginTensorRT {
} }
~PReluPluginDynamic() { cudaFree(p_gpu_weight_); } ~PReluPluginDynamic() { cudaFree(p_gpu_weight_); }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new PReluPluginDynamic(weight_.data(), weight_.size(), mode_); auto ptr = new PReluPluginDynamic(weight_.data(), weight_.size(), mode_);
ptr->p_gpu_weight_ = p_gpu_weight_;
return ptr;
} }
const char* getPluginType() const override { return "prelu_plugin"; } const char* getPluginType() const override { return "prelu_plugin"; }
int getNbOutputs() const override { return 1; } int getNbOutputs() const override { return 1; }
int initialize() override; int initialize() override;
void terminate() override;
size_t getSerializationSize() const override; size_t getSerializationSize() const override;
void serialize(void* buffer) const override; void serialize(void* buffer) const override;
......
...@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT { ...@@ -51,8 +51,11 @@ class SkipLayerNormPluginDynamic : public DynamicPluginTensorRT {
} }
nvinfer1::IPluginV2DynamicExt* clone() const override { nvinfer1::IPluginV2DynamicExt* clone() const override {
return new SkipLayerNormPluginDynamic( auto ptr = new SkipLayerNormPluginDynamic(
bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_); bias_.data(), scale_.data(), bias_size_, scale_size_, eps_, ban_fp16_);
ptr->bias_gpu_ = bias_gpu_;
ptr->scale_gpu_ = bias_gpu_;
return ptr;
} }
const char* getPluginType() const override { return "skip_layernorm_plugin"; } const char* getPluginType() const override { return "skip_layernorm_plugin"; }
......
...@@ -471,19 +471,10 @@ if(WITH_GPU AND TENSORRT_FOUND) ...@@ -471,19 +471,10 @@ if(WITH_GPU AND TENSORRT_FOUND)
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz") inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
endif() endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_serialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc inference_analysis_test(test_trt_dynamic_shape_ernie_ser_deser SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized) ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_unserialized)
set(TEST_TRT_ERNIE_SER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_serialized/")
if (NOT EXISTS ${TEST_TRT_ERNIE_SER_MODEL})
inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_serialized.tgz")
endif()
inference_analysis_test(test_trt_dynamic_shape_ernie_deserialize SRCS trt_dynamic_shape_ernie_deserialize_test.cc
EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4_serialized)
endif() endif()
set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite") set(LITE_MODEL_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/lite")
......
...@@ -66,7 +66,7 @@ TEST(AnalysisPredictor, use_gpu) { ...@@ -66,7 +66,7 @@ TEST(AnalysisPredictor, use_gpu) {
float* data_o = static_cast<float*>(outputs[0].data.data()); float* data_o = static_cast<float*>(outputs[0].data.data());
for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) { for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); j += 10) {
EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0., EXPECT_NEAR((data_o[j] - truth_values[j / 10]) / truth_values[j / 10], 0.,
10e-5); 12e-5);
} }
} }
......
...@@ -123,8 +123,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -123,8 +123,11 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false); config.EnableTensorRtEngine(1 << 30, 1, 5, precision, true, false);
config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape, config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
opt_input_shape); opt_input_shape);
AnalysisConfig* config_deser = new AnalysisConfig(config);
std::vector<float> out_data; std::vector<float> out_data;
run(config, &out_data); run(config, &out_data); // serialize
run(*config_deser, &out_data); // deserialize
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6); EXPECT_NEAR(result[i], out_data[i], 1e-6);
} }
......
...@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) { ...@@ -126,7 +126,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
std::vector<float> out_data; std::vector<float> out_data;
run(config, &out_data); run(config, &out_data);
for (size_t i = 0; i < out_data.size(); i++) { for (size_t i = 0; i < out_data.size(); i++) {
EXPECT_NEAR(result[i], out_data[i], 1e-6); EXPECT_NEAR(result[i], out_data[i], 1e-5);
} }
} }
......
...@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator) ...@@ -23,6 +23,8 @@ cc_library(retry_allocator SRCS retry_allocator.cc DEPS allocator)
nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator) nv_library(pinned_allocator SRCS pinned_allocator.cc DEPS allocator)
if (WITH_GPU) if (WITH_GPU)
set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator) set(AllocatorFacadeDeps gpu_info cuda_allocator pinned_allocator cuda_device_guard thread_local_allocator)
elseif(WITH_XPU)
set(AllocatorFacadeDeps xpu_info)
else () else ()
set(AllocatorFacadeDeps) set(AllocatorFacadeDeps)
endif() endif()
......
...@@ -39,6 +39,9 @@ ...@@ -39,6 +39,9 @@
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/platform/gpu_info.h"
#endif #endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_info.h"
#endif
DEFINE_int64( DEFINE_int64(
gpu_allocator_retry_time, 10000, gpu_allocator_retry_time, 10000,
...@@ -62,6 +65,11 @@ class AllocatorFacadePrivate { ...@@ -62,6 +65,11 @@ class AllocatorFacadePrivate {
switch (strategy) { switch (strategy) {
case AllocatorStrategy::kNaiveBestFit: { case AllocatorStrategy::kNaiveBestFit: {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
...@@ -74,6 +82,11 @@ class AllocatorFacadePrivate { ...@@ -74,6 +82,11 @@ class AllocatorFacadePrivate {
case AllocatorStrategy::kAutoGrowth: { case AllocatorStrategy::kAutoGrowth: {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
...@@ -86,6 +99,11 @@ class AllocatorFacadePrivate { ...@@ -86,6 +99,11 @@ class AllocatorFacadePrivate {
case AllocatorStrategy::kThreadLocal: { case AllocatorStrategy::kThreadLocal: {
InitNaiveBestFitCPUAllocator(); InitNaiveBestFitCPUAllocator();
#ifdef PADDLE_WITH_XPU
for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount(); for (int dev_id = 0; dev_id < platform::GetCUDADeviceCount();
++dev_id) { ++dev_id) {
...@@ -127,6 +145,13 @@ class AllocatorFacadePrivate { ...@@ -127,6 +145,13 @@ class AllocatorFacadePrivate {
private: private:
void InitSystemAllocators() { void InitSystemAllocators() {
system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>(); system_allocators_[platform::CPUPlace()] = std::make_shared<CPUAllocator>();
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
for (int i = 0; i < device_count; ++i) {
platform::XPUPlace p(i);
system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
system_allocators_[platform::CUDAPinnedPlace()] = system_allocators_[platform::CUDAPinnedPlace()] =
std::make_shared<CPUPinnedAllocator>(); std::make_shared<CPUPinnedAllocator>();
...@@ -164,6 +189,12 @@ class AllocatorFacadePrivate { ...@@ -164,6 +189,12 @@ class AllocatorFacadePrivate {
} }
#endif #endif
#ifdef PADDLE_WITH_XPU
void InitNaiveBestFitXPUAllocator(platform::XPUPlace p) {
allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
}
#endif
class ZeroSizeAllocator : public Allocator { class ZeroSizeAllocator : public Allocator {
public: public:
explicit ZeroSizeAllocator(platform::Place place) : place_(place) {} explicit ZeroSizeAllocator(platform::Place place) : place_(place) {}
...@@ -191,6 +222,12 @@ class AllocatorFacadePrivate { ...@@ -191,6 +222,12 @@ class AllocatorFacadePrivate {
} }
places.emplace_back(platform::CUDAPinnedPlace()); places.emplace_back(platform::CUDAPinnedPlace());
#endif #endif
#ifdef PADDLE_WITH_XPU
int device_count = platform::GetXPUDeviceCount();
for (int dev_id = 0; dev_id < device_count; ++dev_id) {
places.emplace_back(platform::XPUPlace(dev_id));
}
#endif
for (auto& p : places) { for (auto& p : places) {
zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p); zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);
......
...@@ -29,6 +29,9 @@ ...@@ -29,6 +29,9 @@
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/cuda_device_guard.h" #include "paddle/fluid/platform/cuda_device_guard.h"
#endif #endif
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#endif
DEFINE_bool(init_allocated_mem, false, DEFINE_bool(init_allocated_mem, false,
"It is a mistake that the values of the memory allocated by " "It is a mistake that the values of the memory allocated by "
...@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) { ...@@ -101,6 +104,100 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used(); return GetCPUBuddyAllocator()->Used();
} }
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void *p = nullptr;
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_malloc(reinterpret_cast<void **>(&p), size);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (FLAGS_init_allocated_mem) {
PADDLE_THROW(platform::errors::Unimplemented(
"xpu memory FLAGS_init_allocated_mem is not implemented."));
}
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
VLOG(10) << " pointer=" << p;
return p;
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
return nullptr;
#endif
}
template <>
void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
size_t size) {
#ifdef PADDLE_WITH_XPU
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
ret = xpu_set_device(place.device);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
xpu_free(p);
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}
template <>
size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
#ifdef PADDLE_WITH_XPU
printf("Used func return 0 for XPUPlace\n");
return 0;
#else
PADDLE_THROW(
platform::errors::PermissionDenied("'XPUPlace' is not supported."));
#endif
}
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
class GPUBuddyAllocatorList { class GPUBuddyAllocatorList {
private: private:
......
...@@ -18,6 +18,10 @@ limitations under the License. */ ...@@ -18,6 +18,10 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#endif
namespace paddle { namespace paddle {
namespace memory { namespace memory {
...@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst, ...@@ -29,6 +33,169 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
std::memcpy(dst, src, num); std::memcpy(dst, src, num);
} }
#ifdef PADDLE_WITH_XPU
template <>
void Copy<platform::XPUPlace, platform::CPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::CPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_HOST_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != dst_place.device) {
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != dst_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
template <>
void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_DEVICE_TO_HOST size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id != src_place.device) {
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
template <>
void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
void* dst,
platform::XPUPlace src_place,
const void* src, size_t num) {
if (num <= 0) {
VLOG(0) << "memcpy XPU_DEVICE_TO_DEVICE size <= 0 (" << num << ")";
return;
}
int dev_id = -1;
int ret = xpu_current_device(&dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
if (dev_id >= 64) {
// if dev_id >= 64, the device is a simulator device, -64 to get real dev_id
dev_id -= 64;
}
if (dev_id != src_place.device || dev_id != dst_place.device) {
ret = xpu_set_device(src_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
void* tmp = malloc(num);
ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dst_place.device);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_memcpy(dst, tmp, num, XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
free(tmp);
} else {
int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
}
}
#endif
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K static constexpr size_t kMaxGpuAsyncCopyBytes = 64 * 1024; // 64K
......
...@@ -88,7 +88,9 @@ endif() ...@@ -88,7 +88,9 @@ endif()
cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator) cc_library(common_infer_shape_functions SRCS common_infer_shape_functions.cc DEPS operator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor device_memory_aligment) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} selected_rows_functor selected_rows
lod_tensor maxouting unpooling pooling lod_rank_table context_project
sequence_pooling executor device_memory_aligment generator)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel_helper concat_and_split cross_entropy softmax vol2col im2col sampler sample_prob tree2col)
set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions beam_search fc matrix_inverse)
...@@ -121,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t ...@@ -121,7 +123,7 @@ cc_test(beam_search_decode_op_test SRCS beam_search_decode_op_test.cc DEPS lod_t
cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory) cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor memory)
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor) nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor generator)
if (WITH_GPU) if (WITH_GPU)
nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3) nv_test(test_leaky_relu_grad_grad_functor SRCS test_leaky_relu_grad_grad_functor.cc test_leaky_relu_grad_grad_functor.cu DEPS tensor device_context eigen3)
else() else()
......
...@@ -219,7 +219,7 @@ $$out = \\frac{1}{\\sqrt{x}}$$ ...@@ -219,7 +219,7 @@ $$out = \\frac{1}{\\sqrt{x}}$$
)DOC"; )DOC";
UNUSED constexpr char AbsDoc[] = R"DOC( UNUSED constexpr char AbsDoc[] = R"DOC(
Abs Activation Operator. Abs Operator.
$$out = |x|$$ $$out = |x|$$
...@@ -242,6 +242,9 @@ $$out = \\left \\lfloor x \\right \\rfloor$$ ...@@ -242,6 +242,9 @@ $$out = \\left \\lfloor x \\right \\rfloor$$
UNUSED constexpr char CosDoc[] = R"DOC( UNUSED constexpr char CosDoc[] = R"DOC(
Cosine Operator. Computes cosine of x element-wise. Cosine Operator. Computes cosine of x element-wise.
Input range is `(-inf, inf)` and output range is `[-1,1]`.
Return `nan` if input is out of boundary.
$$out = cos(x)$$ $$out = cos(x)$$
)DOC"; )DOC";
...@@ -314,13 +317,6 @@ $$out = x^2$$ ...@@ -314,13 +317,6 @@ $$out = x^2$$
)DOC"; )DOC";
UNUSED constexpr char SoftplusDoc[] = R"DOC(
Softplus Activation Operator.
$$out = \ln(1 + e^{x})$$
)DOC";
UNUSED constexpr char SoftsignDoc[] = R"DOC( UNUSED constexpr char SoftsignDoc[] = R"DOC(
Softsign Activation Operator. Softsign Activation Operator.
...@@ -334,7 +330,7 @@ class AcosOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -334,7 +330,7 @@ class AcosOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of acos operator"); AddInput("X", "Input of acos operator");
AddOutput("Out", "Output of acos operator"); AddOutput("Out", "Output of acos operator");
AddComment(R"DOC( AddComment(R"DOC(
Arccosine Activation Operator. Arccosine Operator.
$$out = \cos^{-1}(x)$$ $$out = \cos^{-1}(x)$$
...@@ -348,7 +344,7 @@ class AsinOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -348,7 +344,7 @@ class AsinOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of asin operator"); AddInput("X", "Input of asin operator");
AddOutput("Out", "Output of asin operator"); AddOutput("Out", "Output of asin operator");
AddComment(R"DOC( AddComment(R"DOC(
Arcsine Activation Operator. Arcsine Operator.
$$out = \sin^{-1}(x)$$ $$out = \sin^{-1}(x)$$
...@@ -362,9 +358,9 @@ class AtanOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -362,9 +358,9 @@ class AtanOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput("X", "Input of atan operator"); AddInput("X", "Input of atan operator");
AddOutput("Out", "Output of atan operator"); AddOutput("Out", "Output of atan operator");
AddComment(R"DOC( AddComment(R"DOC(
Arctanh Activation Operator. Arctangent Operator.
$$out = \tanh^{-1}(x)$$ $$out = \tan^{-1}(x)$$
)DOC"); )DOC");
} }
...@@ -393,6 +389,36 @@ $$out = \max(x, \alpha * x)$$ ...@@ -393,6 +389,36 @@ $$out = \max(x, \alpha * x)$$
} }
}; };
class SoftplusOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"Input of Softplus operator, an N-D Tensor, with data type "
"float32, float64 or float16.");
AddOutput(
"Out",
"Output of Softplus operator, a Tensor with shape same as input.");
AddAttr<float>("beta", "The value of beta for Softplus.").SetDefault(1.0f);
AddAttr<float>("threshold", "The value of threshold for Softplus.")
.SetDefault(20.0f);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel.")
.SetDefault(false);
AddAttr<bool>(
"use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn.")
.SetDefault(false);
AddComment(R"DOC(
:strong:`Softplus Activation Operator`
.. math::
out = \frac{1}{\beta} * \log(1 + \exp(\beta * x)) \\
\text{For numerical stability, the implementation reverts to the linear function when :}\,x \times \beta > threshold.
)DOC");
}
};
class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker { class SoftShrinkOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
...@@ -669,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc); ...@@ -669,7 +695,6 @@ REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc); REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc); REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc); REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
REGISTER_ACTIVATION_OP_MAKER(Softplus, SoftplusDoc);
REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc); REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
template <ActBwdOpFwdDeps kDepValue> template <ActBwdOpFwdDeps kDepValue>
...@@ -756,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> { ...@@ -756,8 +781,8 @@ class ReluDoubleGradMaker : public ::paddle::framework::SingleGradOpMaker<T> {
} }
}; };
// leaky_relu Grad: dx=dy if y>=0 else alpha * dy // leaky_relu Grad: dx=dy if x>=0 else alpha * dy
// leaky_relu GradGrad: ddy=ddx if y>=0 else alpha * ddx // leaky_relu GradGrad: ddy=ddx if x>=0 else alpha * ddx
template <typename T> template <typename T>
class LeakyReluDoubleGradMaker class LeakyReluDoubleGradMaker
: public ::paddle::framework::SingleGradOpMaker<T> { : public ::paddle::framework::SingleGradOpMaker<T> {
...@@ -767,8 +792,8 @@ class LeakyReluDoubleGradMaker ...@@ -767,8 +792,8 @@ class LeakyReluDoubleGradMaker
protected: protected:
void Apply(GradOpPtr<T> op) const override { void Apply(GradOpPtr<T> op) const override {
op->SetType("leaky_relu_grad_grad"); op->SetType("leaky_relu_grad_grad");
// input1: Out // input1: X
op->SetInput("Out", this->Input("Out")); op->SetInput("X", this->Input("X"));
// X@GRAD@GRAD: ddx // X@GRAD@GRAD: ddx
op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X"))); op->SetInput("DDX", this->OutputGrad(framework::GradVarName("X")));
op->SetAttrMap(this->Attrs()); op->SetAttrMap(this->Attrs());
......
...@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> { ...@@ -388,9 +388,9 @@ struct HardShrinkFunctor : public BaseActivationFunctor<T> {
} }
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const { void operator()(Device d, X x, Out out) const {
auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>(); auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = (x > static_cast<T>(threshold)).template cast<T>(); auto temp2 = x > static_cast<T>(threshold);
out.device(d) = x * (temp1 + temp2); out.device(d) = x * (temp1 + temp2 > 0).template cast<T>();
} }
}; };
...@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> { ...@@ -405,9 +405,9 @@ struct HardShrinkGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = (x < static_cast<T>(threshold * -1)).template cast<T>(); auto temp1 = x < static_cast<T>(threshold * -1.f);
auto temp2 = (x > static_cast<T>(threshold)).template cast<T>(); auto temp2 = x > static_cast<T>(threshold);
dx.device(d) = dout * (temp1 + temp2).template cast<T>(); dx.device(d) = dout * (temp1 + temp2 > 0).template cast<T>();
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
...@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> { ...@@ -975,32 +975,46 @@ struct HardSwishGradFunctor : public BaseActivationFunctor<T> {
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
}; };
// softplus(x) = log(1 + exp(x)) // For numerical stability, using the following formula instead of softplus(x) =
// When x is a very large positive number, exp(x) may explode to inf, // log(1 + exp(x))
// Using trick below for numerical stability // softplus(x) = log(1 + exp(beta * x)) / beta when beta * x <= threshold(beta =
// https://hips.seas.harvard.edu/blog/2013/01/09/computing-log-sum-exp/ // 1, threshold = 20 by default), otherwise x
// Then: softplus(x) = max(x, 0) + log(exp(-max(x, 0)) + exp(x - max(x, 0)))
template <typename T> template <typename T>
struct SoftplusFunctor : public BaseActivationFunctor<T> { struct SoftplusFunctor : public BaseActivationFunctor<T> {
float beta;
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}, {"threshold", &threshold}};
}
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) { void operator()(Device d, X x, Out out) {
auto temp = x.cwiseMax(static_cast<T>(0)); // temp = max(x, 0) auto x_beta = static_cast<T>(beta) * x;
out.device(d) = temp + (((-temp).exp() + (x - temp).exp()).log()); out.device(d) = (x_beta > static_cast<T>(threshold))
.select(x, (static_cast<T>(1) + x_beta.exp()).log() /
static_cast<T>(beta));
} }
}; };
// d(softplus(x))/dx = exp(x) / (1 + exp(x)) // For numerical stability, using the following formula instead of
// For numerical stability: // d(softplus(x))/dx = 1 / (1 + exp(-x))
// d(softplus(x))/dx = exp(x - max(x, 0)) / (exp(-max(x, 0)) + // d(softplus(x))/dx = 1 / (1 + exp(-beta * x)) when beta * x <= threshold(beta
// exp(x - max(x, 0))) // = 1, threshold = 20 by default), otherwise x
template <typename T> template <typename T>
struct SoftplusGradFunctor : public BaseActivationFunctor<T> { struct SoftplusGradFunctor : public BaseActivationFunctor<T> {
float beta;
float threshold;
typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
return {{"beta", &beta}, {"threshold", &threshold}};
}
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) { void operator()(Device d, X x, Out out, dOut dout, dX dx) {
auto temp = x.cwiseMax(static_cast<T>(0)); // temp = max(x, 0) auto x_beta = static_cast<T>(beta) * x;
dx.device(d) = dx.device(d) =
dout * ((x - temp).exp() / ((-temp).exp() + (x - temp).exp())); (x_beta > static_cast<T>(threshold))
.select(dout, dout / (static_cast<T>(1) + (-x_beta).exp()));
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
...@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> { ...@@ -1070,7 +1084,11 @@ struct LeakyReluFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out> template <typename Device, typename X, typename Out>
void operator()(Device d, X x, Out out) const { void operator()(Device d, X x, Out out) const {
if (alpha < 1.f) {
out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x); out.device(d) = x.cwiseMax(static_cast<T>(alpha) * x);
} else {
out.device(d) = x.cwiseMin(static_cast<T>(alpha) * x);
}
} }
}; };
...@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> { ...@@ -1084,12 +1102,12 @@ struct LeakyReluGradFunctor : public BaseActivationFunctor<T> {
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
auto temp1 = auto temp1 =
static_cast<T>(alpha) * (out <= static_cast<T>(0)).template cast<T>(); static_cast<T>(alpha) * (x < static_cast<T>(0)).template cast<T>();
auto temp2 = (out > static_cast<T>(0)).template cast<T>(); auto temp2 = (x >= static_cast<T>(0)).template cast<T>();
dx.device(d) = dout * (temp1 + temp2).template cast<T>(); dx.device(d) = dout * (temp1 + temp2).template cast<T>();
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
}; };
template <typename T> template <typename T>
...@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> { ...@@ -1116,9 +1134,20 @@ struct ELUGradFunctor : public BaseActivationFunctor<T> {
template <typename Device, typename X, typename Out, typename dOut, template <typename Device, typename X, typename Out, typename dOut,
typename dX> typename dX>
void operator()(Device d, X x, Out out, dOut dout, dX dx) const { void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
dx.device(d) = dout * (x > static_cast<T>(0)).template cast<T>() + auto temp_a_pos = static_cast<T>(alpha > 0);
dout * static_cast<T>(alpha) * x.exp() * auto temp_a_neg = static_cast<T>(alpha <= 0);
(x <= static_cast<T>(0)).template cast<T>(); auto temp_x_pos = (x > static_cast<T>(0)).template cast<T>();
auto temp_x_neg = (x <= static_cast<T>(0)).template cast<T>();
// dx = dout, if alpha > 0 and x > 0
// dx = dout * alpha * x.exp(), if alpha > 0 and x <= 0
// dx = dout * (1 + alpha * x.exp()), if alpha <= 0 and x > 0
// dx = 0, if alpha <= 0 and x <=0
dx.device(d) =
dout * temp_a_pos * temp_x_pos +
dout * static_cast<T>(alpha) * x.exp() * temp_a_pos * temp_x_neg +
dout * (static_cast<T>(1) + static_cast<T>(alpha) * x.exp()) *
temp_a_neg * temp_x_pos;
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
...@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> { ...@@ -1437,18 +1466,18 @@ struct LeakyReluGradGradFunctor : public BaseActivationFunctor<T> {
auto* d = dev.eigen_device(); auto* d = dev.eigen_device();
auto ddx = framework::EigenVector<T>::Flatten( auto ddx = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad")); GET_DATA_SAFELY(ddX, "Input", "DDX", "LeakyReluGradGrad"));
auto out = framework::EigenVector<T>::Flatten( auto x = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(Out, "Output", "Out", "LeakyReluGradGrad")); GET_DATA_SAFELY(X, "Input", "X", "LeakyReluGradGrad"));
auto ddout = framework::EigenVector<T>::Flatten( auto ddout = framework::EigenVector<T>::Flatten(
GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad")); GET_DATA_SAFELY(ddOut, "Output", "DOut", "LeakyReluGradGrad"));
ddout.device(*d) = ddx * ddout.device(*d) =
((out > static_cast<T>(0)).template cast<T>() + ddx *
static_cast<T>(alpha) * ((x > static_cast<T>(0)).template cast<T>() +
(out <= static_cast<T>(0)).template cast<T>()) static_cast<T>(alpha) * (x <= static_cast<T>(0)).template cast<T>())
.template cast<T>(); .template cast<T>();
} }
} }
static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepOut; } static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
}; };
template <typename T> template <typename T>
......
...@@ -28,10 +28,15 @@ using Tensor = framework::Tensor; ...@@ -28,10 +28,15 @@ using Tensor = framework::Tensor;
template <typename T> template <typename T>
struct Linspace<paddle::platform::CPUDeviceContext, T> { struct Linspace<paddle::platform::CPUDeviceContext, T> {
void operator()(T start, T end, int count, framework::Tensor* numbers, void operator()(T start, T end, int count, bool align_corners,
framework::Tensor* numbers,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace()); T* number_data = numbers->mutable_data<T>({count}, platform::CPUPlace());
T slice = (end - start) / (T)(count - 1); T slice = (end - start) / (T)(count - 1);
if (!align_corners) {
slice = (end - start) / (T)count;
start *= (T)(count - 1) / (T)count;
}
for (int i = 0; i < count; ++i) { for (int i = 0; i < count; ++i) {
number_data[i] = start + (T)i * slice; number_data[i] = start + (T)i * slice;
} }
...@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -130,6 +135,10 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
"use_cudnn", "use_cudnn",
"(bool, default false) Only used in cudnn kernel, need install cudnn") "(bool, default false) Only used in cudnn kernel, need install cudnn")
.SetDefault(true); .SetDefault(true);
AddAttr<bool>("align_corners",
"(bool, default false) Whether to align the corners of input"
"and ouput.")
.SetDefault(true);
AddAttr<std::vector<int>>( AddAttr<std::vector<int>>(
"output_shape", "output_shape",
"The target output image shape with format [N, C, H, W].") "The target output image shape with format [N, C, H, W].")
...@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -164,10 +173,12 @@ class AffineGridOpMaker : public framework::OpProtoAndCheckerMaker {
[-1. -0.5 0. 0.5 1. ] [-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ] [-1. -0.5 0. 0.5 1. ]
[-1. -0.5 0. 0.5 1. ]]] [-1. -0.5 0. 0.5 1. ]]]
C[0] is the coordinates in height axis and C[1] is the coordinates in width axis. C[0] is the coordinates in height axis and C[1] is the coordinates in
width axis.
Step2: Step2:
Tanspose and reshape C to shape [H * W, 2] and append ones to last dimension. The we get: Tanspose and reshape C to shape [H * W, 2] and append ones to last
dimension. The we get:
C_ = [[-1. -1. 1. ] C_ = [[-1. -1. 1. ]
[-0.5 -1. 1. ] [-0.5 -1. 1. ]
[ 0. -1. 1. ] [ 0. -1. 1. ]
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/affine_grid_op.h"
#include "paddle/fluid/platform/cuda_device_function.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
template <typename T>
__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
CUDA_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
}
template <typename T>
struct Linspace<paddle::platform::CUDADeviceContext, T> {
void operator()(T start, T end, int count, bool align_corners,
framework::Tensor* numbers,
const framework::ExecutionContext& ctx) {
T* number_data = numbers->mutable_data<T>({count}, ctx.GetPlace());
T slice = (end - start) / (T)(count - 1);
if (!align_corners) {
slice = (end - start) / (T)count;
start *= (T)(count - 1) / (T)count;
}
auto stream = ctx.cuda_device_context().stream();
int block = 512;
int grid = (count + block - 1) / block;
LinspaceKernel<T><<<grid, block, 0, stream>>>(start, slice, count,
number_data);
}
};
template <typename T>
__global__ void affine_grid_kernel(const int count, int n, int out_h, int out_w,
T h_start, T w_start, T h_step, T w_step,
const T* theta, // N, 2, 3
T* output) {
CUDA_KERNEL_LOOP(index, count) {
int w = index % out_w;
int h = (index / out_w) % out_h;
int n = index / (out_w * out_h);
T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
int theta_offset = n * 6; // 2 * 3;
// affine from (h_coor, w_coor) to (x, y)
output[index * 2] = theta[theta_offset] * h_coor +
theta[theta_offset + 1] * w_coor +
theta[theta_offset + 2];
output[index * 2 + 1] = theta[theta_offset + 3] * h_coor +
theta[theta_offset + 4] * w_coor +
theta[theta_offset + 5];
}
}
template <typename T>
__global__ void affine_grid_grad_kernel(const int count, int n, int out_h,
int out_w, T h_start, T w_start,
T h_step, T w_step,
const T* out_grad, // N, H, W, 2
T* theta_grad) { // N, 2, 3
CUDA_KERNEL_LOOP(index, count) {
int w = index % out_w;
int h = (index / out_w) % out_h;
int n = index / (out_w * out_h);
T h_coor = h_step * static_cast<T>(h) + static_cast<T>(h_start);
T w_coor = w_step * static_cast<T>(w) + static_cast<T>(w_start);
int theta_offset = n * 6; // 2 * 3;
T out_grad_x = out_grad[index * 2];
atomicAdd(theta_grad + theta_offset, out_grad_x * h_coor);
atomicAdd(theta_grad + theta_offset + 1, out_grad_x * w_coor);
atomicAdd(theta_grad + theta_offset + 2, out_grad_x);
T out_grad_y = out_grad[index * 2 + 1];
atomicAdd(theta_grad + theta_offset + 3, out_grad_y * h_coor);
atomicAdd(theta_grad + theta_offset + 4, out_grad_y * w_coor);
atomicAdd(theta_grad + theta_offset + 5, out_grad_y);
}
}
template <typename T>
class AffineGridOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* theta = ctx.Input<Tensor>("Theta");
int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
auto align_corners = ctx.Attr<bool>("align_corners");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
auto* output = ctx.Output<Tensor>("Output");
T* out_data = output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
T h_step;
T w_step;
T h_start = -1;
T w_start = -1;
if (align_corners) {
h_step = static_cast<T>(2) / static_cast<T>(h - 1);
w_step = static_cast<T>(2) / static_cast<T>(w - 1);
} else {
h_step = static_cast<T>(2) / static_cast<T>(h);
w_step = static_cast<T>(2) / static_cast<T>(w);
h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
}
const int count = n * h * w;
int block = 512;
int grid = (count + block - 1) / block;
auto cu_stream = ctx.cuda_device_context().stream();
affine_grid_kernel<<<grid, block, 0, cu_stream>>>(
count, n, h, w, h_start, w_start, h_step, w_step,
theta->data<T>(), // N, 2, 3
out_data);
}
};
template <typename T>
class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
auto align_corners = ctx.Attr<bool>("align_corners");
int h = 0;
int w = 0;
if (size_attr.size() == 0) {
auto* output_shape = ctx.Input<Tensor>("OutputShape");
Tensor h_sizes;
framework::TensorCopy(*output_shape, platform::CPUPlace(), &h_sizes);
const int* h_size_data = h_sizes.data<int>();
h = h_size_data[2];
w = h_size_data[3];
} else {
h = size_attr[2];
w = size_attr[3];
}
T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
T h_step;
T w_step;
T h_start = -1;
T w_start = -1;
if (align_corners) {
h_step = static_cast<T>(2) / static_cast<T>(h - 1);
w_step = static_cast<T>(2) / static_cast<T>(w - 1);
} else {
h_step = static_cast<T>(2) / static_cast<T>(h);
w_step = static_cast<T>(2) / static_cast<T>(w);
h_start *= static_cast<T>(h - 1) / static_cast<T>(h);
w_start *= static_cast<T>(w - 1) / static_cast<T>(w);
}
const int count = n * h * w;
VLOG(3) << "count: " << count << "; h_step: " << h_step
<< "; w_step: " << w_step << "; h_start: " << h_start
<< "; w_start: " << w_start;
int block = 512;
int grid = (count + block - 1) / block;
auto cu_stream = ctx.cuda_device_context().stream();
affine_grid_grad_kernel<<<grid, block, 0, cu_stream>>>(
count, n, h, w, h_start, w_start, h_step, w_step,
output_grad->data<T>(), theta_grad_data);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(affine_grid, ops::AffineGridOpCUDAKernel<float>,
ops::AffineGridOpCUDAKernel<double>);
REGISTER_OP_CUDA_KERNEL(affine_grid_grad,
ops::AffineGridGradOpCUDAKernel<float>,
ops::AffineGridGradOpCUDAKernel<double>);
...@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>; ...@@ -37,12 +37,13 @@ using Array4 = Eigen::DSizes<int64_t, 4>;
*/ */
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
struct Linspace { struct Linspace {
void operator()(T start, T end, int count, framework::Tensor* numbers, void operator()(T start, T end, int count, bool align_corners,
framework::Tensor* numbers,
const framework::ExecutionContext& ctx); const framework::ExecutionContext& ctx);
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
inline void GetIdxMap(int n, int h, int w, Tensor* grid, inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
const framework::ExecutionContext& ctx) { const framework::ExecutionContext& ctx) {
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace()); grid->mutable_data<T>({n, h, w, 3}, ctx.GetPlace());
...@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid, ...@@ -50,16 +51,19 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
// Get indexes of height with shape [height, width, 1] // Get indexes of height with shape [height, width, 1]
Tensor h_idx; Tensor h_idx;
Linspace<DeviceContext, T> linspace; Linspace<DeviceContext, T> linspace;
linspace((T)-1, (T)1, h, &h_idx, ctx); linspace((T)-1, (T)1, h, align_corners, &h_idx, ctx);
auto h_idx_t = EigenTensor<T, 1>::From(h_idx); auto h_idx_t = EigenTensor<T, 1>::From(h_idx);
// Get indexes of width with shape [height, width, 1] // Get indexes of width with shape [height, width, 1]
Tensor w_idx; Tensor w_idx;
linspace((T)-1, (T)1, w, &w_idx, ctx); linspace((T)-1, (T)1, w, align_corners, &w_idx, ctx);
auto w_idx_t = EigenTensor<T, 1>::From(w_idx); auto w_idx_t = EigenTensor<T, 1>::From(w_idx);
// Get constant ones tensor with shape [height, width, 1] // Get constant ones tensor with shape [height, width, 1]
Tensor ones; Tensor ones;
ones.mutable_data<T>({h, w, 1}, ctx.GetPlace()); ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
auto ones_t = EigenTensor<T, 3>::From(ones).setConstant((T)1);
math::SetConstant<DeviceContext, T>()(
ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
auto ones_t = EigenTensor<T, 3>::From(ones);
// Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
// ones // ones
Tensor w_idx_map; Tensor w_idx_map;
...@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid, ...@@ -74,11 +78,9 @@ inline void GetIdxMap(int n, int h, int w, Tensor* grid,
Tensor w_h_one_idx_map; Tensor w_h_one_idx_map;
w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace()); w_h_one_idx_map.mutable_data<T>({h, w, 3}, ctx.GetPlace());
auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map); auto w_h_one_idx_map_t = EigenTensor<T, 3>::From(w_h_one_idx_map);
w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w)) w_idx_map_t.device(place) = w_idx_t.reshape(Array2(1, w))
.broadcast(Array2(h, 1)) .broadcast(Array2(h, 1))
.reshape(Array3(h, w, 1)); .reshape(Array3(h, w, 1));
h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h)) h_idx_map_t.device(place) = h_idx_t.reshape(Array2(1, h))
.broadcast(Array2(w, 1)) .broadcast(Array2(w, 1))
.shuffle(Array2(1, 0)) .shuffle(Array2(1, 0))
...@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> { ...@@ -97,6 +99,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
auto* theta = ctx.Input<Tensor>("Theta"); auto* theta = ctx.Input<Tensor>("Theta");
int n = theta->dims()[0]; int n = theta->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape"); auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
auto align_corners = ctx.Attr<bool>("align_corners");
int h = 0; int h = 0;
int w = 0; int w = 0;
if (size_attr.size() == 0) { if (size_attr.size() == 0) {
...@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> { ...@@ -116,7 +119,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
ctx.template device_context<DeviceContext>(), output, ctx.template device_context<DeviceContext>(), output,
static_cast<T>(0)); static_cast<T>(0));
Tensor grid; Tensor grid;
GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx); GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T // output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
...@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -140,6 +143,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta")); auto theta_grad = ctx.Output<Tensor>(framework::GradVarName("Theta"));
int n = output_grad->dims()[0]; int n = output_grad->dims()[0];
auto size_attr = ctx.Attr<std::vector<int>>("output_shape"); auto size_attr = ctx.Attr<std::vector<int>>("output_shape");
auto align_corners = ctx.Attr<bool>("align_corners");
int h = 0; int h = 0;
int w = 0; int w = 0;
if (size_attr.size() == 0) { if (size_attr.size() == 0) {
...@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -158,7 +162,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
ctx.template device_context<DeviceContext>(), theta_grad, ctx.template device_context<DeviceContext>(), theta_grad,
static_cast<T>(0)); static_cast<T>(0));
Tensor grid; Tensor grid;
GetIdxMap<DeviceContext, T>(n, h, w, &grid, ctx); GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T // output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = math::GetBlas<DeviceContext, T>(ctx);
......
...@@ -22,9 +22,11 @@ namespace operators { ...@@ -22,9 +22,11 @@ namespace operators {
class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
public: public:
void Make() override { void Make() override {
AddInput("Input", "The first input tensor to compare."); AddInput("Input",
AddInput("Other", "The second input tensor to compare."); "The input tensor, it's data type should be float32, float64.");
AddOutput("Out", "The output tensor of allclose op."); AddInput("Other",
"The input tensor, it's data type should be float32, float64.");
AddOutput("Out", "The output tensor, it's data type is bool.");
AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .") AddAttr<float>("rtol", "The relative tolerance. Default: :math:`1e-5` .")
.SetDefault(1e-5); .SetDefault(1e-5);
...@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -36,11 +38,12 @@ class AllcloseOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(false); .SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
This operator checks if all :math:`input` and :math:`other` satisfy the condition: This operator checks if all :math:`x` and :math:`y` satisfy the condition:
:math:`\left| input - other \right| \leq atol + rtol \times \left| other \right|` .. math::
\left| x - y \right| \leq atol + rtol \times \left| y \right|
elementwise, for all elements of :math:`input` and :math:`other`. The behaviour of this elementwise, for all elements of :math:`x` and :math:`y`. The behaviour of this
operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if operator is analogous to :math:`numpy.allclose`, namely that it returns :math:`True` if
two tensors are elementwise equal within a tolerance. two tensors are elementwise equal within a tolerance.
)DOC"); )DOC");
......
...@@ -12,18 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h" #include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
arg_max, arg_max, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMax>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMax>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMax>,
double>, paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMax>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext, paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMax>);
int64_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int32_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
int16_t>,
paddle::operators::ArgMaxKernel<paddle::platform::CUDADeviceContext,
uint8_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef __NVCC__
#include <cub/cub.cuh>
#include <limits>
#include <string>
#include <typeinfo>
#include <vector>
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/device_context.h"
namespace paddle {
namespace operators {
namespace { // NOLINT
template <typename K, typename V>
using KeyValuePair = cub::KeyValuePair<K, V>;
using Tensor = framework::Tensor;
} // end namespace
#define FIXED_BLOCK_DIM_CASE_BASE(log2_block_dim, ...) \
case (1 << (log2_block_dim)): { \
constexpr auto kBlockDim = (1 << (log2_block_dim)); \
__VA_ARGS__; \
} break
#define FIXED_BLOCK_DIM_CASE(...) \
FIXED_BLOCK_DIM_CASE_BASE(10, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(9, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(8, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(7, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(6, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(5, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(4, ##__VA_ARGS__); \
FIXED_BLOCK_DIM_CASE_BASE(3, ##__VA_ARGS__);
template <typename T, typename IndType, class Reducer, size_t BlockDim>
__global__ void ArgCUDAKernel(const int64_t height, // n * h
const int64_t width, // c
const int64_t post_size, // h
const Reducer reducer, const T init, const T* in,
IndType* out) {
typedef cub::BlockReduce<KeyValuePair<int, T>, BlockDim> BlockReduce;
__shared__ typename BlockReduce::TempStorage temp_storage;
for (int idx = blockIdx.x; idx < height; idx += gridDim.x) {
KeyValuePair<int, T> kv_pair = {-1, init};
int h = idx / post_size;
int w = idx % post_size;
for (int k = threadIdx.x; k < width; k += blockDim.x) {
kv_pair =
reducer({k, in[h * width * post_size + k * post_size + w]}, kv_pair);
}
kv_pair = BlockReduce(temp_storage).Reduce(kv_pair, reducer);
if (threadIdx.x == 0) {
out[idx] = static_cast<IndType>(kv_pair.key);
}
__syncthreads();
}
}
template <typename T, typename IndType, class Reducer>
void ComputeFullArg(const platform::CUDADeviceContext& ctx, const Tensor& input,
Tensor* indices, const int64_t pre, const int64_t post,
const int64_t n) {
auto cu_stream = ctx.stream();
auto ComputeBlockSize = [](int64_t col) {
if (col > 512)
return 1024;
else if (col > 256)
return 512;
else if (col > 128)
return 256;
else if (col > 64)
return 128;
else if (col > 32)
return 64;
else if (col > 16)
return 32;
else if (col > 8)
return 16;
else
return 8;
};
int64_t max_grid_dimx = ctx.GetCUDAMaxGridDimSize().x;
int64_t height = pre * post;
int64_t width = n;
int64_t grid_size = height < max_grid_dimx ? height : max_grid_dimx;
const T* in_data = input.data<T>();
IndType* out_data = indices->mutable_data<IndType>(ctx.GetPlace());
if (typeid(Reducer) == typeid(cub::ArgMax)) {
switch (ComputeBlockSize(width)) {
FIXED_BLOCK_DIM_CASE(
ArgCUDAKernel<T, IndType, Reducer,
kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
height, width, post, Reducer(), std::numeric_limits<T>::lowest(),
in_data, out_data));
}
} else {
switch (ComputeBlockSize(width)) {
FIXED_BLOCK_DIM_CASE(
ArgCUDAKernel<T, IndType, Reducer,
kBlockDim><<<grid_size, kBlockDim, 0, cu_stream>>>(
height, width, post, Reducer(), std::numeric_limits<T>::max(),
in_data, out_data));
}
}
}
template <typename T, class Reducer>
struct VisitDataCudaArgMinMaxFunctor {
const framework::ExecutionContext& ctx;
explicit VisitDataCudaArgMinMaxFunctor(const framework::ExecutionContext& ctx)
: ctx(ctx) {}
template <typename IndType>
void apply() const {
auto* input = ctx.Input<Tensor>("X");
auto* output = ctx.Output<Tensor>("Out");
int axis = ctx.Attr<int64_t>("axis");
const bool& flatten = ctx.Attr<bool>("flatten");
framework::DDim input_dims;
if (flatten) {
input_dims = framework::make_ddim({input->numel()});
// if flatten, the axis just as 0
axis = 0;
} else {
input_dims = input->dims();
if (axis < 0) axis += input->dims().size();
}
int64_t numel = input->numel();
int64_t groups = numel / input_dims[axis];
int64_t pre = 1;
int64_t post = 1;
int64_t n = input_dims[axis];
for (int i = 0; i < axis; i++) {
pre *= input_dims[i];
}
for (int i = axis + 1; i < input_dims.size(); i++) {
post *= input_dims[i];
}
const auto& dev_ctx = ctx.cuda_device_context();
ComputeFullArg<T, IndType, Reducer>(dev_ctx, *input, output, pre, post, n);
}
};
template <typename T, class Reducer>
class ArgMinMaxOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto& dtype = ctx.Attr<int>("dtype");
if (dtype < 0) {
framework::VisitDataType(static_cast<framework::proto::VarType::Type>(
framework::proto::VarType::INT64),
VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
return;
}
framework::VisitDataType(
static_cast<framework::proto::VarType::Type>(dtype),
VisitDataCudaArgMinMaxFunctor<T, Reducer>(ctx));
}
};
#endif
} // namespace operators
} // namespace paddle
...@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {}; ...@@ -38,8 +38,9 @@ struct ArgMinMaxFunctor {};
struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank, \ struct ArgMinMaxFunctor<DeviceContext, T, Tout, Rank, \
enum_argminmax_value> { \ enum_argminmax_value> { \
void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \ void operator()(const DeviceContext& ctx, const framework::LoDTensor& in, \
framework::LoDTensor* out, int64_t axis, bool keepdims) { \ framework::LoDTensor* out, framework::DDim x_dims, \
auto in_eigen = framework::EigenTensor<T, Rank>::From(in); \ int64_t axis, bool keepdims) { \
auto in_eigen = framework::EigenTensor<T, Rank>::From(in, x_dims); \
if (keepdims) { \ if (keepdims) { \
auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out); \ auto out_eigen = framework::EigenTensor<Tout, Rank>::From(*out); \
out_eigen.device(*(ctx.eigen_device())) = \ out_eigen.device(*(ctx.eigen_device())) = \
...@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor { ...@@ -68,16 +69,26 @@ struct VisitDataArgMinMaxFunctor {
out.template mutable_data<Tout>(ctx.GetPlace()); out.template mutable_data<Tout>(ctx.GetPlace());
auto axis = ctx.Attr<int64_t>("axis"); auto axis = ctx.Attr<int64_t>("axis");
auto keepdims = ctx.Attr<bool>("keepdims"); auto keepdims = ctx.Attr<bool>("keepdims");
auto x_rank = x.dims().size(); const bool& flatten = ctx.Attr<bool>("flatten");
if (axis < 0) axis += x_rank;
// if flatten, will construct the new dims for the cacluate
framework::DDim x_dims;
if (flatten) {
x_dims = framework::make_ddim({x.numel()});
// if flatten, the axis just as 0
axis = 0;
} else {
x_dims = x.dims();
if (axis < 0) axis += x_dims.size();
}
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
#define CALL_ARG_MINMAX_FUNCTOR(rank) \ #define CALL_ARG_MINMAX_FUNCTOR(rank) \
ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \ ArgMinMaxFunctor<DeviceContext, T, Tout, rank, EnumArgMinMaxValue> \
functor##rank; \ functor##rank; \
functor##rank(dev_ctx, x, &out, axis, keepdims) functor##rank(dev_ctx, x, &out, x_dims, axis, keepdims)
switch (x.dims().size()) { switch (x_dims.size()) {
case 1: case 1:
CALL_ARG_MINMAX_FUNCTOR(1); CALL_ARG_MINMAX_FUNCTOR(1);
break; break;
...@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel { ...@@ -141,6 +152,7 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
const auto& x_dims = ctx->GetInputDim("X"); const auto& x_dims = ctx->GetInputDim("X");
int64_t axis = ctx->Attrs().Get<int64_t>("axis"); int64_t axis = ctx->Attrs().Get<int64_t>("axis");
bool keepdims = ctx->Attrs().Get<bool>("keepdims"); bool keepdims = ctx->Attrs().Get<bool>("keepdims");
const bool& flatten = ctx->Attrs().Get<bool>("flatten");
PADDLE_ENFORCE_GE(axis, -x_dims.size(), PADDLE_ENFORCE_GE(axis, -x_dims.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
...@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel { ...@@ -152,14 +164,21 @@ class ArgMinMaxOp : public framework::OperatorWithKernel {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size())); "'axis'(%d) must be less than Rank(X)(%d).", axis, x_dims.size()));
std::vector<int64_t> vec;
if (flatten) {
// if is flatten, will return the only on element
if (keepdims) {
vec.emplace_back(static_cast<int64_t>(1));
}
} else {
auto x_rank = x_dims.size(); auto x_rank = x_dims.size();
if (axis < 0) axis += x_rank; if (axis < 0) axis += x_rank;
std::vector<int64_t> vec; for (int64_t i = 0; i < axis; i++) vec.emplace_back(x_dims[i]);
for (int64_t i = 0; i < axis; i++) vec.push_back(x_dims[i]);
if (keepdims) { if (keepdims) {
vec.push_back(static_cast<int64_t>(1)); vec.emplace_back(static_cast<int64_t>(1));
}
for (int64_t i = axis + 1; i < x_rank; i++) vec.emplace_back(x_dims[i]);
} }
for (int64_t i = axis + 1; i < x_rank; i++) vec.push_back(x_dims[i]);
ctx->SetOutputDim("Out", framework::make_ddim(vec)); ctx->SetOutputDim("Out", framework::make_ddim(vec));
} }
}; };
...@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -176,6 +195,9 @@ class BaseArgMinMaxOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<int64_t>("axis", "The axis in which to compute the arg indics."); AddAttr<int64_t>("axis", "The axis in which to compute the arg indics.");
AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false); AddAttr<bool>("keepdims", "Keep the dim that to reduce.").SetDefault(false);
AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1); AddAttr<int>("dtype", "Keep the dim that to reduce.").SetDefault(-1);
AddAttr<bool>("flatten",
"Flatten the input value, and search the min or max indices")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC( AddComment(string::Sprintf(R"DOC(
%s Operator. %s Operator.
......
...@@ -12,18 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,18 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/arg_min_max_op_base.h" #include "paddle/fluid/operators/arg_min_max_op_base.cu.h"
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
arg_min, arg_min, paddle::operators::ArgMinMaxOpCUDAKernel<float, cub::ArgMin>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, float>, paddle::operators::ArgMinMaxOpCUDAKernel<double, cub::ArgMin>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, paddle::operators::ArgMinMaxOpCUDAKernel<int64_t, cub::ArgMin>,
double>, paddle::operators::ArgMinMaxOpCUDAKernel<int32_t, cub::ArgMin>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext, paddle::operators::ArgMinMaxOpCUDAKernel<int8_t, cub::ArgMin>);
int64_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
int32_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
int16_t>,
paddle::operators::ArgMinKernel<paddle::platform::CUDADeviceContext,
uint8_t>);
...@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel { ...@@ -32,22 +32,29 @@ class BCELossOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss"); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "BCELoss");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto label_dims = ctx->GetInputDim("Label"); auto labels_dims = ctx->GetInputDim("Label");
PADDLE_ENFORCE_EQ(
x_dims.size(), label_dims.size(), int rank = x_dims.size();
PADDLE_ENFORCE_EQ(rank, labels_dims.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Input(X) and Input(Label) shall have the same shape.")); "Input(X) and Input(Label) shall have the same rank."
bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) || "But received: the rank of Input(X) is [%d], "
framework::contain_unknown_dim(label_dims); "the rank of Input(Label) is [%d].",
bool check = ctx->IsRuntime() || !contain_unknown_dim; rank, labels_dims.size()));
bool check = true;
if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
framework::product(labels_dims) <= 0)) {
check = false;
}
if (check) { if (check) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(x_dims, labels_dims,
x_dims.size(), label_dims.size(),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"ShapeError: Input(X) and Input(Label) shall have the same shape " "Input(X) and Input(Label) shall have the same "
"But received: the shape of Input(X) is [%s], the shape of " "shape. But received: the shape of Input(X) is "
"Input(Label) is [%s].", "[%s], the shape of Input(Label) is [%s].",
x_dims, label_dims)); x_dims, labels_dims));
} }
ctx->ShareDim("X", "Out"); ctx->ShareDim("X", "Out");
...@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel { ...@@ -76,20 +83,31 @@ class BCELossGradOp : public framework::OperatorWithKernel {
framework::GradVarName("X"), "BCELossGrad"); framework::GradVarName("X"), "BCELossGrad");
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
auto labels_dims = ctx->GetInputDim("Label");
auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out")); auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
bool contain_unknown_dim = framework::contain_unknown_dim(x_dims) ||
framework::contain_unknown_dim(dout_dims); bool check = true;
bool check = ctx->IsRuntime() || !contain_unknown_dim; if ((!ctx->IsRuntime()) && (framework::product(x_dims) <= 0 ||
framework::product(labels_dims) <= 0)) {
check = false;
}
if (check) { if (check) {
PADDLE_ENFORCE_EQ(x_dims, labels_dims,
platform::errors::InvalidArgument(
"Input(X) and Input(Label) shall have the same "
"shape. But received: the shape of Input(X) is "
"[%s], the shape of Input(Label) is [%s].",
x_dims, labels_dims));
PADDLE_ENFORCE_EQ(x_dims, dout_dims, PADDLE_ENFORCE_EQ(x_dims, dout_dims,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"ShapeError:The Input(X) and Input(Out@Grad) " "Input(X) and Input(Out@Grad) shall have the same "
"should have the same " "shape. But received: the shape of Input(X) is "
"shape, But received: the shape of Input(X) is " "[%s], the shape of Input(Out@Grad) is [%s].",
"[%s], the shape of "
"Input(Out@GRAD) is [%s].",
x_dims, dout_dims)); x_dims, dout_dims));
} }
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
ctx->ShareLoD("X", framework::GradVarName("X")); ctx->ShareLoD("X", framework::GradVarName("X"));
} }
......
...@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> { ...@@ -67,7 +67,8 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
auto x_data = x->data<T>(); auto x_data = x->data<T>();
auto out_data = out->mutable_data<T>(ctx.GetPlace()); auto out_data = out->mutable_data<T>(ctx.GetPlace());
int x_numel = x->numel(); auto x_numel = x->numel();
platform::GpuLaunchConfig config = platform::GpuLaunchConfig config =
platform::getGpuLaunchConfig(x_numel, ctx); platform::getGpuLaunchConfig(x_numel, ctx);
...@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> { ...@@ -75,7 +76,7 @@ class BCELossCUDAKernel : public framework::OpKernel<T> {
framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu); framework::TensorCopy(*x, platform::CPUPlace(), &x_cpu);
T* x_cpu_data = x_cpu.data<T>(); T* x_cpu_data = x_cpu.data<T>();
for (int i = 0; i < x_numel; ++i) { for (int64_t i = 0; i < x_numel; ++i) {
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
x_cpu_data[i], static_cast<T>(0), x_cpu_data[i], static_cast<T>(0),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
...@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> { ...@@ -34,11 +34,11 @@ class BCELossOpKernel : public framework::OpKernel<T> {
auto x_data = x->data<T>(); auto x_data = x->data<T>();
auto label_data = labels->data<T>(); auto label_data = labels->data<T>();
auto out_data = out->mutable_data<T>(ctx.GetPlace()); auto out_data = out->mutable_data<T>(ctx.GetPlace());
int x_numel = x->numel(); auto x_numel = x->numel();
// out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 - // out = -(label * ln(x) + (1 - label) * ln(1 - x)) = (label - 1) * ln(1 -
// x) - label * ln(x) // x) - label * ln(x)
for (int i = 0; i < x_numel; ++i) { for (int64_t i = 0; i < x_numel; ++i) {
PADDLE_ENFORCE_GE( PADDLE_ENFORCE_GE(
x_data[i], static_cast<T>(0), x_data[i], static_cast<T>(0),
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/bernoulli_op.h"
#include <algorithm>
#include <string>
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
namespace paddle {
namespace operators {
class BernoulliOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("X",
"A tensor with probabilities for generating the random binary "
"number");
AddOutput("Out", "A Tensor filled with random binary number");
AddComment(R"DOC(
This OP returns a Tensor filled with random binary(0 or 1) number from a Bernoulli distribution.
Out ~ Bernoulli(X)
)DOC");
}
};
class BernoulliOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
return UnaryOpUnchangedInferShape(ctx);
}
};
// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
// Use std::random and thrust::random(thrust is a std library in CUDA) to
// implement uniform random.
template <typename T>
class BernoulliOpKernel<platform::CPUDeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto x = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto *in_data = x->data<T>();
auto *out_data = out->mutable_data<T>(ctx.GetPlace());
int64_t size = x->numel();
std::uniform_real_distribution<T> dist(0.0, 1.0);
auto gen_ptr = framework::Generator::GetInstance();
std::mt19937_64 &gen_engine = gen_ptr->GetCPUEngine();
for (int64_t i = 0; i < size; ++i) {
out_data[i] = BernoulliFunctor(in_data[i], dist(gen_engine));
}
}
}; // namespace operators
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
bernoulli, ops::BernoulliOp, ops::BernoulliOpMaker,
paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
REGISTER_OP_CPU_KERNEL(bernoulli,
ops::BernoulliOpKernel<plat::CPUDeviceContext, float>,
ops::BernoulliOpKernel<plat::CPUDeviceContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/random.h>
#include <thrust/transform.h>
#include "paddle/fluid/framework/generator.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/bernoulli_op.h"
#include "paddle/fluid/platform/transform.h"
namespace paddle {
namespace operators {
// it can be consistent with cpu when CUDAGenerator is provided.
template <typename T>
struct BernoulliCudaFunctor {
unsigned int seed_;
__host__ __device__ BernoulliCudaFunctor(int seed) : seed_(seed) {}
__host__ __device__ T operator()(const unsigned int n, const T p) const {
thrust::minstd_rand rng;
rng.seed(seed_);
thrust::uniform_real_distribution<T> dist(0.0, 1.0);
rng.discard(n);
return static_cast<T>(dist(rng) < p);
}
};
template <typename T>
class BernoulliOpKernel<platform::CUDADeviceContext, T>
: public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
std::random_device rd;
auto seed = rd();
const auto x = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto* in_data = x->data<T>();
auto* out_data = out->mutable_data<T>(ctx.GetPlace());
int64_t size = x->numel();
thrust::counting_iterator<unsigned int> index_sequence_begin(0);
platform::Transform<platform::CUDADeviceContext> trans;
auto* context =
static_cast<const platform::CUDADeviceContext*>(&ctx.device_context());
trans(*context, index_sequence_begin, index_sequence_begin + size, in_data,
out_data, BernoulliCudaFunctor<T>(seed));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(
bernoulli, ops::BernoulliOpKernel<plat::CUDADeviceContext, float>,
ops::BernoulliOpKernel<plat::CUDADeviceContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h"
namespace paddle {
namespace operators {
/**
* Samples a bernoulli distribution given a probability input
*/
template <typename T>
inline HOSTDEVICE T BernoulliFunctor(T p, T rand) {
PADDLE_ENFORCE_LE(p, 1, platform::errors::OutOfRange(
"The probability should be <= 1, but got %f", p));
PADDLE_ENFORCE_GE(p, 0, platform::errors::OutOfRange(
"The probability should be >= 1, but got %f", p));
return static_cast<T>(rand < p);
}
template <typename DeviceContext, typename T>
class BernoulliOpKernel;
} // namespace operators
} // namespace paddle
...@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> { ...@@ -63,7 +63,6 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
for_range(matrix_band_part_functor); for_range(matrix_band_part_functor);
} }
// TODO(guosheng): Add callback to check info
auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count); auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
auto* info_ptr = reinterpret_cast<int*>(info->ptr()); auto* info_ptr = reinterpret_cast<int*>(info->ptr());
...@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> { ...@@ -96,6 +95,20 @@ class CholeskyGPUKernel : public framework::OpKernel<T> {
#if CUDA_VERSION >= 9020 && !defined(_WIN32) #if CUDA_VERSION >= 9020 && !defined(_WIN32)
} }
#endif #endif
// check the info
std::vector<int> error_info; // only for checking positive matrix
error_info.resize(batch_count);
memory::Copy(platform::CPUPlace(), error_info.data(),
BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace()),
info_ptr, sizeof(int) * batch_count, dev_ctx.stream());
for (int i = 0; i < batch_count; ++i) {
PADDLE_ENFORCE_EQ(error_info[i], 0,
platform::errors::PreconditionNotMet(
"For batch [%d]: U(%d, %d) is zero, singular U.", i,
error_info[i], error_info[i]));
}
} }
void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo, void Potrf(const platform::CUDADeviceContext& dev_ctx, cublasFillMode_t uplo,
......
...@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> { ...@@ -59,22 +59,24 @@ class CholeskyCPUKernel : public framework::OpKernel<T> {
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>, Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
Eigen::UpLoType::Upper> Eigen::UpLoType::Upper>
llt_decomposition(input); llt_decomposition(input);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
llt_decomposition.info(), Eigen::Success,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Cholesky decomposition was not successful. The input matrice " "Cholesky decomposition was not successful. The "
"might not be not be positive definite.")); "%d-th input matrice "
"might not be not be positive definite.",
i));
output = llt_decomposition.matrixU(); output = llt_decomposition.matrixU();
} else { } else {
Eigen::LLT< Eigen::LLT<
Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>, Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>,
Eigen::UpLoType::Lower> Eigen::UpLoType::Lower>
llt_decomposition(input); llt_decomposition(input);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(llt_decomposition.info(), Eigen::Success,
llt_decomposition.info(), Eigen::Success,
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"Cholesky decomposition was not successful. The input matrice " "Cholesky decomposition was not successful. The "
"might not be not be positive definite.")); "%d-th input matrice "
"might not be not be positive definite.",
i));
output = llt_decomposition.matrixL(); output = llt_decomposition.matrixL();
} }
} }
......
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace operators {
class CReduceMaxOpMaker : public CReduceOpMaker {
protected:
std::string GetName() const override { return "Max"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(c_reduce_max, ops::CReduceOp,
ops::CReduceMaxOpMaker);
REGISTER_OP_CPU_KERNEL(c_reduce_max,
ops::CReduceOpCPUKernel<ops::kRedMax, float>,
ops::CReduceOpCPUKernel<ops::kRedMax, double>,
ops::CReduceOpCPUKernel<ops::kRedMax, int>,
ops::CReduceOpCPUKernel<ops::kRedMax, int64_t>,
ops::CReduceOpCPUKernel<ops::kRedMax, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_reduce_max,
ops::CReduceOpCUDAKernel<ops::kRedMax, float>,
ops::CReduceOpCUDAKernel<ops::kRedMax, double>,
ops::CReduceOpCUDAKernel<ops::kRedMax, int>,
ops::CReduceOpCUDAKernel<ops::kRedMax, int64_t>,
ops::CReduceOpCUDAKernel<ops::kRedMax, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace operators {
class CReduceMinOpMaker : public CReduceOpMaker {
protected:
std::string GetName() const override { return "Min"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(c_reduce_min, ops::CReduceOp,
ops::CReduceMinOpMaker);
REGISTER_OP_CPU_KERNEL(c_reduce_min,
ops::CReduceOpCPUKernel<ops::kRedMin, float>,
ops::CReduceOpCPUKernel<ops::kRedMin, double>,
ops::CReduceOpCPUKernel<ops::kRedMin, int>,
ops::CReduceOpCPUKernel<ops::kRedMin, int64_t>,
ops::CReduceOpCPUKernel<ops::kRedMin, plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_reduce_min,
ops::CReduceOpCUDAKernel<ops::kRedMin, float>,
ops::CReduceOpCUDAKernel<ops::kRedMin, double>,
ops::CReduceOpCUDAKernel<ops::kRedMin, int>,
ops::CReduceOpCUDAKernel<ops::kRedMin, int64_t>,
ops::CReduceOpCUDAKernel<ops::kRedMin, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <string>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
enum ReduceType { kRedSum, kRedMax, kRedMin, kRedProd };
class CReduceOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
template <ReduceType red_type, typename T>
class CReduceOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(
true, false,
platform::errors::Unavailable("Unimplemented CReduceOpCPUKernel now."));
}
};
template <ReduceType red_type, typename T>
class CReduceOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL)
auto in = ctx.Input<framework::Tensor>("X");
auto out = ctx.Output<framework::Tensor>("Out");
auto place = ctx.GetPlace();
ncclDataType_t dtype = platform::ToNCCLDataType(in->type());
int64_t numel = in->numel();
const void* sendbuff = in->data<void>();
out->Resize(in->dims());
void* recvbuff = out->mutable_data<T>(place);
int rid = ctx.Attr<int>("ring_id");
int root = ctx.Attr<int>("root_id");
auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
cudaStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
ncclRedOp_t nccl_red_type = ncclSum;
switch (red_type) {
case kRedSum:
nccl_red_type = ncclSum;
break;
case kRedMax:
nccl_red_type = ncclMax;
break;
case kRedMin:
nccl_red_type = ncclMin;
break;
case kRedProd:
nccl_red_type = ncclProd;
break;
default:
PADDLE_ENFORCE_EQ(true, false, platform::errors::InvalidArgument(
"red_type must be one of kRedSum, "
"kRedMax, kRedMin, kRedProd."));
}
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclReduce(
sendbuff, recvbuff, numel, dtype, nccl_red_type, root, comm->comm(),
stream));
#else
PADDLE_ENFORCE_EQ(true, false,
platform::errors::Unavailable(
"PaddlePaddle should compile with GPU.."));
#endif
}
};
class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor), tensor to be reduced.");
AddOutput("Out", "(Tensor) the reduced result.");
AddAttr<int>("ring_id", "(int default 0) communication ring id.")
.SetDefault(0);
AddAttr<int>("root_id", "(int default 0) root id.").SetDefault(0);
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
.SetDefault(false);
AddComment(string::Sprintf(R"DOC(
CReduce %s Operator
Call collective Reduce with reduce type %s. If input and output are
the same variable, in-place reduce will be used.
)DOC",
GetName(), GetName()));
}
protected:
virtual std::string GetName() const = 0;
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace operators {
class CReduceProdOpMaker : public CReduceOpMaker {
protected:
std::string GetName() const override { return "Prod"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(c_reduce_prod, ops::CReduceOp,
ops::CReduceProdOpMaker);
REGISTER_OP_CPU_KERNEL(c_reduce_prod,
ops::CReduceOpCPUKernel<ops::kRedProd, float>,
ops::CReduceOpCPUKernel<ops::kRedProd, double>,
ops::CReduceOpCPUKernel<ops::kRedProd, int>,
ops::CReduceOpCPUKernel<ops::kRedProd, int64_t>,
ops::CReduceOpCPUKernel<ops::kRedProd, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_reduce_prod,
ops::CReduceOpCUDAKernel<ops::kRedProd, float>,
ops::CReduceOpCUDAKernel<ops::kRedProd, double>,
ops::CReduceOpCUDAKernel<ops::kRedProd, int>,
ops::CReduceOpCUDAKernel<ops::kRedProd, int64_t>,
ops::CReduceOpCUDAKernel<ops::kRedProd, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace paddle {
namespace operators {
class CReduceSumOpMaker : public CReduceOpMaker {
protected:
std::string GetName() const override { return "Sum"; }
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(c_reduce_sum, ops::CReduceOp,
ops::CReduceSumOpMaker);
REGISTER_OP_CPU_KERNEL(c_reduce_sum,
ops::CReduceOpCPUKernel<ops::kRedSum, float>,
ops::CReduceOpCPUKernel<ops::kRedSum, double>,
ops::CReduceOpCPUKernel<ops::kRedSum, int>,
ops::CReduceOpCPUKernel<ops::kRedSum, int64_t>,
ops::CReduceOpCPUKernel<ops::kRedSum, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_reduce_sum,
ops::CReduceOpCUDAKernel<ops::kRedSum, float>,
ops::CReduceOpCUDAKernel<ops::kRedSum, double>,
ops::CReduceOpCUDAKernel<ops::kRedSum, int>,
ops::CReduceOpCUDAKernel<ops::kRedSum, int64_t>,
ops::CReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_scatter_op.h"
namespace paddle {
namespace operators {
class CScatterOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "CScatter");
OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CScatter");
int root_id = ctx->Attrs().Get<int>("root");
int ring_id = ctx->Attrs().Get<int>("ring_id");
int nranks = ctx->Attrs().Get<int>("nranks");
PADDLE_ENFORCE_GE(nranks, 2,
platform::errors::InvalidArgument(
"The number of ranks (%d) must be greater than 1 "
"to use collective op (c_scatter op).",
nranks));
PADDLE_ENFORCE_GE(
root_id, 0,
platform::errors::InvalidArgument(
"The root_id (%d) for c_scatter_op must be non-negative.",
root_id));
PADDLE_ENFORCE_GE(
ring_id, 0,
platform::errors::InvalidArgument(
"The ring_id (%d) for c_scatter_op must be non-negative.",
root_id));
framework::DDim dim = ctx->GetInputDim("X");
dim[0] = dim[0] / nranks;
if (dim[0] < 0) dim[0] = -1;
ctx->SetOutputDim("Out", dim);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
}
};
class CScatterOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("X", "(Tensor) tensor to be broadcasted.");
AddOutput("Out", "(Tensor) the result of broadcast.");
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0);
AddAttr<int>("root", "(int default 0) root id for broadcasting.")
.SetDefault(0);
AddAttr<int>("nranks", "(int default 1) number of ranks.").SetDefault(0);
AddAttr<bool>(
"use_calc_stream",
"(bool default false) eject CUDA operations to calculation stream.")
.SetDefault(false);
AddComment(R"DOC(
CScatter Operator
Scatter the source to all participators.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_WITHOUT_GRADIENT(c_scatter, ops::CScatterOp, ops::CScatterOpMaker);
REGISTER_OP_CPU_KERNEL(c_scatter, ops::CScatterOpCPUKernel<float>,
ops::CScatterOpCPUKernel<double>,
ops::CScatterOpCPUKernel<int>,
ops::CScatterOpCPUKernel<int64_t>,
ops::CScatterOpCPUKernel<plat::float16>);
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_scatter_op.h"
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class CScatterOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#if defined(PADDLE_WITH_NCCL)
auto x = ctx.Input<framework::LoDTensor>("X");
auto out = ctx.Output<framework::LoDTensor>("Out");
int numel = x->numel();
ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
int nranks = ctx.Attr<int>("nranks");
int root_id = ctx.Attr<int>("root");
int ring_id = ctx.Attr<int>("ring_id");
auto place = ctx.GetPlace();
auto comm = platform::NCCLCommContext::Instance().Get(ring_id, place);
PADDLE_ENFORCE_EQ(nranks, comm->nranks(),
platform::errors::InvalidArgument(
"The number of ranks (%d) you set of must "
"be equal to comm->nranks (%d).",
nranks, comm->nranks()));
PADDLE_ENFORCE_GE(
root_id, 0,
platform::errors::InvalidArgument(
"The root_id (%d) for c_scatter_op must be non-negative.",
root_id));
PADDLE_ENFORCE_GE(
ring_id, 0,
platform::errors::InvalidArgument(
"The ring_id (%d) for c_scatter_op must be non-negative.",
ring_id));
cudaStream_t stream = nullptr;
if (ctx.Attr<bool>("use_calc_stream")) {
auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
} else {
stream = comm->stream();
}
framework::DDim x_dims = x->dims();
framework::DDim out_dims(x_dims);
framework::Tensor temp;
auto out_ptr = temp.mutable_data<T>(out_dims, place);
if (root_id == comm->rank()) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
reinterpret_cast<void*>(const_cast<T*>(x->data<T>())), numel, dtype,
root_id, comm->comm(), stream));
framework::TensorCopy(*static_cast<const framework::Tensor*>(x), place,
*platform::DeviceContextPool::Instance().Get(place),
static_cast<framework::Tensor*>(&temp));
} else {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclBcast(
out_ptr, numel, dtype, root_id, comm->comm(), stream));
}
out_dims[0] = out_dims[0] / nranks;
auto start_index = out_dims[0] * comm->rank();
auto end_index = start_index + out_dims[0];
temp = temp.Slice(start_index, end_index);
temp.Resize(out_dims);
out->mutable_data<T>(out_dims, place);
framework::TensorCopySync(*static_cast<const framework::Tensor*>(&temp),
place, static_cast<framework::Tensor*>(out));
out->Resize(out_dims);
#else
PADDLE_ENFORCE_EQ(
true, false,
platform::errors::Unavailable("PaddlePaddle should compile with GPU."));
#endif
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(c_scatter, ops::CScatterOpCUDAKernel<float>,
ops::CScatterOpCUDAKernel<double>,
ops::CScatterOpCUDAKernel<int>,
ops::CScatterOpCUDAKernel<int64_t>,
ops::CScatterOpCUDAKernel<plat::float16>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename T>
class CScatterOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(true, false,
platform::errors::Unavailable(
"Unimplemented cpu kernel for CScatterOp."));
}
};
} // namespace operators
} // namespace paddle
...@@ -196,7 +196,7 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar( ...@@ -196,7 +196,7 @@ framework::OpKernelType ConvOp::GetKernelTypeForVar(
auto ar = paddle::framework::AttrReader(attrs); auto ar = paddle::framework::AttrReader(attrs);
const std::string data_format = ar.Get<std::string>("data_format"); const std::string data_format = ar.Get<std::string>("data_format");
auto dl = framework::StringToDataLayout(data_format); auto dl = framework::StringToDataLayout(data_format);
// Some models may have intentionally set "AnyLayout" for pool // Some models may have intentionally set "AnyLayout" for conv
// op. Treat this as NCHW (default data_format value) // op. Treat this as NCHW (default data_format value)
if (dl != framework::DataLayout::kAnyLayout) { if (dl != framework::DataLayout::kAnyLayout) {
return framework::OpKernelType(expected_kernel_type.data_type_, return framework::OpKernelType(expected_kernel_type.data_type_,
......
...@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -37,6 +37,8 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
auto filter_dims = ctx->GetInputDim("Filter"); auto filter_dims = ctx->GetInputDim("Filter");
std::vector<int> output_size = std::vector<int> output_size =
ctx->Attrs().Get<std::vector<int>>("output_size"); ctx->Attrs().Get<std::vector<int>>("output_size");
std::vector<int> output_padding =
ctx->Attrs().Get<std::vector<int>>("output_padding");
std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides"); std::vector<int> strides = ctx->Attrs().Get<std::vector<int>>("strides");
std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings"); std::vector<int> paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations"); std::vector<int> dilations = ctx->Attrs().Get<std::vector<int>>("dilations");
...@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -78,6 +80,12 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
platform::errors::InvalidArgument( platform::errors::InvalidArgument(
"The Attr(output_size) and Attr(stride) of Op(conv_transpose) " "The Attr(output_size) and Attr(stride) of Op(conv_transpose) "
"should be the same.")); "should be the same."));
if (output_padding.size())
PADDLE_ENFORCE_EQ(
output_padding.size(), strides.size(),
platform::errors::InvalidArgument(
"The Attr(output_padding) and Attr(stride) of Op(conv_transpose) "
"should be the same."));
const int64_t C = const int64_t C =
(data_layout != DataLayout::kNHWC ? in_dims[1] (data_layout != DataLayout::kNHWC ? in_dims[1]
...@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -136,6 +144,27 @@ void ConvTransposeOp::InferShape(framework::InferShapeContext* ctx) const {
infer_shape + strides[i])); infer_shape + strides[i]));
} }
output_shape.push_back(output_size[i]); output_shape.push_back(output_size[i]);
} else if (output_padding.size()) {
if (ctx->IsRuntime()) {
PADDLE_ENFORCE_GE(
output_padding[i], 0,
platform::errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should not be "
"less than the 0. But received output_padding = "
"[%s], whose dim %d is less than 0",
framework::make_ddim(output_padding), i));
PADDLE_ENFORCE_LT(
output_padding[i], std::max(strides[i], dilations[i]),
platform::errors::InvalidArgument(
"output_padding of Op(ConvTransposeOp) should be less "
"than either stride or dilation. But received output_size = "
"[%s], "
"whose dim %d is not less than either stride (%d) or "
"dilation (%d)",
framework::make_ddim(output_size), i, strides[i],
dilations[i]));
}
output_shape.push_back((infer_shape + output_padding[i]));
} else { } else {
output_shape.push_back(infer_shape); output_shape.push_back(infer_shape);
} }
...@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() { ...@@ -223,10 +252,14 @@ void Conv2DTransposeOpMaker::Make() {
"The format of output tensor is X (one-dimensional) of size equal" "The format of output tensor is X (one-dimensional) of size equal"
"to the number of output channels. Only used with MKL-DNN.") "to the number of output channels. Only used with MKL-DNN.")
.AsDispensable(); .AsDispensable();
AddOutput("Output", AddOutput("Output",
"(Tensor) The output tensor of convolution transpose operator. " "(Tensor) The output tensor of convolution transpose operator. "
"The format of output tensor is the same as input tensor."); "The format of output tensor is the same as input tensor.");
AddAttr<std::vector<int>>("output_padding",
"(vector<int> default: []), Additional size added "
"to one side of each dimension in the output "
"shape")
.SetDefault({});
AddAttr<std::vector<int>>("output_size", AddAttr<std::vector<int>>("output_size",
"(vector<int> default: []), the " "(vector<int> default: []), the "
"size of the output tensor") "size of the output tensor")
...@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() { ...@@ -338,6 +371,11 @@ void Conv3DTransposeOpMaker::Make() {
"Where N is batch size, C is " "Where N is batch size, C is "
"the number of channels, D is the depth of the feature, H is the " "the number of channels, D is the depth of the feature, H is the "
"height of the feature, and W is the width of the feature."); "height of the feature, and W is the width of the feature.");
AddAttr<std::vector<int>>("output_padding",
"(vector<int> default: []), Additional size added "
"to one side of each dimension in the output "
"shape")
.SetDefault({});
AddAttr<std::vector<int>>("output_size", AddAttr<std::vector<int>>("output_size",
"(vector<int> default: []), the " "(vector<int> default: []), the "
"size of the output tensor") "size of the output tensor")
......
...@@ -24,34 +24,62 @@ class CudnnLSTMOp : public framework::OperatorWithKernel { ...@@ -24,34 +24,62 @@ class CudnnLSTMOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"), OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTM");
"Input(Input) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTM");
PADDLE_ENFORCE(ctx->HasInput("W"), OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTM");
"Input(Weight) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTM");
PADDLE_ENFORCE(ctx->HasInput("InitH"), OP_INOUT_CHECK(ctx->HasOutput("Reserve"), "Output", "Reserve", "CudnnLSTM");
"Input(init_h) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasOutput("StateOut"), "Output", "StateOut",
PADDLE_ENFORCE(ctx->HasInput("InitC"), "CudnnLSTM");
"Input(init_c) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "CudnnLSTM");
PADDLE_ENFORCE(ctx->HasInput("Cache"), OP_INOUT_CHECK(ctx->HasOutput("LastH"), "Output", "LastH", "CudnnLSTM");
"Input(Cache) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasOutput("LastC"), "Output", "LastC", "CudnnLSTM");
PADDLE_ENFORCE(ctx->HasOutput("Out"),
"Output(Out) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("last_h"),
"Output(last_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasOutput("last_c"),
"Output(last_c) of LSTM should not be null.");
auto in_dims = ctx->GetInputDim("Input"); auto in_dims = ctx->GetInputDim("Input");
PADDLE_ENFORCE_EQ(in_dims.size(), 3, "Input(X)'s rank must be 3."); auto init_dims = ctx->GetInputDim("InitH");
PADDLE_ENFORCE_EQ(in_dims.size(), 3,
platform::errors::InvalidArgument(
"The rank of Input in CudnnLSTM must be 3. But "
"received Input's rank is %d.",
in_dims.size()));
PADDLE_ENFORCE_EQ(init_dims.size(), 3,
platform::errors::InvalidArgument(
"The rank of InitH in CudnnLSTM must be 3. But "
"received InitH's rank is %d.",
init_dims.size()));
PADDLE_ENFORCE_EQ(in_dims[1], init_dims[1],
platform::errors::InvalidArgument(
"The in_dims[1] (Input dims) and init_dims[1] (InitH "
"dims) should be equal. But "
"received in_dims[1] is %d and init_dims[1] is %d.",
in_dims[1], init_dims[1]));
PADDLE_ENFORCE_EQ(in_dims[2], init_dims[2],
platform::errors::InvalidArgument(
"The in_dims[2] (Input dims) and init_dims[2] (InitH "
"dims) should be equal. But "
"received in_dims[2] is %d and init_dims[2] is %d.",
in_dims[2], init_dims[2]));
auto out_dims = in_dims; auto out_dims = in_dims;
auto hidden_size = ctx->Attrs().Get<int>("hidden_size"); auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
out_dims[2] = hidden_size; bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
auto last_dims = init_dims;
last_dims[0] = is_bidirec ? last_dims[0] * 2 : last_dims[0];
ctx->SetOutputDim("Out", out_dims); ctx->SetOutputDim("Out", out_dims);
ctx->SetOutputDim("last_h", ctx->GetInputDim("InitH")); ctx->SetOutputDim("LastH", last_dims);
ctx->SetOutputDim("last_c", ctx->GetInputDim("InitC")); ctx->SetOutputDim("LastC", last_dims);
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
ctx.device_context());
} }
}; };
...@@ -84,33 +112,31 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -84,33 +112,31 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
"(Tensor) the learnable hidden-hidden weights." "(Tensor) the learnable hidden-hidden weights."
" The shape is (N), where N is total weight size of the LSTM. " " The shape is (N), where N is total weight size of the LSTM. "
" cudnn concatenate all the weight to one Tensor"); " cudnn concatenate all the weight to one Tensor");
AddInput("Cache", AddOutput("Reserve",
"The cache of dropout op, a RAW type variable including random " "(Tensor, a temporary output Tensor to store the reserve_data "
"number generator states and some descriptors, which is used in " "of cudnn kernel.")
"cudnn kernel.") .AsIntermediate();
.AsDispensable(); AddOutput("StateOut",
"Share memory with State. "
"Store the global drop state when training");
AddOutput("Out", AddOutput("Out",
"(Tensor) the hidden state of LSTM operator. " "(Tensor) the hidden state of LSTM operator. "
"The shape is ( seq_len x batch_size x hidden_size) if " "The shape is ( seq_len x batch_size x hidden_size) if "
"is_bidirec is False" "is_bidirec is False"
"and When is_bidirec is True, the shape will be ( seq_len x " "and When is_bidirec is True, the shape will be ( seq_len x "
"batch_size x hidden_size * 2) "); "batch_size x hidden_size * 2) ");
AddOutput("last_h", AddOutput("LastH",
"(Tensor) the hidden state of the last step. " "(Tensor) the hidden state of the last step. "
"The shape is ( num_layers x batch_size x hidden_size) if " "The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False" "is_bidirec is False"
"and When is_bidirec is True, the shape will be (num_layers*2 x " "and When is_bidirec is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size)"); "batch_size x hidden_size)");
AddOutput("last_c", AddOutput("LastC",
"(Tensor) the cell state of the last step" "(Tensor) the cell state of the last step"
"The shape is ( num_layers x batch_size x hidden_size) if " "The shape is ( num_layers x batch_size x hidden_size) if "
"is_bidirec is False" "is_bidirec is False"
"and When is_bidirect is True, the shape will be (num_layers*2 x " "and When is_bidirect is True, the shape will be (num_layers*2 x "
"batch_size x hidden_size*2)"); "batch_size x hidden_size*2)");
AddAttr<int>("max_len",
"max length of the LSTM op"
"the first dim of the Input can NOT be greater than max_len")
.SetDefault(20);
AddAttr<float>( AddAttr<float>(
"dropout_prob", "dropout_prob",
"dropout prob of the dropout op" "dropout prob of the dropout op"
...@@ -120,14 +146,14 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -120,14 +146,14 @@ class CudnnLSTMOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr<bool>("is_bidirec", AddAttr<bool>("is_bidirec",
"is_bidirec" "is_bidirec"
"if it is bidirectional rnn" "if it is bidirectional rnn"
"The will affect the shape of the Out, last_h, and last_c") "The will affect the shape of the Out, LastH, and LastC")
.SetDefault(false); .SetDefault(false);
AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10); AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100); AddAttr<int>("hidden_size", "hidden size of the LSTM").SetDefault(100);
AddAttr<int>("num_layers", "the total layer number of the LSTM") AddAttr<int>("num_layers", "the total layer number of the LSTM")
.SetDefault(1); .SetDefault(1);
AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false); AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(-1); AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
AddComment(R"DOC( AddComment(R"DOC(
CUDNN LSTM implementation CUDNN LSTM implementation
...@@ -172,16 +198,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { ...@@ -172,16 +198,10 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override { void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Input"), OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "CudnnLSTMGrad");
"Input(Input) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasInput("W"), "Input", "W", "CudnnLSTMGrad");
PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) of LSTM should not be null."); OP_INOUT_CHECK(ctx->HasInput("InitH"), "Input", "InitH", "CudnnLSTMGrad");
PADDLE_ENFORCE(ctx->HasInput("Cache"), OP_INOUT_CHECK(ctx->HasInput("InitC"), "Input", "InitC", "CudnnLSTMGrad");
"Input(last_c) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitH"),
"Input(init_h) of LSTM should not be null.");
PADDLE_ENFORCE(ctx->HasInput("InitC"),
"Input(init_c) of LSTM should not be null.");
auto SetOutGradDim = [&ctx](const std::string& name) { auto SetOutGradDim = [&ctx](const std::string& name) {
auto g_name = framework::GradVarName(name); auto g_name = framework::GradVarName(name);
...@@ -195,6 +215,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel { ...@@ -195,6 +215,12 @@ class CudnnLSTMGradOp : public framework::OperatorWithKernel {
SetOutGradDim("InitH"); SetOutGradDim("InitH");
SetOutGradDim("InitC"); SetOutGradDim("InitC");
} }
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")),
ctx.device_context());
}
}; };
template <typename T> template <typename T>
...@@ -209,13 +235,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> { ...@@ -209,13 +235,12 @@ class CudnnLSTMGradOpMaker : public framework::SingleGradOpMaker<T> {
op->SetInput("InitH", this->Input("InitH")); op->SetInput("InitH", this->Input("InitH"));
op->SetInput("InitC", this->Input("InitC")); op->SetInput("InitC", this->Input("InitC"));
op->SetInput("W", this->Input("W")); op->SetInput("W", this->Input("W"));
if (this->HasInput("Cache")) { op->SetInput("Reserve", this->Output("Reserve"));
op->SetInput("Cache", this->Input("Cache")); op->SetInput("StateOut", this->Output("StateOut"));
}
op->SetInput("Out", this->Output("Out")); op->SetInput("Out", this->Output("Out"));
op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
op->SetInput(framework::GradVarName("last_c"), this->OutputGrad("last_c")); op->SetInput(framework::GradVarName("LastC"), this->OutputGrad("LastC"));
op->SetInput(framework::GradVarName("last_h"), this->OutputGrad("last_h")); op->SetInput(framework::GradVarName("LastH"), this->OutputGrad("LastH"));
op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input")); op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
op->SetOutput(framework::GradVarName("W"), this->InputGrad("W")); op->SetOutput(framework::GradVarName("W"), this->InputGrad("W"));
......
...@@ -15,6 +15,7 @@ limitations under the License. */ ...@@ -15,6 +15,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h"
#include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/platform/cudnn_desc.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -33,8 +34,10 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -33,8 +34,10 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
auto w = ctx.Input<Tensor>("W"); auto w = ctx.Input<Tensor>("W");
Tensor *out = ctx.Output<Tensor>("Out"); Tensor *out = ctx.Output<Tensor>("Out");
Tensor *last_h = ctx.Output<Tensor>("last_h"); Tensor *last_h = ctx.Output<Tensor>("LastH");
Tensor *last_c = ctx.Output<Tensor>("last_c"); Tensor *last_c = ctx.Output<Tensor>("LastC");
Tensor *reserve = ctx.Output<Tensor>("Reserve");
Tensor *state_out = ctx.Output<Tensor>("StateOut");
const T *x_data = x->data<T>(); const T *x_data = x->data<T>();
const T *init_h_data = init_h->data<T>(); const T *init_h_data = init_h->data<T>();
...@@ -46,72 +49,56 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> { ...@@ -46,72 +49,56 @@ class CudnnLSTMGPUKernel : public framework::OpKernel<T> {
T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace()); T *last_h_data = last_h->mutable_data<T>(ctx.GetPlace());
T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace()); T *last_c_data = last_c->mutable_data<T>(ctx.GetPlace());
size_t max_len = ctx.Attr<int>("max_len");
float dropout_prob = ctx.Attr<float>("dropout_prob"); float dropout_prob = ctx.Attr<float>("dropout_prob");
bool is_bidirec = ctx.Attr<bool>("is_bidirec"); bool is_bidirec = ctx.Attr<bool>("is_bidirec");
int input_size = ctx.Attr<int>("input_size");
int hidden_size = ctx.Attr<int>("hidden_size"); int hidden_size = ctx.Attr<int>("hidden_size");
int num_layers = ctx.Attr<int>("num_layers"); int num_layers = ctx.Attr<int>("num_layers");
bool is_test = ctx.Attr<bool>("is_test"); bool is_test = ctx.Attr<bool>("is_test");
int seed = ctx.Attr<int>("seed");
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto *cache_var = ctx.InputVar("Cache");
if (!cache_var) { CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
// The RAW type cache variable wouldn't be created and broadcasted on
// multi-devices before the first running.
// use parent scope to make cache persistable
auto *scope = const_cast<framework::Scope *>(ctx.scope().parent());
auto cache_var_name = ctx.InputNames("Cache")[0];
cache_var = scope->Var(cache_var_name);
}
CudnnRNNCache *cudnn_rnn_cache = nullptr;
if (cache_var->IsInitialized()) {
// const_cast is usually bad.
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
} else {
// const_cast is usually bad.
cudnn_rnn_cache = const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
std::random_device rnd;
int seed = ctx.Attr<int>("seed");
if (seed == -1) {
seed = rnd();
}
auto input_w_numel = w->numel(); auto input_w_numel = w->numel();
auto seq_len = x->dims()[0];
auto batch_size = x->dims()[1]; auto batch_size = x->dims()[1];
cudnn_rnn_cache->init(handle, ctx.GetPlace(), max_len, batch_size, auto input_dim = x->dims()[2];
input_size, hidden_size, num_layers, dropout_prob, size_t reserve_size;
is_bidirec, seed, input_w_numel); bool state_initialized = state_out->IsInitialized() ? true : false;
} cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
framework::ToDataType(std::type_index(typeid(T))));
auto run_seq_len = x->dims()[0]; cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
input_dim, hidden_size, num_layers, dropout_prob,
is_bidirec, seed, input_w_numel, &reserve_size,
state_out, state_initialized, cudnn_type);
auto *reserve_data = reserve->mutable_data<uint8_t>(
{static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
if (is_test) { if (is_test) {
// for inference // for inference
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(), cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_)); cudnn_rnn_cache->workspace_size_));
} else { } else {
// for train // for train
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
cudnn_rnn_cache->x_desc_, x_data, cudnn_rnn_cache->hx_desc_, x_data, cudnn_rnn_cache->hx_desc_, init_h_data,
init_h_data, cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->w_desc_,
cudnn_rnn_cache->w_desc_, w_data, cudnn_rnn_cache->y_desc_, out_data, w_data, cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->hy_desc_,
cudnn_rnn_cache->hy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, last_h_data, cudnn_rnn_cache->cy_desc_, last_c_data,
last_c_data, cudnn_rnn_cache->workspace_data_.data<uint8_t>(), cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->workspace_size_, reserve_data, reserve_size));
cudnn_rnn_cache->reserve_data_.data<uint8_t>(),
cudnn_rnn_cache->reserve_size_));
} }
delete cudnn_rnn_cache;
} }
}; };
...@@ -123,15 +110,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -123,15 +110,13 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
auto *weight = ctx.Input<Tensor>("W"); auto *weight = ctx.Input<Tensor>("W");
auto *init_h = ctx.Input<Tensor>("InitH"); auto *init_h = ctx.Input<Tensor>("InitH");
auto *init_c = ctx.Input<Tensor>("InitC"); auto *init_c = ctx.Input<Tensor>("InitC");
// auto * last_h = ctx.Input<Tensor>("last_h"); auto *reserve = ctx.Input<Tensor>("Reserve");
// auto * last_c = ctx.Input<Tensor>("last_c"); auto *state_out = ctx.Input<Tensor>("StateOut");
auto *out = ctx.Input<Tensor>("Out"); auto *out = ctx.Input<Tensor>("Out");
auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out")); auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("last_h")); auto *last_h_grad = ctx.Input<Tensor>(framework::GradVarName("LastH"));
auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("last_c")); auto *last_c_grad = ctx.Input<Tensor>(framework::GradVarName("LastC"));
// auto* init_h = ctx.Input<Tensor>("init_h");
// auto* init_c = ctx.Input<Tensor>("init_c");
auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input")); auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W")); auto *weight_grad = ctx.Output<Tensor>(framework::GradVarName("W"));
...@@ -140,116 +125,75 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -140,116 +125,75 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto handle = dev_ctx.cudnn_handle(); auto handle = dev_ctx.cudnn_handle();
auto *cache_var = ctx.InputVar("Cache");
PADDLE_ENFORCE(cache_var->IsInitialized());
CudnnRNNCache *cudnn_rnn_cache =
const_cast<framework::Variable *>(cache_var)
->GetMutable<CudnnRNNCache>();
auto input_dims = input->dims(); auto input_dims = input->dims();
auto init_h_dims = init_h->dims(); auto init_h_dims = init_h->dims();
auto init_c_dims = init_c->dims(); auto init_c_dims = init_c->dims();
in_grad->mutable_data<T>(ctx.GetPlace());
weight_grad->mutable_data<T>(ctx.GetPlace()); auto *weight_data = weight->data<T>();
auto *init_h_data = init_h->data<T>();
auto *init_c_data = init_c->data<T>();
auto *out_data = out->data<T>();
auto *out_grad_data = out_grad->data<T>();
auto *last_h_grad_data = last_h_grad->data<T>();
auto *last_c_grad_data = last_c_grad->data<T>();
math::SetConstant<paddle::platform::CUDADeviceContext, T> zero; math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
zero(dev_ctx, in_grad, static_cast<T>(0.0)); weight_grad->mutable_data<T>(ctx.GetPlace());
zero(dev_ctx, weight_grad, static_cast<T>(0.0)); zero(dev_ctx, weight_grad, static_cast<T>(0.0));
T *init_h_grad_data = NULL; in_grad->mutable_data<T>(input_dims, ctx.GetPlace());
if (init_h_grad == nullptr) { auto *in_grad_data = in_grad->data<T>();
Tensor init_h_grad_temp;
init_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, &init_h_grad_temp, static_cast<T>(0.0));
init_h_grad_data = init_h_grad_temp.data<T>();
} else {
init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace()); init_h_grad->mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, init_h_grad, static_cast<T>(0.0)); auto *init_h_grad_data = init_h_grad->data<T>();
init_h_grad_data = init_h_grad->data<T>();
}
T *init_c_grad_data = NULL;
if (init_c_grad == nullptr) {
Tensor init_c_grad_temp;
init_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, &init_c_grad_temp, static_cast<T>(0.0));
init_c_grad_data = init_c_grad_temp.data<T>();
} else {
init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace()); init_c_grad->mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, init_c_grad, static_cast<T>(0.0)); auto *init_c_grad_data = init_c_grad->data<T>();
init_c_grad_data = init_c_grad->data<T>();
}
const T *last_h_grad_data = NULL;
if (last_h_grad == nullptr) {
Tensor last_h_grad_temp;
last_h_grad_temp.mutable_data<T>(init_h_dims, ctx.GetPlace());
zero(dev_ctx, &last_h_grad_temp, static_cast<T>(0.0));
last_h_grad_data = (const T *)last_h_grad_temp.data<T>();
} else {
last_h_grad_data = last_h_grad->data<T>();
}
const T *last_c_grad_data = NULL;
if (last_c_grad == nullptr) {
Tensor last_c_grad_temp;
last_c_grad_temp.mutable_data<T>(init_c_dims, ctx.GetPlace());
zero(dev_ctx, &last_c_grad_temp, static_cast<T>(0.0));
last_c_grad_data = (const T *)last_c_grad_temp.data<T>();
} else {
last_c_grad_data = last_c_grad->data<T>();
}
const T *out_grad_data = NULL;
if (out_grad == nullptr) {
Tensor out_grad_temp;
out_grad_temp.mutable_data<T>(out->dims(), ctx.GetPlace());
zero(dev_ctx, &out_grad_temp, static_cast<T>(0.0));
out_grad_data = (const T *)out_grad_temp.data<T>(); float dropout_prob = ctx.Attr<float>("dropout_prob");
} else { bool is_bidirec = ctx.Attr<bool>("is_bidirec");
out_grad_data = out_grad->data<T>(); int hidden_size = ctx.Attr<int>("hidden_size");
} int num_layers = ctx.Attr<int>("num_layers");
int seed = ctx.Attr<int>("seed");
// zero( dev_ctx, last_h_grad, static_cast<T>(0.0)); CudnnRNNCache *cudnn_rnn_cache = new CudnnRNNCache();
// zero( dev_ctx, last_c_grad, static_cast<T>(0.0));
auto out_data = out->data<T>(); auto input_w_numel = weight->numel();
// auto out_grad_data = out_grad->data<T>(); auto seq_len = input_dims[0];
auto weight_data = weight->data<T>(); auto batch_size = input->dims()[1];
auto init_h_data = init_h->data<T>(); auto input_dim = input->dims()[2];
auto init_c_data = init_c->data<T>(); size_t reserve_size;
auto in_grad_data = in_grad->data<T>(); cudnnDataType_t cudnn_type = platform::ToCudnnDataType(
framework::ToDataType(std::type_index(typeid(T))));
cudnn_rnn_cache->init(handle, ctx.GetPlace(), seq_len, batch_size,
input_dim, hidden_size, num_layers, dropout_prob,
is_bidirec, seed, input_w_numel, &reserve_size,
const_cast<Tensor *>(state_out), true, cudnn_type);
auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>(); auto work_data = cudnn_rnn_cache->workspace_data_.data<uint8_t>();
auto reserve_data = cudnn_rnn_cache->reserve_data_.data<uint8_t>(); const uint8_t *reserve_data = reserve->data<uint8_t>();
auto run_seq_len = input_dims[0];
PADDLE_ENFORCE_LE((size_t)run_seq_len, cudnn_rnn_cache->max_length_,
"cudnn running seq_len CAN not greater max_lengh");
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->y_desc_,
cudnn_rnn_cache->y_desc_, out_data, cudnn_rnn_cache->dy_desc_, out_data, cudnn_rnn_cache->y_desc_, out_grad_data,
out_grad_data, cudnn_rnn_cache->dhy_desc_, last_h_grad_data, cudnn_rnn_cache->hy_desc_, last_h_grad_data, cudnn_rnn_cache->cy_desc_,
cudnn_rnn_cache->dcy_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, last_c_grad_data, cudnn_rnn_cache->w_desc_, weight_data,
weight_data, cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->hx_desc_, init_h_data, cudnn_rnn_cache->cx_desc_,
cudnn_rnn_cache->cx_desc_, init_c_data, cudnn_rnn_cache->dx_desc_, init_c_data, cudnn_rnn_cache->x_desc_, in_grad_data,
in_grad_data, cudnn_rnn_cache->dhx_desc_, init_h_grad_data, cudnn_rnn_cache->hx_desc_, init_h_grad_data, cudnn_rnn_cache->cx_desc_,
cudnn_rnn_cache->dcx_desc_, init_c_grad_data, work_data, init_c_grad_data, work_data, cudnn_rnn_cache->workspace_size_,
cudnn_rnn_cache->workspace_size_, reserve_data, const_cast<uint8_t *>(reserve_data), reserve_size));
cudnn_rnn_cache->reserve_size_));
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights( PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
handle, cudnn_rnn_cache->rnn_desc_, run_seq_len, handle, cudnn_rnn_cache->rnn_desc_, seq_len, cudnn_rnn_cache->x_desc_,
cudnn_rnn_cache->x_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_, input->data<T>(), cudnn_rnn_cache->hx_desc_, init_h->data<T>(),
init_h->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(), cudnn_rnn_cache->y_desc_, out->data<T>(),
cudnn_rnn_cache->workspace_data_.data<uint8_t>(), cudnn_rnn_cache->workspace_data_.data<uint8_t>(),
cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->dw_desc_, cudnn_rnn_cache->workspace_size_, cudnn_rnn_cache->w_desc_,
weight_grad->data<T>(), cudnn_rnn_cache->reserve_data_.data<uint8_t>(), weight_grad->data<T>(), const_cast<uint8_t *>(reserve_data),
cudnn_rnn_cache->reserve_size_)); reserve_size));
delete cudnn_rnn_cache;
} }
}; };
...@@ -257,5 +201,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> { ...@@ -257,5 +201,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
} // namespace paddle } // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>); REGISTER_OP_CUDA_KERNEL(cudnn_lstm, ops::CudnnLSTMGPUKernel<float>,
REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>); ops::CudnnLSTMGPUKernel<double>);
REGISTER_OP_CUDA_KERNEL(cudnn_lstm_grad, ops::CudnnLSTMGPUGradKernel<float>,
ops::CudnnLSTMGPUGradKernel<double>);
...@@ -68,9 +68,20 @@ class CVMOpKernel : public framework::OpKernel<T> { ...@@ -68,9 +68,20 @@ class CVMOpKernel : public framework::OpKernel<T> {
// for Input X do not have Lod Information. // for Input X do not have Lod Information.
if (x->NumLevels() == 0) { if (x->NumLevels() == 0) {
if (use_cvm) {
for (int i = 0; i < batch_size; i++) {
int cursor = i * item_size;
y_data[cursor] = log(x_data[cursor] + 1);
y_data[cursor + 1] = log(x_data[cursor + 1] + 1) - y_data[cursor];
for (int j = 2; j < item_size; j++) {
y_data[cursor + j] = x_data[cursor + j];
}
}
} else {
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
CvmComputeKernel(use_cvm, item_size, &x_data, &y_data); CvmComputeKernel(use_cvm, item_size, &x_data, &y_data);
} }
}
} else { } else {
auto lod = x->lod()[0]; auto lod = x->lod()[0];
for (size_t i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
......
...@@ -222,6 +222,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -222,6 +222,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
} else { } else {
auto lod = dist_mat->lod().back(); auto lod = dist_mat->lod().back();
for (size_t i = 0; i < lod.size() - 1; ++i) { for (size_t i = 0; i < lod.size() - 1; ++i) {
if (lod[i + 1] > lod[i]) {
Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]); Tensor one_ins = dist_mat->Slice(lod[i], lod[i + 1]);
BipartiteMatch(one_ins, indices + i * col, dist + i * col); BipartiteMatch(one_ins, indices + i * col, dist + i * col);
if (type == "per_prediction") { if (type == "per_prediction") {
...@@ -230,6 +231,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> { ...@@ -230,6 +231,7 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
} }
} }
} }
}
}; };
class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker { class BipartiteMatchOpMaker : public framework::OpProtoAndCheckerMaker {
......
此差异已折叠。
此差异已折叠。
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/math_function.h"
namespace paddle {
namespace operators {
using DDim = framework::DDim;
static inline int ComputeStride(int axis, DDim dims) {
int size = 1;
for (int i = axis + 1; i < dims.size(); i++) {
size *= dims[i];
}
return size;
}
} // namespace operators
} // namespace paddle
...@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope) ...@@ -61,7 +61,7 @@ cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler scope)
cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory) cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory) cc_library(parameter_send SRCS parameter_send.cc DEPS sendrecvop_rpc memory)
cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory) cc_library(parameter_recv SRCS parameter_recv.cc DEPS sendrecvop_rpc memory)
cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv) cc_library(communicator SRCS communicator.cc DEPS scope selected_rows tensor variable_helper selected_rows_functor simple_threadpool parameter_send parameter_recv generator)
cc_test(communicator_test SRCS communicator_test.cc DEPS communicator) cc_test(communicator_test SRCS communicator_test.cc DEPS communicator)
if(WITH_GPU) if(WITH_GPU)
cc_test(collective_server_test SRCS collective_server_test.cc cc_test(collective_server_test SRCS collective_server_test.cc
......
...@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel { ...@@ -44,7 +44,7 @@ class RecvSaveOp : public framework::OperatorWithKernel {
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType( return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")), framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.GetPlace()); platform::CPUPlace());
} }
}; };
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册