提交 4bfbc591 编写于 作者: G guosheng

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle into enhance-ReshapeOp

...@@ -36,6 +36,7 @@ include(simd) ...@@ -36,6 +36,7 @@ include(simd)
################################ Configurations ####################################### ################################ Configurations #######################################
option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND})
option(WITH_AMD_GPU "Compile PaddlePaddle with AMD GPU" OFF)
option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND}) option(WITH_AVX "Compile PaddlePaddle with AVX intrinsics" ${AVX_FOUND})
option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND}) option(WITH_MKL "Compile PaddlePaddle with MKL support." ${AVX_FOUND})
option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON) option(WITH_DSO "Compile PaddlePaddle with dynamic linked CUDA" ON)
...@@ -180,6 +181,11 @@ if(WITH_GPU) ...@@ -180,6 +181,11 @@ if(WITH_GPU)
include(cuda) include(cuda)
endif(WITH_GPU) endif(WITH_GPU)
if(WITH_AMD_GPU)
find_package(HIP)
include(hip)
endif(WITH_AMD_GPU)
if(WITH_MKLML) if(WITH_MKLML)
list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB}) list(APPEND EXTERNAL_LIBS ${MKLML_IOMP_LIB})
endif() endif()
......
...@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG) ...@@ -57,11 +57,7 @@ if(NOT WITH_GOLANG)
add_definitions(-DPADDLE_WITHOUT_GOLANG) add_definitions(-DPADDLE_WITHOUT_GOLANG)
endif(NOT WITH_GOLANG) endif(NOT WITH_GOLANG)
if(NOT WITH_GPU) if(WITH_GPU)
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
else()
add_definitions(-DPADDLE_WITH_CUDA) add_definitions(-DPADDLE_WITH_CUDA)
FIND_PACKAGE(CUDA REQUIRED) FIND_PACKAGE(CUDA REQUIRED)
...@@ -84,7 +80,14 @@ else() ...@@ -84,7 +80,14 @@ else()
# Include cuda and cudnn # Include cuda and cudnn
include_directories(${CUDNN_INCLUDE_DIR}) include_directories(${CUDNN_INCLUDE_DIR})
include_directories(${CUDA_TOOLKIT_INCLUDE}) include_directories(${CUDA_TOOLKIT_INCLUDE})
endif(NOT WITH_GPU) elseif(WITH_AMD_GPU)
add_definitions(-DPADDLE_WITH_HIP)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -D__HIP_PLATFORM_HCC__")
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__HIP_PLATFORM_HCC__")
else()
add_definitions(-DHPPL_STUB_FUNC)
list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
endif()
if (WITH_MKLML AND MKLML_IOMP_LIB) if (WITH_MKLML AND MKLML_IOMP_LIB)
message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}") message(STATUS "Enable Intel OpenMP with ${MKLML_IOMP_LIB}")
......
...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost") ...@@ -24,7 +24,7 @@ set(BOOST_PROJECT "extern_boost")
# So we use 1.41.0 here. # So we use 1.41.0 here.
set(BOOST_VER "1.41.0") set(BOOST_VER "1.41.0")
set(BOOST_TAR "boost_1_41_0") set(BOOST_TAR "boost_1_41_0")
set(BOOST_URL "http://paddlepaddledeps.s3-website-us-west-1.amazonaws.com/${BOOST_TAR}.tar.gz") set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz")
set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost) set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)
set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}") set(BOOST_DOWNLOAD_DIR "${BOOST_SOURCES_DIR}/src/${BOOST_PROJECT}")
set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE) set(BOOST_INCLUDE_DIR "${BOOST_DOWNLOAD_DIR}/${BOOST_TAR}" CACHE PATH "boost include directory." FORCE)
......
...@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3) ...@@ -4,18 +4,33 @@ SET(EIGEN_SOURCE_DIR ${THIRD_PARTY_PATH}/eigen3)
SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3) SET(EIGEN_INCLUDE_DIR ${EIGEN_SOURCE_DIR}/src/extern_eigen3)
INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR}) INCLUDE_DIRECTORIES(${EIGEN_INCLUDE_DIR})
ExternalProject_Add( if(WITH_AMD_GPU)
extern_eigen3 ExternalProject_Add(
${EXTERNAL_PROJECT_LOG_ARGS} extern_eigen3
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git" ${EXTERNAL_PROJECT_LOG_ARGS}
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10 GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git"
PREFIX ${EIGEN_SOURCE_DIR} GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9
UPDATE_COMMAND "" PREFIX ${EIGEN_SOURCE_DIR}
CONFIGURE_COMMAND "" UPDATE_COMMAND ""
BUILD_COMMAND "" CONFIGURE_COMMAND ""
INSTALL_COMMAND "" BUILD_COMMAND ""
TEST_COMMAND "" INSTALL_COMMAND ""
) TEST_COMMAND ""
)
else()
ExternalProject_Add(
extern_eigen3
${EXTERNAL_PROJECT_LOG_ARGS}
GIT_REPOSITORY "https://github.com/RLovelett/eigen.git"
GIT_TAG 70661066beef694cadf6c304d0d07e0758825c10
PREFIX ${EIGEN_SOURCE_DIR}
UPDATE_COMMAND ""
CONFIGURE_COMMAND ""
BUILD_COMMAND ""
INSTALL_COMMAND ""
TEST_COMMAND ""
)
endif()
if (${CMAKE_VERSION} VERSION_LESS "3.3.0") if (${CMAKE_VERSION} VERSION_LESS "3.3.0")
set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c) set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/eigen3_dummy.c)
......
...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}") ...@@ -34,7 +34,7 @@ SET(MKLML_DOWNLOAD_DIR "${MKLML_SOURCE_DIR}/src/${MKLML_PROJECT}")
SET(MKLML_DST_DIR "mklml") SET(MKLML_DST_DIR "mklml")
SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install") SET(MKLML_INSTALL_ROOT "${THIRD_PARTY_PATH}/install")
SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR}) SET(MKLML_INSTALL_DIR ${MKLML_INSTALL_ROOT}/${MKLML_DST_DIR})
SET(MKLML_ROOT ${MKLML_INSTALL_DIR}/${MKLML_VER}) SET(MKLML_ROOT ${MKLML_INSTALL_DIR})
SET(MKLML_INC_DIR ${MKLML_ROOT}/include) SET(MKLML_INC_DIR ${MKLML_ROOT}/include)
SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib) SET(MKLML_LIB_DIR ${MKLML_ROOT}/lib)
SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so) SET(MKLML_LIB ${MKLML_LIB_DIR}/libmklml_intel.so)
...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR}) ...@@ -46,7 +46,7 @@ INCLUDE_DIRECTORIES(${MKLML_INC_DIR})
FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt FILE(WRITE ${MKLML_DOWNLOAD_DIR}/CMakeLists.txt
"PROJECT(MKLML)\n" "PROJECT(MKLML)\n"
"cmake_minimum_required(VERSION 3.0)\n" "cmake_minimum_required(VERSION 3.0)\n"
"install(DIRECTORY ${MKLML_VER}\n" "install(DIRECTORY ${MKLML_VER}/include ${MKLML_VER}/lib \n"
" DESTINATION ${MKLML_DST_DIR})\n") " DESTINATION ${MKLML_DST_DIR})\n")
ExternalProject_Add( ExternalProject_Add(
......
...@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME) ...@@ -317,6 +317,82 @@ function(nv_test TARGET_NAME)
endif() endif()
endfunction(nv_test) endfunction(nv_test)
function(hip_library TARGET_NAME)
if (WITH_AMD_GPU)
set(options STATIC static SHARED shared)
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_library_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
if(hip_library_SRCS)
if (hip_library_SHARED OR hip_library_shared) # build *.so
add_library(${TARGET_NAME} SHARED ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
else()
add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX)
target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a)
find_fluid_modules(${TARGET_NAME})
endif()
if (hip_library_DEPS)
add_dependencies(${TARGET_NAME} ${hip_library_DEPS})
target_link_libraries(${TARGET_NAME} ${hip_library_DEPS})
endif()
# cpplint code style
foreach(source_file ${hip_library_SRCS})
string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file})
if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h)
endif()
endforeach()
add_style_check_target(${TARGET_NAME} ${hip_library_SRCS} ${hip_library_HEADERS})
else(hip_library_SRCS)
if (hip_library_DEPS)
merge_static_libs(${TARGET_NAME} ${hip_library_DEPS})
else()
message(FATAL "Please specify source file or library in nv_library.")
endif()
endif(hip_library_SRCS)
endif()
endfunction(hip_library)
function(hip_binary TARGET_NAME)
if (WITH_AMD_GPU)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_binary "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
hip_add_executable(${TARGET_NAME} ${hip_binary_SRCS})
if(hip_binary_DEPS)
target_link_libraries(${TARGET_NAME} ${hip_binary_DEPS})
add_dependencies(${TARGET_NAME} ${hip_binary_DEPS})
endif()
endif()
endfunction(hip_binary)
function(hip_test TARGET_NAME)
if (WITH_AMD_GPU AND WITH_TESTING)
set(options "")
set(oneValueArgs "")
set(multiValueArgs SRCS DEPS)
cmake_parse_arguments(hip_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
set(_sources ${hip_test_SRCS})
HIP_PREPARE_TARGET_COMMANDS(${TARGET_NAME} OBJ _generated_files _source_files ${_sources} HIPCC_OPTIONS ${_hipcc_options} HCC_OPTIONS ${_hcc_options} NVCC_OPTIONS ${_nvcc_options})
if(_source_files)
list(REMOVE_ITEM _sources ${_source_files})
endif()
add_executable(${TARGET_NAME} ${_cmake_options} ${_generated_files} ${_sources})
set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE HIP)
target_link_libraries(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_dependencies(${TARGET_NAME} ${hip_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
add_test(${TARGET_NAME} ${TARGET_NAME})
endif()
endfunction(hip_test)
function(go_library TARGET_NAME) function(go_library TARGET_NAME)
set(options STATIC static SHARED shared) set(options STATIC static SHARED shared)
set(oneValueArgs "") set(oneValueArgs "")
......
if(NOT WITH_AMD_GPU)
return()
endif()
include_directories("/opt/rocm/include")
include_directories("/opt/rocm/hipblas/include")
include_directories("/opt/rocm/hiprand/include")
include_directories("/opt/rocm/rocrand/include")
include_directories("/opt/rocm/rccl/include")
include_directories("/opt/rocm/thrust")
list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc")
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" )
if(WITH_DSO)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO")
endif(WITH_DSO)
if(WITH_DOUBLE)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE")
endif(WITH_DOUBLE)
if(WITH_TESTING)
set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING")
endif(WITH_TESTING)
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG})
elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_RELWITHDEBINFO})
elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel")
list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_MINSIZEREL})
endif()
if("x${HCC_HOME}" STREQUAL "x")
set(HCC_HOME "/opt/rocm/hcc")
endif()
set(CMAKE_HIP_LINK_EXECUTABLE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <FLAGS> <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES>")
set(CMAKE_HIP_CREATE_SHARED_LIBRARY "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
set(CMAKE_HIP_CREATE_SHARED_MODULE "${HIP_HIPCC_CMAKE_LINKER_HELPER} ${HCC_HOME} <CMAKE_CXX_LINK_FLAGS> <LINK_FLAGS> <OBJECTS> -o <TARGET> <LINK_LIBRARIES> -shared")
...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND) ...@@ -69,6 +69,12 @@ if(NOT CBLAS_FOUND)
SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
DSTS ${dst_dir} ${dst_dir} DSTS ${dst_dir} ${dst_dir}
) )
elseif (WITH_MKLML)
set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
copy(mklml_lib
SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
)
endif() endif()
# paddle fluid module # paddle fluid module
......
# go_op Design
## Introduction
The **go_op** allows user's of PaddlePaddle to run program blocks on a detached
thread. It works in conjuction with CSP operators (channel_send,
channel_receive, channel_open, channel_close, and select) to allow users to
concurrently process data and communicate easily between different threads.
## How to use it
```
channel = fluid.make_channel(dtype=core.VarDesc.VarType.LOD_TENSOR)
with fluid.Go():
# Send a tensor of value 99 to "channel" on a detached thread
tensor = fill_constant(shape=[1], dtype='int', value=99)
tensor.stop_gradient = True
fluid.channel_send(channel, tensor)
# Receive sent tensor from "channel" on the main thread
result = fill_constant(shape=[1], dtype='int', value=-1)
fluid.channel_recv(ch, result)
```
The go operator can be accessed by using the fluid.Go() control flow. This
will create a new sub block, where the user can add additional operators
to be ran on the thread.
**Note:** Since back propegation is currently not support in the go_op, users
should ensure that operators in the go block does not require gradient
calculations.
## How it Works
Similar to other control blocks, go_op will create a sub block and add it
as a child to the current block. Operators and variables defined in this
block will be added to the go sub_block.
In addition, the go operator will create a new child scope whose parent is
the global scope. Please refer to [block captures](#block-captures) for more
information.
When Paddle executor runs go_op, go_op will take the sub_block and pass it to
the executor.run method (along with a newly created local scope) on a detached
thread.
An example of the generated program description is shown below. Take note of
the **go_op** in particular. It is added as an operator in the current
block (in this example, block0). The **go_op** contains a `sub_block`
attribute, which points to the id of the block that will be executed in a
detached thread.
```
blocks {
idx: 0
parent_idx: -1
vars {
name: "return_value"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: INT64
}
}
}
}
vars {
name: "status_recv"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "channel"
}
type: "channel_create"
attrs {
name: "data_type"
type: INT
i: 7
}
attrs {
name: "capacity"
type: INT
i: 0
}
}
ops {
inputs {
parameter: "X"
arguments: "channel"
}
type: "go"
attrs {
name: "sub_block"
type: BLOCK
block_idx: 1
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
outputs {
parameter: "Out"
arguments: "return_value"
}
outputs {
parameter: "Status"
arguments: "status_recv"
}
type: "channel_recv"
}
...
}
blocks {
idx: 1
parent_idx: 0
vars {
name: "status"
type {
type: LOD_TENSOR
lod_tensor {
tensor {
data_type: BOOL
}
}
}
}
...
ops {
outputs {
parameter: "Out"
arguments: "fill_constant_1.tmp_0"
}
type: "fill_constant"
attrs {
name: "force_cpu"
type: BOOLEAN
b: false
}
attrs {
name: "value"
type: FLOAT
f: 99.0
}
attrs {
name: "shape"
type: INTS
ints: 1
}
attrs {
name: "dtype"
type: INT
i: 3
}
}
ops {
inputs {
parameter: "Channel"
arguments: "channel"
}
inputs {
parameter: "X"
arguments: "fill_constant_1.tmp_0"
}
outputs {
parameter: "Status"
arguments: "status"
}
type: "channel_send"
attrs {
name: "copy"
type: BOOLEAN
b: false
}
}
```
## Current Limitations
#### <a name="block-captures"></a>Scopes and block captures:
Paddle utilizes [scopes](./../concepts/scope.md) to store variables used in a
block. When a block is executed, a new local scope is created from the parent
scope (ie: scope derived from the parent block) and associated with the new
child block. After the block finishes executing, then the local scope and
all associated variables in the scope is deleted.
This works well in a single threaded scenario, however with introduction of
go_op, a child block may continue to execute even after the parent block has
exited. If the go_op tries to access variables located in the parent block's
scope, it may receive a segmentation fault because the parent scope may have
been deleted.
We need to implement block closures in order to prevent access to parent
scope variables from causing a segmentation fault. As a temporary workaround,
please ensure that all variables accessed in the go block is not destructed
before it is being accessed. Currently, the go_op will explicitly enforce
this requirement and raise an exception if a variable could not be found in
the scope.
Please refer to [Closure issue](https://github.com/PaddlePaddle/Paddle/issues/8502)
for more details.
#### Green Threads
Golang utilizes `green threads`, which is a mechnism for the runtime library to
manage multiple threads (instead of natively by the OS). Green threads usually
allows for faster thread creation and switching, as there is less overhead
when spawning these threads. For the first version of CSP, we only support
OS threads.
#### Backward Propegation:
go_op currently does not support backwards propagation. Please use go_op with
non training operators.
Development Development
------------ ------------
PaddlePaddle adheres to the following three sections of code and document specifications.
PaddlePaddle uses git for version control and Docker is used for building and testing environment. The code includes Cuda, C++, Python, Shell and other programming languages,which comply with Google C++ Style, Pep-8, and the code base includes style checking by an automatic inspection tool. Code comments need to follow the Doxygen specification. The code that does not meet the style requirements will fail to compile. We provide the following guidelines for the use of Git, build tests and code development.
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
contribute_to_paddle_en.md contribute_to_paddle_en.md
PaddlePaddle is well documented in English and Chinese. We recommend using the English version of the documents and problem description. The design documents focus on problem descriptions, backgrounds, and are followed by solutions. As documents are generated by Sphinx, code comments should comply with the Sphinx documentation standard. We recommend to use the paddlepaddle.org tool to compile and generate and preview documents locally. Please refer to:
.. toctree::
:maxdepth: 1
write_docs_en.rst write_docs_en.rst
PaddlePaddle V2 defines new operations by adding new Layers. You can implement various complex layers by combining basic APIs to satisfy most applications. If you want to customize layer, please refer to the following, and welcome to propose patch.
.. toctree::
:maxdepth: 1
new_layer_en.rst new_layer_en.rst
...@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line ...@@ -6,32 +6,32 @@ PaddlePaddle provides the users the ability to flexibly set various command line
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cmd_parameter/index_cn.rst cmd_parameter/index_en.rst
PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to: PaddlePaddle supports distributed training tasks on fabric clusters, MPI clusters, and Kubernetes clusters. For detailed configuration and usage instructions, refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
cluster/index_cn.rst cluster/index_en.rst
PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API: PaddlePaddle provides a C-API for inference. We provide the following guidelines for using the C-API:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
capi/index_cn.rst capi/index_en.rst
PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to: PaddlePaddle supports a variety of flexible and efficient recurrent neural networks. For details, please refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
rnn/index_cn.rst rnn/index_en.rst
How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to: How to use the built-in timing tool, nvprof, or nvvp to run performance analysis and tuning, please refer to:
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
optimization/gpu_profiling_cn.rst optimization/gpu_profiling_en.rst
...@@ -34,7 +34,7 @@ class Channel { ...@@ -34,7 +34,7 @@ class Channel {
public: public:
virtual bool CanSend() = 0; virtual bool CanSend() = 0;
virtual bool CanReceive() = 0; virtual bool CanReceive() = 0;
virtual bool Send(T*) = 0; virtual void Send(T*) = 0;
virtual bool Receive(T*) = 0; virtual bool Receive(T*) = 0;
virtual size_t Cap() = 0; virtual size_t Cap() = 0;
virtual void Lock() = 0; virtual void Lock() = 0;
...@@ -84,69 +84,81 @@ class ChannelHolder { ...@@ -84,69 +84,81 @@ class ChannelHolder {
} }
template <typename T> template <typename T>
bool Send(T* data) { void Send(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
// Static cast should be safe because we have ensured that types are same // Static cast should be safe because we have ensured that types are same
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Send(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
channel->Send(data);
} }
template <typename T> template <typename T>
bool Receive(T* data) { bool Receive(T* data) {
if (!IsInitialized()) return false; PADDLE_ENFORCE_EQ(IsInitialized(), true,
PADDLE_ENFORCE_EQ(holder_->Type(), std::type_index(typeid(T))); "The Channel hasn't been initialized");
PADDLE_ENFORCE_EQ(
holder_->Type(), std::type_index(typeid(T)),
"Channel type is not same as the type of the data being sent");
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
return channel != nullptr ? channel->Receive(data) : false; PADDLE_ENFORCE_EQ(channel != nullptr, true, "Channel should not be null.");
return channel->Receive(data);
} }
bool IsClosed() { bool IsClosed() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->IsClosed(); "The Channel hasn't been initialized");
} return holder_->IsClosed();
return false;
} }
bool CanSend() { bool CanSend() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->CanSend(); "The Channel hasn't been initialized");
} return holder_->CanSend();
return false;
} }
bool CanReceive() { bool CanReceive() {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
return holder_->CanReceive(); "The Channel hasn't been initialized");
} return holder_->CanReceive();
return false;
} }
void close() { void close() {
if (IsInitialized()) holder_->Close(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Close();
} }
size_t Cap() { size_t Cap() {
if (IsInitialized()) return holder_->Cap(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
return -1; "The Channel hasn't been initialized");
return holder_->Cap();
} }
void Lock() { void Lock() {
if (IsInitialized()) holder_->Lock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Lock();
} }
void Unlock() { void Unlock() {
if (IsInitialized()) holder_->Unlock(); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->Unlock();
} }
template <typename T> template <typename T>
void AddToSendQ(const void* referrer, T* data, void AddToSendQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); "The Channel hasn't been initialized");
if (channel != nullptr) { Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
channel->AddToSendQ(referrer, data, cond, cb); if (channel != nullptr) {
} channel->AddToSendQ(referrer, data, cond, cb);
} }
} }
...@@ -154,26 +166,31 @@ class ChannelHolder { ...@@ -154,26 +166,31 @@ class ChannelHolder {
void AddToReceiveQ(const void* referrer, T* data, void AddToReceiveQ(const void* referrer, T* data,
std::shared_ptr<std::condition_variable_any> cond, std::shared_ptr<std::condition_variable_any> cond,
std::function<bool(ChannelAction)> cb) { std::function<bool(ChannelAction)> cb) {
if (IsInitialized()) { PADDLE_ENFORCE_EQ(IsInitialized(), true,
Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr()); "The Channel hasn't been initialized");
if (channel != nullptr) { Channel<T>* channel = static_cast<Channel<T>*>(holder_->Ptr());
channel->AddToReceiveQ(referrer, data, cond, cb); if (channel != nullptr) {
} channel->AddToReceiveQ(referrer, data, cond, cb);
} }
} }
void RemoveFromSendQ(const void* referrer) { void RemoveFromSendQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromSendQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromSendQ(referrer);
} }
void RemoveFromReceiveQ(const void* referrer) { void RemoveFromReceiveQ(const void* referrer) {
if (IsInitialized()) holder_->RemoveFromReceiveQ(referrer); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
holder_->RemoveFromReceiveQ(referrer);
} }
inline bool IsInitialized() const { return holder_ != nullptr; } inline bool IsInitialized() const { return holder_ != nullptr; }
inline const std::type_index Type() { inline const std::type_index Type() {
PADDLE_ENFORCE_EQ(IsInitialized(), true); PADDLE_ENFORCE_EQ(IsInitialized(), true,
"The Channel hasn't been initialized");
return holder_->Type(); return holder_->Type();
} }
......
...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -31,7 +31,7 @@ class ChannelImpl : public paddle::framework::Channel<T> {
public: public:
virtual bool CanSend(); virtual bool CanSend();
virtual bool CanReceive(); virtual bool CanReceive();
virtual bool Send(T *); virtual void Send(T *);
virtual bool Receive(T *); virtual bool Receive(T *);
virtual size_t Cap() { return cap_; } virtual size_t Cap() { return cap_; }
virtual void Lock(); virtual void Lock();
...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> { ...@@ -76,10 +76,9 @@ class ChannelImpl : public paddle::framework::Channel<T> {
} }
}; };
bool send_return(bool value) { void send_return() {
send_ctr--; send_ctr--;
destructor_cond_.notify_all(); destructor_cond_.notify_all();
return value;
} }
bool recv_return(bool value) { bool recv_return(bool value) {
...@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() { ...@@ -118,15 +117,15 @@ bool ChannelImpl<T>::CanReceive() {
} }
template <typename T> template <typename T>
bool ChannelImpl<T>::Send(T *item) { void ChannelImpl<T>::Send(T *item) {
send_ctr++; send_ctr++;
std::unique_lock<std::recursive_mutex> lock{mu_}; std::unique_lock<std::recursive_mutex> lock{mu_};
// If channel is closed, do nothing // If channel is closed, throw exception
if (closed_) { if (closed_) {
lock.unlock(); lock.unlock();
// TODO(abhinavarora) Should panic on closed channel send_return();
return send_return(false); PADDLE_THROW("Cannot send on closed channel");
} }
// If there is a receiver, directly pass the value we want // If there is a receiver, directly pass the value we want
...@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -143,7 +142,7 @@ bool ChannelImpl<T>::Send(T *item) {
if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND); if (m->callback != nullptr) do_send = m->callback(ChannelAction::SEND);
if (do_send) if (do_send)
*(m->data) = std::move(*item); *(m->data) = std::move(*item);
else else {
// We cannot do the data transfer because // We cannot do the data transfer because
// this QueueMessage was added by Select // this QueueMessage was added by Select
// and some other case was executed. // and some other case was executed.
...@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -151,12 +150,17 @@ bool ChannelImpl<T>::Send(T *item) {
// We do not care about notifying other // We do not care about notifying other
// because they would have been notified // because they would have been notified
// by the executed select case. // by the executed select case.
return send_return(Send(item)); lock.unlock();
Send(item);
send_return();
return;
}
// Wake up the blocked process and unlock // Wake up the blocked process and unlock
m->Notify(); m->Notify();
lock.unlock(); lock.unlock();
return send_return(true); send_return();
return;
} }
// Unbuffered channel will always bypass this // Unbuffered channel will always bypass this
...@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -167,7 +171,8 @@ bool ChannelImpl<T>::Send(T *item) {
buf_.push_back(std::move(*item)); buf_.push_back(std::move(*item));
// Release lock and return true // Release lock and return true
lock.unlock(); lock.unlock();
return send_return(true); send_return();
return;
} }
// Block on channel, because some receiver will complete // Block on channel, because some receiver will complete
...@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) { ...@@ -175,8 +180,12 @@ bool ChannelImpl<T>::Send(T *item) {
auto m = std::make_shared<QueueMessage>(item); auto m = std::make_shared<QueueMessage>(item);
sendq.push_back(m); sendq.push_back(m);
m->Wait(lock); m->Wait(lock);
// TODO(abhinavarora) Should panic on closed channel if (m->chan_closed) {
return send_return(!m->chan_closed); lock.unlock();
send_return();
PADDLE_THROW("Cannot send on closed channel");
}
send_return();
} }
template <typename T> template <typename T>
......
...@@ -16,7 +16,6 @@ limitations under the License. */ ...@@ -16,7 +16,6 @@ limitations under the License. */
#include <chrono> #include <chrono>
#include <thread> #include <thread>
#include "gtest/gtest.h" #include "gtest/gtest.h"
using paddle::framework::Channel; using paddle::framework::Channel;
...@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) { ...@@ -41,7 +40,7 @@ void RecevingOrderEqualToSendingOrder(Channel<int> *ch) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
...@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) { ...@@ -61,7 +60,7 @@ TEST(Channel, SufficientBufferSizeDoesntBlock) {
const size_t buffer_size = 10; const size_t buffer_size = 10;
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i);
} }
size_t out; size_t out;
...@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -82,7 +81,7 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
const size_t data = 5; const size_t data = 5;
std::thread send_thread{[&]() { std::thread send_thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), true); // should not block ch->Send(&i); // should not block
}}; }};
std::thread recv_thread{[&]() { std::thread recv_thread{[&]() {
...@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) { ...@@ -94,12 +93,18 @@ void SendReceiveWithACloseChannelShouldPanic(Channel<size_t> *ch) {
send_thread.join(); send_thread.join();
recv_thread.join(); recv_thread.join();
// After closing send should return false. Receive should // After closing send should panic. Receive should
// also return false as there is no data in queue. // also false as there is no data in queue.
CloseChannel(ch); CloseChannel(ch);
send_thread = std::thread{[&]() { send_thread = std::thread{[&]() {
size_t i = data; size_t i = data;
EXPECT_EQ(ch->Send(&i), false); // should return false bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}}; }};
recv_thread = std::thread{[&]() { recv_thread = std::thread{[&]() {
size_t i; size_t i;
...@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) { ...@@ -129,7 +134,7 @@ TEST(Channel, ReceiveFromBufferedChannelReturnResidualValuesTest) {
auto ch = MakeChannel<size_t>(buffer_size); auto ch = MakeChannel<size_t>(buffer_size);
for (size_t i = 0; i < buffer_size; ++i) { for (size_t i = 0; i < buffer_size; ++i) {
EXPECT_EQ(ch->Send(&i), true); // sending should not block ch->Send(&i); // sending should not block
} }
size_t out; size_t out;
...@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) { ...@@ -160,9 +165,16 @@ TEST(Channel, ConcurrentSendNonConcurrentReceiveWithSufficientBufferSize) {
// Try to write more than buffer size. // Try to write more than buffer size.
for (size_t i = 0; i < 2 * buffer_size; ++i) { for (size_t i = 0; i < 2 * buffer_size; ++i) {
if (i < buffer_size) if (i < buffer_size)
EXPECT_EQ(ch->Send(&i), true); // should block after 10 iterations ch->Send(&i); // should block after 10 iterations
else else {
EXPECT_EQ(ch->Send(&i), false); bool is_exception = false;
try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
EXPECT_EQ(is_exception, true);
}
} }
}); });
std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec std::this_thread::sleep_for(std::chrono::milliseconds(200)); // wait 0.2 sec
...@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) { ...@@ -231,7 +243,13 @@ void ChannelCloseUnblocksSendersTest(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) { ...@@ -316,8 +334,11 @@ TEST(Channel, UnbufferedLessReceiveMoreSendTest) {
// Try to send more number of times // Try to send more number of times
// than receivers // than receivers
for (int i = 0; i < 4; i++) { for (int i = 0; i < 4; i++) {
ch->Send(&i); try {
sum_send += i; ch->Send(&i);
sum_send += i;
} catch (paddle::platform::EnforceNotMet e) {
}
} }
}); });
for (int i = 0; i < 3; i++) { for (int i = 0; i < 3; i++) {
...@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) { ...@@ -382,7 +403,13 @@ void ChannelDestroyUnblockSenders(Channel<int> *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) { ...@@ -508,7 +535,7 @@ void ChannelHolderSendReceive(ChannelHolder *ch) {
unsigned sum_send = 0; unsigned sum_send = 0;
std::thread t([&]() { std::thread t([&]() {
for (int i = 0; i < 5; i++) { for (int i = 0; i < 5; i++) {
EXPECT_EQ(ch->Send(&i), true); ch->Send(&i);
sum_send += i; sum_send += i;
} }
}); });
...@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) { ...@@ -541,8 +568,22 @@ TEST(ChannelHolder, ChannelUninitializedTest) {
ChannelHolder *ch = new ChannelHolder(); ChannelHolder *ch = new ChannelHolder();
EXPECT_EQ(ch->IsInitialized(), false); EXPECT_EQ(ch->IsInitialized(), false);
int i = 10; int i = 10;
EXPECT_EQ(ch->Send(&i), false); bool send_exception = false;
EXPECT_EQ(ch->Receive(&i), false); try {
ch->Send(&i);
} catch (paddle::platform::EnforceNotMet e) {
send_exception = true;
}
EXPECT_EQ(send_exception, true);
bool recv_exception = false;
try {
ch->Receive(&i);
} catch (paddle::platform::EnforceNotMet e) {
recv_exception = true;
}
EXPECT_EQ(recv_exception, true);
bool is_exception = false; bool is_exception = false;
try { try {
ch->Type(); ch->Type();
...@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) { ...@@ -669,7 +710,13 @@ void ChannelHolderCloseUnblocksSendersTest(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
...@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) { ...@@ -760,7 +807,13 @@ void ChannelHolderDestroyUnblockSenders(ChannelHolder *ch, bool isBuffered) {
t[i] = std::thread( t[i] = std::thread(
[&](bool *ended, bool *success) { [&](bool *ended, bool *success) {
int data = 10; int data = 10;
*success = ch->Send(&data); bool is_exception = false;
try {
ch->Send(&data);
} catch (paddle::platform::EnforceNotMet e) {
is_exception = true;
}
*success = !is_exception;
*ended = true; *ended = true;
}, },
&thread_ended[i], &send_success[i]); &thread_ended[i], &send_success[i]);
......
...@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program, ...@@ -150,8 +150,9 @@ void AddFibonacciSelect(Scope *scope, p::CPUPlace *place, ProgramDesc *program,
// Select block // Select block
AddOp("select", {{"X", {dataChanName, quitChanName}}, AddOp("select", {{"X", {dataChanName, quitChanName}},
{"case_to_execute", {"caseToExecute"}}}, {"case_to_execute", {"caseToExecute"}}},
{}, {{"sub_block", casesBlock}, {{"Out", {}}},
{"cases", std::vector<std::string>{case0Config, case1Config}}}, {{"sub_block", casesBlock},
{"cases", std::vector<std::string>{case0Config, case1Config}}},
whileBlock); whileBlock);
scope->Var("stepScopes"); scope->Var("stepScopes");
...@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) { ...@@ -209,9 +210,8 @@ TEST(Concurrency, Go_Op) {
executor.Run(program, &scope, 0, true, true); executor.Run(program, &scope, 0, true, true);
// After we call executor.run, the Go operator should do a channel_send to set // After we call executor.run, the Go operator should do a channel_send to
// the // set the "result" variable to 99.
// "result" variable to 99
auto *finalData = tensor.data<int>(); auto *finalData = tensor.data<int>();
EXPECT_EQ(finalData[0], 99); EXPECT_EQ(finalData[0], 99);
} }
......
...@@ -45,10 +45,11 @@ class Tensor { ...@@ -45,10 +45,11 @@ class Tensor {
friend struct EigenVector; friend struct EigenVector;
public: public:
Tensor() : offset_(0) {} Tensor() : offset_(0), is_pinned_(false) {}
/*! Constructor with place should only be used in pybind. */ /*! Constructor with place should only be used in pybind. */
explicit Tensor(const platform::Place& place) : offset_(0) { explicit Tensor(const platform::Place& place)
: offset_(0), is_pinned_(false) {
holder_->set_place(place); holder_->set_place(place);
} }
...@@ -69,11 +70,12 @@ class Tensor { ...@@ -69,11 +70,12 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(platform::Place place); inline T* mutable_data(platform::Place place, bool is_pinned = false);
inline void* mutable_data(platform::Place place, std::type_index type); inline void* mutable_data(platform::Place place, std::type_index type,
bool is_pinned = false);
inline void* mutable_data(platform::Place place); inline void* mutable_data(platform::Place place, bool is_pinned = false);
/** /**
* @brief Return a pointer to mutable memory block. * @brief Return a pointer to mutable memory block.
...@@ -84,7 +86,8 @@ class Tensor { ...@@ -84,7 +86,8 @@ class Tensor {
* @note If not exist, then allocation. * @note If not exist, then allocation.
*/ */
template <typename T> template <typename T>
inline T* mutable_data(DDim dims, platform::Place place); inline T* mutable_data(DDim dims, platform::Place place,
bool is_pinned = false);
/*! Return the dimensions of the memory block. */ /*! Return the dimensions of the memory block. */
inline const DDim& dims() const; inline const DDim& dims() const;
...@@ -92,6 +95,9 @@ class Tensor { ...@@ -92,6 +95,9 @@ class Tensor {
/*! Return the numel of the memory block. */ /*! Return the numel of the memory block. */
inline int64_t numel() const; inline int64_t numel() const;
/*! Return the numel of the memory block. */
inline bool isPinned() const;
/*! Resize the dimensions of the memory block. */ /*! Resize the dimensions of the memory block. */
inline Tensor& Resize(const DDim& dims); inline Tensor& Resize(const DDim& dims);
...@@ -146,12 +152,14 @@ class Tensor { ...@@ -146,12 +152,14 @@ class Tensor {
template <typename Place> template <typename Place>
struct PlaceholderImpl : public Placeholder { struct PlaceholderImpl : public Placeholder {
PlaceholderImpl(Place place, size_t size, std::type_index type) PlaceholderImpl(Place place, size_t size, std::type_index type,
: ptr_(static_cast<uint8_t*>(memory::Alloc(place, size)), bool is_pinned = false)
memory::PODDeleter<uint8_t, Place>(place)), : ptr_(static_cast<uint8_t*>(memory::Alloc(place, size, is_pinned)),
memory::PODDeleter<uint8_t, Place>(place, is_pinned)),
place_(place), place_(place),
size_(size), size_(size),
type_(type) { type_(type),
is_pinned_(is_pinned) {
PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.", PADDLE_ENFORCE_NOT_NULL(ptr_, "Insufficient %s memory to allocation.",
(is_cpu_place(place_) ? "CPU" : "GPU")); (is_cpu_place(place_) ? "CPU" : "GPU"));
} }
...@@ -174,6 +182,9 @@ class Tensor { ...@@ -174,6 +182,9 @@ class Tensor {
/* the current type of memory */ /* the current type of memory */
std::type_index type_; std::type_index type_;
/*! use pinned memory or not. */
bool is_pinned_;
}; };
/*! holds the memory block if allocated. */ /*! holds the memory block if allocated. */
...@@ -208,6 +219,7 @@ class Tensor { ...@@ -208,6 +219,7 @@ class Tensor {
* PlaceHolder::ptr_ and where the tensor data really begins. * PlaceHolder::ptr_ and where the tensor data really begins.
*/ */
size_t offset_; size_t offset_;
bool is_pinned_;
}; };
inline void Tensor::switch_place(platform::Place new_place) { inline void Tensor::switch_place(platform::Place new_place) {
......
...@@ -101,52 +101,55 @@ inline T* Tensor::data() { ...@@ -101,52 +101,55 @@ inline T* Tensor::data() {
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(DDim dims, platform::Place place) { inline T* Tensor::mutable_data(DDim dims, platform::Place place,
bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
Resize(dims); Resize(dims);
return mutable_data<T>(place); return mutable_data<T>(place, is_pinned);
} }
template <typename T> template <typename T>
inline T* Tensor::mutable_data(platform::Place place) { inline T* Tensor::mutable_data(platform::Place place, bool is_pinned) {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
return reinterpret_cast<T*>(mutable_data(place, typeid(T))); return reinterpret_cast<T*>(mutable_data(place, typeid(T), is_pinned));
} }
inline void* Tensor::mutable_data(platform::Place place, std::type_index type) { inline void* Tensor::mutable_data(platform::Place place, std::type_index type,
bool is_pinned) {
if (holder_ != nullptr) { if (holder_ != nullptr) {
holder_->set_type(type); holder_->set_type(type);
} }
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GE(numel(), 0,
numel(), 0, "When calling this method, the Tensor's numel must be "
"When calling this method, the Tensor's numel must be larger than zero. " "equal or larger than zero. "
"Please check Tensor::Resize has been called first."); "Please check Tensor::Resize has been called first.");
int64_t size = numel() * SizeOfType(type); int64_t size = numel() * SizeOfType(type);
/* some versions of boost::variant don't have operator!= */ /* some versions of boost::variant don't have operator!= */
if (holder_ == nullptr || !(holder_->place() == place) || if (holder_ == nullptr || !(holder_->place() == place) ||
holder_->size() < size + offset_) { holder_->size() < size + offset_) {
if (platform::is_cpu_place(place)) { if (platform::is_cpu_place(place)) {
holder_.reset(new PlaceholderImpl<platform::CPUPlace>( holder_.reset(new PlaceholderImpl<platform::CPUPlace>(
boost::get<platform::CPUPlace>(place), size, type)); boost::get<platform::CPUPlace>(place), size, type, is_pinned));
} else if (platform::is_gpu_place(place)) { } else if (platform::is_gpu_place(place)) {
#ifndef PADDLE_WITH_CUDA #ifndef PADDLE_WITH_CUDA
PADDLE_THROW("'CUDAPlace' is not supported in CPU only device."); PADDLE_THROW("'CUDAPlace' is not supported in CPU only device.");
} }
#else #else
holder_.reset(new PlaceholderImpl<platform::CUDAPlace>( holder_.reset(new PlaceholderImpl<platform::CUDAPlace>(
boost::get<platform::CUDAPlace>(place), size, type)); boost::get<platform::CUDAPlace>(place), size, type, is_pinned));
} }
#endif #endif
offset_ = 0; offset_ = 0;
is_pinned_ = is_pinned;
} }
return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) + return reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(holder_->ptr()) +
offset_); offset_);
} }
inline void* Tensor::mutable_data(platform::Place place) { inline void* Tensor::mutable_data(platform::Place place, bool is_pinned) {
PADDLE_ENFORCE(this->holder_ != nullptr, PADDLE_ENFORCE(this->holder_ != nullptr,
"Cannot invoke mutable data if current hold nothing"); "Cannot invoke mutable data if current hold nothing");
return mutable_data(place, holder_->type()); return mutable_data(place, holder_->type(), is_pinned);
} }
inline Tensor& Tensor::ShareDataWith(const Tensor& src) { inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; } ...@@ -188,6 +191,8 @@ inline const DDim& Tensor::dims() const { return dims_; }
inline int64_t Tensor::numel() const { return product(dims_); } inline int64_t Tensor::numel() const { return product(dims_); }
inline bool Tensor::isPinned() const { return is_pinned_; }
inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) { inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
Tensor res; Tensor res;
res.ShareDataWith(src); res.ShareDataWith(src);
......
...@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) { ...@@ -119,6 +119,50 @@ void GPUAllocator::Free(void* p, size_t size, size_t index) {
bool GPUAllocator::UseGpu() const { return true; } bool GPUAllocator::UseGpu() const { return true; }
// PINNED memory allows direct DMA transfers by the GPU to and from system
// memory. It’s locked to a physical address.
void* CUDAPinnedAllocator::Alloc(size_t& index, size_t size) {
if (size <= 0) return nullptr;
void* p;
// NOTE: here, we use GpuMaxAllocSize() as the maximum memory size
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t usable = paddle::platform::GpuMaxAllocSize() - fallback_alloc_size_;
if (size > usable) return nullptr;
// PINNED memory is visible to all CUDA contexts.
cudaError_t result = cudaMallocHost(&p, size);
if (result == cudaSuccess) {
index = 1;
fallback_alloc_size_ += size;
return p;
}
return nullptr;
}
void CUDAPinnedAllocator::Free(void* p, size_t size, size_t index) {
cudaError_t err;
PADDLE_ASSERT(index == 1);
PADDLE_ASSERT(fallback_alloc_size_ >= size);
fallback_alloc_size_ -= size;
err = cudaFreeHost(p);
// Purposefully allow cudaErrorCudartUnloading, because
// that is returned if you ever call cudaFreeHost after the
// driver has already shutdown. This happens only if the
// process is terminating, in which case we don't care if
// cudaFreeHost succeeds.
if (err != cudaErrorCudartUnloading) {
PADDLE_ENFORCE(err, "cudaFreeHost failed in GPUPinnedAllocator::Free.");
}
}
bool CUDAPinnedAllocator::UseGpu() const { return true; }
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator { ...@@ -51,6 +51,18 @@ class GPUAllocator : public SystemAllocator {
size_t gpu_alloc_size_ = 0; size_t gpu_alloc_size_ = 0;
size_t fallback_alloc_size_ = 0; size_t fallback_alloc_size_ = 0;
}; };
class CUDAPinnedAllocator : public SystemAllocator {
public:
virtual void* Alloc(size_t& index, size_t size);
virtual void Free(void* p, size_t size, size_t index);
virtual bool UseGpu() const;
private:
size_t gpu_alloc_size_ =
0; // TODO(zcd): how to define the upper limit of CUDAPinnedMemory?
size_t fallback_alloc_size_ = 0;
};
#endif #endif
} // namespace detail } // namespace detail
......
...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() { ...@@ -38,7 +38,8 @@ BuddyAllocator* GetCPUBuddyAllocator() {
} }
template <> template <>
void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size,
bool is_pinned) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
void* p = GetCPUBuddyAllocator()->Alloc(size); void* p = GetCPUBuddyAllocator()->Alloc(size);
VLOG(10) << " pointer=" << p; VLOG(10) << " pointer=" << p;
...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) { ...@@ -46,7 +47,8 @@ void* Alloc<platform::CPUPlace>(platform::CPUPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CPUPlace>(platform::CPUPlace place, void* p) { void Free<platform::CPUPlace>(platform::CPUPlace place, void* p,
bool is_pinned) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p); GetCPUBuddyAllocator()->Free(p);
} }
...@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { ...@@ -82,15 +84,47 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
return as[gpu_id]; return as[gpu_id];
} }
BuddyAllocator* GetCUDAPinnedBuddyAllocator(int gpu_id) {
static BuddyAllocator** as = NULL;
if (as == NULL) {
int gpu_num = platform::GetCUDADeviceCount();
as = new BuddyAllocator*[gpu_num];
for (int gpu = 0; gpu < gpu_num; gpu++) {
as[gpu] = nullptr;
}
}
platform::SetDeviceId(gpu_id);
if (!as[gpu_id]) {
as[gpu_id] = new BuddyAllocator(new detail::CUDAPinnedAllocator,
platform::GpuMinChunkSize(),
platform::GpuMaxChunkSize());
VLOG(10) << "\n\nNOTE: each GPU device use "
<< FLAGS_fraction_of_gpu_memory_to_use * 100
<< "% of GPU memory.\n"
<< "You can set GFlags environment variable '"
<< "FLAGS_fraction_of_gpu_memory_to_use"
<< "' to change the fraction of GPU usage.\n\n";
}
return as[gpu_id];
}
template <> template <>
size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) { size_t Used<platform::CUDAPlace>(platform::CUDAPlace place) {
return GetGPUBuddyAllocator(place.device)->Used(); return GetGPUBuddyAllocator(place.device)->Used();
} }
template <> template <>
void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size,
auto* buddy_allocator = GetGPUBuddyAllocator(place.device); bool is_pinned) {
auto* ptr = buddy_allocator->Alloc(size); void* ptr;
if (is_pinned) {
auto* buddy_allocator = GetCUDAPinnedBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
} else {
auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
ptr = buddy_allocator->Alloc(size);
}
if (ptr == nullptr) { if (ptr == nullptr) {
int cur_dev = platform::GetCurrentDeviceId(); int cur_dev = platform::GetCurrentDeviceId();
platform::SetDeviceId(place.device); platform::SetDeviceId(place.device);
...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) { ...@@ -108,8 +142,13 @@ void* Alloc<platform::CUDAPlace>(platform::CUDAPlace place, size_t size) {
} }
template <> template <>
void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p) { void Free<platform::CUDAPlace>(platform::CUDAPlace place, void* p,
GetGPUBuddyAllocator(place.device)->Free(p); bool is_pinned) {
if (is_pinned) {
GetCUDAPinnedBuddyAllocator(place.device)->Free(p);
} else {
GetGPUBuddyAllocator(place.device)->Free(p);
}
} }
#endif #endif
......
...@@ -33,7 +33,7 @@ namespace memory { ...@@ -33,7 +33,7 @@ namespace memory {
* address is valid or not. * address is valid or not.
*/ */
template <typename Place> template <typename Place>
void* Alloc(Place place, size_t size); void* Alloc(Place place, size_t size, bool is_pinned = false);
/** /**
* \brief Free memory block in one place. * \brief Free memory block in one place.
...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size); ...@@ -43,7 +43,7 @@ void* Alloc(Place place, size_t size);
* *
*/ */
template <typename Place> template <typename Place>
void Free(Place place, void* ptr); void Free(Place place, void* ptr, bool is_pinned = false);
/** /**
* \brief Total size of used memory in one place. * \brief Total size of used memory in one place.
...@@ -74,11 +74,13 @@ class PODDeleter { ...@@ -74,11 +74,13 @@ class PODDeleter {
static_assert(std::is_pod<T>::value, "T must be POD"); static_assert(std::is_pod<T>::value, "T must be POD");
public: public:
explicit PODDeleter(Place place) : place_(place) {} explicit PODDeleter(Place place, bool is_pinned = false)
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr)); } : place_(place), is_pinned_(is_pinned) {}
void operator()(T* ptr) { Free(place_, static_cast<void*>(ptr), is_pinned_); }
private: private:
Place place_; Place place_;
bool is_pinned_;
}; };
/** /**
......
...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) { ...@@ -59,7 +59,7 @@ TEST(BuddyAllocator, CPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(cpu, size)] = size; ps[paddle::memory::Alloc(cpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) { ...@@ -117,7 +117,7 @@ TEST(BuddyAllocator, GPUMultAlloc) {
EXPECT_EQ(total_size, 0UL); EXPECT_EQ(total_size, 0UL);
for (auto size : for (auto size :
{128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) { {0, 128, 256, 1024, 4096, 16384, 65536, 262144, 1048576, 4194304}) {
ps[paddle::memory::Alloc(gpu, size)] = size; ps[paddle::memory::Alloc(gpu, size)] = size;
// Buddy Allocator doesn't manage too large memory chunk // Buddy Allocator doesn't manage too large memory chunk
......
...@@ -12,6 +12,8 @@ function(op_library TARGET) ...@@ -12,6 +12,8 @@ function(op_library TARGET)
set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE) set(OP_LIBRARY ${TARGET} ${OP_LIBRARY} PARENT_SCOPE)
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(hip_cu_srcs)
set(miopen_hip_cc_srcs)
set(cu_cc_srcs) set(cu_cc_srcs)
set(cudnn_cu_cc_srcs) set(cudnn_cu_cc_srcs)
set(CUDNN_FILE) set(CUDNN_FILE)
...@@ -36,10 +38,19 @@ function(op_library TARGET) ...@@ -36,10 +38,19 @@ function(op_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_cu_srcs ${TARGET}.hip.cu)
endif()
string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}") string(REPLACE "_op" "_cudnn_op" CUDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${CUDNN_FILE}.cu.cc)
list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc) list(APPEND cudnn_cu_cc_srcs ${CUDNN_FILE}.cu.cc)
endif() endif()
if(WITH_AMD_GPU)
string(REPLACE "_op" "_miopen_op" MIOPEN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MIOPEN_FILE}.hip.cc)
list(APPEND miopen_hip_cc_srcs ${MIOPEN_FILE}.hip.cc)
endif()
endif()
if(WITH_MKLDNN) if(WITH_MKLDNN)
string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}") string(REPLACE "_op" "_mkldnn_op" MKLDNN_FILE "${TARGET}")
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${MKLDNN_FILE}.cc)
...@@ -48,10 +59,14 @@ function(op_library TARGET) ...@@ -48,10 +59,14 @@ function(op_library TARGET)
endif() endif()
else() else()
foreach(src ${op_library_SRCS}) foreach(src ${op_library_SRCS})
if (${src} MATCHES ".*\\.cu$") if (${src} MATCHES ".*\\.hip.cu$")
list(APPEND hip_cu_srcs ${src})
elseif (${src} MATCHES ".*\\.cu$")
list(APPEND cu_srcs ${src}) list(APPEND cu_srcs ${src})
elseif(${src} MATCHES ".*_cudnn_op.cu.cc$") elseif(${src} MATCHES ".*_cudnn_op.cu.cc$")
list(APPEND cudnn_cu_cc_srcs ${src}) list(APPEND cudnn_cu_cc_srcs ${src})
elseif(WITH_AMD_GPU AND ${src} MATCHES ".*_miopen_op.hip.cc$")
list(APPEND miopen_hip_cc_srcs ${src})
elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$") elseif(WITH_MKLDNN AND ${src} MATCHES ".*_mkldnn_op.cc$")
list(APPEND mkldnn_cc_srcs ${src}) list(APPEND mkldnn_cc_srcs ${src})
elseif(${src} MATCHES ".*\\.cu.cc$") elseif(${src} MATCHES ".*\\.cu.cc$")
...@@ -76,6 +91,9 @@ function(op_library TARGET) ...@@ -76,6 +91,9 @@ function(op_library TARGET)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS} nv_library(${TARGET} SRCS ${cc_srcs} ${cu_cc_srcs} ${cudnn_cu_cc_srcs} ${mkldnn_cc_srcs} ${cu_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_cu_srcs} ${miopen_hip_cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps})
else() else()
cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS} cc_library(${TARGET} SRCS ${cc_srcs} ${mkldnn_cc_srcs} DEPS ${op_library_DEPS}
${op_common_deps}) ${op_common_deps})
...@@ -88,7 +106,7 @@ function(op_library TARGET) ...@@ -88,7 +106,7 @@ function(op_library TARGET)
endif() endif()
endforeach() endforeach()
# The registration of USE_OP, please refer to paddle/framework/op_registry.h. # The registration of USE_OP, please refer to paddle/fluid/framework/op_registry.h.
# Note that it's enough to just adding one operator to pybind in a *_op.cc file. # Note that it's enough to just adding one operator to pybind in a *_op.cc file.
# And for detail pybind information, please see generated paddle/pybind/pybind.h. # And for detail pybind information, please see generated paddle/pybind/pybind.h.
file(READ ${TARGET}.cc TARGET_CONTENT) file(READ ${TARGET}.cc TARGET_CONTENT)
...@@ -114,7 +132,10 @@ function(op_library TARGET) ...@@ -114,7 +132,10 @@ function(op_library TARGET)
list(LENGTH cu_srcs cu_srcs_len) list(LENGTH cu_srcs cu_srcs_len)
list(LENGTH cu_cc_srcs cu_cc_srcs_len) list(LENGTH cu_cc_srcs cu_cc_srcs_len)
list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len) list(LENGTH mkldnn_cc_srcs mkldnn_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0) list(LENGTH hip_cu_srcs hip_cu_srcs_len)
list(LENGTH miopen_hip_cc_srcs miopen_hip_cc_srcs_len)
if (${pybind_flag} EQUAL 0 AND ${mkldnn_cc_srcs_len} EQUAL 0 AND ${cu_srcs_len} EQUAL 0 AND ${cu_cc_srcs_len} EQUAL 0 AND
${hip_cu_srcs_len} EQUAL 0 AND ${miopen_hip_cc_srcs_len} EQUAL 0)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
set(pybind_flag 1) set(pybind_flag 1)
endif() endif()
...@@ -125,9 +146,19 @@ function(op_library TARGET) ...@@ -125,9 +146,19 @@ function(op_library TARGET)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, CUDNN);\n")
endif() endif()
# pybind USE_OP_DEVICE_KERNEL for MIOPEN
if (WITH_AMD_GPU AND ${miopen_hip_cc_srcs_len} GREATER 0)
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MIOPEN);\n")
endif()
# pybind USE_OP_DEVICE_KERNEL for MKLDNN # pybind USE_OP_DEVICE_KERNEL for MKLDNN
if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0) if (WITH_MKLDNN AND ${mkldnn_cc_srcs_len} GREATER 0)
# Append first implemented MKLDNN activation operator
if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op")
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n")
else()
file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
endif()
endif() endif()
# pybind USE_OP # pybind USE_OP
...@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE) ...@@ -156,9 +187,13 @@ if(WITH_DISTRIBUTE)
set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(recv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS}) op_library(listen_and_serv_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set_source_files_properties(listen_and_serv_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_vars_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_vars_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor) cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS send_op listen_and_serv_op sum_op executor)
else() else()
set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op) set(DEPS_OPS ${DEPS_OPS} send_op recv_op listen_and_serv_op send_vars_op send_barrier_op)
endif() endif()
op_library(cond_op DEPS framework_proto tensor net_op) op_library(cond_op DEPS framework_proto tensor net_op)
...@@ -229,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor ...@@ -229,3 +264,4 @@ cc_test(strided_memcpy_test SRCS strided_memcpy_test.cc DEPS tensor paddle_memor
cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op) cc_test(save_load_op_test SRCS save_load_op_test.cc DEPS save_op load_op)
cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op) cc_test(save_load_combine_op_test SRCS save_load_combine_op_test.cc DEPS save_combine_op load_combine_op)
nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context) nv_test(nccl_op_test SRCS nccl_op_test.cu.cc DEPS nccl_op gpu_info device_context)
nv_test(dropout_op_test SRCS dropout_op_test.cc DEPS dropout_op tensor)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "mkldnn.hpp"
#include "mkldnn_activation_op.h"
#include "paddle/fluid/operators/activation_op.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
namespace {
template <typename T, typename ExecContext>
void eltwise_forward(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace.");
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *src = ctx.template Input<Tensor>("X");
const auto *src_data = src->template data<T>();
auto *dst = ctx.template Output<Tensor>("Out");
const T *dst_data = dst->template mutable_data<T>(ctx.GetPlace());
// get memory dim
PADDLE_ENFORCE(src->dims().size() == 4,
"Input dim must be with 4, i.e. NCHW");
std::vector<int> src_tz = framework::vectorize2int(src->dims());
// create memory description
// TODO(kbinias-intel): support more formats
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src_data);
auto dst_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)dst_data);
auto forward_desc = mkldnn::eltwise_forward::desc(
mkldnn::prop_kind::forward_training, algorithm, data_md, alpha, beta);
// save prim desc into global device context to be referred in backward path
const std::string key = ctx.op().Output("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
auto forward_pd = std::make_shared<mkldnn::eltwise_forward::primitive_desc>(
forward_desc, mkldnn_engine);
dev_ctx.SetBlob(key_eltwise_pd, forward_pd);
auto eltwise = mkldnn::eltwise_forward(*forward_pd, src_memory, dst_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
template <typename T, typename ExecContext>
void eltwise_grad(const ExecContext &ctx, mkldnn::algorithm algorithm,
const T alpha = 0, const T beta = 0) {
auto &dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto &mkldnn_engine = dev_ctx.GetEngine();
// get buffers
const auto *x = ctx.template Input<Tensor>("X");
const auto *src = x->template data<T>();
auto *dout = ctx.template Input<Tensor>(framework::GradVarName("Out"));
const auto *diff_dst = dout->template data<T>();
auto *dx =
ctx.template Output<framework::Tensor>(framework::GradVarName("X"));
const T *diff_src = dx->template mutable_data<T>(ctx.GetPlace());
// get memory dim
std::vector<int> src_tz = framework::vectorize2int(x->dims());
// create memory description
auto data_md = platform::MKLDNNMemDesc(src_tz, mkldnn::memory::f32,
mkldnn::memory::format::nchw);
// create memory primitives
auto src_memory = mkldnn::memory({data_md, mkldnn_engine}, (void *)src);
auto diff_src_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_src);
auto diff_dst_memory =
mkldnn::memory({data_md, mkldnn_engine}, (void *)diff_dst);
auto backward_desc =
mkldnn::eltwise_backward::desc(algorithm, data_md, data_md, alpha, beta);
// retrieve eltwise primitive desc from device context
const std::string key = ctx.op().Input("Out");
const std::string key_eltwise_pd = key + "@eltwise_pd";
const std::shared_ptr<void> forward_pd = dev_ctx.GetBlob(key_eltwise_pd);
PADDLE_ENFORCE(forward_pd != nullptr,
"Fail to find eltwise_pd in device context");
auto *p_forward_pd =
static_cast<mkldnn::eltwise_forward::primitive_desc *>(forward_pd.get());
auto eltwise_bwd_prim_desc = mkldnn::eltwise_backward::primitive_desc(
backward_desc, mkldnn_engine, *p_forward_pd);
auto eltwise_bwd = mkldnn::eltwise_backward(eltwise_bwd_prim_desc, src_memory,
diff_dst_memory, diff_src_memory);
// push primitive to stream and wait until it's executed
std::vector<mkldnn::primitive> pipeline = {eltwise_bwd};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // anonymous namespace
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_forward<T>(ctx, algorithm);
}
};
template <typename T, mkldnn::algorithm algorithm>
struct MKLDNNActivationGradFunc : public BaseActivationFunctor<T> {
template <typename ExecContext>
void operator()(const ExecContext &ctx) const {
eltwise_grad<T>(ctx, algorithm);
}
};
template <typename T>
using ReluMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnFunctor =
MKLDNNActivationFunc<T, mkldnn::algorithm::eltwise_abs>;
template <typename T>
using ReluMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_relu>;
template <typename T>
using TanhMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_tanh>;
template <typename T>
using SqrtMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_sqrt>;
template <typename T>
using AbsMkldnnGradFunctor =
MKLDNNActivationGradFunc<T, mkldnn::algorithm::eltwise_abs>;
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu, ReluMkldnnFunctor, ReluMkldnnGradFunctor); \
__macro(tanh, TanhMkldnnFunctor, TanhMkldnnGradFunctor); \
__macro(sqrt, SqrtMkldnnFunctor, SqrtMkldnnGradFunctor); \
__macro(abs, AbsMkldnnFunctor, AbsMkldnnGradFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/mkldnn_activation_op.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -87,6 +88,9 @@ class ReluOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Relu operator"); AddInput("X", "Input of Relu operator");
AddOutput("Out", "Output of Relu operator"); AddOutput("Out", "Output of Relu operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Relu Activation Operator. Relu Activation Operator.
...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -140,6 +144,9 @@ class TanhOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Tanh operator"); AddInput("X", "Input of Tanh operator");
AddOutput("Out", "Output of Tanh operator"); AddOutput("Out", "Output of Tanh operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Tanh Activation Operator. Tanh Activation Operator.
...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -193,6 +200,9 @@ class SqrtOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Sqrt operator"); AddInput("X", "Input of Sqrt operator");
AddOutput("Out", "Output of Sqrt operator"); AddOutput("Out", "Output of Sqrt operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Sqrt Activation Operator. Sqrt Activation Operator.
...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -208,6 +218,9 @@ class AbsOpMaker : public framework::OpProtoAndCheckerMaker {
: framework::OpProtoAndCheckerMaker(proto, op_checker) { : framework::OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "Input of Abs operator"); AddInput("X", "Input of Abs operator");
AddOutput("Out", "Output of Abs operator"); AddOutput("Out", "Output of Abs operator");
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Abs Activation Operator. Abs Activation Operator.
...@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker, ...@@ -524,11 +537,11 @@ REGISTER_OP(logsigmoid, ops::ActivationOp, ops::LogSigmoidOpMaker,
REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad, REGISTER_OP(exp, ops::ActivationOp, ops::ExpOpMaker, exp_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
REGISTER_OP(relu, ops::ActivationOp, ops::ReluOpMaker, relu_grad, REGISTER_OP(relu, ops::ActivationWithMKLDNNOp, ops::ReluOpMaker, relu_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh, ops::ActivationOp, ops::TanhOpMaker, tanh_grad, REGISTER_OP(tanh, ops::ActivationWithMKLDNNOp, ops::TanhOpMaker, tanh_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
tanh_shrink_grad, ops::ActivationOpGrad); tanh_shrink_grad, ops::ActivationOpGrad);
...@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker, ...@@ -536,11 +549,11 @@ REGISTER_OP(tanh_shrink, ops::ActivationOp, ops::TanhShrinkOpMaker,
REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker, REGISTER_OP(softshrink, ops::ActivationOp, ops::SoftShrinkOpMaker,
softshrink_grad, ops::ActivationOpGrad); softshrink_grad, ops::ActivationOpGrad);
REGISTER_OP(sqrt, ops::ActivationOp, ops::SqrtOpMaker, sqrt_grad, REGISTER_OP(sqrt, ops::ActivationWithMKLDNNOp, ops::SqrtOpMaker, sqrt_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(abs, ops::ActivationOp, ops::AbsOpMaker, abs_grad, REGISTER_OP(abs, ops::ActivationWithMKLDNNOp, ops::AbsOpMaker, abs_grad,
ops::ActivationOpGrad); ops::ActivationWithMKLDNNOpGrad);
REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad, REGISTER_OP(ceil, ops::ActivationOp, ops::CeilOpMaker, ceil_grad,
ops::ActivationOpGrad); ops::ActivationOpGrad);
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -17,6 +17,10 @@ limitations under the License. */ ...@@ -17,6 +17,10 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h" #include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -23,21 +23,10 @@ limitations under the License. */ ...@@ -23,21 +23,10 @@ limitations under the License. */
static constexpr char Channel[] = "Channel"; static constexpr char Channel[] = "Channel";
static constexpr char X[] = "X"; static constexpr char X[] = "X";
static constexpr char Status[] = "Status";
static constexpr char copy[] = "copy";
namespace paddle { namespace paddle {
namespace operators { namespace operators {
void SetSendStatus(const platform::Place &dev_place,
framework::Variable &status_var, bool status) {
auto cpu = platform::CPUPlace();
auto status_tensor =
status_var.GetMutable<framework::LoDTensor>()->mutable_data<bool>({1},
cpu);
status_tensor[0] = status;
}
class ChannelSendOp : public framework::OperatorBase { class ChannelSendOp : public framework::OperatorBase {
public: public:
ChannelSendOp(const std::string &type, ChannelSendOp(const std::string &type,
...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -51,9 +40,6 @@ class ChannelSendOp : public framework::OperatorBase {
"Input(Channel) of ChannelSendOp should not be null."); "Input(Channel) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasInput(X), PADDLE_ENFORCE(ctx->HasInput(X),
"Input(X) of ChannelSendOp should not be null."); "Input(X) of ChannelSendOp should not be null.");
PADDLE_ENFORCE(ctx->HasOutput(Status),
"Output(Status) of ChannelSendOp should not be null.");
ctx->SetOutputDim("Status", {1});
} }
private: private:
...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase { ...@@ -65,10 +51,7 @@ class ChannelSendOp : public framework::OperatorBase {
auto input_var = scope.FindVar(Input(X)); auto input_var = scope.FindVar(Input(X));
// Send the input data through the channel. // Send the input data through the channel.
bool ok = concurrency::ChannelSend(ch, input_var); concurrency::ChannelSend(ch, input_var);
// Set the status output of the `ChannelSend` call.
SetSendStatus(dev_place, *scope.FindVar(Output(Status)), ok);
} }
}; };
...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -82,12 +65,6 @@ class ChannelSendOpMaker : public framework::OpProtoAndCheckerMaker {
.AsDuplicable(); .AsDuplicable();
AddInput(X, "(Variable) The value which gets sent by the channel.") AddInput(X, "(Variable) The value which gets sent by the channel.")
.AsDuplicable(); .AsDuplicable();
AddOutput(Status,
"(Tensor) An LoD Tensor that returns a boolean status of the"
"result of the send operation.")
.AsDuplicable();
AddAttr<bool>(copy, "(bool, default false) Should copy before send")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
)DOC"); )DOC");
} }
......
...@@ -17,20 +17,20 @@ limitations under the License. */ ...@@ -17,20 +17,20 @@ limitations under the License. */
namespace poc = paddle::operators::concurrency; namespace poc = paddle::operators::concurrency;
bool poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) { void poc::ChannelSend(framework::ChannelHolder *ch, framework::Variable *var) {
auto type = framework::ToVarType(var->Type()); auto type = framework::ToVarType(var->Type());
if (type == framework::proto::VarType_Type_LOD_TENSOR) if (type == framework::proto::VarType_Type_LOD_TENSOR)
return ch->Send(var->GetMutable<framework::LoDTensor>()); ch->Send(var->GetMutable<framework::LoDTensor>());
else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE) else if (type == framework::proto::VarType_Type_LOD_RANK_TABLE)
return ch->Send(var->GetMutable<framework::LoDRankTable>()); ch->Send(var->GetMutable<framework::LoDRankTable>());
else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY) else if (type == framework::proto::VarType_Type_LOD_TENSOR_ARRAY)
return ch->Send(var->GetMutable<framework::LoDTensorArray>()); ch->Send(var->GetMutable<framework::LoDTensorArray>());
else if (type == framework::proto::VarType_Type_SELECTED_ROWS) else if (type == framework::proto::VarType_Type_SELECTED_ROWS)
return ch->Send(var->GetMutable<framework::SelectedRows>()); ch->Send(var->GetMutable<framework::SelectedRows>());
else if (type == framework::proto::VarType_Type_READER) else if (type == framework::proto::VarType_Type_READER)
return ch->Send(var->GetMutable<framework::ReaderHolder>()); ch->Send(var->GetMutable<framework::ReaderHolder>());
else if (type == framework::proto::VarType_Type_CHANNEL) else if (type == framework::proto::VarType_Type_CHANNEL)
return ch->Send(var->GetMutable<framework::ChannelHolder>()); ch->Send(var->GetMutable<framework::ChannelHolder>());
else else
PADDLE_THROW("ChannelSend:Unsupported type"); PADDLE_THROW("ChannelSend:Unsupported type");
} }
......
...@@ -21,7 +21,7 @@ namespace paddle { ...@@ -21,7 +21,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace concurrency { namespace concurrency {
bool ChannelSend(framework::ChannelHolder *ch, framework::Variable *var); void ChannelSend(framework::ChannelHolder *ch, framework::Variable *var);
bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var); bool ChannelReceive(framework::ChannelHolder *ch, framework::Variable *var);
void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer, void ChannelAddToSendQ(framework::ChannelHolder *ch, const void *referrer,
......
...@@ -146,8 +146,9 @@ class GrpcByteBufferSource ...@@ -146,8 +146,9 @@ class GrpcByteBufferSource
class GrpcByteBufferSourceWrapper : public Source { class GrpcByteBufferSourceWrapper : public Source {
public: public:
GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source) : source_(source) {} explicit GrpcByteBufferSourceWrapper(GrpcByteBufferSource* source)
virtual ::google::protobuf::io::ZeroCopyInputStream* contents() override { : source_(source) {}
::google::protobuf::io::ZeroCopyInputStream* contents() override {
return source_; return source_;
} }
......
...@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep, ...@@ -49,9 +49,8 @@ bool RPCClient::AsyncSendVariable(const std::string& ep,
s->Prepare(var_h, time_out); s->Prepare(var_h, time_out);
s->response_call_back_ = NULL; s->response_call_back_ = NULL;
auto call = std::move(s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, s->context_.get(), "/sendrecv.SendRecvService/SendVariable", req, &cq_);
&cq_));
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s); call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
...@@ -107,8 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep, ...@@ -107,8 +106,8 @@ bool RPCClient::AsyncGetVariable(const std::string& ep,
::grpc::ByteBuffer buf; ::grpc::ByteBuffer buf;
RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf); RequestToByteBuffer<sendrecv::VariableMessage>(req, &buf);
auto call = std::move(s->stub_g_.PrepareUnaryCall( auto call = s->stub_g_.PrepareUnaryCall(
s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_)); s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_);
call->StartCall(); call->StartCall();
call->Finish(&s->reply_, &s->status_, (void*)s); call->Finish(&s->reply_, &s->status_, (void*)s);
}); });
......
...@@ -21,15 +21,11 @@ limitations under the License. */ ...@@ -21,15 +21,11 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/selected_rows.h"
#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/var_type.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h" #include "paddle/fluid/operators/detail/grpc_service.h"
#include "paddle/fluid/operators/detail/simple_block_queue.h"
#include "paddle/fluid/operators/detail/send_recv.grpc.pb.h" #include "paddle/fluid/operators/detail/send_recv.grpc.pb.h"
#include "paddle/fluid/operators/detail/send_recv.pb.h" #include "paddle/fluid/operators/detail/send_recv.pb.h"
#include "paddle/fluid/operators/detail/sendrecvop_utils.h"
#include "paddle/fluid/operators/detail/grpc_service.h" #include "paddle/fluid/operators/detail/simple_block_queue.h"
//#include <grpc/support/log.h>
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) { ...@@ -81,7 +81,7 @@ void RunSerdeTestSelectedRows(platform::Place place) {
// operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2); // operators::detail::DeserializeFromByteBuffer(msg, ctx, &var2);
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::TensorResponse resp(&scope, &ctx); operators::detail::VariableResponse resp(&scope, &ctx);
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
framework::Variable* var2 = resp.GetVar(); framework::Variable* var2 = resp.GetVar();
...@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -166,7 +166,7 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
// deserialize zero-copy // deserialize zero-copy
framework::Scope scope; framework::Scope scope;
scope.Var("myvar"); scope.Var("myvar");
operators::detail::TensorResponse resp(&scope, &ctx); operators::detail::VariableResponse resp(&scope, &ctx);
if (from_type == 0) { if (from_type == 0) {
EXPECT_EQ(resp.Parse(msg), 0); EXPECT_EQ(resp.Parse(msg), 0);
} else { } else {
...@@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) { ...@@ -194,24 +194,23 @@ void RunTestLodTensor(platform::Place place, int from_type = 0) {
for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9); for (int i = 0; i < tensor_numel; ++i) EXPECT_FLOAT_EQ(tensor_data2[i], 31.9);
} }
TEST(LodTensor, GPU) { TEST(LodTensor, Run) {
platform::CUDAPlace place;
RunTestLodTensor(place);
RunTestLodTensor(place, 1);
}
TEST(LodTensor, CPU) {
platform::CPUPlace place; platform::CPUPlace place;
RunTestLodTensor(place); RunTestLodTensor(place);
RunTestLodTensor(place, 1); RunTestLodTensor(place, 1);
#ifdef PADDLE_WITH_CUDA
platform::CUDAPlace gpu(0);
RunTestLodTensor(gpu);
RunTestLodTensor(gpu, 1);
#endif
} }
TEST(SelectedRows, CPU) { TEST(SelectedRows, Run) {
platform::CPUPlace place; platform::CPUPlace place;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(place);
}
TEST(SelectedRows, GPU) { #ifdef PADDLE_WITH_CUDA
platform::CUDAPlace place; platform::CUDAPlace gpu;
RunSerdeTestSelectedRows(place); RunSerdeTestSelectedRows(gpu);
#endif
} }
...@@ -36,9 +36,9 @@ class VariableResponse { ...@@ -36,9 +36,9 @@ class VariableResponse {
public: public:
VariableResponse(const framework::Scope* scope, VariableResponse(const framework::Scope* scope,
const platform::DeviceContext* dev_ctx) const platform::DeviceContext* dev_ctx)
: scope_(scope), dev_ctx_(dev_ctx){}; : scope_(scope), dev_ctx_(dev_ctx) {}
virtual ~VariableResponse(){}; virtual ~VariableResponse() {}
// return: // return:
// 0:ok. // 0:ok.
......
...@@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed, ...@@ -33,6 +33,7 @@ __global__ void RandomGenerator(const size_t n, const int seed,
int idx = blockDim.x * blockIdx.x + threadIdx.x; int idx = blockDim.x * blockIdx.x + threadIdx.x;
for (; idx < n; idx += blockDim.x * gridDim.x) { for (; idx < n; idx += blockDim.x * gridDim.x) {
rng.discard(idx);
if (dist(rng) < dropout_prob) { if (dist(rng) < dropout_prob) {
mask_data[idx] = static_cast<T>(0); mask_data[idx] = static_cast<T>(0);
} else { } else {
...@@ -54,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -54,9 +55,6 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
y->mutable_data<T>(context.GetPlace()); y->mutable_data<T>(context.GetPlace());
float dropout_prob = context.Attr<float>("dropout_prob"); float dropout_prob = context.Attr<float>("dropout_prob");
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
auto& place = *context.template device_context<Place>().eigen_device(); auto& place = *context.template device_context<Place>().eigen_device();
if (!context.Attr<bool>("is_test")) { if (!context.Attr<bool>("is_test")) {
auto* mask = context.Output<Tensor>("Mask"); auto* mask = context.Output<Tensor>("Mask");
...@@ -75,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> { ...@@ -75,6 +73,8 @@ class GPUDropoutKernel : public framework::OpKernel<T> {
T><<<grid, threads, 0, context.cuda_device_context().stream()>>>( T><<<grid, threads, 0, context.cuda_device_context().stream()>>>(
size, seed, dropout_prob, x_data, mask_data, y_data); size, seed, dropout_prob, x_data, mask_data, y_data);
} else { } else {
auto X = EigenMatrix<T>::Reshape(*x, 1);
auto Y = EigenMatrix<T>::Reshape(*y, 1);
Y.device(place) = X * static_cast<T>(1.0f - dropout_prob); Y.device(place) = X * static_cast<T>(1.0f - dropout_prob);
} }
} }
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <unistd.h>
#include <string>
#include <thread>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/dropout_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/string/printf.h"
namespace f = paddle::framework;
namespace p = paddle::platform;
namespace m = paddle::operators::math;
USE_OP(dropout);
void Compare(f::Scope& scope, p::DeviceContext& ctx) {
// init
auto var = scope.Var("X");
auto tensor = var->GetMutable<f::LoDTensor>();
tensor->Resize({10, 10});
std::vector<float> init;
for (int64_t i = 0; i < 10 * 10; ++i) {
init.push_back(1.0);
}
TensorFromVector(init, ctx, tensor);
auto place = ctx.GetPlace();
auto out_var = scope.Var("Out");
auto out_tensor = out_var->GetMutable<f::LoDTensor>();
out_tensor->Resize({10, 10});
out_tensor->mutable_data<float>(place); // allocate
auto mask_var = scope.Var("Mask");
auto mask_tensor = mask_var->GetMutable<f::LoDTensor>();
mask_tensor->Resize({10, 10});
mask_tensor->mutable_data<float>(place); // allocate
// run
f::AttributeMap attrs;
float dropout_prob = 0.5;
attrs.insert({"fix_seed", 1});
attrs.insert({"seed", 3});
attrs.insert({"dropout_prob", dropout_prob});
auto dropout_op = f::OpRegistry::CreateOp(
"dropout", {{"X", {"X"}}}, {{"Out", {"Out"}}, {"Mask", {"Mask"}}}, attrs);
dropout_op->Run(scope, place);
std::vector<float> out_vec;
TensorToVector(*out_tensor, ctx, &out_vec);
std::vector<float> std_out = {
0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1,
1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0,
1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1,
1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0,
1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1};
EXPECT_EQ(out_vec.size(), std_out.size());
for (uint32_t i = 0; i < out_vec.size(); i++) {
EXPECT_EQ(out_vec[i], std_out[i]);
}
}
TEST(Dropout, CPUDense) {
f::Scope scope;
p::CPUPlace place;
p::CPUDeviceContext ctx(place);
Compare(scope, ctx);
}
TEST(Dropout, GPUDense) {
f::Scope scope;
p::CUDAPlace place;
p::CUDADeviceContext ctx(place);
Compare(scope, ctx);
}
...@@ -22,6 +22,103 @@ limitations under the License. */ ...@@ -22,6 +22,103 @@ limitations under the License. */
namespace paddle { namespace paddle {
namespace operators { namespace operators {
// Wrap RowwiseMean and ColwiseMean.
// Reuse the cpu codes and replace the gpu codes with cublas_gemv, which is
// significantly faster. Unlike the RowwiseMean and ColwiseMean, the
// implementation only considers 2D.
template <typename DeviceContext, typename T>
struct RowwiseMean2D {
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class RowwiseMean2D<platform::CUDADeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({right_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0 / right);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, false, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class RowwiseMean2D<platform::CPUDeviceContext, T> {
public:
RowwiseMean2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
row_mean_(context, input, out);
}
private:
math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
};
template <typename DeviceContext, typename T>
struct ColwiseSum2D {
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx);
void operator()(const platform::DeviceContext& context,
const framework::Tensor& input, framework::Tensor* vec);
};
#ifdef PADDLE_WITH_CUDA
template <typename T>
class ColwiseSum2D<platform::CUDADeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx)
: left_(left), right_(right) {
framework::DDim ones_dim({left_});
divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
math::set_constant(dev_ctx, &divisor_, 1.0);
}
void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
math::gemv<platform::CUDADeviceContext, T>(
context, true, left_, right_, 1., input.data<T>(), divisor_.data<T>(),
0., out->data<T>());
}
private:
int left_;
int right_;
framework::Tensor divisor_;
};
#endif
template <typename T>
class ColwiseSum2D<platform::CPUDeviceContext, T> {
public:
ColwiseSum2D(int left, int right, const platform::DeviceContext& dev_ctx) {}
void operator()(const platform::CPUDeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) {
col_wise_(context, input, out);
}
private:
math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
};
template <typename T> template <typename T>
struct SubAndSquareFunctor { struct SubAndSquareFunctor {
inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); } inline HOSTDEVICE T operator()(T a, T b) const { return (a - b) * (a - b); }
...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout; ...@@ -67,15 +164,15 @@ using DataLayout = framework::DataLayout;
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormKernel : public framework::OpKernel<T> { class LayerNormKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Output<Tensor>("Y"); auto* y = ctx.Output<Tensor>("Y");
auto *mean = ctx.Output<Tensor>("Mean"); auto* mean = ctx.Output<Tensor>("Mean");
auto *var = ctx.Output<Tensor>("Variance"); auto* var = ctx.Output<Tensor>("Variance");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const auto x_dims = x.dims(); const auto x_dims = x.dims();
...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -94,8 +191,8 @@ class LayerNormKernel : public framework::OpKernel<T> {
out.ShareDataWith(*y); out.ShareDataWith(*y);
out.Resize(matrix_shape); out.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right, ctx.device_context());
// get mean // get mean
row_mean(dev_ctx, x, mean); row_mean(dev_ctx, x, mean);
...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> { ...@@ -126,31 +223,32 @@ class LayerNormKernel : public framework::OpKernel<T> {
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
class LayerNormGradKernel : public framework::OpKernel<T> { class LayerNormGradKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext &ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
const float epsilon = ctx.Attr<float>("epsilon"); const float epsilon = ctx.Attr<float>("epsilon");
auto x = *ctx.Input<Tensor>("X"); auto x = *ctx.Input<Tensor>("X");
auto *y = ctx.Input<Tensor>("Y"); auto* y = ctx.Input<Tensor>("Y");
auto *mean = ctx.Input<Tensor>("Mean"); auto* mean = ctx.Input<Tensor>("Mean");
auto *var = ctx.Input<Tensor>("Variance"); auto* var = ctx.Input<Tensor>("Variance");
auto *scale = ctx.Input<Tensor>("Scale"); auto* scale = ctx.Input<Tensor>("Scale");
auto *bias = ctx.Input<Tensor>("Bias"); auto* bias = ctx.Input<Tensor>("Bias");
auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y")); auto d_y = *ctx.Input<Tensor>(framework::GradVarName("Y"));
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis"); const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
// init output // init output
auto *d_x = ctx.Output<Tensor>(framework::GradVarName("X")); auto* d_x = ctx.Output<Tensor>(framework::GradVarName("X"));
auto *d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale")); auto* d_scale = ctx.Output<Tensor>(framework::GradVarName("Scale"));
auto *d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias")); auto* d_bias = ctx.Output<Tensor>(framework::GradVarName("Bias"));
const auto &x_dims = x.dims(); const auto& x_dims = x.dims();
auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis); auto matrix_dim = framework::flatten_to_2d(x_dims, begin_norm_axis);
int left = static_cast<int>(matrix_dim[0]); int left = static_cast<int>(matrix_dim[0]);
int right = static_cast<int>(matrix_dim[1]); int right = static_cast<int>(matrix_dim[1]);
framework::DDim matrix_shape({left, right}); framework::DDim matrix_shape({left, right});
d_y.Resize(matrix_shape); d_y.Resize(matrix_shape);
auto &dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::ColwiseSum<DeviceContext, T> colwise_sum; ColwiseSum2D<DeviceContext, T> colwise_sum(left, right,
ctx.device_context());
Tensor temp; Tensor temp;
Tensor temp_norm; Tensor temp_norm;
...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> { ...@@ -190,7 +288,8 @@ class LayerNormGradKernel : public framework::OpKernel<T> {
Tensor temp_vec; Tensor temp_vec;
temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace()); temp_vec.mutable_data<T>(vec_shape, ctx.GetPlace());
math::RowwiseMean<DeviceContext, T> row_mean; RowwiseMean2D<DeviceContext, T> row_mean(left, right,
ctx.device_context());
if (d_scale) { if (d_scale) {
// dy_dx // dy_dx
......
...@@ -139,26 +139,25 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -139,26 +139,25 @@ class ListenAndServOp : public framework::OperatorBase {
// should be global ops. // should be global ops.
// NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads // NOTE: if is_gpu_place, CUDA kernels are laugched by multiple threads
// and this will still work. // and this will still work.
std::vector<std::future<void>> fs; std::vector<std::future<void>> fs;
// block0 contains only listen_and_serv op, start run from block1. // block0 contains only listen_and_serv op, start run from block1.
for (int blkid = 1; blkid < num_blocks - 1; ++blkid) { for (int blkid = 1; blkid < num_blocks - 1; ++blkid) {
fs.push_back(framework::Async([&executor, &program, &recv_scope, fs.push_back(
blkid]() { framework::Async([&executor, &program, &recv_scope, blkid]() {
int run_block = blkid; // thread local int run_block = blkid; // thread local
try { try {
executor.Run(*program, &recv_scope, run_block, executor.Run(*program, &recv_scope, run_block, false, false);
false /*create_local_scope*/, false /*create_vars*/); } catch (std::exception &e) {
} catch (std::exception &e) { LOG(ERROR) << "run sub program error " << e.what();
LOG(ERROR) << "run sub program error " << e.what(); }
} }));
}));
} }
for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait(); for (int i = 0; i < num_blocks - 2; ++i) fs[i].wait();
// Run global block at final step, or block1 if there are only 2 blocks // Run global block at final step, or block1 if there are only 2 blocks
if (num_blocks >= 2) { if (num_blocks >= 2) {
try { try {
executor.Run(*program, &recv_scope, num_blocks - 1, executor.Run(*program, &recv_scope, num_blocks - 1, false, false);
false /*create_local_scope*/, false /*create_vars*/);
} catch (std::exception &e) { } catch (std::exception &e) {
LOG(ERROR) << "run sub program error " << e.what(); LOG(ERROR) << "run sub program error " << e.what();
} }
...@@ -177,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase { ...@@ -177,6 +176,10 @@ class ListenAndServOp : public framework::OperatorBase {
rpc_service_->WaitClientGet(fan_in); rpc_service_->WaitClientGet(fan_in);
sparse_vars.clear(); sparse_vars.clear();
} // while(true) } // while(true)
// for (int i = 0; i < num_blocks; ++i) {
// delete blk_ctx_list[i];
// }
} }
protected: protected:
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/lrn_op.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
using paddle::platform::MKLDNNDeviceContext;
namespace {
template <typename T, typename... Args>
std::shared_ptr<T> insert_to_context(const std::string& key,
const MKLDNNDeviceContext& dev_ctx,
Args&&... args) {
auto p = std::static_pointer_cast<T, void>(dev_ctx.GetBlob(key));
if (!p) {
p = std::make_shared<T>(args...);
dev_ctx.SetBlob(key, std::static_pointer_cast<void, T>(p));
}
return p;
}
template <typename... Args>
void run_primitive(Args&&... args) {
auto forward_op = mkldnn::lrn_forward{args...};
std::vector<mkldnn::primitive> pipeline = {forward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
} // namespace
template <typename T>
class LRNMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(std::is_same<T, float>::value,
"MKLDNN LRN must use float data.");
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"MKLDNN LRN must use CPUPlace.");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto x = ctx.Input<Tensor>("X");
auto out = ctx.Output<Tensor>("Out");
auto mid = ctx.Output<Tensor>("MidOut");
auto input_data = x->data<T>();
auto output_data = out->mutable_data<T>(ctx.GetPlace());
mid->mutable_data<T>(ctx.GetPlace());
const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha");
const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k");
const bool is_test = ctx.Attr<bool>("is_test");
auto e_mid = framework::EigenTensor<T, 4>::From(*mid);
e_mid = e_mid.constant(k);
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto dst_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto forward_desc = mkldnn::lrn_forward::desc{mkldnn::prop_kind::forward,
mkldnn::lrn_across_channels,
src_md,
n,
alpha,
beta,
k};
auto src_memory_pd = mkldnn::memory::primitive_desc{src_md, mkldnn_engine};
auto dst_memory = mkldnn::memory{{dst_md, mkldnn_engine},
static_cast<void*>(output_data)};
if (!is_test) {
const std::string key = ctx.op().Output("Out");
const std::string key_src_memory = key + "@lrn_src_memory";
const std::string key_pd = key + "@lrn_pd";
const std::string key_workspace_memory = key + "@lrn_workspace_memory";
auto forward_pd = insert_to_context<mkldnn::lrn_forward::primitive_desc>(
key_pd, dev_ctx, forward_desc, mkldnn_engine);
auto src_memory = insert_to_context<mkldnn::memory>(
key_src_memory, dev_ctx, src_memory_pd);
src_memory->set_data_handle(
static_cast<void*>(const_cast<T*>(input_data)));
auto workspace_memory = insert_to_context<mkldnn::memory>(
key_workspace_memory, dev_ctx,
forward_pd->workspace_primitive_desc());
run_primitive(*forward_pd, *src_memory, *workspace_memory, dst_memory);
} else {
auto forward_pd =
mkldnn::lrn_forward::primitive_desc{forward_desc, mkldnn_engine};
auto src_memory = mkldnn::memory{
src_memory_pd, static_cast<void*>(const_cast<T*>(input_data))};
auto workspace_memory =
mkldnn::memory{forward_pd.workspace_primitive_desc()};
run_primitive(forward_pd, src_memory, workspace_memory, dst_memory);
}
}
};
template <typename T>
class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(std::is_same<T, float>::value,
"MKLDNN LRN must use float data.");
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"MKLDNN LRN must use CPUPlace.");
PADDLE_ENFORCE(
!ctx.Attr<bool>("is_test"),
"is_test attribute should be set to False in training phase.");
auto x = ctx.Input<Tensor>("X");
auto out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto x_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
const std::string key = ctx.op().Input("Out");
const std::string key_src_memory = key + "@lrn_src_memory";
const std::string key_pd = key + "@lrn_pd";
const std::string key_workspace_memory = key + "@lrn_workspace_memory";
const int n = ctx.Attr<int>("n");
const float alpha = ctx.Attr<float>("alpha");
const float beta = ctx.Attr<float>("beta");
const float k = ctx.Attr<float>("k");
auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto x_grad_data = x_grad->mutable_data<T>(ctx.GetPlace());
auto out_grad_data = out_grad->data<T>();
auto dims = paddle::framework::vectorize2int(x->dims());
auto src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_src_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_dst_md = paddle::platform::MKLDNNMemDesc(
dims, mkldnn::memory::data_type::f32, mkldnn::memory::format::nchw);
auto diff_dst_memory =
mkldnn::memory{{diff_dst_md, mkldnn_engine},
static_cast<void*>(const_cast<float*>(out_grad_data))};
auto diff_src_memory = mkldnn::memory{{diff_src_md, mkldnn_engine},
static_cast<void*>(x_grad_data)};
auto backward_desc = mkldnn::lrn_backward::desc{
mkldnn::lrn_across_channels, src_md, diff_src_md, n, alpha, beta, k};
auto forward_pd = dev_ctx.GetBlob(key_pd);
auto backward_pd = mkldnn::lrn_backward::primitive_desc{
backward_desc, mkldnn_engine,
*static_cast<mkldnn::lrn_forward::primitive_desc*>(forward_pd.get())};
std::shared_ptr<void> workspace_memory =
dev_ctx.GetBlob(key_workspace_memory);
auto src_memory = dev_ctx.GetBlob(key_src_memory);
auto backward_op = mkldnn::lrn_backward{
backward_pd, *static_cast<mkldnn::memory*>(src_memory.get()),
diff_dst_memory, *static_cast<mkldnn::memory*>(workspace_memory.get()),
diff_src_memory};
std::vector<mkldnn::primitive> pipeline = {backward_op};
mkldnn::stream(mkldnn::stream::kind::eager).submit(pipeline).wait();
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(lrn, MKLDNN, paddle::platform::CPUPlace,
ops::LRNMKLDNNOpKernel<float>);
REGISTER_OP_KERNEL(lrn_grad, MKLDNN, paddle::platform::CPUPlace,
ops::LRNMKLDNNGradOpKernel<float>);
...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and ...@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/lrn_op.h" #include "paddle/fluid/operators/lrn_op.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> { ...@@ -116,6 +119,26 @@ struct LRNGradFunctor<platform::CPUDeviceContext, T> {
template struct LRNGradFunctor<platform::CPUDeviceContext, float>; template struct LRNGradFunctor<platform::CPUDeviceContext, float>;
template struct LRNGradFunctor<platform::CPUDeviceContext, double>; template struct LRNGradFunctor<platform::CPUDeviceContext, double>;
namespace {
framework::OpKernelType GetExpectedLRNKernel(
const framework::ExecutionContext& ctx) {
framework::LibraryType library_{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library_ == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library_ = framework::LibraryType::kMKLDNN;
}
#endif
std::string data_format = ctx.Attr<std::string>("data_format");
// TODO(pzelazko-intel): enable MKLDNN layout when it's ready
framework::DataLayout layout_ = framework::StringToDataLayout(data_format);
return framework::OpKernelType(
framework::ToDataType(ctx.Input<Tensor>("X")->type()), ctx.GetPlace(),
layout_, library_);
}
} // namespace
class LRNOp : public framework::OperatorWithKernel { class LRNOp : public framework::OperatorWithKernel {
public: public:
using framework::OperatorWithKernel::OperatorWithKernel; using framework::OperatorWithKernel::OperatorWithKernel;
...@@ -132,8 +155,13 @@ class LRNOp : public framework::OperatorWithKernel { ...@@ -132,8 +155,13 @@ class LRNOp : public framework::OperatorWithKernel {
PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4."); PADDLE_ENFORCE_EQ(x_dim.size(), 4, "Input(X)'rank of LRNOp should be 4.");
ctx->SetOutputDim("Out", x_dim); ctx->SetOutputDim("Out", x_dim);
ctx->SetOutputDim("MidOut", x_dim);
ctx->ShareLoD("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out");
ctx->SetOutputDim("MidOut", x_dim);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetExpectedLRNKernel(ctx);
} }
}; };
...@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -176,6 +204,17 @@ class LRNOpMaker : public framework::OpProtoAndCheckerMaker {
"beta is the power number.") "beta is the power number.")
.SetDefault(0.75) .SetDefault(0.75)
.GreaterThan(0.0); .GreaterThan(0.0);
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"data_format",
"(string, default NCHW) Only used in "
"An optional string from: \"NHWC\", \"NCHW\". "
"Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ")
.SetDefault("AnyLayout");
AddAttr<bool>("is_test", "").SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Local Response Normalization Operator. Local Response Normalization Operator.
...@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel { ...@@ -223,8 +262,12 @@ class LRNOpGrad : public framework::OperatorWithKernel {
auto x_dims = ctx->GetInputDim("X"); auto x_dims = ctx->GetInputDim("X");
ctx->SetOutputDim(framework::GradVarName("X"), x_dims); ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
} }
};
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetExpectedLRNKernel(ctx);
}
};
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
......
...@@ -6,6 +6,7 @@ function(math_library TARGET) ...@@ -6,6 +6,7 @@ function(math_library TARGET)
# But it handle split GPU/CPU code and link some common library. # But it handle split GPU/CPU code and link some common library.
set(cc_srcs) set(cc_srcs)
set(cu_srcs) set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto) set(math_common_deps device_context framework_proto)
set(multiValueArgs DEPS) set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}" cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
...@@ -17,10 +18,15 @@ function(math_library TARGET) ...@@ -17,10 +18,15 @@ function(math_library TARGET)
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu) if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu) list(APPEND cu_srcs ${TARGET}.cu)
endif() endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.hip.cu)
list(APPEND hip_srcs ${TARGET}.hip.cu)
endif()
list(LENGTH cc_srcs cc_srcs_len) list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_AMD_GPU)
hip_library(${TARGET} SRCS ${cc_srcs} ${hip_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0) elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps}) cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif() endif()
......
...@@ -20,7 +20,7 @@ namespace math { ...@@ -20,7 +20,7 @@ namespace math {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CPUDeviceContext, T> { class ConcatFunctor<platform::CPUDeviceContext, T> {
...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> { ...@@ -63,7 +63,7 @@ class ConcatFunctor<platform::CPUDeviceContext, T> {
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CPUDeviceContext, T> { class ConcatGradFunctor<platform::CPUDeviceContext, T> {
......
...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, ...@@ -66,68 +66,66 @@ __global__ void KernelConcat(T** inputs, const int* input_cols, int col_size,
} }
template <typename T> template <typename T>
__global__ void KernelConcat(T** inputs, const int input_col, __global__ void KernelConcat(T** inputs_data, const int fixed_in_col,
const int output_rows, const int output_cols, const int out_rows, const int out_cols,
T* output) { T* output_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < out_cols; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { int split = tid_x * 1.0 / fixed_in_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_in_col;
int in_offset = tid_x - split * input_col; T* input_ptr = inputs_data[split];
T* input_ptr = inputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { for (; tid_y < out_rows; tid_y += blockDim.y * gridDim.y) {
output[tid_y * output_cols + tid_x] = output_data[tid_y * out_cols + tid_x] =
input_ptr[tid_y * input_col + in_offset]; input_ptr[tid_y * fixed_in_col + in_offset];
} }
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int* output_cols, const int in_col, const int* out_cols,
int col_size, T** outputs) { int out_cols_size, T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
int segment = upper_bound<int>(output_cols, col_size, tid_x) - 1; int segment = upper_bound<int>(out_cols, out_cols_size, tid_x) - 1;
int curr_offset = output_cols[segment]; int curr_offset = out_cols[segment];
int curr_segment = segment; int curr_segment = segment;
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
T curr_col_offset; T curr_col_offset;
while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { while ((curr_col_offset = out_cols[curr_segment + 1]) <= tid_x) {
curr_offset = curr_col_offset; curr_offset = curr_col_offset;
++curr_segment; ++curr_segment;
} }
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs[curr_segment]; T* output_ptr = outputs_data[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * segment_width + local_col] = output_ptr[tid_y * segment_width + local_col] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
template <typename T> template <typename T>
__global__ void KernelConcatGrad(const T* input, const int input_row, __global__ void KernelConcatGrad(const T* input_data, const int in_row,
const int input_col, const int output_cols, const int in_col, const int fixed_out_col,
T** outputs) { T** outputs_data) {
int tid_x = blockIdx.x * blockDim.x + threadIdx.x; int tid_x = blockIdx.x * blockDim.x + threadIdx.x;
double inv_input_col = 1.0 / input_col; for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) {
for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x / fixed_out_col;
int split = tid_x * inv_input_col; int in_offset = tid_x - split * fixed_out_col;
int in_offset = tid_x - split * input_col; T* output_ptr = outputs_data[split];
T* output_ptr = outputs[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
output_ptr[tid_y * output_cols + in_offset] = output_ptr[tid_y * fixed_out_col + in_offset] =
input[tid_y * input_col + tid_x]; input_data[tid_y * in_col + tid_x];
} }
} }
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatFunctor<platform::CUDADeviceContext, T> { class ConcatFunctor<platform::CUDADeviceContext, T> {
...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -136,41 +134,40 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
const std::vector<framework::Tensor>& input, const int axis, const std::vector<framework::Tensor>& input, const int axis,
framework::Tensor* output) { framework::Tensor* output) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = input.size(); int in_num = input.size();
int rows = 1; int in_row = 1;
auto dim_0 = input[0].dims(); auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
rows *= dim_0[i]; in_row *= dim_0[i];
} }
int cols = input[0].numel() / rows; int in_col = input[0].numel() / in_row;
int out_rows = rows, out_cols = 0; int out_row = in_row, out_col = 0;
framework::Vector<int16_t> inputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> inputs_data(in_num * sizeof(T*) / 2);
framework::Vector<int> inputs_cols(num + 1); framework::Vector<int> inputs_col(in_num + 1);
inputs_cols[0] = 0;
T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data()); T** inputs_ptr = reinterpret_cast<T**>(inputs_data.data());
inputs_col[0] = 0;
bool sameShape = true; bool sameShape = true;
for (int i = 0; i < num; ++i) { for (int i = 0; i < in_num; ++i) {
int t_cols = input[i].numel() / rows; int t_cols = input[i].numel() / in_row;
if (sameShape) { if (sameShape) {
if (t_cols != cols) sameShape = false; if (t_cols != in_col) sameShape = false;
} }
out_cols += t_cols; out_col += t_cols;
inputs_cols[i + 1] = out_cols; inputs_col[i + 1] = out_col;
inputs_ptr[i] = const_cast<T*>(input[i].data<T>()); inputs_ptr[i] = const_cast<T*>(input[i].data<T>());
} }
T** ins_gpu = T** dev_ins_data =
reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(inputs_data.CUDAMutableData(context.GetPlace()));
const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace());
// computation // computation
// set the thread block and grid according to CurrentDeviceId // set the thread block and grid according to CurrentDeviceId
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. if (out_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((out_cols + 31) >> 5) << 5; block_cols = ((out_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -179,25 +176,26 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((out_cols + block_cols - 1) / block_cols, max_blocks); std::min((out_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, cols, out_rows, out_cols, output->data<T>()); dev_ins_data, in_col, out_row, out_col, output->data<T>());
} else { } else {
const int* dev_ins_col_data = inputs_col.CUDAData(context.GetPlace());
KernelConcat<<<grid_size, block_size, 0, context.stream()>>>( KernelConcat<<<grid_size, block_size, 0, context.stream()>>>(
ins_gpu, ins_col_gpu, static_cast<int>(inputs_cols.size()), out_rows, dev_ins_data, dev_ins_col_data, static_cast<int>(inputs_col.size()),
out_cols, output->data<T>()); out_row, out_col, output->data<T>());
} }
} }
}; };
/* /*
* All tensors' dimension should be the same and the values of * All tensors' dimension should be the same and the values of
* each dimension are the same, except the axis dimension. * each dimension must be the same, except the axis dimension.
*/ */
template <typename T> template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -206,41 +204,40 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
const framework::Tensor& input, const int axis, const framework::Tensor& input, const int axis,
std::vector<framework::Tensor>& outputs) { std::vector<framework::Tensor>& outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int num = outputs.size(); int o_num = outputs.size();
int input_row = 1; int out_row = 1;
auto dim_0 = outputs[0].dims(); auto dim_0 = outputs[0].dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
input_row *= dim_0[i]; out_row *= dim_0[i];
} }
int output_col_0 = outputs[0].numel() / input_row; int out_col = outputs[0].numel() / out_row;
int input_col = 0; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
framework::Vector<int16_t> outputs_data(num * sizeof(T*) / 2); framework::Vector<int16_t> outputs_data(o_num * sizeof(T*) / 2);
framework::Vector<int> outputs_cols(num + 1); framework::Vector<int> outputs_cols(o_num + 1);
outputs_cols[0] = 0;
T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data()); T** outputs_ptr = reinterpret_cast<T**>(outputs_data.data());
for (int i = 0; i < num; ++i) { outputs_cols[0] = 0;
int t_col = outputs[i].numel() / input_row; for (int i = 0; i < o_num; ++i) {
int t_col = outputs[i].numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != output_col_0) sameShape = false; if (t_col != out_col) sameShape = false;
} }
input_col += t_col; in_col += t_col;
outputs_cols[i + 1] = input_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs[i].data<T>(); outputs_ptr[i] = outputs[i].data<T>();
} }
T** outs_gpu = T** dev_out_gpu_data =
reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace())); reinterpret_cast<T**>(outputs_data.CUDAMutableData(context.GetPlace()));
const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace());
// computation // computation
const int kThreadsPerBlock = 1024; const int kThreadsPerBlock = 1024;
int block_cols = kThreadsPerBlock; int block_cols = kThreadsPerBlock;
if (input_col < kThreadsPerBlock) { // block_cols is aligned by 32. if (in_col < kThreadsPerBlock) { // block_cols is aligned by 32.
block_cols = ((input_col + 31) >> 5) << 5; block_cols = ((in_col + 31) >> 5) << 5;
} }
int block_rows = kThreadsPerBlock / block_cols; int block_rows = kThreadsPerBlock / block_cols;
dim3 block_size = dim3(block_cols, block_rows, 1); dim3 block_size = dim3(block_cols, block_rows, 1);
...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -249,18 +246,19 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); int max_blocks = std::max(max_threads / kThreadsPerBlock, 1);
int grid_cols = int grid_cols =
std::min((input_col + block_cols - 1) / block_cols, max_blocks); std::min((in_col + block_cols - 1) / block_cols, max_blocks);
int grid_rows = int grid_rows =
std::min(max_blocks / grid_cols, std::max(input_row / block_rows, 1)); std::min(max_blocks / grid_cols, std::max(out_row / block_rows, 1));
dim3 grid_size = dim3(grid_cols, grid_rows, 1); dim3 grid_size = dim3(grid_cols, grid_rows, 1);
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, output_col_0, outs_gpu); input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), input_row, input_col, outs_col_gpu, input.data<T>(), in_row, in_col, dev_outs_col_data,
static_cast<int>(outputs_cols.size()), outs_gpu); static_cast<int>(outputs_cols.size()), dev_out_gpu_data);
} }
} }
}; };
......
/* Copyright (c) 2018 paddlepaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <hip/hip_runtime.h>
...@@ -19,13 +19,6 @@ limitations under the License. */ ...@@ -19,13 +19,6 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#ifdef PADDLE_USE_ATLAS
extern "C" {
#include <cblas.h>
#include <clapack.h>
}
#endif
#ifdef PADDLE_USE_OPENBLAS #ifdef PADDLE_USE_OPENBLAS
#include <cblas.h> #include <cblas.h>
#include <lapacke.h> #include <lapacke.h>
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/detail/safe_ref.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle {
namespace operators {
template <typename Functor>
class MKLDNNActivationKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
PADDLE_ENFORCE(context.Input<framework::Tensor>("X") != nullptr,
"Cannot get input tensor X, variable name = %s",
context.op().Input("X"));
PADDLE_ENFORCE(context.Output<framework::Tensor>("Out") != nullptr,
"Cannot find output tensor Out, variable name = %s",
context.op().Output("Out"));
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
template <typename Functor>
class MKLDNNActivationGradKernel
: public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
public:
void Compute(const framework::ExecutionContext& context) const override {
Functor functor;
auto attrs = functor.GetAttrs();
for (auto& attr : attrs) {
*attr.second = context.Attr<float>(attr.first);
}
functor(context);
}
};
namespace {
framework::OpKernelType GetKernelType(
const framework::ExecutionContext& ctx,
const framework::OperatorWithKernel& oper) {
framework::LibraryType library{framework::LibraryType::kPlain};
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
}
#endif
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
return framework::OpKernelType(
framework::ToDataType(ctx.Input<framework::Tensor>("X")->type()),
ctx.GetPlace(), layout, library);
}
} // anonymous namespace
class ActivationWithMKLDNNOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
ctx->ShareLoD("X", /*->*/ "Out");
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
class ActivationWithMKLDNNOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return GetKernelType(ctx, *this);
}
};
} // namespace operators
} // namespace paddle
...@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase { ...@@ -144,7 +144,12 @@ class ParallelDoOp : public framework::OperatorBase {
PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(), PADDLE_ENFORCE(scope.FindVar(param)->IsType<LoDTensor>(),
"Only support parameter type as LoDTensor"); "Only support parameter type as LoDTensor");
auto &src = scope.FindVar(param)->Get<LoDTensor>(); auto &src = scope.FindVar(param)->Get<LoDTensor>();
for (size_t i = 0; i < sub_scopes.size(); ++i) {
auto *sub_scope0 = sub_scopes[0];
auto *dst0 = sub_scope0->Var(param)->GetMutable<LoDTensor>();
dst0->ShareDataWith(src);
for (size_t i = 1; i < sub_scopes.size(); ++i) {
auto &place = places[i]; auto &place = places[i];
auto *sub_scope = sub_scopes[i]; auto *sub_scope = sub_scopes[i];
auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>(); auto *dst = sub_scope->Var(param)->GetMutable<LoDTensor>();
......
...@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() { ...@@ -166,7 +166,9 @@ void DoubleBufferReader::PrefetchThreadFunc() {
std::swap(gpu_batch, batch.payloads_); std::swap(gpu_batch, batch.payloads_);
} }
if (!buffer_->Send(&batch)) { try {
buffer_->Send(&batch);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The double buffer channel has been closed. The " VLOG(5) << "WARNING: The double buffer channel has been closed. The "
"prefetch thread will terminate."; "prefetch thread will terminate.";
break; break;
......
...@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name, ...@@ -146,14 +146,19 @@ void MultipleReader::PrefetchThreadFunc(std::string file_name,
while (reader->HasNext()) { while (reader->HasNext()) {
std::vector<framework::LoDTensor> ins; std::vector<framework::LoDTensor> ins;
reader->ReadNext(&ins); reader->ReadNext(&ins);
if (!buffer_->Send(&ins)) { try {
buffer_->Send(&ins);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch " VLOG(5) << "WARNING: The buffer channel has been closed. The prefetch "
"thread of file '" "thread of file '"
<< file_name << "' will terminate."; << file_name << "' will terminate.";
break; break;
} }
} }
if (!available_thread_idx_->Send(&thread_idx)) {
try {
available_thread_idx_->Send(&thread_idx);
} catch (paddle::platform::EnforceNotMet e) {
VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. " VLOG(5) << "WARNING: The available_thread_idx_ channel has been closed. "
"Fail to send thread_idx."; "Fail to send thread_idx.";
} }
......
...@@ -27,6 +27,7 @@ namespace operators { ...@@ -27,6 +27,7 @@ namespace operators {
static constexpr char kX[] = "X"; static constexpr char kX[] = "X";
static constexpr char kCaseToExecute[] = "case_to_execute"; static constexpr char kCaseToExecute[] = "case_to_execute";
static constexpr char kOutputs[] = "Out";
static constexpr char kCases[] = "cases"; static constexpr char kCases[] = "cases";
static constexpr char kCasesBlock[] = "sub_block"; static constexpr char kCasesBlock[] = "sub_block";
...@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -388,6 +389,10 @@ class SelectOpMaker : public framework::OpProtoAndCheckerMaker {
"(Int) The variable the sets the index of the case to execute, " "(Int) The variable the sets the index of the case to execute, "
"after evaluating the channels being sent to and received from") "after evaluating the channels being sent to and received from")
.AsDuplicable(); .AsDuplicable();
AddOutput(kOutputs,
"A set of variables, which will be assigned with values "
"generated by the operators inside the cases of Select Op.")
.AsDuplicable();
AddAttr<std::vector<std::string>>(kCases, AddAttr<std::vector<std::string>>(kCases,
"(String vector) Serialized list of" "(String vector) Serialized list of"
"all cases in the select op. Each" "all cases in the select op. Each"
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
class SendBarrierOp : public framework::OperatorBase {
public:
SendBarrierOp(const std::string& type,
const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
// need to wait before sending send_barrier message
PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : eps) {
VLOG(3) << "send barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep);
}
PADDLE_ENFORCE(rpc_client->Wait());
}
};
class SendBarrierOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendBarrierOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddOutput("RPCClient",
"(RPCClient) The RPC client object which is"
"initialized at most once.");
AddComment(R"DOC(
SendBarrier operator
This operator will send a send barrier signal to list_and_serv op, so that
the Parameter Server would knew all variables have been sent.
)DOC");
AddAttr<std::vector<std::string>>("endpoints",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints to send variables to.")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendBarrierOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendBarrierOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_barrier, ops::SendBarrierOp,
paddle::framework::EmptyGradOpMaker, ops::SendBarrierOpMaker,
ops::SendBarrierOpVarTypeInference,
ops::SendBarrierOpShapeInference);
...@@ -21,6 +21,7 @@ limitations under the License. */ ...@@ -21,6 +21,7 @@ limitations under the License. */
#include <future> #include <future>
#include "paddle/fluid/operators/detail/grpc_client.h" #include "paddle/fluid/operators/detail/grpc_client.h"
#include "paddle/fluid/platform/profiler.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase { ...@@ -59,6 +60,9 @@ class SendOp : public framework::OperatorBase {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place); auto& ctx = *pool.Get(place);
// For profiling
platform::RecordEvent record_event(Type(), &ctx);
auto client_var_name = Output("RPCClient"); auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name), PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.", "Can not find variable '%s' in the scope.",
...@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase { ...@@ -68,7 +72,7 @@ class SendOp : public framework::OperatorBase {
for (size_t i = 0; i < ins.size(); i++) { for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) { if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i]; VLOG(2) << "sending " << ins[i] << " to " << epmap[i];
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]); rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else { } else {
VLOG(3) << "don't send no-initialied variable: " << ins[i]; VLOG(3) << "don't send no-initialied variable: " << ins[i];
...@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase { ...@@ -77,20 +81,20 @@ class SendOp : public framework::OperatorBase {
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(3) << "batch barrier, ep: " << ep; VLOG(2) << "batch barrier, ep: " << ep;
rpc_client->AsyncSendBatchBarrier(ep); rpc_client->AsyncSendBatchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
if (outs.size() > 0) { if (outs.size() > 0) {
for (size_t i = 0; i < outs.size(); i++) { for (size_t i = 0; i < outs.size(); i++) {
VLOG(3) << "getting " << outs[i] << " from " << epmap[i]; VLOG(2) << "getting " << outs[i] << " from " << epmap[i];
rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]); rpc_client->AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
// tell pservers that current trainer have called fetch // tell pservers that current trainer have called fetch
for (auto& ep : endpoints) { for (auto& ep : endpoints) {
VLOG(3) << "send fetch barrier, ep: " << ep; VLOG(2) << "send fetch barrier, ep: " << ep;
rpc_client->AsyncSendFetchBarrier(ep); rpc_client->AsyncSendFetchBarrier(ep);
} }
PADDLE_ENFORCE(rpc_client->Wait()); PADDLE_ENFORCE(rpc_client->Wait());
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <ostream>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/framework.pb.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include <future>
#include "paddle/fluid/operators/detail/grpc_client.h"
namespace paddle {
namespace operators {
static bool NeedSend(const framework::Scope& scope,
const std::string& varname) {
auto* var = scope.FindVar(varname);
PADDLE_ENFORCE_NOT_NULL(var, "Can not find variable '%s' in the send side.",
varname);
if (var->IsType<framework::LoDTensor>()) {
return var->Get<framework::LoDTensor>().IsInitialized();
} else if (var->IsType<framework::SelectedRows>()) {
return var->Get<framework::SelectedRows>().rows().size() > 0UL;
} else {
PADDLE_THROW(
"Variable type in send side should be in "
"[LodTensor, SelectedRows]");
}
return false;
}
class SendVarsOp : public framework::OperatorBase {
public:
SendVarsOp(const std::string& type, const framework::VariableNameMap& inputs,
const framework::VariableNameMap& outputs,
const framework::AttributeMap& attrs)
: OperatorBase(type, inputs, outputs, attrs) {}
void RunImpl(const framework::Scope& scope,
const platform::Place& place) const override {
auto ins = Inputs("X");
std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
int sync_send = Attr<int>("sync_sent");
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto& ctx = *pool.Get(place);
auto client_var_name = Output("RPCClient");
PADDLE_ENFORCE_NOT_NULL(scope.FindVar(client_var_name),
"Can not find variable '%s' in the scope.",
client_var_name);
auto* client_var = scope.FindVar(client_var_name);
detail::RPCClient* rpc_client = client_var->GetMutable<detail::RPCClient>();
for (size_t i = 0; i < ins.size(); i++) {
if (NeedSend(scope, ins[i])) {
VLOG(3) << "sending " << ins[i] << " to " << epmap[i];
// TODO(Yancey1989): we need to use an IO threadpool which has
// a larger number of threads than the computing threadpool.
rpc_client->AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
} else {
VLOG(3) << "don't send no-initialied variable: " << ins[i];
}
}
if (sync_send) {
rpc_client->Wait();
}
}
};
class SendVarsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SendVarsOpMaker(OpProto* proto, OpAttrChecker* op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
.AsDuplicable();
AddOutput("RPCClient",
"(RPCClient) The RPC client object which will be"
"initialized at most once.");
AddComment(R"DOC(
Send operator
This operator will send variables to listen_and_serve op at the parameter server.
)DOC");
AddAttr<int>("ync_send",
"(int, default 0)"
"sync send or async send.")
.SetDefault(0);
AddAttr<std::vector<std::string>>("epmap",
"(string vector, default 127.0.0.1:6164)"
"Server endpoints in the order of input "
"variables for mapping")
.SetDefault({"127.0.0.1:6164"});
}
};
class SendVarsOpVarTypeInference : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc& op_desc,
framework::BlockDesc* block) const override {
auto out_var_name = op_desc.Output("RPCClient").front();
auto& out_var = block->FindRecursiveOrCreateVar(out_var_name);
auto var_type = framework::proto::VarType::RAW;
out_var.SetType(var_type);
}
};
class SendVarsOpShapeInference : public framework::InferShapeBase {
public:
void operator()(framework::InferShapeContext* ctx) const override {}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(send_vars, ops::SendVarsOp,
paddle::framework::EmptyGradOpMaker, ops::SendVarsOpMaker,
ops::SendVarsOpVarTypeInference,
ops::SendVarsOpShapeInference);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/split_ids_op.h"
namespace paddle {
namespace operators {
class SplitIdsOpMaker : public framework::OpProtoAndCheckerMaker {
public:
SplitIdsOpMaker(OpProto *proto, OpAttrChecker *op_checker)
: OpProtoAndCheckerMaker(proto, op_checker) {
AddInput("Ids", "(LoDTensor) the input ids with shape{batch_num, 1}");
AddOutput("Out", "(LoDTensor) The outputs of the input Ids.")
.AsDuplicable();
AddComment(R"DOC(
Split a LoDTensor of Ids into multi LoDTensors, the number is pserver's number
Example:
Input:
X = [1,2,3,4,5,6]
Out(3 output):
out0 = [3, 6]
out1 = [1, 4]
out2 = [2, 5]
)DOC");
}
};
class SplitIdsOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE(ctx->HasInput("Ids"), "SplitIdsOp must has input Ids.");
PADDLE_ENFORCE(ctx->HasOutputs("Out"), "SplitIdsOp must has output Out.");
auto ids_var_type = ctx->GetInputsVarType("Ids").front();
PADDLE_ENFORCE_EQ(ids_var_type, framework::proto::VarType::LOD_TENSOR);
auto ids_dims = ctx->GetInputDim("Ids");
PADDLE_ENFORCE_EQ(ids_dims.size(), 2);
PADDLE_ENFORCE_EQ(ids_dims[1], 1);
}
};
class SplitIdsOpInferVarType : public framework::VarTypeInference {
public:
void operator()(const framework::OpDesc &op_desc,
framework::BlockDesc *block) const override {
for (auto &out_var : op_desc.Output("Out")) {
block->Var(out_var)->SetType(framework::proto::VarType::LOD_TENSOR);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(split_ids, ops::SplitIdsOp, ops::SplitIdsOpMaker,
ops::SplitIdsOpInferVarType);
REGISTER_OP_CPU_KERNEL(
split_ids, ops::SplitIdsOpKernel<paddle::platform::CPUPlace, int64_t>);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class SplitIdsOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto place = ctx.GetPlace();
if (!platform::is_cpu_place(place)) {
PADDLE_THROW("SplitIds do not support GPU kernel");
}
const auto* ids_t = ctx.Input<framework::LoDTensor>("Ids");
auto& ids_dims = ids_t->dims();
auto outs = ctx.MultiOutput<framework::LoDTensor>("Out");
const T* ids = ids_t->data<T>();
const size_t shard_num = outs.size();
std::vector<std::vector<T>> out_ids;
out_ids.resize(outs.size());
// split id by their shard_num.
for (size_t i = 0; i < ids_dims[0]; ++i) {
T id = ids[i];
size_t shard_id = static_cast<size_t>(id) % shard_num;
out_ids[shard_id].push_back(id);
}
// create tensor for each shard and send to parameter server
for (size_t i = 0; i < out_ids.size(); ++i) {
auto* shard_t = outs[i];
std::vector<T> ids = out_ids[i];
auto* shard_data = shard_t->mutable_data<T>(
framework::make_ddim({static_cast<int64_t>(ids.size()), 1}), place);
for (size_t i = 0; i < ids.size(); ++i) {
shard_data[i] = ids[i];
}
}
}
};
} // namespace operators
} // namespace paddle
...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_ ...@@ -49,7 +49,7 @@ nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_
nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda)
nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context) nv_test(transform_test SRCS transform_test.cu DEPS paddle_memory place device_context)
cc_library(device_tracer SRCS device_tracer.cc DEPS profiler_proto ${GPU_CTX_DEPS}) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto ${GPU_CTX_DEPS})
cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer)
cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler)
......
if(WITH_PYTHON) if(WITH_PYTHON)
cc_library(paddle_pybind SHARED if(WITH_AMD_GPU)
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc hip_library(paddle_pybind SHARED
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
${GLOB_OP_LIB}) DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
if(NOT APPLE AND NOT ANDROID) ${GLOB_OP_LIB})
target_link_libraries(paddle_pybind rt) else()
endif(NOT APPLE AND NOT ANDROID) cc_library(paddle_pybind SHARED
SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc
DEPS pybind python backward proto_desc paddle_memory executor prune init profiler feed_fetch_method
${GLOB_OP_LIB})
if(NOT APPLE AND NOT ANDROID)
target_link_libraries(paddle_pybind rt)
endif(NOT APPLE AND NOT ANDROID)
endif(WITH_AMD_GPU)
endif(WITH_PYTHON) endif(WITH_PYTHON)
...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr; ...@@ -59,17 +59,10 @@ void* lapack_dso_handle = nullptr;
} __name; // struct DynLoad__##__name } __name; // struct DynLoad__##__name
#endif #endif
#ifdef PADDLE_USE_ATLAS #define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_SGETRF clapack_sgetrf #define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_DGETRF clapack_dgetrf #define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_SGETRI clapack_sgetri #define PADDLE_DGETRI LAPACKE_dgetri
#define PADDLE_DGETRI clapack_dgetri
#else
#define PADDLE_SGETRF LAPACKE_sgetrf
#define PADDLE_DGETRF LAPACKE_dgetrf
#define PADDLE_SGETRI LAPACKE_sgetri
#define PADDLE_DGETRI LAPACKE_dgetri
#endif
#define LAPACK_ROUTINE_EACH(__macro) \ #define LAPACK_ROUTINE_EACH(__macro) \
__macro(PADDLE_SGETRF) \ __macro(PADDLE_SGETRF) \
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include <mkl_vml_functions.h> #include <mkl_vml_functions.h>
#endif #endif
#if defined(PADDLE_USE_ATLAS) || defined(PADDLE_USE_VECLIB) #if defined(PADDLE_USE_VECLIB)
extern "C" { extern "C" {
#include <cblas.h> #include <cblas.h>
#include <clapack.h> #include <clapack.h>
......
...@@ -37,6 +37,7 @@ function cmake_gen() { ...@@ -37,6 +37,7 @@ function cmake_gen() {
-DWITH_DSO=ON -DWITH_DSO=ON
-DWITH_DOC=${WITH_DOC:-OFF} -DWITH_DOC=${WITH_DOC:-OFF}
-DWITH_GPU=${WITH_GPU:-OFF} -DWITH_GPU=${WITH_GPU:-OFF}
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF}
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF}
-DWITH_MKL=${WITH_MKL:-ON} -DWITH_MKL=${WITH_MKL:-ON}
-DWITH_AVX=${WITH_AVX:-OFF} -DWITH_AVX=${WITH_AVX:-OFF}
...@@ -50,6 +51,7 @@ function cmake_gen() { ...@@ -50,6 +51,7 @@ function cmake_gen() {
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON}
-DWITH_TESTING=${WITH_TESTING:-ON} -DWITH_TESTING=${WITH_TESTING:-ON}
-DWITH_FAST_BUNDLE_TEST=ON -DWITH_FAST_BUNDLE_TEST=ON
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
======================================== ========================================
EOF EOF
...@@ -62,6 +64,7 @@ EOF ...@@ -62,6 +64,7 @@ EOF
-DWITH_DSO=ON \ -DWITH_DSO=ON \
-DWITH_DOC=${WITH_DOC:-OFF} \ -DWITH_DOC=${WITH_DOC:-OFF} \
-DWITH_GPU=${WITH_GPU:-OFF} \ -DWITH_GPU=${WITH_GPU:-OFF} \
-DWITH_AMD_GPU=${WITH_AMD_GPU:-OFF} \
-DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \ -DWITH_DISTRIBUTE=${WITH_DISTRIBUTE:-OFF} \
-DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_MKL=${WITH_MKL:-ON} \
-DWITH_AVX=${WITH_AVX:-OFF} \ -DWITH_AVX=${WITH_AVX:-OFF} \
...@@ -74,6 +77,7 @@ EOF ...@@ -74,6 +77,7 @@ EOF
-DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \ -DWITH_STYLE_CHECK=${WITH_STYLE_CHECK:-ON} \
-DWITH_TESTING=${WITH_TESTING:-ON} \ -DWITH_TESTING=${WITH_TESTING:-ON} \
-DWITH_FAST_BUNDLE_TEST=ON \ -DWITH_FAST_BUNDLE_TEST=ON \
-DCMAKE_MODULE_PATH=/opt/rocm/hip/cmake \
-DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
} }
......
...@@ -12,7 +12,8 @@ ...@@ -12,7 +12,8 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from layers.control_flow import BlockGuard, Select from layers.control_flow import BlockGuard, equal
from .framework import Operator
from layer_helper import LayerHelper, unique_name from layer_helper import LayerHelper, unique_name
from layers import fill_constant from layers import fill_constant
import core import core
...@@ -75,6 +76,206 @@ class Go(BlockGuard): ...@@ -75,6 +76,206 @@ class Go(BlockGuard):
attrs={'sub_block': go_block}) attrs={'sub_block': go_block})
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
select,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None,
is_copy=False):
self.select = select
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else self.DEFAULT
X = value
if self.action == self.SEND and is_copy:
# We create of copy of the data we want to send
copied_X = self.select.parent_block.create_var(
name=unique_name.generate(value.name + '_copy'),
type=value.type,
dtype=value.dtype,
shape=value.shape,
lod_level=value.lod_level,
capacity=value.capacity
if hasattr(value, 'capacity') else None, )
self.select.parent_block.append_op(
type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X
self.value = X
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.parent_block = self.helper.main_program.current_block()
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value, is_copy=False):
"""Create a new block for this condition.
"""
select_case = SelectCase(self,
len(self.cases), self.case_to_execute,
channel_action_fn, channel, value, is_copy)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(self, len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block.var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={'Out': out_list})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
def make_channel(dtype, capacity=0): def make_channel(dtype, capacity=0):
""" """
Helps implementation of a concurrent program by creating a "channel" of Helps implementation of a concurrent program by creating a "channel" of
...@@ -159,35 +360,26 @@ def channel_send(channel, value, is_copy=False): ...@@ -159,35 +360,26 @@ def channel_send(channel, value, is_copy=False):
main_program = helper.main_program main_program = helper.main_program
channel_send_block = main_program.current_block() channel_send_block = main_program.current_block()
status = helper.create_variable(
name=unique_name.generate('status'),
type=core.VarDesc.VarType.LOD_TENSOR,
dtype=core.VarDesc.VarType.BOOL)
X = value X = value
if is_copy is True: if is_copy:
copied_X = helper.create_variable( copied_X = helper.create_variable(
name=unique_name.generate(value.name + '_copy'), name=unique_name.generate(value.name + '_copy'),
type=value.type, type=value.type,
dtype=value.dtype, dtype=value.dtype,
shape=value.shape, shape=value.shape,
lod_level=value.lod_level, lod_level=value.lod_level,
capacity=value.capacity) capacity=value.capacity if hasattr(value, 'capacity') else None)
assign_op = channel_send_block.append_op( assign_op = channel_send_block.append_op(
type="assign_op", inputs={"X": value}, outputs={"Out": copied_X}) type="assign", inputs={"X": value}, outputs={"Out": copied_X})
X = copied_X X = copied_X
channel_send_op = channel_send_block.append_op( channel_send_block.append_op(
type="channel_send", type="channel_send", inputs={
inputs={
"Channel": channel, "Channel": channel,
"X": X, "X": X,
}, })
outputs={"Status": status})
return status
def channel_recv(channel, return_value): def channel_recv(channel, return_value):
......
...@@ -565,6 +565,8 @@ class DistributeTranspiler: ...@@ -565,6 +565,8 @@ class DistributeTranspiler:
orig_var_name = "" orig_var_name = ""
if suff_idx >= 0: if suff_idx >= 0:
orig_var_name = varname[:suff_idx] orig_var_name = varname[:suff_idx]
else:
orig_var_name = varname
return orig_var_name return orig_var_name
def _append_pserver_ops(self, optimize_block, opt_op, endpoint, def _append_pserver_ops(self, optimize_block, opt_op, endpoint,
...@@ -579,7 +581,8 @@ class DistributeTranspiler: ...@@ -579,7 +581,8 @@ class DistributeTranspiler:
grad_block = None grad_block = None
for g in self.param_grad_ep_mapping[endpoint]["grads"]: for g in self.param_grad_ep_mapping[endpoint]["grads"]:
if same_or_split_var( if same_or_split_var(
self._orig_varname(g.name), opt_op.input(key)[0]): self._orig_varname(g.name),
self._orig_varname(opt_op.input(key)[0])):
grad_block = g grad_block = g
break break
if not grad_block: if not grad_block:
...@@ -750,7 +753,7 @@ class DistributeTranspiler: ...@@ -750,7 +753,7 @@ class DistributeTranspiler:
param_names = [ param_names = [
p.name for p in self.param_grad_ep_mapping[endpoint]["params"] p.name for p in self.param_grad_ep_mapping[endpoint]["params"]
] ]
if op.input("Param") in param_names: if op.input("Param")[0] in param_names:
return True return True
else: else:
for n in param_names: for n in param_names:
......
...@@ -403,6 +403,8 @@ class LayerHelper(object): ...@@ -403,6 +403,8 @@ class LayerHelper(object):
if 'use_mkldnn' in self.kwargs: if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn') act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
act_type = act.pop('type') act_type = act.pop('type')
if 'use_mkldnn' in self.kwargs:
act['use_mkldnn'] = self.kwargs.get('use_mkldnn')
self.append_op( self.append_op(
type=act_type, type=act_type,
inputs={"X": [input_var]}, inputs={"X": [input_var]},
......
...@@ -16,7 +16,7 @@ import contextlib ...@@ -16,7 +16,7 @@ import contextlib
from layer_function_generator import autodoc from layer_function_generator import autodoc
from tensor import assign, fill_constant from tensor import assign, fill_constant
from .. import core from .. import core
from ..framework import Program, Variable, Operator, Block from ..framework import Program, Variable, Operator
from ..layer_helper import LayerHelper, unique_name from ..layer_helper import LayerHelper, unique_name
from ops import logical_and, logical_not, logical_or from ops import logical_and, logical_not, logical_or
...@@ -29,7 +29,6 @@ __all__ = [ ...@@ -29,7 +29,6 @@ __all__ = [
'WhileGuard', 'WhileGuard',
'While', 'While',
'Switch', 'Switch',
'Select',
'lod_rank_table', 'lod_rank_table',
'max_sequence_len', 'max_sequence_len',
'topk', 'topk',
...@@ -1212,186 +1211,6 @@ class Switch(object): ...@@ -1212,186 +1211,6 @@ class Switch(object):
return True return True
class SelectCase(object):
DEFAULT = 0
SEND = 1
RECEIVE = 2
def __init__(self,
case_idx,
case_to_execute,
channel_action_fn=None,
channel=None,
value=None):
self.helper = LayerHelper('conditional_block')
self.main_program = self.helper.main_program
self.is_scalar_condition = True
self.case_to_execute = case_to_execute
self.idx = case_idx
# Since we aren't going to use the `channel_send` or `channel_recv`
# functions directly, we just need to capture the name.
self.action = (self.SEND
if channel_action_fn.__name__ == ('channel_send') else
self.RECEIVE) if channel_action_fn else (self.DEFAULT)
self.value = value
self.channel = channel
def __enter__(self):
self.block = self.main_program.create_block()
def construct_op(self):
main_program = self.helper.main_program
cases_block = main_program.current_block()
inner_outputs = set()
input_set = set()
params = set()
for op in self.block.ops:
# Iterate over all operators, get all the inputs
# and add as input to the SelectCase operator.
for iname in op.input_names:
for in_var_name in op.input(iname):
if in_var_name not in inner_outputs:
input_set.add(in_var_name)
for oname in op.output_names:
for out_var_name in op.output(oname):
inner_outputs.add(out_var_name)
param_list = [
cases_block.var(each_name) for each_name in params
if each_name not in input_set
]
# Iterate over all operators, get all the outputs
# add to the output list of SelectCase operator only if
# they exist in the parent block.
out_vars = []
for inner_out_name in inner_outputs:
if inner_out_name in cases_block.vars:
out_vars.append(cases_block.var(inner_out_name))
# First, create an op that will determine whether or not this is the
# conditional variable to execute.
should_execute_block = equal(
fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=self.idx),
self.case_to_execute)
step_scope = cases_block.create_var(
type=core.VarDesc.VarType.STEP_SCOPES)
cases_block.append_op(
type='conditional_block',
inputs={'X': [should_execute_block],
'Params': param_list},
outputs={'Out': out_vars,
'Scope': [step_scope]},
attrs={
'sub_block': self.block,
'is_scalar_condition': self.is_scalar_condition
})
return '%s,%s,%s,%s' % (self.idx, self.action, self.channel.name
if self.channel else '', self.value.name
if self.value else '')
def __exit__(self, exc_type, exc_val, exc_tb):
self.main_program.rollback()
if exc_type is not None:
return False # re-raise exception
return True
class Select(BlockGuard):
def __init__(self, name=None):
self.helper = LayerHelper('select', name=name)
self.cases = []
super(Select, self).__init__(self.helper.main_program)
self.case_to_execute = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=-1)
def __enter__(self):
super(Select, self).__enter__()
return self
def case(self, channel_action_fn, channel, value):
"""Create a new block for this condition.
"""
select_case = SelectCase(
len(self.cases), self.case_to_execute, channel_action_fn, channel,
value)
self.cases.append(select_case)
return select_case
def default(self):
"""Create a default case block for this condition.
"""
default_case = SelectCase(len(self.cases), self.case_to_execute)
self.cases.append(default_case)
return default_case
def __exit__(self, exc_type, exc_val, exc_tb):
if exc_type is not None:
return False
# Create a select op and another block to wrap its
# case blocks.
select_block = self.helper.main_program.current_block()
parent_block = self.helper.main_program.block(select_block.parent_idx)
# Construct each case op, inside the newly created select block.
serialized_cases = []
for case in self.cases:
serialized_cases.append(case.construct_op())
intermediate = set()
params = set()
for case_block in select_block.ops:
if case_block.attrs and 'sub_block' in case_block.attrs:
for each_op in case_block.attrs['sub_block'].ops:
assert isinstance(each_op, Operator)
for iname in each_op.input_names:
for in_var_name in each_op.input(iname):
if in_var_name not in intermediate:
params.add(in_var_name)
for oname in each_op.output_names:
for out_var_name in each_op.output(oname):
intermediate.add(out_var_name)
# TODO(varunarora): Figure out if defining output is needed.
out_list = [
parent_block.var(var_name) for var_name in parent_block.vars
if var_name in intermediate
]
X = [select_block.var_recursive(x_name) for x_name in params]
# Needs to be used by `equal` inside the cases block.
X.append(self.case_to_execute)
# Construct the select op.
parent_block.append_op(
type='select',
inputs={'X': X,
'case_to_execute': self.case_to_execute},
attrs={'sub_block': select_block,
'cases': serialized_cases},
outputs={})
return super(Select, self).__exit__(exc_type, exc_val, exc_tb)
class IfElseBlockGuard(object): class IfElseBlockGuard(object):
def __init__(self, is_true, ifelse): def __init__(self, is_true, ifelse):
if not isinstance(ifelse, IfElse): if not isinstance(ifelse, IfElse):
......
...@@ -133,6 +133,7 @@ def detection_output(loc, ...@@ -133,6 +133,7 @@ def detection_output(loc,
scores = nn.softmax(input=scores) scores = nn.softmax(input=scores)
scores = nn.reshape(x=scores, shape=old_shape) scores = nn.reshape(x=scores, shape=old_shape)
scores = nn.transpose(scores, perm=[0, 2, 1]) scores = nn.transpose(scores, perm=[0, 2, 1])
scores.stop_gradient = True
nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype) nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
helper.append_op( helper.append_op(
type="multiclass_nms", type="multiclass_nms",
...@@ -147,6 +148,7 @@ def detection_output(loc, ...@@ -147,6 +148,7 @@ def detection_output(loc,
'score_threshold': score_threshold, 'score_threshold': score_threshold,
'nms_eta': 1.0 'nms_eta': 1.0
}) })
nmsed_outs.stop_gradient = True
return nmsed_outs return nmsed_outs
...@@ -836,4 +838,6 @@ def multi_box_head(inputs, ...@@ -836,4 +838,6 @@ def multi_box_head(inputs,
mbox_locs_concat = tensor.concat(mbox_locs, axis=1) mbox_locs_concat = tensor.concat(mbox_locs, axis=1)
mbox_confs_concat = tensor.concat(mbox_confs, axis=1) mbox_confs_concat = tensor.concat(mbox_confs, axis=1)
box.stop_gradient = True
var.stop_gradient = True
return mbox_locs_concat, mbox_confs_concat, box, var return mbox_locs_concat, mbox_confs_concat, box, var
...@@ -113,9 +113,9 @@ class ListenAndServ(object): ...@@ -113,9 +113,9 @@ class ListenAndServ(object):
which can receive variables from clients and run a block. which can receive variables from clients and run a block.
""" """
def __init__(self, endpoint, fan_in=1, optimizer_mode=True): def __init__(self, endpoint, inputs, fan_in=1, optimizer_mode=True):
self.helper = LayerHelper("listen_and_serv") self.helper = LayerHelper("listen_and_serv")
self.inputs = [] self.inputs = inputs
self.outputs = [] self.outputs = []
self.endpoint = endpoint self.endpoint = endpoint
self.fan_in = fan_in self.fan_in = fan_in
...@@ -160,18 +160,13 @@ class ListenAndServ(object): ...@@ -160,18 +160,13 @@ class ListenAndServ(object):
current_block = main_program.current_block() current_block = main_program.current_block()
parent_block = self.parent_block() parent_block = self.parent_block()
params, grads = self.get_params_and_grads()
param_names = [p.name for p in params]
grad_names = [g.name for g in grads]
parent_block.append_op( parent_block.append_op(
type='listen_and_serv', type='listen_and_serv',
inputs={}, inputs={"X": self.inputs},
outputs={}, outputs={},
attrs={ attrs={
'endpoint': self.endpoint, 'endpoint': self.endpoint,
'Fanin': self.fan_in, 'Fanin': self.fan_in,
'ParamList': param_names,
'GradList': grad_names,
'OptimizeBlock': current_block 'OptimizeBlock': current_block
}) })
...@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars): ...@@ -196,10 +191,14 @@ def Send(endpoints, send_vars, get_vars):
endpoints = list(set(epmap)) endpoints = list(set(epmap))
helper = LayerHelper("Send", **locals()) helper = LayerHelper("Send", **locals())
rpc_client_var = default_main_program().global_block().create_var(
name="RPC_CLIENT_VAR", persistable=True, type=core.VarDesc.VarType.RAW)
helper.append_op( helper.append_op(
type="send", type="send",
inputs={"X": send_vars}, inputs={"X": send_vars},
outputs={"Out": get_vars}, outputs={"Out": get_vars,
"RPCClient": rpc_client_var},
attrs={"endpoints": endpoints, attrs={"endpoints": endpoints,
"epmap": epmap}) "epmap": epmap})
......
...@@ -75,6 +75,7 @@ __all__ = [ ...@@ -75,6 +75,7 @@ __all__ = [
'autoincreased_step_counter', 'autoincreased_step_counter',
'reshape', 'reshape',
'lod_reset', 'lod_reset',
'lrn',
] ]
...@@ -3508,3 +3509,73 @@ def lod_reset(x, y=None, target_lod=None): ...@@ -3508,3 +3509,73 @@ def lod_reset(x, y=None, target_lod=None):
raise ValueError("y and target_lod should not be both None.") raise ValueError("y and target_lod should not be both None.")
return out return out
def lrn(input, n=5, k=1.0, alpha=1e-4, beta=0.75, name=None):
"""
Local Response Normalization Layer. This layer performs a type of
"lateral inhibition" by normalizing over local input regions.
The formula is as follows:
.. math::
Output(i, x, y) = Input(i, x, y) / \left(
k + \alpha \sum\limits^{\min(C, c + n/2)}_{j = \max(0, c - n/2)}
(Input(j, x, y))^2 \right)^{\beta}
In the above equation:
* :math:`n`: The number of channels to sum over.
* :math:`k`: The offset (avoid being divided by 0).
* :math:`alpha`: The scaling parameter.
* :math:`beta`: The exponent parameter.
Refer to `ImageNet Classification with Deep Convolutional Neural Networks
<https://papers.nips.cc/paper/4824-imagenet-classification-with-deep-convolutional-neural-networks.pdf>`_
Args:
input (Variable): The input tensor of this layer, and the dimension of input tensor must be 4.
n (int, default 5): The number of channels to sum over.
k (float, default 1.0): An offset (usually positive to avoid dividing by 0).
alpha (float, default 1e-4): The scaling parameter.
beta (float, default 0.75): The exponent.
name (str, default None): A name for this operation.
Raises:
ValueError: If rank of the input tensor is not 4.
Returns:
A tensor variable storing the transformation result.
Examples:
.. code-block:: python
data = fluid.layers.data(name="data", shape=[3, 112, 112], dtype="float32")
lrn = fluid.layers.lrn(input=data)
"""
helper = LayerHelper('lrn', **locals())
dtype = helper.input_dtype()
input_shape = input.shape
dims = len(input_shape)
if dims != 4:
raise ValueError(
"dims of input must be 4(not %d), and it's order must be NCHW" %
(dims))
mid_out = helper.create_tmp_variable(dtype=dtype, stop_gradient=True)
lrn_out = helper.create_tmp_variable(dtype)
helper.append_op(
type="lrn",
inputs={"X": input},
outputs={
"Out": lrn_out,
"MidOut": mid_out,
},
attrs={"n": n,
"k": k,
"alpha": alpha,
"beta": beta})
return lrn_out
...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase): ...@@ -173,16 +173,10 @@ class TestRoutineOp(unittest.TestCase):
with while_op.block(): with while_op.block():
result2 = fill_constant( result2 = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0) shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
x_to_send_tmp = fill_constant(
shape=[1], dtype=core.VarDesc.VarType.INT32, value=0)
# TODO(abhinav): Need to perform copy when doing a channel send.
# Once this is complete, we can remove these lines
assign(input=x, output=x_to_send_tmp)
with fluid.Select() as select: with fluid.Select() as select:
with select.case(fluid.channel_send, channel, with select.case(
x_to_send_tmp): fluid.channel_send, channel, x, is_copy=True):
assign(input=x, output=x_tmp) assign(input=x, output=x_tmp)
assign(input=y, output=x) assign(input=y, output=x)
assign(elementwise_add(x=x_tmp, y=y), output=y) assign(elementwise_add(x=x_tmp, y=y), output=y)
...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase): ...@@ -230,21 +224,12 @@ class TestRoutineOp(unittest.TestCase):
core.VarDesc.VarType.LOD_TENSOR, core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64) core.VarDesc.VarType.FP64)
pong_result = self._create_tensor('pong_return_value',
core.VarDesc.VarType.LOD_TENSOR,
core.VarDesc.VarType.FP64)
def ping(ch, message): def ping(ch, message):
message_to_send_tmp = fill_constant( fluid.channel_send(ch, message, is_copy=True)
shape=[1], dtype=core.VarDesc.VarType.FP64, value=0)
assign(input=message, output=message_to_send_tmp)
fluid.channel_send(ch, message_to_send_tmp)
def pong(ch1, ch2): def pong(ch1, ch2):
fluid.channel_recv(ch1, ping_result) fluid.channel_recv(ch1, ping_result)
assign(input=ping_result, output=pong_result) fluid.channel_send(ch2, ping_result, is_copy=True)
fluid.channel_send(ch2, pong_result)
pings = fluid.make_channel( pings = fluid.make_channel(
dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1) dtype=core.VarDesc.VarType.LOD_TENSOR, capacity=1)
......
...@@ -506,5 +506,54 @@ class TestSwish(OpTest): ...@@ -506,5 +506,54 @@ class TestSwish(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.008) self.check_grad(['X'], 'Out', max_relative_error=0.008)
#--------------------test MKLDNN--------------------
class TestMKLDNNRelu(TestRelu):
def setUp(self):
super(TestMKLDNNRelu, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
out = np.maximum(x, 0)
self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
self.outputs = {'Out': out}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNTanh(TestTanh):
def setUp(self):
super(TestMKLDNNTanh, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.tanh(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNSqrt(TestSqrt):
def setUp(self):
super(TestMKLDNNSqrt, self).setUp()
self.inputs = {
'X': np.random.uniform(0.1, 1, [2, 4, 3, 5]).astype("float32")
}
self.outputs = {'Out': np.sqrt(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
class TestMKLDNNAbs(TestAbs):
def setUp(self):
super(TestMKLDNNAbs, self).setUp()
x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype("float32")
# The same reason with TestAbs
x[np.abs(x) < 0.005] = 0.02
self.inputs = {'X': x}
self.outputs = {'Out': np.abs(self.inputs['X'])}
self.attrs = {"use_mkldnn": True}
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -20,19 +20,35 @@ from op_test import OpTest ...@@ -20,19 +20,35 @@ from op_test import OpTest
class TestConcatOp(OpTest): class TestConcatOp(OpTest):
def setUp(self): def setUp(self):
self.op_type = "concat" self.op_type = "concat"
x0 = np.random.random((2, 1, 4, 5)).astype('float32') self.init_test_data()
x1 = np.random.random((2, 2, 4, 5)).astype('float32') self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
x2 = np.random.random((2, 3, 4, 5)).astype('float32') self.attrs = {'axis': self.axis}
axis = 1 self.outputs = {
self.inputs = {'X': [('x0', x0), ('x1', x1), ('x2', x2)]} 'Out': np.concatenate(
self.attrs = {'axis': axis} (self.x0, self.x1, self.x2), axis=self.axis)
self.outputs = {'Out': np.concatenate((x0, x1, x2), axis=axis)} }
def test_check_output(self): def test_check_output(self):
self.check_output() self.check_output()
def test_check_grad(self): def test_check_grad(self):
self.check_grad(['x0'], 'Out') self.check_grad(['x0'], 'Out')
self.check_grad(['x1'], 'Out')
self.check_grad(['x2'], 'Out')
def init_test_data(self):
self.x0 = np.random.random((2, 1, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 2, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
class TestConcatOp2(OpTest):
def init_test_data(self):
self.x0 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x1 = np.random.random((2, 3, 4, 5)).astype('float32')
self.x2 = np.random.random((2, 3, 4, 5)).astype('float32')
self.axis = 1
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase): ...@@ -231,6 +231,13 @@ class TestBook(unittest.TestCase):
self.assertIsNotNone(layers.softmax(hid)) self.assertIsNotNone(layers.softmax(hid))
print(str(program)) print(str(program))
def test_lrn(self):
program = Program()
with program_guard(program):
data = layers.data(name='data', shape=[6, 2, 2], dtype='float32')
self.assertIsNotNone(layers.lrn(data))
print(str(program))
def test_get_places(self): def test_get_places(self):
program = Program() program = Program()
with program_guard(program): with program_guard(program):
......
...@@ -87,5 +87,34 @@ class TestLRNOp(OpTest): ...@@ -87,5 +87,34 @@ class TestLRNOp(OpTest):
self.check_grad(['X'], 'Out', max_relative_error=0.01) self.check_grad(['X'], 'Out', max_relative_error=0.01)
class TestLRNMKLDNNOp(TestLRNOp):
def get_attrs(self):
attrs = TestLRNOp.get_attrs(self)
attrs['use_mkldnn'] = True
return attrs
def test_check_output(self):
self.check_output(atol=0.002)
class TestLRNMKLDNNOpWithIsTest(TestLRNMKLDNNOp):
def get_attrs(self):
attrs = TestLRNMKLDNNOp.get_attrs(self)
attrs['is_test'] = True
return attrs
def test_check_grad_normal(self):
def check_raise_is_test():
try:
self.check_grad(['X'], 'Out', max_relative_error=0.01)
except Exception as e:
t = \
"is_test attribute should be set to False in training phase."
if t in str(e):
raise AttributeError
self.assertRaises(AttributeError, check_raise_is_test)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase): ...@@ -38,14 +38,15 @@ class TestRecvOp(unittest.TestCase):
def init_serv(self, place): def init_serv(self, place):
main = fluid.Program() main = fluid.Program()
with fluid.program_guard(main): with fluid.program_guard(main):
x = layers.data( serv = layers.ListenAndServ(
shape=[32, 32], "127.0.0.1:6174", ["X"], optimizer_mode=False)
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
serv = layers.ListenAndServ("127.0.0.1:6174", optimizer_mode=False)
with serv.do(): with serv.do():
x = layers.data(
shape=[32, 32],
dtype='float32',
name="X",
append_batch_size=False)
fluid.initializer.Constant(value=1.0)(x, main.global_block())
o = layers.scale(x=x, scale=10.0) o = layers.scale(x=x, scale=10.0)
main.global_block().create_var( main.global_block().create_var(
name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape) name=o.name, psersistable=False, dtype=o.dtype, shape=o.shape)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from op_test import OpTest
class TestSplitIdsOp(OpTest):
def setUp(self):
self.op_type = "split_ids"
ids = np.array([[0], [2], [2], [3], [5], [5], [6]]).astype('int64')
out0 = np.array([[0], [3], [6]]).astype('int64')
out1 = np.array([[]]).astype('int64')
out2 = np.array([[2], [2], [5], [5]]).astype('int64')
self.inputs = {'Ids': ids}
self.outputs = {'Out': [('out0', out0), ('out1', out1), ('out2', out2)]}
def test_check_output(self):
self.check_output()
if __name__ == '__main__':
unittest.main()
...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase): ...@@ -126,7 +126,6 @@ class TestTensor(unittest.TestCase):
def test_lod_tensor_gpu_init(self): def test_lod_tensor_gpu_init(self):
if not core.is_compiled_with_cuda(): if not core.is_compiled_with_cuda():
return return
scope = core.Scope()
place = core.CUDAPlace(0) place = core.CUDAPlace(0)
lod_py = [[0, 2, 5], [0, 2, 4, 5]] lod_py = [[0, 2, 5], [0, 2, 4, 5]]
lod_tensor = core.LoDTensor() lod_tensor = core.LoDTensor()
...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase): ...@@ -144,6 +143,25 @@ class TestTensor(unittest.TestCase):
self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1]) self.assertAlmostEqual(2.0, lod_v[0, 0, 0, 1])
self.assertListEqual(lod_py, lod_tensor.lod()) self.assertListEqual(lod_py, lod_tensor.lod())
def test_empty_tensor(self):
place = core.CPUPlace()
scope = core.Scope()
var = scope.var("test_tensor")
tensor = var.get_tensor()
tensor.set_dims([0, 1])
tensor.alloc_float(place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if core.is_compiled_with_cuda():
gpu_place = core.CUDAPlace(0)
tensor.alloc_float(gpu_place)
tensor_array = numpy.array(tensor)
self.assertEqual((0, 1), tensor_array.shape)
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation): ...@@ -77,7 +77,7 @@ class SoftmaxActivation(BaseActivation):
.. math:: .. math::
P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_j} } P(y=j|x) = \\frac{e^{x_j}} {\\sum^K_{k=1} e^{x_k} }
""" """
def __init__(self): def __init__(self):
......
...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could ...@@ -16,7 +16,7 @@ Creator package contains some simple reader creator, which could
be used in user program. be used in user program.
""" """
__all__ = ['np_array', 'text_file', "cloud_reader"] __all__ = ['np_array', 'text_file', 'recordio', 'cloud_reader']
def np_array(x): def np_array(x):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册